diff options
Diffstat (limited to 'kernel')
201 files changed, 10082 insertions, 4808 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 0ee9afd8b7cf..a89602a2a0ab 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -12,7 +12,6 @@ obj-y = fork.o exec_domain.o panic.o \ notifier.o ksysfs.o cred.o reboot.o \ async.o range.o smpboot.o ucount.o regset.o ksyms_common.o -obj-$(CONFIG_USERMODE_DRIVER) += usermode_driver.o obj-$(CONFIG_MULTIUSER) += groups.o obj-$(CONFIG_VHOST_TASK) += vhost_task.o diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 3a335c50e6e3..269c04a24664 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -14,7 +14,7 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o obj-$(CONFIG_BPF_JIT) += trampoline.o -obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o rqspinlock.o +obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o rqspinlock.o stream.o ifeq ($(CONFIG_MMU)$(CONFIG_64BIT),yy) obj-$(CONFIG_BPF_SYSCALL) += arena.o range_tree.o endif diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 0d56cea71602..5b37753799d2 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -550,6 +550,34 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt) } } +/* + * Reserve an arena virtual address range without populating it. This call stops + * bpf_arena_alloc_pages from adding pages to this range. + */ +static int arena_reserve_pages(struct bpf_arena *arena, long uaddr, u32 page_cnt) +{ + long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT; + long pgoff; + int ret; + + if (uaddr & ~PAGE_MASK) + return 0; + + pgoff = compute_pgoff(arena, uaddr); + if (pgoff + page_cnt > page_cnt_max) + return -EINVAL; + + guard(mutex)(&arena->lock); + + /* Cannot guard already allocated pages. */ + ret = is_range_tree_set(&arena->rt, pgoff, page_cnt); + if (ret) + return -EBUSY; + + /* "Allocate" the region to prevent it from being allocated. */ + return range_tree_clear(&arena->rt, pgoff, page_cnt); +} + __bpf_kfunc_start_defs(); __bpf_kfunc void *bpf_arena_alloc_pages(void *p__map, void *addr__ign, u32 page_cnt, @@ -573,11 +601,26 @@ __bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt return; arena_free_pages(arena, (long)ptr__ign, page_cnt); } + +__bpf_kfunc int bpf_arena_reserve_pages(void *p__map, void *ptr__ign, u32 page_cnt) +{ + struct bpf_map *map = p__map; + struct bpf_arena *arena = container_of(map, struct bpf_arena, map); + + if (map->map_type != BPF_MAP_TYPE_ARENA) + return -EINVAL; + + if (!page_cnt) + return 0; + + return arena_reserve_pages(arena, (long)ptr__ign, page_cnt); +} __bpf_kfunc_end_defs(); BTF_KFUNCS_START(arena_kfuncs) BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_RET | KF_ARENA_ARG2) BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_ARG2) +BTF_ID_FLAGS(func, bpf_arena_reserve_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_ARG2) BTF_KFUNCS_END(arena_kfuncs) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index eb28c0f219ee..3d080916faf9 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -530,8 +530,6 @@ static int array_map_check_btf(const struct bpf_map *map, const struct btf_type *key_type, const struct btf_type *value_type) { - u32 int_data; - /* One exception for keyless BTF: .bss/.data/.rodata map */ if (btf_type_is_void(key_type)) { if (map->map_type != BPF_MAP_TYPE_ARRAY || @@ -544,14 +542,11 @@ static int array_map_check_btf(const struct bpf_map *map, return 0; } - if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) - return -EINVAL; - - int_data = *(u32 *)(key_type + 1); - /* bpf array can only take a u32 key. This check makes sure + /* + * Bpf array can only take a u32 key. This check makes sure * that the btf matches the attr used during map_create. */ - if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data)) + if (!btf_type_is_i32(key_type)) return -EINVAL; return 0; diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 380e9a7cac75..0cbcae727079 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -38,8 +38,7 @@ static DEFINE_MUTEX(link_mutex); /* incremented on every opened seq_file */ static atomic64_t session_id; -static int prepare_seq_file(struct file *file, struct bpf_iter_link *link, - const struct bpf_iter_seq_info *seq_info); +static int prepare_seq_file(struct file *file, struct bpf_iter_link *link); static void bpf_iter_inc_seq_num(struct seq_file *seq) { @@ -257,7 +256,7 @@ static int iter_open(struct inode *inode, struct file *file) { struct bpf_iter_link *link = inode->i_private; - return prepare_seq_file(file, link, __get_seq_info(link)); + return prepare_seq_file(file, link); } static int iter_release(struct inode *inode, struct file *file) @@ -553,7 +552,8 @@ int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr, if (!link) return -ENOMEM; - bpf_link_init(&link->link, BPF_LINK_TYPE_ITER, &bpf_iter_link_lops, prog); + bpf_link_init(&link->link, BPF_LINK_TYPE_ITER, &bpf_iter_link_lops, prog, + attr->link_create.attach_type); link->tinfo = tinfo; err = bpf_link_prime(&link->link, &link_primer); @@ -586,9 +586,9 @@ static void init_seq_meta(struct bpf_iter_priv_data *priv_data, priv_data->done_stop = false; } -static int prepare_seq_file(struct file *file, struct bpf_iter_link *link, - const struct bpf_iter_seq_info *seq_info) +static int prepare_seq_file(struct file *file, struct bpf_iter_link *link) { + const struct bpf_iter_seq_info *seq_info = __get_seq_info(link); struct bpf_iter_priv_data *priv_data; struct bpf_iter_target_info *tinfo; struct bpf_prog *prog; @@ -653,7 +653,7 @@ int bpf_iter_new_fd(struct bpf_link *link) } iter_link = container_of(link, struct bpf_iter_link, link); - err = prepare_seq_file(file, iter_link, __get_seq_info(iter_link)); + err = prepare_seq_file(file, iter_link); if (err) goto free_file; diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index fa56c30833ff..b931fbceb54d 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -722,13 +722,7 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map, const struct btf_type *key_type, const struct btf_type *value_type) { - u32 int_data; - - if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) - return -EINVAL; - - int_data = *(u32 *)(key_type + 1); - if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data)) + if (!btf_type_is_i32(key_type)) return -EINVAL; return 0; diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 96113633e391..687a3e9c76f5 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -808,7 +808,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, goto reset_unlock; } bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, - &bpf_struct_ops_link_lops, prog); + &bpf_struct_ops_link_lops, prog, prog->expected_attach_type); *plink++ = &link->link; ksym = kzalloc(sizeof(*ksym), GFP_USER); @@ -1351,7 +1351,8 @@ int bpf_struct_ops_link_create(union bpf_attr *attr) err = -ENOMEM; goto err_out; } - bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL); + bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL, + attr->link_create.attach_type); err = bpf_link_prime(&link->link, &link_primer); if (err) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 1d2cf898e21e..64739308902f 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -858,26 +858,43 @@ const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) EXPORT_SYMBOL_GPL(btf_type_by_id); /* - * Regular int is not a bit field and it must be either - * u8/u16/u32/u64 or __int128. + * Check that the type @t is a regular int. This means that @t is not + * a bit field and it has the same size as either of u8/u16/u32/u64 + * or __int128. If @expected_size is not zero, then size of @t should + * be the same. A caller should already have checked that the type @t + * is an integer. */ +static bool __btf_type_int_is_regular(const struct btf_type *t, size_t expected_size) +{ + u32 int_data = btf_type_int(t); + u8 nr_bits = BTF_INT_BITS(int_data); + u8 nr_bytes = BITS_ROUNDUP_BYTES(nr_bits); + + return BITS_PER_BYTE_MASKED(nr_bits) == 0 && + BTF_INT_OFFSET(int_data) == 0 && + (nr_bytes <= 16 && is_power_of_2(nr_bytes)) && + (expected_size == 0 || nr_bytes == expected_size); +} + static bool btf_type_int_is_regular(const struct btf_type *t) { - u8 nr_bits, nr_bytes; - u32 int_data; + return __btf_type_int_is_regular(t, 0); +} - int_data = btf_type_int(t); - nr_bits = BTF_INT_BITS(int_data); - nr_bytes = BITS_ROUNDUP_BYTES(nr_bits); - if (BITS_PER_BYTE_MASKED(nr_bits) || - BTF_INT_OFFSET(int_data) || - (nr_bytes != sizeof(u8) && nr_bytes != sizeof(u16) && - nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64) && - nr_bytes != (2 * sizeof(u64)))) { - return false; - } +bool btf_type_is_i32(const struct btf_type *t) +{ + return btf_type_is_int(t) && __btf_type_int_is_regular(t, 4); +} - return true; +bool btf_type_is_i64(const struct btf_type *t) +{ + return btf_type_is_int(t) && __btf_type_int_is_regular(t, 8); +} + +bool btf_type_is_primitive(const struct btf_type *t) +{ + return (btf_type_is_int(t) && btf_type_int_is_regular(t)) || + btf_is_any_enum(t); } /* @@ -3443,7 +3460,8 @@ btf_find_graph_root(const struct btf *btf, const struct btf_type *pt, node_field_name = strstr(value_type, ":"); if (!node_field_name) return -EINVAL; - value_type = kstrndup(value_type, node_field_name - value_type, GFP_KERNEL | __GFP_NOWARN); + value_type = kstrndup(value_type, node_field_name - value_type, + GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (!value_type) return -ENOMEM; id = btf_find_by_name_kind(btf, value_type, BTF_KIND_STRUCT); @@ -3958,7 +3976,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type /* This needs to be kzalloc to zero out padding and unused fields, see * comment in btf_record_equal. */ - rec = kzalloc(struct_size(rec, fields, cnt), GFP_KERNEL | __GFP_NOWARN); + rec = kzalloc(struct_size(rec, fields, cnt), GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (!rec) return ERR_PTR(-ENOMEM); @@ -6182,8 +6200,7 @@ int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_ty return kctx_type_id; } -BTF_ID_LIST(bpf_ctx_convert_btf_id) -BTF_ID(struct, bpf_ctx_convert) +BTF_ID_LIST_SINGLE(bpf_ctx_convert_btf_id, struct, bpf_ctx_convert) static struct btf *btf_parse_base(struct btf_verifier_env *env, const char *name, void *data, unsigned int data_size) @@ -6903,6 +6920,7 @@ enum bpf_struct_walk_result { /* < 0 error */ WALK_SCALAR = 0, WALK_PTR, + WALK_PTR_UNTRUSTED, WALK_STRUCT, }; @@ -7144,6 +7162,8 @@ error: *field_name = mname; return WALK_PTR; } + + return WALK_PTR_UNTRUSTED; } /* Allow more flexible access within an int as long as @@ -7216,6 +7236,9 @@ int btf_struct_access(struct bpf_verifier_log *log, *next_btf_id = id; *flag = tmp_flag; return PTR_TO_BTF_ID; + case WALK_PTR_UNTRUSTED: + *flag = MEM_RDONLY | PTR_UNTRUSTED; + return PTR_TO_MEM; case WALK_SCALAR: return SCALAR_VALUE; case WALK_STRUCT: @@ -7628,11 +7651,12 @@ cand_cache_unlock: } enum btf_arg_tag { - ARG_TAG_CTX = BIT_ULL(0), - ARG_TAG_NONNULL = BIT_ULL(1), - ARG_TAG_TRUSTED = BIT_ULL(2), - ARG_TAG_NULLABLE = BIT_ULL(3), - ARG_TAG_ARENA = BIT_ULL(4), + ARG_TAG_CTX = BIT_ULL(0), + ARG_TAG_NONNULL = BIT_ULL(1), + ARG_TAG_TRUSTED = BIT_ULL(2), + ARG_TAG_UNTRUSTED = BIT_ULL(3), + ARG_TAG_NULLABLE = BIT_ULL(4), + ARG_TAG_ARENA = BIT_ULL(5), }; /* Process BTF of a function to produce high-level expectation of function @@ -7740,6 +7764,8 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) tags |= ARG_TAG_CTX; } else if (strcmp(tag, "trusted") == 0) { tags |= ARG_TAG_TRUSTED; + } else if (strcmp(tag, "untrusted") == 0) { + tags |= ARG_TAG_UNTRUSTED; } else if (strcmp(tag, "nonnull") == 0) { tags |= ARG_TAG_NONNULL; } else if (strcmp(tag, "nullable") == 0) { @@ -7800,6 +7826,38 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) sub->args[i].btf_id = kern_type_id; continue; } + if (tags & ARG_TAG_UNTRUSTED) { + struct btf *vmlinux_btf; + int kern_type_id; + + if (tags & ~ARG_TAG_UNTRUSTED) { + bpf_log(log, "arg#%d untrusted cannot be combined with any other tags\n", i); + return -EINVAL; + } + + ref_t = btf_type_skip_modifiers(btf, t->type, NULL); + if (btf_type_is_void(ref_t) || btf_type_is_primitive(ref_t)) { + sub->args[i].arg_type = ARG_PTR_TO_MEM | MEM_RDONLY | PTR_UNTRUSTED; + sub->args[i].mem_size = 0; + continue; + } + + kern_type_id = btf_get_ptr_to_btf_id(log, i, btf, t); + if (kern_type_id < 0) + return kern_type_id; + + vmlinux_btf = bpf_get_btf_vmlinux(); + ref_t = btf_type_by_id(vmlinux_btf, kern_type_id); + if (!btf_type_is_struct(ref_t)) { + tname = __btf_name_by_offset(vmlinux_btf, t->name_off); + bpf_log(log, "arg#%d has type %s '%s', but only struct or primitive types are allowed\n", + i, btf_type_str(ref_t), tname); + return -EINVAL; + } + sub->args[i].arg_type = ARG_PTR_TO_BTF_ID | PTR_UNTRUSTED; + sub->args[i].btf_id = kern_type_id; + continue; + } if (tags & ARG_TAG_ARENA) { if (tags & ~ARG_TAG_ARENA) { bpf_log(log, "arg#%d arena cannot be combined with any other tags\n", i); @@ -8183,7 +8241,7 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op, attr->attr.mode = 0444; attr->size = btf->data_size; attr->private = btf->data; - attr->read_new = sysfs_bin_attr_simple_read; + attr->read = sysfs_bin_attr_simple_read; err = sysfs_create_bin_file(btf_kobj, attr); if (err) { @@ -9019,7 +9077,7 @@ static struct bpf_cand_cache *populate_cand_cache(struct bpf_cand_cache *cands, bpf_free_cands_from_cache(*cc); *cc = NULL; } - new_cands = kmemdup(cands, sizeof_cands(cands->cnt), GFP_KERNEL); + new_cands = kmemdup(cands, sizeof_cands(cands->cnt), GFP_KERNEL_ACCOUNT); if (!new_cands) { bpf_free_cands(cands); return ERR_PTR(-ENOMEM); @@ -9027,7 +9085,7 @@ static struct bpf_cand_cache *populate_cand_cache(struct bpf_cand_cache *cands, /* strdup the name, since it will stay in cache. * the cands->name points to strings in prog's BTF and the prog can be unloaded. */ - new_cands->name = kmemdup_nul(cands->name, cands->name_len, GFP_KERNEL); + new_cands->name = kmemdup_nul(cands->name, cands->name_len, GFP_KERNEL_ACCOUNT); bpf_free_cands(cands); if (!new_cands->name) { kfree(new_cands); @@ -9111,7 +9169,7 @@ bpf_core_add_cands(struct bpf_cand_cache *cands, const struct btf *targ_btf, continue; /* most of the time there is only one candidate for a given kind+name pair */ - new_cands = kmalloc(sizeof_cands(cands->cnt + 1), GFP_KERNEL); + new_cands = kmalloc(sizeof_cands(cands->cnt + 1), GFP_KERNEL_ACCOUNT); if (!new_cands) { bpf_free_cands(cands); return ERR_PTR(-ENOMEM); @@ -9228,7 +9286,7 @@ int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo, /* ~4k of temp memory necessary to convert LLVM spec like "0:1:0:5" * into arrays of btf_ids of struct fields and array indices. */ - specs = kcalloc(3, sizeof(*specs), GFP_KERNEL); + specs = kcalloc(3, sizeof(*specs), GFP_KERNEL_ACCOUNT); if (!specs) return -ENOMEM; @@ -9253,7 +9311,7 @@ int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo, goto out; } if (cc->cnt) { - cands.cands = kcalloc(cc->cnt, sizeof(*cands.cands), GFP_KERNEL); + cands.cands = kcalloc(cc->cnt, sizeof(*cands.cands), GFP_KERNEL_ACCOUNT); if (!cands.cands) { err = -ENOMEM; goto out; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index f4885514f007..180b630279b9 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -658,6 +658,116 @@ static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs, return NULL; } +static struct bpf_link *bpf_get_anchor_link(u32 flags, u32 id_or_fd) +{ + struct bpf_link *link = ERR_PTR(-EINVAL); + + if (flags & BPF_F_ID) + link = bpf_link_by_id(id_or_fd); + else if (id_or_fd) + link = bpf_link_get_from_fd(id_or_fd); + return link; +} + +static struct bpf_prog *bpf_get_anchor_prog(u32 flags, u32 id_or_fd) +{ + struct bpf_prog *prog = ERR_PTR(-EINVAL); + + if (flags & BPF_F_ID) + prog = bpf_prog_by_id(id_or_fd); + else if (id_or_fd) + prog = bpf_prog_get(id_or_fd); + return prog; +} + +static struct bpf_prog_list *get_prog_list(struct hlist_head *progs, struct bpf_prog *prog, + struct bpf_cgroup_link *link, u32 flags, u32 id_or_fd) +{ + bool is_link = flags & BPF_F_LINK, is_id = flags & BPF_F_ID; + struct bpf_prog_list *pltmp, *pl = ERR_PTR(-EINVAL); + bool preorder = flags & BPF_F_PREORDER; + struct bpf_link *anchor_link = NULL; + struct bpf_prog *anchor_prog = NULL; + bool is_before, is_after; + + is_before = flags & BPF_F_BEFORE; + is_after = flags & BPF_F_AFTER; + if (is_link || is_id || id_or_fd) { + /* flags must have either BPF_F_BEFORE or BPF_F_AFTER */ + if (is_before == is_after) + return ERR_PTR(-EINVAL); + if ((is_link && !link) || (!is_link && !prog)) + return ERR_PTR(-EINVAL); + } else if (!hlist_empty(progs)) { + /* flags cannot have both BPF_F_BEFORE and BPF_F_AFTER */ + if (is_before && is_after) + return ERR_PTR(-EINVAL); + } + + if (is_link) { + anchor_link = bpf_get_anchor_link(flags, id_or_fd); + if (IS_ERR(anchor_link)) + return ERR_CAST(anchor_link); + } else if (is_id || id_or_fd) { + anchor_prog = bpf_get_anchor_prog(flags, id_or_fd); + if (IS_ERR(anchor_prog)) + return ERR_CAST(anchor_prog); + } + + if (!anchor_prog && !anchor_link) { + /* if there is no anchor_prog/anchor_link, then BPF_F_PREORDER + * doesn't matter since either prepend or append to a combined + * list of progs will end up with correct result. + */ + hlist_for_each_entry(pltmp, progs, node) { + if (is_before) + return pltmp; + if (pltmp->node.next) + continue; + return pltmp; + } + return NULL; + } + + hlist_for_each_entry(pltmp, progs, node) { + if ((anchor_prog && anchor_prog == pltmp->prog) || + (anchor_link && anchor_link == &pltmp->link->link)) { + if (!!(pltmp->flags & BPF_F_PREORDER) != preorder) + goto out; + pl = pltmp; + goto out; + } + } + + pl = ERR_PTR(-ENOENT); +out: + if (anchor_link) + bpf_link_put(anchor_link); + else + bpf_prog_put(anchor_prog); + return pl; +} + +static int insert_pl_to_hlist(struct bpf_prog_list *pl, struct hlist_head *progs, + struct bpf_prog *prog, struct bpf_cgroup_link *link, + u32 flags, u32 id_or_fd) +{ + struct bpf_prog_list *pltmp; + + pltmp = get_prog_list(progs, prog, link, flags, id_or_fd); + if (IS_ERR(pltmp)) + return PTR_ERR(pltmp); + + if (!pltmp) + hlist_add_head(&pl->node, progs); + else if (flags & BPF_F_BEFORE) + hlist_add_before(&pl->node, &pltmp->node); + else + hlist_add_behind(&pl->node, &pltmp->node); + + return 0; +} + /** * __cgroup_bpf_attach() - Attach the program or the link to a cgroup, and * propagate the change to descendants @@ -667,6 +777,8 @@ static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs, * @replace_prog: Previously attached program to replace if BPF_F_REPLACE is set * @type: Type of attach operation * @flags: Option flags + * @id_or_fd: Relative prog id or fd + * @revision: bpf_prog_list revision * * Exactly one of @prog or @link can be non-null. * Must be called with cgroup_mutex held. @@ -674,7 +786,8 @@ static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs, static int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, struct bpf_prog *replace_prog, struct bpf_cgroup_link *link, - enum bpf_attach_type type, u32 flags) + enum bpf_attach_type type, u32 flags, u32 id_or_fd, + u64 revision) { u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)); struct bpf_prog *old_prog = NULL; @@ -690,6 +803,9 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, ((flags & BPF_F_REPLACE) && !(flags & BPF_F_ALLOW_MULTI))) /* invalid combination */ return -EINVAL; + if ((flags & BPF_F_REPLACE) && (flags & (BPF_F_BEFORE | BPF_F_AFTER))) + /* only either replace or insertion with before/after */ + return -EINVAL; if (link && (prog || replace_prog)) /* only either link or prog/replace_prog can be specified */ return -EINVAL; @@ -700,6 +816,8 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, atype = bpf_cgroup_atype_find(type, new_prog->aux->attach_btf_id); if (atype < 0) return -EINVAL; + if (revision && revision != cgrp->bpf.revisions[atype]) + return -ESTALE; progs = &cgrp->bpf.progs[atype]; @@ -728,22 +846,18 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, if (pl) { old_prog = pl->prog; } else { - struct hlist_node *last = NULL; - pl = kmalloc(sizeof(*pl), GFP_KERNEL); if (!pl) { bpf_cgroup_storages_free(new_storage); return -ENOMEM; } - if (hlist_empty(progs)) - hlist_add_head(&pl->node, progs); - else - hlist_for_each(last, progs) { - if (last->next) - continue; - hlist_add_behind(&pl->node, last); - break; - } + + err = insert_pl_to_hlist(pl, progs, prog, link, flags, id_or_fd); + if (err) { + kfree(pl); + bpf_cgroup_storages_free(new_storage); + return err; + } } pl->prog = prog; @@ -753,7 +867,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, cgrp->bpf.flags[atype] = saved_flags; if (type == BPF_LSM_CGROUP) { - err = bpf_trampoline_link_cgroup_shim(new_prog, atype); + err = bpf_trampoline_link_cgroup_shim(new_prog, atype, type); if (err) goto cleanup; } @@ -762,6 +876,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, if (err) goto cleanup_trampoline; + cgrp->bpf.revisions[atype] += 1; if (old_prog) { if (type == BPF_LSM_CGROUP) bpf_trampoline_unlink_cgroup_shim(old_prog); @@ -793,12 +908,13 @@ static int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, struct bpf_prog *replace_prog, struct bpf_cgroup_link *link, enum bpf_attach_type type, - u32 flags) + u32 flags, u32 id_or_fd, u64 revision) { int ret; cgroup_lock(); - ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags); + ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags, + id_or_fd, revision); cgroup_unlock(); return ret; } @@ -868,7 +984,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp, struct hlist_head *progs; bool found = false; - atype = bpf_cgroup_atype_find(link->type, new_prog->aux->attach_btf_id); + atype = bpf_cgroup_atype_find(link->link.attach_type, new_prog->aux->attach_btf_id); if (atype < 0) return -EINVAL; @@ -886,6 +1002,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp, if (!found) return -ENOENT; + cgrp->bpf.revisions[atype] += 1; old_prog = xchg(&link->link.prog, new_prog); replace_effective_prog(cgrp, atype, link); bpf_prog_put(old_prog); @@ -1011,12 +1128,14 @@ found: * @prog: A program to detach or NULL * @link: A link to detach or NULL * @type: Type of detach operation + * @revision: bpf_prog_list revision * * At most one of @prog or @link can be non-NULL. * Must be called with cgroup_mutex held. */ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, - struct bpf_cgroup_link *link, enum bpf_attach_type type) + struct bpf_cgroup_link *link, enum bpf_attach_type type, + u64 revision) { enum cgroup_bpf_attach_type atype; struct bpf_prog *old_prog; @@ -1034,6 +1153,9 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, if (atype < 0) return -EINVAL; + if (revision && revision != cgrp->bpf.revisions[atype]) + return -ESTALE; + progs = &cgrp->bpf.progs[atype]; flags = cgrp->bpf.flags[atype]; @@ -1059,6 +1181,7 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, /* now can actually delete it from this cgroup list */ hlist_del(&pl->node); + cgrp->bpf.revisions[atype] += 1; kfree(pl); if (hlist_empty(progs)) @@ -1074,12 +1197,12 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, } static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, - enum bpf_attach_type type) + enum bpf_attach_type type, u64 revision) { int ret; cgroup_lock(); - ret = __cgroup_bpf_detach(cgrp, prog, NULL, type); + ret = __cgroup_bpf_detach(cgrp, prog, NULL, type, revision); cgroup_unlock(); return ret; } @@ -1097,6 +1220,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, struct bpf_prog_array *effective; int cnt, ret = 0, i; int total_cnt = 0; + u64 revision = 0; u32 flags; if (effective_query && prog_attach_flags) @@ -1134,6 +1258,10 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, return -EFAULT; if (copy_to_user(&uattr->query.prog_cnt, &total_cnt, sizeof(total_cnt))) return -EFAULT; + if (!effective_query && from_atype == to_atype) + revision = cgrp->bpf.revisions[from_atype]; + if (copy_to_user(&uattr->query.revision, &revision, sizeof(revision))) + return -EFAULT; if (attr->query.prog_cnt == 0 || !prog_ids || !total_cnt) /* return early if user requested only program count + flags */ return 0; @@ -1216,7 +1344,8 @@ int cgroup_bpf_prog_attach(const union bpf_attr *attr, } ret = cgroup_bpf_attach(cgrp, prog, replace_prog, NULL, - attr->attach_type, attr->attach_flags); + attr->attach_type, attr->attach_flags, + attr->relative_fd, attr->expected_revision); if (replace_prog) bpf_prog_put(replace_prog); @@ -1238,7 +1367,7 @@ int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) if (IS_ERR(prog)) prog = NULL; - ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type); + ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, attr->expected_revision); if (prog) bpf_prog_put(prog); @@ -1267,8 +1396,8 @@ static void bpf_cgroup_link_release(struct bpf_link *link) } WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link, - cg_link->type)); - if (cg_link->type == BPF_LSM_CGROUP) + link->attach_type, 0)); + if (link->attach_type == BPF_LSM_CGROUP) bpf_trampoline_unlink_cgroup_shim(cg_link->link.prog); cg = cg_link->cgroup; @@ -1310,7 +1439,7 @@ static void bpf_cgroup_link_show_fdinfo(const struct bpf_link *link, "cgroup_id:\t%llu\n" "attach_type:\t%d\n", cg_id, - cg_link->type); + link->attach_type); } static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link, @@ -1326,7 +1455,7 @@ static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link, cgroup_unlock(); info->cgroup.cgroup_id = cg_id; - info->cgroup.attach_type = cg_link->type; + info->cgroup.attach_type = link->attach_type; return 0; } @@ -1339,6 +1468,13 @@ static const struct bpf_link_ops bpf_cgroup_link_lops = { .fill_link_info = bpf_cgroup_link_fill_link_info, }; +#define BPF_F_LINK_ATTACH_MASK \ + (BPF_F_ID | \ + BPF_F_BEFORE | \ + BPF_F_AFTER | \ + BPF_F_PREORDER | \ + BPF_F_LINK) + int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { struct bpf_link_primer link_primer; @@ -1346,7 +1482,7 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) struct cgroup *cgrp; int err; - if (attr->link_create.flags) + if (attr->link_create.flags & (~BPF_F_LINK_ATTACH_MASK)) return -EINVAL; cgrp = cgroup_get_from_fd(attr->link_create.target_fd); @@ -1359,9 +1495,8 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) goto out_put_cgroup; } bpf_link_init(&link->link, BPF_LINK_TYPE_CGROUP, &bpf_cgroup_link_lops, - prog); + prog, attr->link_create.attach_type); link->cgroup = cgrp; - link->type = attr->link_create.attach_type; err = bpf_link_prime(&link->link, &link_primer); if (err) { @@ -1370,7 +1505,9 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) } err = cgroup_bpf_attach(cgrp, NULL, NULL, link, - link->type, BPF_F_ALLOW_MULTI); + link->link.attach_type, BPF_F_ALLOW_MULTI | attr->link_create.flags, + attr->link_create.cgroup.relative_fd, + attr->link_create.cgroup.expected_revision); if (err) { bpf_link_cleanup(&link_primer); goto out_put_cgroup; @@ -2440,22 +2577,22 @@ static bool cg_sockopt_is_valid_access(int off, int size, } switch (off) { - case offsetof(struct bpf_sockopt, sk): + case bpf_ctx_range_ptr(struct bpf_sockopt, sk): if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_SOCKET; break; - case offsetof(struct bpf_sockopt, optval): + case bpf_ctx_range_ptr(struct bpf_sockopt, optval): if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_PACKET; break; - case offsetof(struct bpf_sockopt, optval_end): + case bpf_ctx_range_ptr(struct bpf_sockopt, optval_end): if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_PACKET_END; break; - case offsetof(struct bpf_sockopt, retval): + case bpf_ctx_range(struct bpf_sockopt, retval): if (size != size_default) return false; return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index dae281a1286d..09dde5b00d0c 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -134,6 +134,10 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag mutex_init(&fp->aux->ext_mutex); mutex_init(&fp->aux->dst_mutex); +#ifdef CONFIG_BPF_SYSCALL + bpf_prog_stream_init(fp); +#endif + return fp; } @@ -778,7 +782,10 @@ bool is_bpf_text_address(unsigned long addr) struct bpf_prog *bpf_prog_ksym_find(unsigned long addr) { - struct bpf_ksym *ksym = bpf_ksym_find(addr); + struct bpf_ksym *ksym; + + WARN_ON_ONCE(!rcu_read_lock_held()); + ksym = bpf_ksym_find(addr); return ksym && ksym->prog ? container_of(ksym, struct bpf_prog_aux, ksym)->prog : @@ -1290,6 +1297,13 @@ int bpf_jit_get_func_addr(const struct bpf_prog *prog, return 0; } +const char *bpf_jit_get_prog_name(struct bpf_prog *prog) +{ + if (prog->aux->ksym.prog) + return prog->aux->ksym.name; + return prog->aux->name; +} + static int bpf_jit_blind_insn(const struct bpf_insn *from, const struct bpf_insn *aux, struct bpf_insn *to_buff, @@ -2102,14 +2116,15 @@ out: #undef COND_JMP /* ST, STX and LDX*/ ST_NOSPEC: - /* Speculation barrier for mitigating Speculative Store Bypass. - * In case of arm64, we rely on the firmware mitigation as - * controlled via the ssbd kernel parameter. Whenever the - * mitigation is enabled, it works for all of the kernel code - * with no need to provide any additional instructions here. - * In case of x86, we use 'lfence' insn for mitigation. We - * reuse preexisting logic from Spectre v1 mitigation that - * happens to produce the required code on x86 for v4 as well. + /* Speculation barrier for mitigating Speculative Store Bypass, + * Bounds-Check Bypass and Type Confusion. In case of arm64, we + * rely on the firmware mitigation as controlled via the ssbd + * kernel parameter. Whenever the mitigation is enabled, it + * works for all of the kernel code with no need to provide any + * additional instructions here. In case of x86, we use 'lfence' + * insn for mitigation. We reuse preexisting logic from Spectre + * v1 mitigation that happens to produce the required code on + * x86 for v4 as well. */ barrier_nospec(); CONT; @@ -2861,6 +2876,7 @@ static void bpf_prog_free_deferred(struct work_struct *work) aux = container_of(work, struct bpf_prog_aux, work); #ifdef CONFIG_BPF_SYSCALL bpf_free_kfunc_btf_tab(aux->kfunc_btf_tab); + bpf_prog_stream_free(aux->prog); #endif #ifdef CONFIG_CGROUP_BPF if (aux->cgroup_atype != CGROUP_BPF_ATTACH_TYPE_INVALID) @@ -3034,6 +3050,21 @@ bool __weak bpf_jit_needs_zext(void) return false; } +/* By default, enable the verifier's mitigations against Spectre v1 and v4 for + * all archs. The value returned must not change at runtime as there is + * currently no support for reloading programs that were loaded without + * mitigations. + */ +bool __weak bpf_jit_bypass_spec_v1(void) +{ + return false; +} + +bool __weak bpf_jit_bypass_spec_v4(void) +{ + return false; +} + /* Return true if the JIT inlines the call to the helper corresponding to * the imm. * @@ -3144,6 +3175,22 @@ u64 __weak arch_bpf_timed_may_goto(void) return 0; } +static noinline void bpf_prog_report_may_goto_violation(void) +{ +#ifdef CONFIG_BPF_SYSCALL + struct bpf_stream_stage ss; + struct bpf_prog *prog; + + prog = bpf_prog_find_from_stack(); + if (!prog) + return; + bpf_stream_stage(ss, prog, BPF_STDERR, ({ + bpf_stream_printk(ss, "ERROR: Timeout detected for may_goto instruction\n"); + bpf_stream_dump_stack(ss); + })); +#endif +} + u64 bpf_check_timed_may_goto(struct bpf_timed_may_goto *p) { u64 time = ktime_get_mono_fast_ns(); @@ -3154,8 +3201,10 @@ u64 bpf_check_timed_may_goto(struct bpf_timed_may_goto *p) return BPF_MAX_TIMED_LOOPS; } /* Check if we've exhausted our time slice, and zero count. */ - if (time - p->timestamp >= (NSEC_PER_SEC / 4)) + if (unlikely(time - p->timestamp >= (NSEC_PER_SEC / 4))) { + bpf_prog_report_may_goto_violation(); return 0; + } /* Refresh the count for the stack frame. */ return BPF_MAX_TIMED_LOOPS; } @@ -3192,3 +3241,85 @@ EXPORT_SYMBOL(bpf_stats_enabled_key); EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception); EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_bulk_tx); + +#ifdef CONFIG_BPF_SYSCALL + +int bpf_prog_get_file_line(struct bpf_prog *prog, unsigned long ip, const char **filep, + const char **linep, int *nump) +{ + int idx = -1, insn_start, insn_end, len; + struct bpf_line_info *linfo; + void **jited_linfo; + struct btf *btf; + int nr_linfo; + + btf = prog->aux->btf; + linfo = prog->aux->linfo; + jited_linfo = prog->aux->jited_linfo; + + if (!btf || !linfo || !jited_linfo) + return -EINVAL; + len = prog->aux->func ? prog->aux->func[prog->aux->func_idx]->len : prog->len; + + linfo = &prog->aux->linfo[prog->aux->linfo_idx]; + jited_linfo = &prog->aux->jited_linfo[prog->aux->linfo_idx]; + + insn_start = linfo[0].insn_off; + insn_end = insn_start + len; + nr_linfo = prog->aux->nr_linfo - prog->aux->linfo_idx; + + for (int i = 0; i < nr_linfo && + linfo[i].insn_off >= insn_start && linfo[i].insn_off < insn_end; i++) { + if (jited_linfo[i] >= (void *)ip) + break; + idx = i; + } + + if (idx == -1) + return -ENOENT; + + /* Get base component of the file path. */ + *filep = btf_name_by_offset(btf, linfo[idx].file_name_off); + *filep = kbasename(*filep); + /* Obtain the source line, and strip whitespace in prefix. */ + *linep = btf_name_by_offset(btf, linfo[idx].line_off); + while (isspace(**linep)) + *linep += 1; + *nump = BPF_LINE_INFO_LINE_NUM(linfo[idx].line_col); + return 0; +} + +struct walk_stack_ctx { + struct bpf_prog *prog; +}; + +static bool find_from_stack_cb(void *cookie, u64 ip, u64 sp, u64 bp) +{ + struct walk_stack_ctx *ctxp = cookie; + struct bpf_prog *prog; + + /* + * The RCU read lock is held to safely traverse the latch tree, but we + * don't need its protection when accessing the prog, since it has an + * active stack frame on the current stack trace, and won't disappear. + */ + rcu_read_lock(); + prog = bpf_prog_ksym_find(ip); + rcu_read_unlock(); + if (!prog) + return true; + if (bpf_is_subprog(prog)) + return true; + ctxp->prog = prog; + return false; +} + +struct bpf_prog *bpf_prog_find_from_stack(void) +{ + struct walk_stack_ctx ctx = {}; + + arch_bpf_stack_walk(find_from_stack_cb, &ctx); + return ctx.prog; +} + +#endif diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 67e8a2fc1a99..b2b7b8ec2c2a 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -282,8 +282,7 @@ static void cpu_map_gro_flush(struct bpf_cpu_map_entry *rcpu, bool empty) * This is equivalent to how NAPI decides whether to perform a full * flush. */ - gro_flush(&rcpu->gro, !empty && HZ >= 1000); - gro_normal_list(&rcpu->gro); + gro_flush_normal(&rcpu->gro, !empty && HZ >= 1000); } static int cpu_map_kthread_run(void *data) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 88035dae38c1..6b4877e85a68 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -24,6 +24,7 @@ #include <linux/bpf_mem_alloc.h> #include <linux/kasan.h> #include <linux/bpf_verifier.h> +#include <linux/uaccess.h> #include "../../lib/kstrtox.h" @@ -763,22 +764,13 @@ static int bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype, return -EINVAL; } -/* Per-cpu temp buffers used by printf-like helpers to store the bprintf binary - * arguments representation. - */ -#define MAX_BPRINTF_BIN_ARGS 512 - /* Support executing three nested bprintf helper calls on a given CPU */ #define MAX_BPRINTF_NEST_LEVEL 3 -struct bpf_bprintf_buffers { - char bin_args[MAX_BPRINTF_BIN_ARGS]; - char buf[MAX_BPRINTF_BUF]; -}; static DEFINE_PER_CPU(struct bpf_bprintf_buffers[MAX_BPRINTF_NEST_LEVEL], bpf_bprintf_bufs); static DEFINE_PER_CPU(int, bpf_bprintf_nest_level); -static int try_get_buffers(struct bpf_bprintf_buffers **bufs) +int bpf_try_get_buffers(struct bpf_bprintf_buffers **bufs) { int nest_level; @@ -794,16 +786,21 @@ static int try_get_buffers(struct bpf_bprintf_buffers **bufs) return 0; } -void bpf_bprintf_cleanup(struct bpf_bprintf_data *data) +void bpf_put_buffers(void) { - if (!data->bin_args && !data->buf) - return; if (WARN_ON_ONCE(this_cpu_read(bpf_bprintf_nest_level) == 0)) return; this_cpu_dec(bpf_bprintf_nest_level); preempt_enable(); } +void bpf_bprintf_cleanup(struct bpf_bprintf_data *data) +{ + if (!data->bin_args && !data->buf) + return; + bpf_put_buffers(); +} + /* * bpf_bprintf_prepare - Generic pass on format strings for bprintf-like helpers * @@ -818,7 +815,7 @@ void bpf_bprintf_cleanup(struct bpf_bprintf_data *data) * In argument preparation mode, if 0 is returned, safe temporary buffers are * allocated and bpf_bprintf_cleanup should be called to free them after use. */ -int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, +int bpf_bprintf_prepare(const char *fmt, u32 fmt_size, const u64 *raw_args, u32 num_args, struct bpf_bprintf_data *data) { bool get_buffers = (data->get_bin_args && num_args) || data->get_buf; @@ -834,7 +831,7 @@ int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, return -EINVAL; fmt_size = fmt_end - fmt; - if (get_buffers && try_get_buffers(&buffers)) + if (get_buffers && bpf_try_get_buffers(&buffers)) return -EBUSY; if (data->get_bin_args) { @@ -2911,6 +2908,52 @@ __bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off, return 0; } +/** + * bpf_dynptr_memset() - Fill dynptr memory with a constant byte. + * @p: Destination dynptr - where data will be filled + * @offset: Offset into the dynptr to start filling from + * @size: Number of bytes to fill + * @val: Constant byte to fill the memory with + * + * Fills the @size bytes of the memory area pointed to by @p + * at @offset with the constant byte @val. + * Returns 0 on success; negative error, otherwise. + */ + __bpf_kfunc int bpf_dynptr_memset(struct bpf_dynptr *p, u32 offset, u32 size, u8 val) + { + struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; + u32 chunk_sz, write_off; + char buf[256]; + void* slice; + int err; + + slice = bpf_dynptr_slice_rdwr(p, offset, NULL, size); + if (likely(slice)) { + memset(slice, val, size); + return 0; + } + + if (__bpf_dynptr_is_rdonly(ptr)) + return -EINVAL; + + err = bpf_dynptr_check_off_len(ptr, offset, size); + if (err) + return err; + + /* Non-linear data under the dynptr, write from a local buffer */ + chunk_sz = min_t(u32, sizeof(buf), size); + memset(buf, val, chunk_sz); + + for (write_off = 0; write_off < size; write_off += chunk_sz) { + chunk_sz = min_t(u32, sizeof(buf), size - write_off); + err = __bpf_dynptr_write(ptr, offset + write_off, buf, chunk_sz, 0); + if (err) + return err; + } + + return 0; +} + __bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj) { return obj; @@ -2943,9 +2986,16 @@ static bool bpf_stack_walker(void *cookie, u64 ip, u64 sp, u64 bp) struct bpf_throw_ctx *ctx = cookie; struct bpf_prog *prog; - if (!is_bpf_text_address(ip)) - return !ctx->cnt; + /* + * The RCU read lock is held to safely traverse the latch tree, but we + * don't need its protection when accessing the prog, since it has an + * active stack frame on the current stack trace, and won't disappear. + */ + rcu_read_lock(); prog = bpf_prog_ksym_find(ip); + rcu_read_unlock(); + if (!prog) + return !ctx->cnt; ctx->cnt++; if (bpf_is_subprog(prog)) return true; @@ -3283,6 +3333,376 @@ __bpf_kfunc void __bpf_trap(void) { } +/* + * Kfuncs for string operations. + * + * Since strings are not necessarily %NUL-terminated, we cannot directly call + * in-kernel implementations. Instead, we open-code the implementations using + * __get_kernel_nofault instead of plain dereference to make them safe. + */ + +/** + * bpf_strcmp - Compare two strings + * @s1__ign: One string + * @s2__ign: Another string + * + * Return: + * * %0 - Strings are equal + * * %-1 - @s1__ign is smaller + * * %1 - @s2__ign is smaller + * * %-EFAULT - Cannot read one of the strings + * * %-E2BIG - One of strings is too large + * * %-ERANGE - One of strings is outside of kernel address space + */ +__bpf_kfunc int bpf_strcmp(const char *s1__ign, const char *s2__ign) +{ + char c1, c2; + int i; + + if (!copy_from_kernel_nofault_allowed(s1__ign, 1) || + !copy_from_kernel_nofault_allowed(s2__ign, 1)) { + return -ERANGE; + } + + guard(pagefault)(); + for (i = 0; i < XATTR_SIZE_MAX; i++) { + __get_kernel_nofault(&c1, s1__ign, char, err_out); + __get_kernel_nofault(&c2, s2__ign, char, err_out); + if (c1 != c2) + return c1 < c2 ? -1 : 1; + if (c1 == '\0') + return 0; + s1__ign++; + s2__ign++; + } + return -E2BIG; +err_out: + return -EFAULT; +} + +/** + * bpf_strnchr - Find a character in a length limited string + * @s__ign: The string to be searched + * @count: The number of characters to be searched + * @c: The character to search for + * + * Note that the %NUL-terminator is considered part of the string, and can + * be searched for. + * + * Return: + * * >=0 - Index of the first occurrence of @c within @s__ign + * * %-ENOENT - @c not found in the first @count characters of @s__ign + * * %-EFAULT - Cannot read @s__ign + * * %-E2BIG - @s__ign is too large + * * %-ERANGE - @s__ign is outside of kernel address space + */ +__bpf_kfunc int bpf_strnchr(const char *s__ign, size_t count, char c) +{ + char sc; + int i; + + if (!copy_from_kernel_nofault_allowed(s__ign, 1)) + return -ERANGE; + + guard(pagefault)(); + for (i = 0; i < count && i < XATTR_SIZE_MAX; i++) { + __get_kernel_nofault(&sc, s__ign, char, err_out); + if (sc == c) + return i; + if (sc == '\0') + return -ENOENT; + s__ign++; + } + return i == XATTR_SIZE_MAX ? -E2BIG : -ENOENT; +err_out: + return -EFAULT; +} + +/** + * bpf_strchr - Find the first occurrence of a character in a string + * @s__ign: The string to be searched + * @c: The character to search for + * + * Note that the %NUL-terminator is considered part of the string, and can + * be searched for. + * + * Return: + * * >=0 - The index of the first occurrence of @c within @s__ign + * * %-ENOENT - @c not found in @s__ign + * * %-EFAULT - Cannot read @s__ign + * * %-E2BIG - @s__ign is too large + * * %-ERANGE - @s__ign is outside of kernel address space + */ +__bpf_kfunc int bpf_strchr(const char *s__ign, char c) +{ + return bpf_strnchr(s__ign, XATTR_SIZE_MAX, c); +} + +/** + * bpf_strchrnul - Find and return a character in a string, or end of string + * @s__ign: The string to be searched + * @c: The character to search for + * + * Return: + * * >=0 - Index of the first occurrence of @c within @s__ign or index of + * the null byte at the end of @s__ign when @c is not found + * * %-EFAULT - Cannot read @s__ign + * * %-E2BIG - @s__ign is too large + * * %-ERANGE - @s__ign is outside of kernel address space + */ +__bpf_kfunc int bpf_strchrnul(const char *s__ign, char c) +{ + char sc; + int i; + + if (!copy_from_kernel_nofault_allowed(s__ign, 1)) + return -ERANGE; + + guard(pagefault)(); + for (i = 0; i < XATTR_SIZE_MAX; i++) { + __get_kernel_nofault(&sc, s__ign, char, err_out); + if (sc == '\0' || sc == c) + return i; + s__ign++; + } + return -E2BIG; +err_out: + return -EFAULT; +} + +/** + * bpf_strrchr - Find the last occurrence of a character in a string + * @s__ign: The string to be searched + * @c: The character to search for + * + * Return: + * * >=0 - Index of the last occurrence of @c within @s__ign + * * %-ENOENT - @c not found in @s__ign + * * %-EFAULT - Cannot read @s__ign + * * %-E2BIG - @s__ign is too large + * * %-ERANGE - @s__ign is outside of kernel address space + */ +__bpf_kfunc int bpf_strrchr(const char *s__ign, int c) +{ + char sc; + int i, last = -ENOENT; + + if (!copy_from_kernel_nofault_allowed(s__ign, 1)) + return -ERANGE; + + guard(pagefault)(); + for (i = 0; i < XATTR_SIZE_MAX; i++) { + __get_kernel_nofault(&sc, s__ign, char, err_out); + if (sc == c) + last = i; + if (sc == '\0') + return last; + s__ign++; + } + return -E2BIG; +err_out: + return -EFAULT; +} + +/** + * bpf_strnlen - Calculate the length of a length-limited string + * @s__ign: The string + * @count: The maximum number of characters to count + * + * Return: + * * >=0 - The length of @s__ign + * * %-EFAULT - Cannot read @s__ign + * * %-E2BIG - @s__ign is too large + * * %-ERANGE - @s__ign is outside of kernel address space + */ +__bpf_kfunc int bpf_strnlen(const char *s__ign, size_t count) +{ + char c; + int i; + + if (!copy_from_kernel_nofault_allowed(s__ign, 1)) + return -ERANGE; + + guard(pagefault)(); + for (i = 0; i < count && i < XATTR_SIZE_MAX; i++) { + __get_kernel_nofault(&c, s__ign, char, err_out); + if (c == '\0') + return i; + s__ign++; + } + return i == XATTR_SIZE_MAX ? -E2BIG : i; +err_out: + return -EFAULT; +} + +/** + * bpf_strlen - Calculate the length of a string + * @s__ign: The string + * + * Return: + * * >=0 - The length of @s__ign + * * %-EFAULT - Cannot read @s__ign + * * %-E2BIG - @s__ign is too large + * * %-ERANGE - @s__ign is outside of kernel address space + */ +__bpf_kfunc int bpf_strlen(const char *s__ign) +{ + return bpf_strnlen(s__ign, XATTR_SIZE_MAX); +} + +/** + * bpf_strspn - Calculate the length of the initial substring of @s__ign which + * only contains letters in @accept__ign + * @s__ign: The string to be searched + * @accept__ign: The string to search for + * + * Return: + * * >=0 - The length of the initial substring of @s__ign which only + * contains letters from @accept__ign + * * %-EFAULT - Cannot read one of the strings + * * %-E2BIG - One of the strings is too large + * * %-ERANGE - One of the strings is outside of kernel address space + */ +__bpf_kfunc int bpf_strspn(const char *s__ign, const char *accept__ign) +{ + char cs, ca; + int i, j; + + if (!copy_from_kernel_nofault_allowed(s__ign, 1) || + !copy_from_kernel_nofault_allowed(accept__ign, 1)) { + return -ERANGE; + } + + guard(pagefault)(); + for (i = 0; i < XATTR_SIZE_MAX; i++) { + __get_kernel_nofault(&cs, s__ign, char, err_out); + if (cs == '\0') + return i; + for (j = 0; j < XATTR_SIZE_MAX; j++) { + __get_kernel_nofault(&ca, accept__ign + j, char, err_out); + if (cs == ca || ca == '\0') + break; + } + if (j == XATTR_SIZE_MAX) + return -E2BIG; + if (ca == '\0') + return i; + s__ign++; + } + return -E2BIG; +err_out: + return -EFAULT; +} + +/** + * bpf_strcspn - Calculate the length of the initial substring of @s__ign which + * does not contain letters in @reject__ign + * @s__ign: The string to be searched + * @reject__ign: The string to search for + * + * Return: + * * >=0 - The length of the initial substring of @s__ign which does not + * contain letters from @reject__ign + * * %-EFAULT - Cannot read one of the strings + * * %-E2BIG - One of the strings is too large + * * %-ERANGE - One of the strings is outside of kernel address space + */ +__bpf_kfunc int bpf_strcspn(const char *s__ign, const char *reject__ign) +{ + char cs, cr; + int i, j; + + if (!copy_from_kernel_nofault_allowed(s__ign, 1) || + !copy_from_kernel_nofault_allowed(reject__ign, 1)) { + return -ERANGE; + } + + guard(pagefault)(); + for (i = 0; i < XATTR_SIZE_MAX; i++) { + __get_kernel_nofault(&cs, s__ign, char, err_out); + if (cs == '\0') + return i; + for (j = 0; j < XATTR_SIZE_MAX; j++) { + __get_kernel_nofault(&cr, reject__ign + j, char, err_out); + if (cs == cr || cr == '\0') + break; + } + if (j == XATTR_SIZE_MAX) + return -E2BIG; + if (cr != '\0') + return i; + s__ign++; + } + return -E2BIG; +err_out: + return -EFAULT; +} + +/** + * bpf_strnstr - Find the first substring in a length-limited string + * @s1__ign: The string to be searched + * @s2__ign: The string to search for + * @len: the maximum number of characters to search + * + * Return: + * * >=0 - Index of the first character of the first occurrence of @s2__ign + * within the first @len characters of @s1__ign + * * %-ENOENT - @s2__ign not found in the first @len characters of @s1__ign + * * %-EFAULT - Cannot read one of the strings + * * %-E2BIG - One of the strings is too large + * * %-ERANGE - One of the strings is outside of kernel address space + */ +__bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign, size_t len) +{ + char c1, c2; + int i, j; + + if (!copy_from_kernel_nofault_allowed(s1__ign, 1) || + !copy_from_kernel_nofault_allowed(s2__ign, 1)) { + return -ERANGE; + } + + guard(pagefault)(); + for (i = 0; i < XATTR_SIZE_MAX; i++) { + for (j = 0; i + j < len && j < XATTR_SIZE_MAX; j++) { + __get_kernel_nofault(&c2, s2__ign + j, char, err_out); + if (c2 == '\0') + return i; + __get_kernel_nofault(&c1, s1__ign + j, char, err_out); + if (c1 == '\0') + return -ENOENT; + if (c1 != c2) + break; + } + if (j == XATTR_SIZE_MAX) + return -E2BIG; + if (i + j == len) + return -ENOENT; + s1__ign++; + } + return -E2BIG; +err_out: + return -EFAULT; +} + +/** + * bpf_strstr - Find the first substring in a string + * @s1__ign: The string to be searched + * @s2__ign: The string to search for + * + * Return: + * * >=0 - Index of the first character of the first occurrence of @s2__ign + * within @s1__ign + * * %-ENOENT - @s2__ign is not a substring of @s1__ign + * * %-EFAULT - Cannot read one of the strings + * * %-E2BIG - One of the strings is too large + * * %-ERANGE - One of the strings is outside of kernel address space + */ +__bpf_kfunc int bpf_strstr(const char *s1__ign, const char *s2__ign) +{ + return bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX); +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(generic_btf_ids) @@ -3369,6 +3789,7 @@ BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly) BTF_ID_FLAGS(func, bpf_dynptr_size) BTF_ID_FLAGS(func, bpf_dynptr_clone) BTF_ID_FLAGS(func, bpf_dynptr_copy) +BTF_ID_FLAGS(func, bpf_dynptr_memset) #ifdef CONFIG_NET BTF_ID_FLAGS(func, bpf_modify_return_test_tp) #endif @@ -3402,9 +3823,21 @@ BTF_ID_FLAGS(func, bpf_iter_dmabuf_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPAB BTF_ID_FLAGS(func, bpf_iter_dmabuf_destroy, KF_ITER_DESTROY | KF_SLEEPABLE) #endif BTF_ID_FLAGS(func, __bpf_trap) -#ifdef CONFIG_CGROUPS +BTF_ID_FLAGS(func, bpf_strcmp); +BTF_ID_FLAGS(func, bpf_strchr); +BTF_ID_FLAGS(func, bpf_strchrnul); +BTF_ID_FLAGS(func, bpf_strnchr); +BTF_ID_FLAGS(func, bpf_strrchr); +BTF_ID_FLAGS(func, bpf_strlen); +BTF_ID_FLAGS(func, bpf_strnlen); +BTF_ID_FLAGS(func, bpf_strspn); +BTF_ID_FLAGS(func, bpf_strcspn); +BTF_ID_FLAGS(func, bpf_strstr); +BTF_ID_FLAGS(func, bpf_strnstr); +#if defined(CONFIG_BPF_LSM) && defined(CONFIG_CGROUPS) BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU) #endif +BTF_ID_FLAGS(func, bpf_stream_vprintk, KF_TRUSTED_ARGS) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/kernel/bpf/link_iter.c b/kernel/bpf/link_iter.c index fec8005a121c..8158e9c1af7b 100644 --- a/kernel/bpf/link_iter.c +++ b/kernel/bpf/link_iter.c @@ -78,8 +78,7 @@ static const struct seq_operations bpf_link_seq_ops = { .show = bpf_link_seq_show, }; -BTF_ID_LIST(btf_bpf_link_id) -BTF_ID(struct, bpf_link) +BTF_ID_LIST_SINGLE(btf_bpf_link_id, struct, bpf_link) static const struct bpf_iter_seq_info bpf_link_seq_info = { .seq_ops = &bpf_link_seq_ops, diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 3969eb0382af..632d51b05fe9 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -394,17 +394,10 @@ static int cgroup_storage_check_btf(const struct bpf_map *map, if (!btf_member_is_reg_int(btf, key_type, m, offset, size)) return -EINVAL; } else { - u32 int_data; - /* * Key is expected to be u64, which stores the cgroup_inode_id */ - - if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) - return -EINVAL; - - int_data = *(u32 *)(key_type + 1); - if (BTF_INT_BITS(int_data) != 64 || BTF_INT_OFFSET(int_data)) + if (!btf_type_is_i64(key_type)) return -EINVAL; } diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c index 868cc2c43899..8e88201c98bf 100644 --- a/kernel/bpf/net_namespace.c +++ b/kernel/bpf/net_namespace.c @@ -11,8 +11,6 @@ struct bpf_netns_link { struct bpf_link link; - enum bpf_attach_type type; - enum netns_bpf_attach_type netns_type; /* We don't hold a ref to net in order to auto-detach the link * when netns is going away. Instead we rely on pernet @@ -21,6 +19,7 @@ struct bpf_netns_link { */ struct net *net; struct list_head node; /* node in list of links attached to net */ + enum netns_bpf_attach_type netns_type; }; /* Protects updates to netns_bpf */ @@ -216,7 +215,7 @@ static int bpf_netns_link_fill_info(const struct bpf_link *link, mutex_unlock(&netns_bpf_mutex); info->netns.netns_ino = inum; - info->netns.attach_type = net_link->type; + info->netns.attach_type = link->attach_type; return 0; } @@ -230,7 +229,7 @@ static void bpf_netns_link_show_fdinfo(const struct bpf_link *link, "netns_ino:\t%u\n" "attach_type:\t%u\n", info.netns.netns_ino, - info.netns.attach_type); + link->attach_type); } static const struct bpf_link_ops bpf_netns_link_ops = { @@ -501,9 +500,8 @@ int netns_bpf_link_create(const union bpf_attr *attr, struct bpf_prog *prog) goto out_put_net; } bpf_link_init(&net_link->link, BPF_LINK_TYPE_NETNS, - &bpf_netns_link_ops, prog); + &bpf_netns_link_ops, prog, type); net_link->net = net; - net_link->type = type; net_link->netns_type = netns_type; err = bpf_link_prime(&net_link->link, &link_primer); diff --git a/kernel/bpf/preload/Kconfig b/kernel/bpf/preload/Kconfig index c9d45c9d6918..aef7b0bc96d6 100644 --- a/kernel/bpf/preload/Kconfig +++ b/kernel/bpf/preload/Kconfig @@ -1,8 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only -config USERMODE_DRIVER - bool - default n - menuconfig BPF_PRELOAD bool "Preload BPF file system with kernel specific program and map iterators" depends on BPF @@ -10,7 +6,6 @@ menuconfig BPF_PRELOAD # The dependency on !COMPILE_TEST prevents it from being enabled # in allmodconfig or allyesconfig configurations depends on !COMPILE_TEST - select USERMODE_DRIVER help This builds kernel module with several embedded BPF programs that are pinned into BPF FS mount point as human readable files that are diff --git a/kernel/bpf/preload/iterators/iterators.lskel-big-endian.h b/kernel/bpf/preload/iterators/iterators.lskel-big-endian.h index ebdc6c0cdb70..49b1d515a847 100644 --- a/kernel/bpf/preload/iterators/iterators.lskel-big-endian.h +++ b/kernel/bpf/preload/iterators/iterators.lskel-big-endian.h @@ -89,10 +89,7 @@ iterators_bpf__load(struct iterators_bpf *skel) { struct bpf_load_and_run_opts opts = {}; int err; - - opts.ctx = (struct bpf_loader_ctx *)skel; - opts.data_sz = 6008; - opts.data = (void *)"\ + static const char opts_data[] __attribute__((__aligned__(8))) = "\ \0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ \0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ \0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ @@ -126,190 +123,196 @@ iterators_bpf__load(struct iterators_bpf *skel) \0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ \0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ \0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xeb\x9f\x01\0\ -\0\0\0\x18\0\0\0\0\0\0\x04\x1c\0\0\x04\x1c\0\0\x05\x18\0\0\0\0\x02\0\0\0\0\0\0\ +\0\0\0\x18\0\0\0\0\0\0\x04\x80\0\0\x04\x80\0\0\x05\x44\0\0\0\0\x02\0\0\0\0\0\0\ \x02\0\0\0\x01\x04\0\0\x02\0\0\0\x10\0\0\0\x13\0\0\0\x03\0\0\0\0\0\0\0\x18\0\0\ \0\x04\0\0\0\x40\0\0\0\0\x02\0\0\0\0\0\0\x08\0\0\0\0\x02\0\0\0\0\0\0\x0d\0\0\0\ \0\x0d\0\0\x01\0\0\0\x06\0\0\0\x1c\0\0\0\x01\0\0\0\x20\x01\0\0\0\0\0\0\x04\x01\ -\0\0\x20\0\0\0\x24\x0c\0\0\x01\0\0\0\x05\0\0\0\xc2\x04\0\0\x03\0\0\0\x18\0\0\0\ -\xd0\0\0\0\x09\0\0\0\0\0\0\0\xd4\0\0\0\x0b\0\0\0\x40\0\0\0\xdf\0\0\0\x0b\0\0\0\ -\x80\0\0\0\0\x02\0\0\0\0\0\0\x0a\0\0\0\xe7\x07\0\0\0\0\0\0\0\0\0\0\xf0\x08\0\0\ -\0\0\0\0\x0c\0\0\0\xf6\x01\0\0\0\0\0\0\x08\0\0\0\x40\0\0\x01\xb3\x04\0\0\x03\0\ -\0\0\x18\0\0\x01\xbb\0\0\0\x0e\0\0\0\0\0\0\x01\xbe\0\0\0\x11\0\0\0\x20\0\0\x01\ -\xc3\0\0\0\x0e\0\0\0\xa0\0\0\x01\xcf\x08\0\0\0\0\0\0\x0f\0\0\x01\xd5\x01\0\0\0\ -\0\0\0\x04\0\0\0\x20\0\0\x01\xe2\x01\0\0\0\0\0\0\x01\x01\0\0\x08\0\0\0\0\x03\0\ -\0\0\0\0\0\0\0\0\0\x10\0\0\0\x12\0\0\0\x10\0\0\x01\xe7\x01\0\0\0\0\0\0\x04\0\0\ -\0\x20\0\0\0\0\x02\0\0\0\0\0\0\x14\0\0\x02\x4b\x04\0\0\x02\0\0\0\x10\0\0\0\x13\ -\0\0\0\x03\0\0\0\0\0\0\x02\x5e\0\0\0\x15\0\0\0\x40\0\0\0\0\x02\0\0\0\0\0\0\x18\ -\0\0\0\0\x0d\0\0\x01\0\0\0\x06\0\0\0\x1c\0\0\0\x13\0\0\x02\x63\x0c\0\0\x01\0\0\ -\0\x16\0\0\x02\xaf\x04\0\0\x01\0\0\0\x08\0\0\x02\xb8\0\0\0\x19\0\0\0\0\0\0\0\0\ -\x02\0\0\0\0\0\0\x1a\0\0\x03\x09\x04\0\0\x06\0\0\0\x38\0\0\x01\xbb\0\0\0\x0e\0\ -\0\0\0\0\0\x01\xbe\0\0\0\x11\0\0\0\x20\0\0\x03\x16\0\0\0\x1b\0\0\0\xc0\0\0\x03\ -\x27\0\0\0\x15\0\0\x01\0\0\0\x03\x30\0\0\0\x1d\0\0\x01\x40\0\0\x03\x3a\0\0\0\ -\x1e\0\0\x01\x80\0\0\0\0\x02\0\0\0\0\0\0\x1c\0\0\0\0\x0a\0\0\0\0\0\0\x10\0\0\0\ -\0\x02\0\0\0\0\0\0\x1f\0\0\0\0\x02\0\0\0\0\0\0\x20\0\0\x03\x84\x04\0\0\x02\0\0\ -\0\x08\0\0\x03\x92\0\0\0\x0e\0\0\0\0\0\0\x03\x9b\0\0\0\x0e\0\0\0\x20\0\0\x03\ -\x3a\x04\0\0\x03\0\0\0\x18\0\0\x03\xa5\0\0\0\x1b\0\0\0\0\0\0\x03\xad\0\0\0\x21\ -\0\0\0\x40\0\0\x03\xb3\0\0\0\x23\0\0\0\x80\0\0\0\0\x02\0\0\0\0\0\0\x22\0\0\0\0\ -\x02\0\0\0\0\0\0\x24\0\0\x03\xb7\x04\0\0\x01\0\0\0\x04\0\0\x03\xc2\0\0\0\x0e\0\ -\0\0\0\0\0\x04\x2b\x04\0\0\x01\0\0\0\x04\0\0\x04\x34\0\0\0\x0e\0\0\0\0\0\0\0\0\ -\x03\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\x12\0\0\0\x23\0\0\x04\xaa\x0e\0\0\0\0\0\0\ -\x25\0\0\0\0\0\0\0\0\x03\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\x12\0\0\0\x0e\0\0\x04\ -\xbe\x0e\0\0\0\0\0\0\x27\0\0\0\0\0\0\0\0\x03\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\x12\ -\0\0\0\x20\0\0\x04\xd4\x0e\0\0\0\0\0\0\x29\0\0\0\0\0\0\0\0\x03\0\0\0\0\0\0\0\0\ -\0\0\x1c\0\0\0\x12\0\0\0\x11\0\0\x04\xe9\x0e\0\0\0\0\0\0\x2b\0\0\0\0\0\0\0\0\ -\x03\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\x12\0\0\0\x04\0\0\x05\0\x0e\0\0\0\0\0\0\x2d\ -\0\0\0\x01\0\0\x05\x08\x0f\0\0\x04\0\0\0\x62\0\0\0\x26\0\0\0\0\0\0\0\x23\0\0\0\ -\x28\0\0\0\x23\0\0\0\x0e\0\0\0\x2a\0\0\0\x31\0\0\0\x20\0\0\0\x2c\0\0\0\x51\0\0\ -\0\x11\0\0\x05\x10\x0f\0\0\x01\0\0\0\x04\0\0\0\x2e\0\0\0\0\0\0\0\x04\0\x62\x70\ -\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x6d\x65\x74\x61\ -\0\x6d\x61\x70\0\x63\x74\x78\0\x69\x6e\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\ -\x5f\x6d\x61\x70\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\x30\x3a\ -\x30\0\x2f\x68\x6f\x6d\x65\x2f\x69\x69\x69\x2f\x6c\x69\x6e\x75\x78\x2d\x6b\x65\ -\x72\x6e\x65\x6c\x2d\x74\x6f\x6f\x6c\x63\x68\x61\x69\x6e\x2f\x73\x72\x63\x2f\ -\x6c\x69\x6e\x75\x78\x2f\x6b\x65\x72\x6e\x65\x6c\x2f\x62\x70\x66\x2f\x70\x72\ -\x65\x6c\x6f\x61\x64\x2f\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2f\x69\x74\x65\ -\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\x2e\x63\0\x09\x73\x74\x72\x75\x63\x74\ -\x20\x73\x65\x71\x5f\x66\x69\x6c\x65\x20\x2a\x73\x65\x71\x20\x3d\x20\x63\x74\ -\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x3b\0\x62\x70\x66\x5f\x69\x74\ -\x65\x72\x5f\x6d\x65\x74\x61\0\x73\x65\x71\0\x73\x65\x73\x73\x69\x6f\x6e\x5f\ -\x69\x64\0\x73\x65\x71\x5f\x6e\x75\x6d\0\x73\x65\x71\x5f\x66\x69\x6c\x65\0\x5f\ -\x5f\x75\x36\x34\0\x75\x6e\x73\x69\x67\x6e\x65\x64\x20\x6c\x6f\x6e\x67\x20\x6c\ -\x6f\x6e\x67\0\x30\x3a\x31\0\x09\x73\x74\x72\x75\x63\x74\x20\x62\x70\x66\x5f\ -\x6d\x61\x70\x20\x2a\x6d\x61\x70\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x61\x70\ -\x3b\0\x09\x69\x66\x20\x28\x21\x6d\x61\x70\x29\0\x30\x3a\x32\0\x09\x5f\x5f\x75\ -\x36\x34\x20\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\ -\x65\x74\x61\x2d\x3e\x73\x65\x71\x5f\x6e\x75\x6d\x3b\0\x09\x69\x66\x20\x28\x73\ -\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x3d\x20\x30\x29\0\x09\x09\x42\x50\x46\x5f\x53\ -\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x20\x20\x69\ -\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\ -\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\ -\x5f\x6d\x61\x70\0\x69\x64\0\x6e\x61\x6d\x65\0\x6d\x61\x78\x5f\x65\x6e\x74\x72\ -\x69\x65\x73\0\x5f\x5f\x75\x33\x32\0\x75\x6e\x73\x69\x67\x6e\x65\x64\x20\x69\ -\x6e\x74\0\x63\x68\x61\x72\0\x5f\x5f\x41\x52\x52\x41\x59\x5f\x53\x49\x5a\x45\ -\x5f\x54\x59\x50\x45\x5f\x5f\0\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\ -\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\x34\x75\x20\x25\x2d\x31\x36\x73\ -\x25\x36\x64\x5c\x6e\x22\x2c\x20\x6d\x61\x70\x2d\x3e\x69\x64\x2c\x20\x6d\x61\ -\x70\x2d\x3e\x6e\x61\x6d\x65\x2c\x20\x6d\x61\x70\x2d\x3e\x6d\x61\x78\x5f\x65\ -\x6e\x74\x72\x69\x65\x73\x29\x3b\0\x7d\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\ -\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x70\x72\x6f\x67\0\x64\x75\x6d\x70\x5f\ -\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x70\x72\ -\x6f\x67\0\x09\x73\x74\x72\x75\x63\x74\x20\x62\x70\x66\x5f\x70\x72\x6f\x67\x20\ -\x2a\x70\x72\x6f\x67\x20\x3d\x20\x63\x74\x78\x2d\x3e\x70\x72\x6f\x67\x3b\0\x09\ -\x69\x66\x20\x28\x21\x70\x72\x6f\x67\x29\0\x62\x70\x66\x5f\x70\x72\x6f\x67\0\ -\x61\x75\x78\0\x09\x61\x75\x78\x20\x3d\x20\x70\x72\x6f\x67\x2d\x3e\x61\x75\x78\ -\x3b\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\ -\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\ -\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\x64\x5c\x6e\x22\ -\x29\x3b\0\x62\x70\x66\x5f\x70\x72\x6f\x67\x5f\x61\x75\x78\0\x61\x74\x74\x61\ -\x63\x68\x5f\x66\x75\x6e\x63\x5f\x6e\x61\x6d\x65\0\x64\x73\x74\x5f\x70\x72\x6f\ -\x67\0\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x62\x74\x66\0\x09\x42\x50\x46\x5f\ -\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\x34\ -\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x5c\x6e\x22\x2c\x20\x61\ -\x75\x78\x2d\x3e\x69\x64\x2c\0\x30\x3a\x34\0\x30\x3a\x35\0\x09\x69\x66\x20\x28\ -\x21\x62\x74\x66\x29\0\x62\x70\x66\x5f\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\ -\x69\x6e\x73\x6e\x5f\x6f\x66\x66\0\x74\x79\x70\x65\x5f\x69\x64\0\x30\0\x73\x74\ -\x72\x69\x6e\x67\x73\0\x74\x79\x70\x65\x73\0\x68\x64\x72\0\x62\x74\x66\x5f\x68\ -\x65\x61\x64\x65\x72\0\x73\x74\x72\x5f\x6c\x65\x6e\0\x09\x74\x79\x70\x65\x73\ -\x20\x3d\x20\x62\x74\x66\x2d\x3e\x74\x79\x70\x65\x73\x3b\0\x09\x62\x70\x66\x5f\ -\x70\x72\x6f\x62\x65\x5f\x72\x65\x61\x64\x5f\x6b\x65\x72\x6e\x65\x6c\x28\x26\ -\x74\x2c\x20\x73\x69\x7a\x65\x6f\x66\x28\x74\x29\x2c\x20\x74\x79\x70\x65\x73\ -\x20\x2b\x20\x62\x74\x66\x5f\x69\x64\x29\x3b\0\x09\x73\x74\x72\x20\x3d\x20\x62\ -\x74\x66\x2d\x3e\x73\x74\x72\x69\x6e\x67\x73\x3b\0\x62\x74\x66\x5f\x74\x79\x70\ -\x65\0\x6e\x61\x6d\x65\x5f\x6f\x66\x66\0\x09\x6e\x61\x6d\x65\x5f\x6f\x66\x66\ -\x20\x3d\x20\x42\x50\x46\x5f\x43\x4f\x52\x45\x5f\x52\x45\x41\x44\x28\x74\x2c\ -\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x29\x3b\0\x30\x3a\x32\x3a\x30\0\x09\x69\ -\x66\x20\x28\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3e\x3d\x20\x62\x74\x66\x2d\ -\x3e\x68\x64\x72\x2e\x73\x74\x72\x5f\x6c\x65\x6e\x29\0\x09\x72\x65\x74\x75\x72\ -\x6e\x20\x73\x74\x72\x20\x2b\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x3b\0\x30\x3a\ -\x33\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\ -\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\ -\x74\x2e\x31\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\ -\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\ -\x5f\x5f\x66\x6d\x74\x2e\x32\0\x4c\x49\x43\x45\x4e\x53\x45\0\x2e\x72\x6f\x64\ -\x61\x74\x61\0\x6c\x69\x63\x65\x6e\x73\x65\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\x09\x4c\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x02\0\0\0\x04\0\0\0\x62\0\0\0\ -\x01\0\0\0\x80\0\0\0\0\0\0\0\0\x69\x74\x65\x72\x61\x74\x6f\x72\x2e\x72\x6f\x64\ -\x61\x74\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x2f\0\0\0\0\0\0\0\0\0\0\0\0\x20\ -\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\ -\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x0a\0\x25\x34\x75\x20\x25\ -\x2d\x31\x36\x73\x25\x36\x64\x0a\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\ +\0\0\x20\0\0\0\x24\x0c\0\0\x01\0\0\0\x05\0\0\0\xc3\x04\0\0\x03\0\0\0\x18\0\0\0\ +\xd1\0\0\0\x09\0\0\0\0\0\0\0\xd5\0\0\0\x0b\0\0\0\x40\0\0\0\xe0\0\0\0\x0b\0\0\0\ +\x80\0\0\0\0\x02\0\0\0\0\0\0\x0a\0\0\0\xe8\x07\0\0\0\0\0\0\0\0\0\0\xf1\x08\0\0\ +\0\0\0\0\x0c\0\0\0\xf7\x01\0\0\0\0\0\0\x08\0\0\0\x40\0\0\x01\xc1\x04\0\0\x03\0\ +\0\0\x18\0\0\x01\xc9\0\0\0\x0e\0\0\0\0\0\0\x01\xcc\0\0\0\x11\0\0\0\x20\0\0\x01\ +\xd1\0\0\0\x0e\0\0\0\xa0\0\0\x01\xdd\x08\0\0\0\0\0\0\x0f\0\0\x01\xe3\x01\0\0\0\ +\0\0\0\x04\0\0\0\x20\0\0\x01\xf0\x01\0\0\0\0\0\0\x01\x01\0\0\x08\0\0\0\0\x03\0\ +\0\0\0\0\0\0\0\0\0\x10\0\0\0\x12\0\0\0\x10\0\0\x01\xf5\x01\0\0\0\0\0\0\x04\0\0\ +\0\x20\0\0\0\0\x0d\0\0\x01\0\0\0\x14\0\0\x05\x39\0\0\0\x04\0\0\x02\x3e\x08\0\0\ +\0\0\0\0\x15\0\0\x02\x44\x01\0\0\0\0\0\0\x08\x01\0\0\x40\0\0\x02\x4e\x0c\0\0\ +\x01\0\0\0\x13\0\0\0\0\x02\0\0\0\0\0\0\x18\0\0\x02\x65\x04\0\0\x02\0\0\0\x10\0\ +\0\0\x13\0\0\0\x03\0\0\0\0\0\0\x02\x78\0\0\0\x19\0\0\0\x40\0\0\0\0\x02\0\0\0\0\ +\0\0\x1c\0\0\0\0\x0d\0\0\x01\0\0\0\x06\0\0\0\x1c\0\0\0\x17\0\0\x02\x7d\x0c\0\0\ +\x01\0\0\0\x1a\0\0\x02\xc9\x04\0\0\x01\0\0\0\x08\0\0\x02\xd2\0\0\0\x1d\0\0\0\0\ +\0\0\0\0\x02\0\0\0\0\0\0\x1e\0\0\x03\x23\x04\0\0\x06\0\0\0\x38\0\0\x01\xc9\0\0\ +\0\x0e\0\0\0\0\0\0\x01\xcc\0\0\0\x11\0\0\0\x20\0\0\x03\x30\0\0\0\x1f\0\0\0\xc0\ +\0\0\x03\x41\0\0\0\x19\0\0\x01\0\0\0\x03\x4a\0\0\0\x21\0\0\x01\x40\0\0\x03\x54\ +\0\0\0\x22\0\0\x01\x80\0\0\0\0\x02\0\0\0\0\0\0\x20\0\0\0\0\x0a\0\0\0\0\0\0\x10\ +\0\0\0\0\x02\0\0\0\0\0\0\x23\0\0\0\0\x02\0\0\0\0\0\0\x24\0\0\x03\x9e\x04\0\0\ +\x02\0\0\0\x08\0\0\x03\xac\0\0\0\x0e\0\0\0\0\0\0\x03\xb5\0\0\0\x0e\0\0\0\x20\0\ +\0\x03\x54\x04\0\0\x03\0\0\0\x18\0\0\x03\xbf\0\0\0\x1f\0\0\0\0\0\0\x03\xc7\0\0\ +\0\x25\0\0\0\x40\0\0\x03\xcd\0\0\0\x27\0\0\0\x80\0\0\0\0\x02\0\0\0\0\0\0\x26\0\ +\0\0\0\x02\0\0\0\0\0\0\x28\0\0\x03\xd1\x04\0\0\x01\0\0\0\x04\0\0\x03\xdc\0\0\0\ +\x0e\0\0\0\0\0\0\x04\x45\x04\0\0\x01\0\0\0\x04\0\0\x04\x4e\0\0\0\x0e\0\0\0\0\0\ +\0\0\0\x03\0\0\0\0\0\0\0\0\0\0\x20\0\0\0\x12\0\0\0\x30\0\0\x04\xc4\x0e\0\0\0\0\ +\0\0\x29\0\0\0\0\0\0\0\0\x03\0\0\0\0\0\0\0\0\0\0\x20\0\0\0\x12\0\0\0\x1a\0\0\ +\x04\xd8\x0e\0\0\0\0\0\0\x2b\0\0\0\0\0\0\0\0\x03\0\0\0\0\0\0\0\0\0\0\x20\0\0\0\ +\x12\0\0\0\x20\0\0\x04\xee\x0e\0\0\0\0\0\0\x2d\0\0\0\0\0\0\0\0\x03\0\0\0\0\0\0\ +\0\0\0\0\x20\0\0\0\x12\0\0\0\x11\0\0\x05\x03\x0e\0\0\0\0\0\0\x2f\0\0\0\0\0\0\0\ +\0\x03\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\x12\0\0\0\x04\0\0\x05\x1a\x0e\0\0\0\0\0\0\ +\x31\0\0\0\x01\0\0\x05\x22\x0f\0\0\x01\0\0\0\x04\0\0\0\x36\0\0\0\0\0\0\0\x04\0\ +\0\x05\x29\x0f\0\0\x04\0\0\0\x7b\0\0\0\x2a\0\0\0\0\0\0\0\x30\0\0\0\x2c\0\0\0\ +\x30\0\0\0\x1a\0\0\0\x2e\0\0\0\x4a\0\0\0\x20\0\0\0\x30\0\0\0\x6a\0\0\0\x11\0\0\ +\x05\x31\x0f\0\0\x01\0\0\0\x04\0\0\0\x32\0\0\0\0\0\0\0\x04\0\0\x05\x39\x0e\0\0\ +\0\0\0\0\x06\0\0\0\x01\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\ +\x5f\x6d\x61\x70\0\x6d\x65\x74\x61\0\x6d\x61\x70\0\x63\x74\x78\0\x69\x6e\x74\0\ +\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x74\x65\x72\x2f\x62\x70\ +\x66\x5f\x6d\x61\x70\0\x30\x3a\x30\0\x2f\x68\x6f\x6d\x65\x32\x2f\x69\x69\x69\ +\x2f\x6c\x69\x6e\x75\x78\x2d\x6b\x65\x72\x6e\x65\x6c\x2d\x74\x6f\x6f\x6c\x63\ +\x68\x61\x69\x6e\x2f\x73\x72\x63\x2f\x6c\x69\x6e\x75\x78\x2f\x6b\x65\x72\x6e\ +\x65\x6c\x2f\x62\x70\x66\x2f\x70\x72\x65\x6c\x6f\x61\x64\x2f\x69\x74\x65\x72\ +\x61\x74\x6f\x72\x73\x2f\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\ +\x2e\x63\0\x09\x73\x74\x72\x75\x63\x74\x20\x73\x65\x71\x5f\x66\x69\x6c\x65\x20\ +\x2a\x73\x65\x71\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\ +\x65\x71\x3b\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x6d\x65\x74\x61\0\x73\x65\ +\x71\0\x73\x65\x73\x73\x69\x6f\x6e\x5f\x69\x64\0\x73\x65\x71\x5f\x6e\x75\x6d\0\ +\x73\x65\x71\x5f\x66\x69\x6c\x65\0\x5f\x5f\x75\x36\x34\0\x75\x6e\x73\x69\x67\ +\x6e\x65\x64\x20\x6c\x6f\x6e\x67\x20\x6c\x6f\x6e\x67\0\x30\x3a\x31\0\x09\x73\ +\x74\x72\x75\x63\x74\x20\x62\x70\x66\x5f\x6d\x61\x70\x20\x2a\x6d\x61\x70\x20\ +\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x61\x70\x3b\0\x09\x69\x66\x20\x28\x21\x6d\x61\ +\x70\x29\0\x30\x3a\x32\0\x09\x5f\x5f\x75\x36\x34\x20\x73\x65\x71\x5f\x6e\x75\ +\x6d\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x5f\ +\x6e\x75\x6d\x3b\0\x09\x69\x66\x20\x28\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x3d\ +\x20\x30\x29\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\ +\x28\x73\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\ +\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\ +\x65\x73\x20\x20\x63\x75\x72\x5f\x65\x6e\x74\x72\x69\x65\x73\x5c\x6e\x22\x29\ +\x3b\0\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x64\0\x6e\x61\x6d\x65\0\x6d\x61\x78\ +\x5f\x65\x6e\x74\x72\x69\x65\x73\0\x5f\x5f\x75\x33\x32\0\x75\x6e\x73\x69\x67\ +\x6e\x65\x64\x20\x69\x6e\x74\0\x63\x68\x61\x72\0\x5f\x5f\x41\x52\x52\x41\x59\ +\x5f\x53\x49\x5a\x45\x5f\x54\x59\x50\x45\x5f\x5f\0\x09\x42\x50\x46\x5f\x53\x45\ +\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\x34\x75\x20\ +\x25\x2d\x31\x36\x73\x20\x20\x25\x31\x30\x64\x20\x20\x20\x25\x31\x30\x6c\x6c\ +\x64\x5c\x6e\x22\x2c\0\x7d\0\x5f\x5f\x73\x36\x34\0\x6c\x6f\x6e\x67\x20\x6c\x6f\ +\x6e\x67\0\x62\x70\x66\x5f\x6d\x61\x70\x5f\x73\x75\x6d\x5f\x65\x6c\x65\x6d\x5f\ +\x63\x6f\x75\x6e\x74\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\ +\x70\x72\x6f\x67\0\x70\x72\x6f\x67\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\ +\x72\x6f\x67\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x09\x73\ +\x74\x72\x75\x63\x74\x20\x62\x70\x66\x5f\x70\x72\x6f\x67\x20\x2a\x70\x72\x6f\ +\x67\x20\x3d\x20\x63\x74\x78\x2d\x3e\x70\x72\x6f\x67\x3b\0\x09\x69\x66\x20\x28\ +\x21\x70\x72\x6f\x67\x29\0\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x61\x75\x78\0\x09\ +\x61\x75\x78\x20\x3d\x20\x70\x72\x6f\x67\x2d\x3e\x61\x75\x78\x3b\0\x09\x09\x42\ +\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\ +\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\ +\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\x64\x5c\x6e\x22\x29\x3b\0\x62\x70\ +\x66\x5f\x70\x72\x6f\x67\x5f\x61\x75\x78\0\x61\x74\x74\x61\x63\x68\x5f\x66\x75\ +\x6e\x63\x5f\x6e\x61\x6d\x65\0\x64\x73\x74\x5f\x70\x72\x6f\x67\0\x66\x75\x6e\ +\x63\x5f\x69\x6e\x66\x6f\0\x62\x74\x66\0\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\ +\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\x34\x75\x20\x25\x2d\ +\x31\x36\x73\x20\x25\x73\x20\x25\x73\x5c\x6e\x22\x2c\x20\x61\x75\x78\x2d\x3e\ +\x69\x64\x2c\0\x30\x3a\x34\0\x30\x3a\x35\0\x09\x69\x66\x20\x28\x21\x62\x74\x66\ +\x29\0\x62\x70\x66\x5f\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x69\x6e\x73\x6e\ +\x5f\x6f\x66\x66\0\x74\x79\x70\x65\x5f\x69\x64\0\x30\0\x73\x74\x72\x69\x6e\x67\ +\x73\0\x74\x79\x70\x65\x73\0\x68\x64\x72\0\x62\x74\x66\x5f\x68\x65\x61\x64\x65\ +\x72\0\x73\x74\x72\x5f\x6c\x65\x6e\0\x09\x74\x79\x70\x65\x73\x20\x3d\x20\x62\ +\x74\x66\x2d\x3e\x74\x79\x70\x65\x73\x3b\0\x09\x62\x70\x66\x5f\x70\x72\x6f\x62\ +\x65\x5f\x72\x65\x61\x64\x5f\x6b\x65\x72\x6e\x65\x6c\x28\x26\x74\x2c\x20\x73\ +\x69\x7a\x65\x6f\x66\x28\x74\x29\x2c\x20\x74\x79\x70\x65\x73\x20\x2b\x20\x62\ +\x74\x66\x5f\x69\x64\x29\x3b\0\x09\x73\x74\x72\x20\x3d\x20\x62\x74\x66\x2d\x3e\ +\x73\x74\x72\x69\x6e\x67\x73\x3b\0\x62\x74\x66\x5f\x74\x79\x70\x65\0\x6e\x61\ +\x6d\x65\x5f\x6f\x66\x66\0\x09\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3d\x20\x42\ +\x50\x46\x5f\x43\x4f\x52\x45\x5f\x52\x45\x41\x44\x28\x74\x2c\x20\x6e\x61\x6d\ +\x65\x5f\x6f\x66\x66\x29\x3b\0\x30\x3a\x32\x3a\x30\0\x09\x69\x66\x20\x28\x6e\ +\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3e\x3d\x20\x62\x74\x66\x2d\x3e\x68\x64\x72\ +\x2e\x73\x74\x72\x5f\x6c\x65\x6e\x29\0\x09\x72\x65\x74\x75\x72\x6e\x20\x73\x74\ +\x72\x20\x2b\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x3b\0\x30\x3a\x33\0\x64\x75\ +\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\ +\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\ +\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\ +\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\ +\x6d\x74\x2e\x32\0\x4c\x49\x43\x45\x4e\x53\x45\0\x2e\x6b\x73\x79\x6d\x73\0\x2e\ +\x72\x6f\x64\x61\x74\x61\0\x6c\x69\x63\x65\x6e\x73\x65\0\x64\x75\x6d\x6d\x79\ +\x5f\x6b\x73\x79\x6d\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x09\xdc\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\x02\0\0\0\x04\0\0\0\x7b\0\0\0\x01\0\0\0\x80\0\0\0\0\ +\0\0\0\0\x69\x74\x65\x72\x61\x74\x6f\x72\x2e\x72\x6f\x64\x61\x74\x61\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\x34\0\0\0\0\0\0\0\0\0\0\0\0\x20\x20\x69\x64\x20\x6e\x61\ +\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\ +\x65\x6e\x74\x72\x69\x65\x73\x20\x20\x63\x75\x72\x5f\x65\x6e\x74\x72\x69\x65\ +\x73\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x20\x25\x31\x30\x64\x20\x20\ +\x20\x25\x31\x30\x6c\x6c\x64\x0a\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\ \x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\x64\ \x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x0a\0\0\0\0\ \0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\x79\x21\0\0\0\0\0\0\x79\x62\0\0\ -\0\0\0\0\x79\x71\0\x08\0\0\0\0\x15\x70\0\x1a\0\0\0\0\x79\x12\0\x10\0\0\0\0\x55\ -\x10\0\x08\0\0\0\0\xbf\x4a\0\0\0\0\0\0\x07\x40\0\0\xff\xff\xff\xe8\xbf\x16\0\0\ -\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xb7\x30\0\0\0\0\0\x23\xb7\x50\0\0\ -\0\0\0\0\x85\0\0\0\0\0\0\x7e\x61\x17\0\0\0\0\0\0\x7b\xa1\xff\xe8\0\0\0\0\xb7\ -\x10\0\0\0\0\0\x04\xbf\x27\0\0\0\0\0\0\x0f\x21\0\0\0\0\0\0\x7b\xa2\xff\xf0\0\0\ -\0\0\x61\x17\0\x14\0\0\0\0\x7b\xa1\xff\xf8\0\0\0\0\xbf\x4a\0\0\0\0\0\0\x07\x40\ -\0\0\xff\xff\xff\xe8\xbf\x16\0\0\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\0\x23\ -\xb7\x30\0\0\0\0\0\x0e\xb7\x50\0\0\0\0\0\x18\x85\0\0\0\0\0\0\x7e\xb7\0\0\0\0\0\ -\0\0\x95\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x07\0\0\0\0\0\0\0\x42\0\0\0\x9a\0\x01\x3c\ -\x1e\0\0\0\x01\0\0\0\x42\0\0\0\x9a\0\x01\x3c\x24\0\0\0\x02\0\0\0\x42\0\0\x01\ -\x0d\0\x01\x44\x1d\0\0\0\x03\0\0\0\x42\0\0\x01\x2e\0\x01\x4c\x06\0\0\0\x04\0\0\ -\0\x42\0\0\x01\x3d\0\x01\x40\x1d\0\0\0\x05\0\0\0\x42\0\0\x01\x62\0\x01\x58\x06\ -\0\0\0\x07\0\0\0\x42\0\0\x01\x75\0\x01\x5c\x03\0\0\0\x0e\0\0\0\x42\0\0\x01\xfb\ -\0\x01\x64\x02\0\0\0\x1e\0\0\0\x42\0\0\x02\x49\0\x01\x6c\x01\0\0\0\0\0\0\0\x02\ -\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x02\0\ -\0\x01\x09\0\0\0\0\0\0\0\x20\0\0\0\x08\0\0\x01\x39\0\0\0\0\0\0\0\x70\0\0\0\x0d\ -\0\0\0\x3e\0\0\0\0\0\0\0\x80\0\0\0\x0d\0\0\x01\x09\0\0\0\0\0\0\0\xa0\0\0\0\x0d\ -\0\0\x01\x39\0\0\0\0\0\0\0\x1a\0\0\0\x20\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\ -\x6d\x61\x70\0\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\ -\x01\0\0\0\x10\0\0\0\0\0\0\0\0\0\0\0\x09\0\0\0\x01\0\0\0\0\0\0\0\x07\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\ -\x62\x70\x66\x5f\x6d\x61\x70\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\x79\x21\0\0\ -\0\0\0\0\x79\x62\0\0\0\0\0\0\x79\x11\0\x08\0\0\0\0\x15\x10\0\x3b\0\0\0\0\x79\ -\x71\0\0\0\0\0\0\x79\x12\0\x10\0\0\0\0\x55\x10\0\x08\0\0\0\0\xbf\x4a\0\0\0\0\0\ -\0\x07\x40\0\0\xff\xff\xff\xd0\xbf\x16\0\0\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\x31\xb7\x30\0\0\0\0\0\x20\xb7\x50\0\0\0\0\0\0\x85\0\0\0\0\0\0\x7e\x7b\ -\xa6\xff\xc8\0\0\0\0\x61\x17\0\0\0\0\0\0\x7b\xa1\xff\xd0\0\0\0\0\xb7\x30\0\0\0\ -\0\0\x04\xbf\x97\0\0\0\0\0\0\x0f\x93\0\0\0\0\0\0\x79\x17\0\x28\0\0\0\0\x79\x87\ -\0\x30\0\0\0\0\x15\x80\0\x18\0\0\0\0\xb7\x20\0\0\0\0\0\0\x0f\x12\0\0\0\0\0\0\ -\x61\x11\0\x04\0\0\0\0\x79\x38\0\x08\0\0\0\0\x67\x10\0\0\0\0\0\x03\x0f\x31\0\0\ -\0\0\0\0\x79\x68\0\0\0\0\0\0\xbf\x1a\0\0\0\0\0\0\x07\x10\0\0\xff\xff\xff\xf8\ -\xb7\x20\0\0\0\0\0\x08\x85\0\0\0\0\0\0\x71\xb7\x10\0\0\0\0\0\0\x79\x3a\xff\xf8\ +\0\0\0\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\x79\x21\0\0\0\0\0\0\x79\x62\0\0\0\ +\0\0\0\x79\x71\0\x08\0\0\0\0\x15\x70\0\x1d\0\0\0\0\x79\x12\0\x10\0\0\0\0\x55\ +\x10\0\x08\0\0\0\0\xbf\x4a\0\0\0\0\0\0\x07\x40\0\0\xff\xff\xff\xe0\xbf\x16\0\0\ +\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xb4\x30\0\0\0\0\0\x30\xb4\x50\0\0\ +\0\0\0\0\x85\0\0\0\0\0\0\x7e\x61\x17\0\0\0\0\0\0\x7b\xa1\xff\xe0\0\0\0\0\xb7\ +\x10\0\0\0\0\0\x04\xbf\x27\0\0\0\0\0\0\x0f\x21\0\0\0\0\0\0\x7b\xa2\xff\xe8\0\0\ +\0\0\x61\x17\0\x14\0\0\0\0\x7b\xa1\xff\xf0\0\0\0\0\xbf\x17\0\0\0\0\0\0\x85\x02\ +\0\0\0\0\0\0\x7b\xa0\xff\xf8\0\0\0\0\xbf\x4a\0\0\0\0\0\0\x07\x40\0\0\xff\xff\ +\xff\xe0\xbf\x16\0\0\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\0\x30\xb4\x30\0\0\ +\0\0\0\x1a\xb4\x50\0\0\0\0\0\x20\x85\0\0\0\0\0\0\x7e\xb4\0\0\0\0\0\0\0\x95\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\x07\0\0\0\0\0\0\0\x42\0\0\0\x9b\0\x01\x44\x1e\0\0\0\ +\x01\0\0\0\x42\0\0\0\x9b\0\x01\x44\x24\0\0\0\x02\0\0\0\x42\0\0\x01\x0e\0\x01\ +\x4c\x1d\0\0\0\x03\0\0\0\x42\0\0\x01\x2f\0\x01\x54\x06\0\0\0\x04\0\0\0\x42\0\0\ +\x01\x3e\0\x01\x48\x1d\0\0\0\x05\0\0\0\x42\0\0\x01\x63\0\x01\x60\x0e\0\0\0\x08\ +\0\0\0\x42\0\0\x01\x76\0\x01\x64\x03\0\0\0\x0e\0\0\0\x42\0\0\x02\x09\0\x01\x6c\ +\x02\0\0\0\x21\0\0\0\x42\0\0\x02\x3c\0\x01\x80\x01\0\0\0\0\0\0\0\x02\0\0\0\x3e\ +\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x02\0\0\x01\x0a\ +\0\0\0\0\0\0\0\x20\0\0\0\x08\0\0\x01\x3a\0\0\0\0\0\0\0\x70\0\0\0\x0d\0\0\0\x3e\ +\0\0\0\0\0\0\0\x80\0\0\0\x0d\0\0\x01\x0a\0\0\0\0\0\0\0\xa0\0\0\0\x0d\0\0\x01\ +\x3a\0\0\0\0\0\0\0\x1a\0\0\0\x23\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\ +\x70\0\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\ +\0\x10\0\0\0\0\0\0\0\0\0\0\0\x09\0\0\0\x01\0\0\0\0\0\0\0\x07\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x62\x70\ +\x66\x5f\x6d\x61\x70\0\0\0\0\0\0\0\0\x62\x70\x66\x5f\x6d\x61\x70\x5f\x73\x75\ +\x6d\x5f\x65\x6c\x65\x6d\x5f\x63\x6f\x75\x6e\x74\0\0\x47\x50\x4c\0\0\0\0\0\x79\ +\x21\0\0\0\0\0\0\x79\x62\0\0\0\0\0\0\x79\x11\0\x08\0\0\0\0\x15\x10\0\x3b\0\0\0\ +\0\x79\x71\0\0\0\0\0\0\x79\x12\0\x10\0\0\0\0\x55\x10\0\x08\0\0\0\0\xbf\x4a\0\0\ +\0\0\0\0\x07\x40\0\0\xff\xff\xff\xd0\xbf\x16\0\0\0\0\0\0\x18\x26\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\x4a\xb4\x30\0\0\0\0\0\x20\xb4\x50\0\0\0\0\0\0\x85\0\0\0\0\0\0\x7e\ +\x7b\xa6\xff\xc8\0\0\0\0\x61\x17\0\0\0\0\0\0\x7b\xa1\xff\xd0\0\0\0\0\xb7\x30\0\ +\0\0\0\0\x04\xbf\x97\0\0\0\0\0\0\x0f\x93\0\0\0\0\0\0\x79\x17\0\x28\0\0\0\0\x79\ +\x87\0\x30\0\0\0\0\x15\x80\0\x18\0\0\0\0\xb7\x20\0\0\0\0\0\0\x0f\x12\0\0\0\0\0\ +\0\x61\x11\0\x04\0\0\0\0\x79\x38\0\x08\0\0\0\0\x67\x10\0\0\0\0\0\x03\x0f\x31\0\ +\0\0\0\0\0\x79\x68\0\0\0\0\0\0\xbf\x1a\0\0\0\0\0\0\x07\x10\0\0\xff\xff\xff\xf8\ +\xb4\x20\0\0\0\0\0\x08\x85\0\0\0\0\0\0\x71\xb7\x10\0\0\0\0\0\0\x79\x3a\xff\xf8\ \0\0\0\0\x0f\x31\0\0\0\0\0\0\xbf\x1a\0\0\0\0\0\0\x07\x10\0\0\xff\xff\xff\xf4\ -\xb7\x20\0\0\0\0\0\x04\x85\0\0\0\0\0\0\x71\xb7\x30\0\0\0\0\0\x04\x61\x1a\xff\ -\xf4\0\0\0\0\x61\x28\0\x10\0\0\0\0\x3d\x12\0\x02\0\0\0\0\x0f\x61\0\0\0\0\0\0\ +\xb4\x20\0\0\0\0\0\x04\x85\0\0\0\0\0\0\x71\xb7\x30\0\0\0\0\0\x04\x61\x1a\xff\ +\xf4\0\0\0\0\x61\x28\0\x10\0\0\0\0\x3e\x12\0\x02\0\0\0\0\x0f\x61\0\0\0\0\0\0\ \xbf\x96\0\0\0\0\0\0\x7b\xa9\xff\xd8\0\0\0\0\x79\x17\0\x18\0\0\0\0\x7b\xa1\xff\ \xe0\0\0\0\0\x79\x17\0\x20\0\0\0\0\x79\x11\0\0\0\0\0\0\x0f\x13\0\0\0\0\0\0\x7b\ \xa1\xff\xe8\0\0\0\0\xbf\x4a\0\0\0\0\0\0\x07\x40\0\0\xff\xff\xff\xd0\x79\x1a\ -\xff\xc8\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\0\x51\xb7\x30\0\0\0\0\0\x11\ -\xb7\x50\0\0\0\0\0\x20\x85\0\0\0\0\0\0\x7e\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\x17\0\0\0\0\0\0\0\x42\0\0\0\x9a\0\x01\x80\x1e\0\0\0\x01\0\0\0\ -\x42\0\0\0\x9a\0\x01\x80\x24\0\0\0\x02\0\0\0\x42\0\0\x02\x7f\0\x01\x88\x1f\0\0\ -\0\x03\0\0\0\x42\0\0\x02\xa3\0\x01\x94\x06\0\0\0\x04\0\0\0\x42\0\0\x02\xbc\0\ -\x01\xa0\x0e\0\0\0\x05\0\0\0\x42\0\0\x01\x3d\0\x01\x84\x1d\0\0\0\x06\0\0\0\x42\ -\0\0\x01\x62\0\x01\xa4\x06\0\0\0\x08\0\0\0\x42\0\0\x02\xce\0\x01\xa8\x03\0\0\0\ -\x10\0\0\0\x42\0\0\x03\x3e\0\x01\xb0\x02\0\0\0\x17\0\0\0\x42\0\0\x03\x79\0\x01\ -\x04\x06\0\0\0\x1a\0\0\0\x42\0\0\x03\x3e\0\x01\xb0\x02\0\0\0\x1b\0\0\0\x42\0\0\ -\x03\xca\0\x01\x10\x0f\0\0\0\x1c\0\0\0\x42\0\0\x03\xdf\0\x01\x14\x2d\0\0\0\x1e\ -\0\0\0\x42\0\0\x04\x16\0\x01\x0c\x0d\0\0\0\x20\0\0\0\x42\0\0\x03\x3e\0\x01\xb0\ -\x02\0\0\0\x21\0\0\0\x42\0\0\x03\xdf\0\x01\x14\x02\0\0\0\x24\0\0\0\x42\0\0\x04\ -\x3d\0\x01\x18\x0d\0\0\0\x27\0\0\0\x42\0\0\x03\x3e\0\x01\xb0\x02\0\0\0\x28\0\0\ -\0\x42\0\0\x04\x3d\0\x01\x18\x0d\0\0\0\x2b\0\0\0\x42\0\0\x04\x3d\0\x01\x18\x0d\ -\0\0\0\x2c\0\0\0\x42\0\0\x04\x6b\0\x01\x1c\x1b\0\0\0\x2d\0\0\0\x42\0\0\x04\x6b\ -\0\x01\x1c\x06\0\0\0\x2e\0\0\0\x42\0\0\x04\x8e\0\x01\x24\x0d\0\0\0\x30\0\0\0\ -\x42\0\0\x03\x3e\0\x01\xb0\x02\0\0\0\x3f\0\0\0\x42\0\0\x02\x49\0\x01\xc0\x01\0\ -\0\0\0\0\0\0\x14\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\ -\x10\0\0\0\x14\0\0\x01\x09\0\0\0\0\0\0\0\x20\0\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\ -\x28\0\0\0\x08\0\0\x01\x39\0\0\0\0\0\0\0\x80\0\0\0\x1a\0\0\0\x3e\0\0\0\0\0\0\0\ -\x90\0\0\0\x1a\0\0\x01\x09\0\0\0\0\0\0\0\xa8\0\0\0\x1a\0\0\x03\x71\0\0\0\0\0\0\ -\0\xb0\0\0\0\x1a\0\0\x03\x75\0\0\0\0\0\0\0\xc0\0\0\0\x1f\0\0\x03\xa3\0\0\0\0\0\ -\0\0\xd8\0\0\0\x20\0\0\x01\x09\0\0\0\0\0\0\0\xf0\0\0\0\x20\0\0\0\x3e\0\0\0\0\0\ -\0\x01\x18\0\0\0\x24\0\0\0\x3e\0\0\0\0\0\0\x01\x50\0\0\0\x1a\0\0\x01\x09\0\0\0\ -\0\0\0\x01\x60\0\0\0\x20\0\0\x04\x65\0\0\0\0\0\0\x01\x88\0\0\0\x1a\0\0\x01\x39\ -\0\0\0\0\0\0\x01\x98\0\0\0\x1a\0\0\x04\xa6\0\0\0\0\0\0\x01\xa0\0\0\0\x18\0\0\0\ -\x3e\0\0\0\0\0\0\0\x1a\0\0\0\x41\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\ -\x6f\x67\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\x01\0\ -\0\0\x10\0\0\0\0\0\0\0\0\0\0\0\x19\0\0\0\x01\0\0\0\0\0\0\0\x12\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x62\x70\ -\x66\x5f\x70\x72\x6f\x67\0\0\0\0\0\0\0"; - opts.insns_sz = 2216; - opts.insns = (void *)"\ +\xff\xc8\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\0\x6a\xb4\x30\0\0\0\0\0\x11\ +\xb4\x50\0\0\0\0\0\x20\x85\0\0\0\0\0\0\x7e\xb4\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\x1b\0\0\0\0\0\0\0\x42\0\0\0\x9b\0\x01\x94\x1e\0\0\0\x01\0\0\0\ +\x42\0\0\0\x9b\0\x01\x94\x24\0\0\0\x02\0\0\0\x42\0\0\x02\x99\0\x01\x9c\x1f\0\0\ +\0\x03\0\0\0\x42\0\0\x02\xbd\0\x01\xa8\x06\0\0\0\x04\0\0\0\x42\0\0\x02\xd6\0\ +\x01\xb4\x0e\0\0\0\x05\0\0\0\x42\0\0\x01\x3e\0\x01\x98\x1d\0\0\0\x06\0\0\0\x42\ +\0\0\x01\x63\0\x01\xb8\x0e\0\0\0\x09\0\0\0\x42\0\0\x02\xe8\0\x01\xbc\x03\0\0\0\ +\x10\0\0\0\x42\0\0\x03\x58\0\x01\xc4\x02\0\0\0\x17\0\0\0\x42\0\0\x03\x93\0\x01\ +\x04\x06\0\0\0\x1a\0\0\0\x42\0\0\x03\x58\0\x01\xc4\x02\0\0\0\x1b\0\0\0\x42\0\0\ +\x03\xe4\0\x01\x10\x0f\0\0\0\x1c\0\0\0\x42\0\0\x03\xf9\0\x01\x14\x2d\0\0\0\x1e\ +\0\0\0\x42\0\0\x04\x30\0\x01\x0c\x0d\0\0\0\x21\0\0\0\x42\0\0\x03\xf9\0\x01\x14\ +\x02\0\0\0\x24\0\0\0\x42\0\0\x04\x57\0\x01\x18\x0d\0\0\0\x2b\0\0\0\x42\0\0\x04\ +\x57\0\x01\x18\x0d\0\0\0\x2c\0\0\0\x42\0\0\x04\x85\0\x01\x1c\x1b\0\0\0\x2d\0\0\ +\0\x42\0\0\x04\x85\0\x01\x1c\x0f\0\0\0\x2e\0\0\0\x42\0\0\x04\xa8\0\x01\x24\x0d\ +\0\0\0\x30\0\0\0\x42\0\0\x03\x58\0\x01\xc4\x02\0\0\0\x3f\0\0\0\x42\0\0\x02\x3c\ +\0\x01\xd4\x01\0\0\0\0\0\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\ +\x3e\0\0\0\0\0\0\0\x10\0\0\0\x18\0\0\x01\x0a\0\0\0\0\0\0\0\x20\0\0\0\x1c\0\0\0\ +\x3e\0\0\0\0\0\0\0\x28\0\0\0\x08\0\0\x01\x3a\0\0\0\0\0\0\0\x80\0\0\0\x1e\0\0\0\ +\x3e\0\0\0\0\0\0\0\x90\0\0\0\x1e\0\0\x01\x0a\0\0\0\0\0\0\0\xa8\0\0\0\x1e\0\0\ +\x03\x8b\0\0\0\0\0\0\0\xb0\0\0\0\x1e\0\0\x03\x8f\0\0\0\0\0\0\0\xc0\0\0\0\x23\0\ +\0\x03\xbd\0\0\0\0\0\0\0\xd8\0\0\0\x24\0\0\x01\x0a\0\0\0\0\0\0\0\xf0\0\0\0\x24\ +\0\0\0\x3e\0\0\0\0\0\0\x01\x18\0\0\0\x28\0\0\0\x3e\0\0\0\0\0\0\x01\x50\0\0\0\ +\x1e\0\0\x01\x0a\0\0\0\0\0\0\x01\x60\0\0\0\x24\0\0\x04\x7f\0\0\0\0\0\0\x01\x88\ +\0\0\0\x1e\0\0\x01\x3a\0\0\0\0\0\0\x01\x98\0\0\0\x1e\0\0\x04\xc0\0\0\0\0\0\0\ +\x01\xa0\0\0\0\x1c\0\0\0\x3e\0\0\0\0\0\0\0\x1a\0\0\0\x41\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\ +\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\ +\0\0\0\0\0\0\0\0\0\x01\0\0\0\x10\0\0\0\0\0\0\0\0\0\0\0\x16\0\0\0\x01\0\0\0\0\0\ +\0\0\x12\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\x62\x70\x66\x5f\x69\ +\x74\x65\x72\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\0\0\0\0"; + static const char opts_insn[] __attribute__((__aligned__(8))) = "\ \xbf\x61\0\0\0\0\0\0\xbf\x1a\0\0\0\0\0\0\x07\x10\0\0\xff\xff\xff\x78\xb7\x20\0\ \0\0\0\0\x88\xb7\x30\0\0\0\0\0\0\x85\0\0\0\0\0\0\x71\x05\0\0\x14\0\0\0\0\x61\ \x1a\xff\x78\0\0\0\0\xd5\x10\0\x01\0\0\0\0\x85\0\0\0\0\0\0\xa8\x61\x1a\xff\x7c\ @@ -318,72 +321,87 @@ iterators_bpf__load(struct iterators_bpf *skel) \0\0\0\x85\0\0\0\0\0\0\xa8\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x10\0\0\0\0\ \0\0\xd5\x10\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\xbf\x07\0\0\ \0\0\0\0\x95\0\0\0\0\0\0\0\x61\x06\0\x08\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\ -\0\x0e\x68\x63\x10\0\0\0\0\0\0\x61\x06\0\x0c\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\ -\0\0\0\x0e\x64\x63\x10\0\0\0\0\0\0\x79\x06\0\x10\0\0\0\0\x18\x16\0\0\0\0\0\0\0\ -\0\0\0\0\0\x0e\x58\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x05\0\ -\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0e\x50\x7b\x10\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\ -\x12\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x0e\x50\xb7\x30\0\0\0\0\0\x1c\x85\0\0\0\0\ +\0\x0e\xf8\x63\x10\0\0\0\0\0\0\x61\x06\0\x0c\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\ +\0\0\0\x0e\xf4\x63\x10\0\0\0\0\0\0\x79\x06\0\x10\0\0\0\0\x18\x16\0\0\0\0\0\0\0\ +\0\0\0\0\0\x0e\xe8\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x05\0\ +\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0e\xe0\x7b\x10\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\ +\x12\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x0e\xe0\xb7\x30\0\0\0\0\0\x1c\x85\0\0\0\0\ \0\0\xa6\xbf\x70\0\0\0\0\0\0\xc5\x70\xff\xd4\0\0\0\0\x63\xa7\xff\x78\0\0\0\0\ -\x61\x0a\xff\x78\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0e\xa0\x63\x10\0\0\0\ +\x61\x0a\xff\x78\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x30\x63\x10\0\0\0\ \0\0\0\x61\x06\0\x1c\0\0\0\0\x15\0\0\x03\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\ -\0\x0e\x7c\x63\x10\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\ -\0\0\x0e\x70\xb7\x30\0\0\0\0\0\x48\x85\0\0\0\0\0\0\xa6\xbf\x70\0\0\0\0\0\0\xc5\ +\0\x0f\x0c\x63\x10\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\ +\0\0\x0f\0\xb7\x30\0\0\0\0\0\x48\x85\0\0\0\0\0\0\xa6\xbf\x70\0\0\0\0\0\0\xc5\ \x70\xff\xc3\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x63\x17\0\0\0\0\0\0\ \x79\x36\0\x20\0\0\0\0\x15\x30\0\x08\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\ -\x0e\xb8\xb7\x20\0\0\0\0\0\x62\x61\x06\0\x04\0\0\0\0\x45\0\0\x02\0\0\0\x01\x85\ +\x0f\x48\xb7\x20\0\0\0\0\0\x7b\x61\x06\0\x04\0\0\0\0\x45\0\0\x02\0\0\0\x01\x85\ \0\0\0\0\0\0\x94\x05\0\0\x01\0\0\0\0\x85\0\0\0\0\0\0\x71\x18\x26\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\x61\x02\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x28\x63\ -\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x20\x18\x16\0\0\0\0\0\0\0\ -\0\0\0\0\0\x0f\x30\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0e\xb8\ -\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x38\x7b\x10\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\ -\x02\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x28\xb7\x30\0\0\0\0\0\x20\x85\0\0\0\0\ +\0\0\0\0\0\0\0\x61\x02\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0f\xd0\x63\ +\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0f\xc8\x18\x16\0\0\0\0\0\0\0\ +\0\0\0\0\0\x0f\xd8\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x48\ +\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0f\xe0\x7b\x10\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\ +\x02\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x0f\xd0\xb7\x30\0\0\0\0\0\x20\x85\0\0\0\0\ \0\0\xa6\xbf\x70\0\0\0\0\0\0\xc5\x70\xff\x9f\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\x61\x02\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x48\x63\x10\ -\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\x16\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x48\xb7\ +\0\0\0\0\0\x61\x02\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0f\xf0\x63\x10\ +\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\x16\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x0f\xf0\xb7\ \x30\0\0\0\0\0\x04\x85\0\0\0\0\0\0\xa6\xbf\x70\0\0\0\0\0\0\xc5\x70\xff\x92\0\0\ -\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x50\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\ -\x11\x70\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x58\x18\x16\0\ -\0\0\0\0\0\0\0\0\0\0\0\x11\x68\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\ -\0\0\x10\x58\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\xb0\x7b\x10\0\0\0\0\0\0\x18\ -\x06\0\0\0\0\0\0\0\0\0\0\0\0\x10\x60\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\xc0\ -\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x10\xf0\x18\x16\0\0\0\0\0\ -\0\0\0\0\0\0\0\x11\xe0\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\xd8\x7b\x10\0\0\0\0\0\0\x61\x06\0\x08\0\0\ -\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\x78\x63\x10\0\0\0\0\0\0\x61\x06\0\x0c\ -\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\x7c\x63\x10\0\0\0\0\0\0\x79\x06\0\ -\x10\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\x80\x7b\x10\0\0\0\0\0\0\x61\ -\x0a\xff\x78\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\xa8\x63\x10\0\0\0\0\0\ -\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\xf0\xb7\x20\0\0\0\0\0\x11\xb7\x30\0\0\0\ -\0\0\x0c\xb7\x40\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa7\xbf\x70\0\0\0\0\0\0\xc5\x70\ -\xff\x5c\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x11\x60\x63\x07\0\x6c\0\0\0\0\ -\x77\x70\0\0\0\0\0\x20\x63\x07\0\x70\0\0\0\0\xb7\x10\0\0\0\0\0\x05\x18\x26\0\0\ -\0\0\0\0\0\0\0\0\0\0\x11\x60\xb7\x30\0\0\0\0\0\x8c\x85\0\0\0\0\0\0\xa6\xbf\x70\ -\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x11\xd0\x61\x10\0\0\0\0\0\0\xd5\ -\x10\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\xc5\x70\xff\x4a\0\0\ -\0\0\x63\xa7\xff\x80\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x12\x08\x18\x16\0\ -\0\0\0\0\0\0\0\0\0\0\0\x16\xe0\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\ -\0\0\x12\x10\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x16\xd8\x7b\x10\0\0\0\0\0\0\x18\ -\x06\0\0\0\0\0\0\0\0\0\0\0\0\x14\x18\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x20\ -\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x14\x20\x18\x16\0\0\0\0\0\ -\0\0\0\0\0\0\0\x17\x30\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x15\ -\xb0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x50\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x48\x7b\x10\0\0\0\0\ -\0\0\x61\x06\0\x08\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x16\xe8\x63\x10\0\0\ -\0\0\0\0\x61\x06\0\x0c\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x16\xec\x63\x10\ -\0\0\0\0\0\0\x79\x06\0\x10\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x16\xf0\x7b\ -\x10\0\0\0\0\0\0\x61\x0a\xff\x78\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\ -\x18\x63\x10\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x60\xb7\x20\0\0\0\ -\0\0\x12\xb7\x30\0\0\0\0\0\x0c\xb7\x40\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa7\xbf\x70\ -\0\0\0\0\0\0\xc5\x70\xff\x13\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x16\xd0\ -\x63\x07\0\x6c\0\0\0\0\x77\x70\0\0\0\0\0\x20\x63\x07\0\x70\0\0\0\0\xb7\x10\0\0\ -\0\0\0\x05\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x16\xd0\xb7\x30\0\0\0\0\0\x8c\x85\0\ -\0\0\0\0\0\xa6\xbf\x70\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x17\x40\x61\ -\x10\0\0\0\0\0\0\xd5\x10\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\ -\xc5\x70\xff\x01\0\0\0\0\x63\xa7\xff\x84\0\0\0\0\x61\x1a\xff\x78\0\0\0\0\xd5\ -\x10\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\x61\x0a\xff\x80\0\0\ -\0\0\x63\x60\0\x28\0\0\0\0\x61\x0a\xff\x84\0\0\0\0\x63\x60\0\x2c\0\0\0\0\x18\ -\x16\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x01\0\0\0\0\0\0\x63\x60\0\x18\0\0\0\0\xb7\ -\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0"; +\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0f\xf8\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\ +\x12\x30\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x10\0\x18\x16\0\0\ +\0\0\0\0\0\0\0\0\0\0\x12\x28\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\ +\0\x11\x18\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\x70\x7b\x10\0\0\0\0\0\0\x18\x06\ +\0\0\0\0\0\0\0\0\0\0\0\0\x11\x20\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\x80\x7b\ +\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x11\xb0\x18\x16\0\0\0\0\0\0\0\ +\0\0\0\0\0\x12\xa0\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x18\ +\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\x98\x7b\x10\0\0\0\0\0\0\x61\x06\0\x08\0\0\0\0\ +\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\x38\x63\x10\0\0\0\0\0\0\x61\x06\0\x0c\0\0\ +\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\x3c\x63\x10\0\0\0\0\0\0\x79\x06\0\x10\ +\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\x40\x7b\x10\0\0\0\0\0\0\x61\x0a\ +\xff\x78\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\x68\x63\x10\0\0\0\0\0\0\ +\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\xb0\xb7\x20\0\0\0\0\0\x11\xb7\x30\0\0\0\0\ +\0\x0c\xb7\x40\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa7\xbf\x70\0\0\0\0\0\0\xc5\x70\xff\ +\x5c\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x12\x20\x63\x07\0\x6c\0\0\0\0\x77\ +\x70\0\0\0\0\0\x20\x63\x07\0\x70\0\0\0\0\x18\x86\0\0\0\0\0\0\0\0\0\0\0\0\x10\ +\xb8\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\xc8\xb7\x20\0\0\0\0\0\x17\xb7\x30\0\0\ +\0\0\0\x0c\xb7\x40\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa7\xbf\x70\0\0\0\0\0\0\xc5\x70\ +\xff\x4d\0\0\0\0\x75\x70\0\x03\0\0\0\0\x62\x80\0\x04\0\0\0\0\x6a\x80\0\x02\0\0\ +\0\0\x05\0\0\x0a\0\0\0\0\x63\x87\0\x04\0\0\0\0\xbf\x97\0\0\0\0\0\0\x77\x90\0\0\ +\0\0\0\x20\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\x63\x09\0\0\0\0\0\0\x55\x90\0\ +\x02\0\0\0\0\x6a\x80\0\x02\0\0\0\0\x05\0\0\x01\0\0\0\0\x6a\x80\0\x02\0\0\0\x40\ +\xb7\x10\0\0\0\0\0\x05\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x12\x20\xb7\x30\0\0\0\0\ +\0\x8c\x85\0\0\0\0\0\0\xa6\xbf\x70\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\ +\x01\0\x61\x10\0\0\0\0\0\0\xd5\x10\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\ +\0\0\0\xa8\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x12\x90\x61\x10\0\0\0\0\0\0\xd5\x10\ +\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\xc5\x70\xff\x2c\0\0\0\0\ +\x63\xa7\xff\x80\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x12\xe0\x18\x16\0\0\0\ +\0\0\0\0\0\0\0\0\0\x17\x88\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\ +\x12\xe8\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x80\x7b\x10\0\0\0\0\0\0\x18\x06\0\ +\0\0\0\0\0\0\0\0\0\0\0\x14\xf0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\xc8\x7b\x10\ +\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x14\xf8\x18\x16\0\0\0\0\0\0\0\0\0\ +\0\0\0\x17\xd8\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x16\x58\x18\ +\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\xf8\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\xf0\x7b\x10\0\0\0\0\0\0\x61\ +\x06\0\x08\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x90\x63\x10\0\0\0\0\0\0\ +\x61\x06\0\x0c\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x94\x63\x10\0\0\0\0\ +\0\0\x79\x06\0\x10\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x98\x7b\x10\0\0\ +\0\0\0\0\x61\x0a\xff\x78\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\xc0\x63\ +\x10\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x18\x08\xb7\x20\0\0\0\0\0\x12\ +\xb7\x30\0\0\0\0\0\x0c\xb7\x40\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa7\xbf\x70\0\0\0\0\ +\0\0\xc5\x70\xfe\xf5\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x17\x78\x63\x07\0\ +\x6c\0\0\0\0\x77\x70\0\0\0\0\0\x20\x63\x07\0\x70\0\0\0\0\xb7\x10\0\0\0\0\0\x05\ +\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x17\x78\xb7\x30\0\0\0\0\0\x8c\x85\0\0\0\0\0\0\ +\xa6\xbf\x70\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x17\xe8\x61\x10\0\0\0\ +\0\0\0\xd5\x10\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\xc5\x70\ +\xfe\xe3\0\0\0\0\x63\xa7\xff\x84\0\0\0\0\x61\x1a\xff\x78\0\0\0\0\xd5\x10\0\x02\ +\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\x61\x0a\xff\x80\0\0\0\0\x63\ +\x60\0\x28\0\0\0\0\x61\x0a\xff\x84\0\0\0\0\x63\x60\0\x2c\0\0\0\0\x18\x16\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\x61\x01\0\0\0\0\0\0\x63\x60\0\x18\0\0\0\0\xb7\0\0\0\0\0\ +\0\0\x95\0\0\0\0\0\0\0"; + + opts.ctx = (struct bpf_loader_ctx *)skel; + opts.data_sz = sizeof(opts_data) - 1; + opts.data = (void *)opts_data; + opts.insns_sz = sizeof(opts_insn) - 1; + opts.insns = (void *)opts_insn; + err = bpf_load_and_run(&opts); if (err < 0) return err; diff --git a/kernel/bpf/prog_iter.c b/kernel/bpf/prog_iter.c index 53a73c841c13..85d8fcb56fb7 100644 --- a/kernel/bpf/prog_iter.c +++ b/kernel/bpf/prog_iter.c @@ -78,8 +78,7 @@ static const struct seq_operations bpf_prog_seq_ops = { .show = bpf_prog_seq_show, }; -BTF_ID_LIST(btf_bpf_prog_id) -BTF_ID(struct, bpf_prog) +BTF_ID_LIST_SINGLE(btf_bpf_prog_id, struct, bpf_prog) static const struct bpf_iter_seq_info bpf_prog_seq_info = { .seq_ops = &bpf_prog_seq_ops, diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c index 338305c8852c..5ab354d55d82 100644 --- a/kernel/bpf/rqspinlock.c +++ b/kernel/bpf/rqspinlock.c @@ -666,6 +666,27 @@ EXPORT_SYMBOL_GPL(resilient_queued_spin_lock_slowpath); __bpf_kfunc_start_defs(); +static void bpf_prog_report_rqspinlock_violation(const char *str, void *lock, bool irqsave) +{ + struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); + struct bpf_stream_stage ss; + struct bpf_prog *prog; + + prog = bpf_prog_find_from_stack(); + if (!prog) + return; + bpf_stream_stage(ss, prog, BPF_STDERR, ({ + bpf_stream_printk(ss, "ERROR: %s for bpf_res_spin_lock%s\n", str, irqsave ? "_irqsave" : ""); + bpf_stream_printk(ss, "Attempted lock = 0x%px\n", lock); + bpf_stream_printk(ss, "Total held locks = %d\n", rqh->cnt); + for (int i = 0; i < min(RES_NR_HELD, rqh->cnt); i++) + bpf_stream_printk(ss, "Held lock[%2d] = 0x%px\n", i, rqh->locks[i]); + bpf_stream_dump_stack(ss); + })); +} + +#define REPORT_STR(ret) ({ (ret) == -ETIMEDOUT ? "Timeout detected" : "AA or ABBA deadlock detected"; }) + __bpf_kfunc int bpf_res_spin_lock(struct bpf_res_spin_lock *lock) { int ret; @@ -676,6 +697,7 @@ __bpf_kfunc int bpf_res_spin_lock(struct bpf_res_spin_lock *lock) preempt_disable(); ret = res_spin_lock((rqspinlock_t *)lock); if (unlikely(ret)) { + bpf_prog_report_rqspinlock_violation(REPORT_STR(ret), lock, false); preempt_enable(); return ret; } @@ -698,6 +720,7 @@ __bpf_kfunc int bpf_res_spin_lock_irqsave(struct bpf_res_spin_lock *lock, unsign local_irq_save(flags); ret = res_spin_lock((rqspinlock_t *)lock); if (unlikely(ret)) { + bpf_prog_report_rqspinlock_violation(REPORT_STR(ret), lock, true); local_irq_restore(flags); preempt_enable(); return ret; diff --git a/kernel/bpf/stream.c b/kernel/bpf/stream.c new file mode 100644 index 000000000000..ab592db4a4bf --- /dev/null +++ b/kernel/bpf/stream.c @@ -0,0 +1,526 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include <linux/bpf.h> +#include <linux/filter.h> +#include <linux/bpf_mem_alloc.h> +#include <linux/percpu.h> +#include <linux/refcount.h> +#include <linux/gfp.h> +#include <linux/memory.h> +#include <linux/local_lock.h> +#include <linux/mutex.h> + +/* + * Simple per-CPU NMI-safe bump allocation mechanism, backed by the NMI-safe + * try_alloc_pages()/free_pages_nolock() primitives. We allocate a page and + * stash it in a local per-CPU variable, and bump allocate from the page + * whenever items need to be printed to a stream. Each page holds a global + * atomic refcount in its first 4 bytes, and then records of variable length + * that describe the printed messages. Once the global refcount has dropped to + * zero, it is a signal to free the page back to the kernel's page allocator, + * given all the individual records in it have been consumed. + * + * It is possible the same page is used to serve allocations across different + * programs, which may be consumed at different times individually, hence + * maintaining a reference count per-page is critical for correct lifetime + * tracking. + * + * The bpf_stream_page code will be replaced to use kmalloc_nolock() once it + * lands. + */ +struct bpf_stream_page { + refcount_t ref; + u32 consumed; + char buf[]; +}; + +/* Available room to add data to a refcounted page. */ +#define BPF_STREAM_PAGE_SZ (PAGE_SIZE - offsetofend(struct bpf_stream_page, consumed)) + +static DEFINE_PER_CPU(local_trylock_t, stream_local_lock) = INIT_LOCAL_TRYLOCK(stream_local_lock); +static DEFINE_PER_CPU(struct bpf_stream_page *, stream_pcpu_page); + +static bool bpf_stream_page_local_lock(unsigned long *flags) +{ + return local_trylock_irqsave(&stream_local_lock, *flags); +} + +static void bpf_stream_page_local_unlock(unsigned long *flags) +{ + local_unlock_irqrestore(&stream_local_lock, *flags); +} + +static void bpf_stream_page_free(struct bpf_stream_page *stream_page) +{ + struct page *p; + + if (!stream_page) + return; + p = virt_to_page(stream_page); + free_pages_nolock(p, 0); +} + +static void bpf_stream_page_get(struct bpf_stream_page *stream_page) +{ + refcount_inc(&stream_page->ref); +} + +static void bpf_stream_page_put(struct bpf_stream_page *stream_page) +{ + if (refcount_dec_and_test(&stream_page->ref)) + bpf_stream_page_free(stream_page); +} + +static void bpf_stream_page_init(struct bpf_stream_page *stream_page) +{ + refcount_set(&stream_page->ref, 1); + stream_page->consumed = 0; +} + +static struct bpf_stream_page *bpf_stream_page_replace(void) +{ + struct bpf_stream_page *stream_page, *old_stream_page; + struct page *page; + + page = alloc_pages_nolock(NUMA_NO_NODE, 0); + if (!page) + return NULL; + stream_page = page_address(page); + bpf_stream_page_init(stream_page); + + old_stream_page = this_cpu_read(stream_pcpu_page); + if (old_stream_page) + bpf_stream_page_put(old_stream_page); + this_cpu_write(stream_pcpu_page, stream_page); + return stream_page; +} + +static int bpf_stream_page_check_room(struct bpf_stream_page *stream_page, int len) +{ + int min = offsetof(struct bpf_stream_elem, str[0]); + int consumed = stream_page->consumed; + int total = BPF_STREAM_PAGE_SZ; + int rem = max(0, total - consumed - min); + + /* Let's give room of at least 8 bytes. */ + WARN_ON_ONCE(rem % 8 != 0); + rem = rem < 8 ? 0 : rem; + return min(len, rem); +} + +static void bpf_stream_elem_init(struct bpf_stream_elem *elem, int len) +{ + init_llist_node(&elem->node); + elem->total_len = len; + elem->consumed_len = 0; +} + +static struct bpf_stream_page *bpf_stream_page_from_elem(struct bpf_stream_elem *elem) +{ + unsigned long addr = (unsigned long)elem; + + return (struct bpf_stream_page *)PAGE_ALIGN_DOWN(addr); +} + +static struct bpf_stream_elem *bpf_stream_page_push_elem(struct bpf_stream_page *stream_page, int len) +{ + u32 consumed = stream_page->consumed; + + stream_page->consumed += round_up(offsetof(struct bpf_stream_elem, str[len]), 8); + return (struct bpf_stream_elem *)&stream_page->buf[consumed]; +} + +static struct bpf_stream_elem *bpf_stream_page_reserve_elem(int len) +{ + struct bpf_stream_elem *elem = NULL; + struct bpf_stream_page *page; + int room = 0; + + page = this_cpu_read(stream_pcpu_page); + if (!page) + page = bpf_stream_page_replace(); + if (!page) + return NULL; + + room = bpf_stream_page_check_room(page, len); + if (room != len) + page = bpf_stream_page_replace(); + if (!page) + return NULL; + bpf_stream_page_get(page); + room = bpf_stream_page_check_room(page, len); + WARN_ON_ONCE(room != len); + + elem = bpf_stream_page_push_elem(page, room); + bpf_stream_elem_init(elem, room); + return elem; +} + +static struct bpf_stream_elem *bpf_stream_elem_alloc(int len) +{ + const int max_len = ARRAY_SIZE((struct bpf_bprintf_buffers){}.buf); + struct bpf_stream_elem *elem; + unsigned long flags; + + BUILD_BUG_ON(max_len > BPF_STREAM_PAGE_SZ); + /* + * Length denotes the amount of data to be written as part of stream element, + * thus includes '\0' byte. We're capped by how much bpf_bprintf_buffers can + * accomodate, therefore deny allocations that won't fit into them. + */ + if (len < 0 || len > max_len) + return NULL; + + if (!bpf_stream_page_local_lock(&flags)) + return NULL; + elem = bpf_stream_page_reserve_elem(len); + bpf_stream_page_local_unlock(&flags); + return elem; +} + +static int __bpf_stream_push_str(struct llist_head *log, const char *str, int len) +{ + struct bpf_stream_elem *elem = NULL; + + /* + * Allocate a bpf_prog_stream_elem and push it to the bpf_prog_stream + * log, elements will be popped at once and reversed to print the log. + */ + elem = bpf_stream_elem_alloc(len); + if (!elem) + return -ENOMEM; + + memcpy(elem->str, str, len); + llist_add(&elem->node, log); + + return 0; +} + +static int bpf_stream_consume_capacity(struct bpf_stream *stream, int len) +{ + if (atomic_read(&stream->capacity) >= BPF_STREAM_MAX_CAPACITY) + return -ENOSPC; + if (atomic_add_return(len, &stream->capacity) >= BPF_STREAM_MAX_CAPACITY) { + atomic_sub(len, &stream->capacity); + return -ENOSPC; + } + return 0; +} + +static void bpf_stream_release_capacity(struct bpf_stream *stream, struct bpf_stream_elem *elem) +{ + int len = elem->total_len; + + atomic_sub(len, &stream->capacity); +} + +static int bpf_stream_push_str(struct bpf_stream *stream, const char *str, int len) +{ + int ret = bpf_stream_consume_capacity(stream, len); + + return ret ?: __bpf_stream_push_str(&stream->log, str, len); +} + +static struct bpf_stream *bpf_stream_get(enum bpf_stream_id stream_id, struct bpf_prog_aux *aux) +{ + if (stream_id != BPF_STDOUT && stream_id != BPF_STDERR) + return NULL; + return &aux->stream[stream_id - 1]; +} + +static void bpf_stream_free_elem(struct bpf_stream_elem *elem) +{ + struct bpf_stream_page *p; + + p = bpf_stream_page_from_elem(elem); + bpf_stream_page_put(p); +} + +static void bpf_stream_free_list(struct llist_node *list) +{ + struct bpf_stream_elem *elem, *tmp; + + llist_for_each_entry_safe(elem, tmp, list, node) + bpf_stream_free_elem(elem); +} + +static struct llist_node *bpf_stream_backlog_peek(struct bpf_stream *stream) +{ + return stream->backlog_head; +} + +static struct llist_node *bpf_stream_backlog_pop(struct bpf_stream *stream) +{ + struct llist_node *node; + + node = stream->backlog_head; + if (stream->backlog_head == stream->backlog_tail) + stream->backlog_head = stream->backlog_tail = NULL; + else + stream->backlog_head = node->next; + return node; +} + +static void bpf_stream_backlog_fill(struct bpf_stream *stream) +{ + struct llist_node *head, *tail; + + if (llist_empty(&stream->log)) + return; + tail = llist_del_all(&stream->log); + if (!tail) + return; + head = llist_reverse_order(tail); + + if (!stream->backlog_head) { + stream->backlog_head = head; + stream->backlog_tail = tail; + } else { + stream->backlog_tail->next = head; + stream->backlog_tail = tail; + } + + return; +} + +static bool bpf_stream_consume_elem(struct bpf_stream_elem *elem, int *len) +{ + int rem = elem->total_len - elem->consumed_len; + int used = min(rem, *len); + + elem->consumed_len += used; + *len -= used; + + return elem->consumed_len == elem->total_len; +} + +static int bpf_stream_read(struct bpf_stream *stream, void __user *buf, int len) +{ + int rem_len = len, cons_len, ret = 0; + struct bpf_stream_elem *elem = NULL; + struct llist_node *node; + + mutex_lock(&stream->lock); + + while (rem_len) { + int pos = len - rem_len; + bool cont; + + node = bpf_stream_backlog_peek(stream); + if (!node) { + bpf_stream_backlog_fill(stream); + node = bpf_stream_backlog_peek(stream); + } + if (!node) + break; + elem = container_of(node, typeof(*elem), node); + + cons_len = elem->consumed_len; + cont = bpf_stream_consume_elem(elem, &rem_len) == false; + + ret = copy_to_user(buf + pos, elem->str + cons_len, + elem->consumed_len - cons_len); + /* Restore in case of error. */ + if (ret) { + ret = -EFAULT; + elem->consumed_len = cons_len; + break; + } + + if (cont) + continue; + bpf_stream_backlog_pop(stream); + bpf_stream_release_capacity(stream, elem); + bpf_stream_free_elem(elem); + } + + mutex_unlock(&stream->lock); + return ret ? ret : len - rem_len; +} + +int bpf_prog_stream_read(struct bpf_prog *prog, enum bpf_stream_id stream_id, void __user *buf, int len) +{ + struct bpf_stream *stream; + + stream = bpf_stream_get(stream_id, prog->aux); + if (!stream) + return -ENOENT; + return bpf_stream_read(stream, buf, len); +} + +__bpf_kfunc_start_defs(); + +/* + * Avoid using enum bpf_stream_id so that kfunc users don't have to pull in the + * enum in headers. + */ +__bpf_kfunc int bpf_stream_vprintk(int stream_id, const char *fmt__str, const void *args, u32 len__sz, void *aux__prog) +{ + struct bpf_bprintf_data data = { + .get_bin_args = true, + .get_buf = true, + }; + struct bpf_prog_aux *aux = aux__prog; + u32 fmt_size = strlen(fmt__str) + 1; + struct bpf_stream *stream; + u32 data_len = len__sz; + int ret, num_args; + + stream = bpf_stream_get(stream_id, aux); + if (!stream) + return -ENOENT; + + if (data_len & 7 || data_len > MAX_BPRINTF_VARARGS * 8 || + (data_len && !args)) + return -EINVAL; + num_args = data_len / 8; + + ret = bpf_bprintf_prepare(fmt__str, fmt_size, args, num_args, &data); + if (ret < 0) + return ret; + + ret = bstr_printf(data.buf, MAX_BPRINTF_BUF, fmt__str, data.bin_args); + /* Exclude NULL byte during push. */ + ret = bpf_stream_push_str(stream, data.buf, ret); + bpf_bprintf_cleanup(&data); + + return ret; +} + +__bpf_kfunc_end_defs(); + +/* Added kfunc to common_btf_ids */ + +void bpf_prog_stream_init(struct bpf_prog *prog) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(prog->aux->stream); i++) { + atomic_set(&prog->aux->stream[i].capacity, 0); + init_llist_head(&prog->aux->stream[i].log); + mutex_init(&prog->aux->stream[i].lock); + prog->aux->stream[i].backlog_head = NULL; + prog->aux->stream[i].backlog_tail = NULL; + } +} + +void bpf_prog_stream_free(struct bpf_prog *prog) +{ + struct llist_node *list; + int i; + + for (i = 0; i < ARRAY_SIZE(prog->aux->stream); i++) { + list = llist_del_all(&prog->aux->stream[i].log); + bpf_stream_free_list(list); + bpf_stream_free_list(prog->aux->stream[i].backlog_head); + } +} + +void bpf_stream_stage_init(struct bpf_stream_stage *ss) +{ + init_llist_head(&ss->log); + ss->len = 0; +} + +void bpf_stream_stage_free(struct bpf_stream_stage *ss) +{ + struct llist_node *node; + + node = llist_del_all(&ss->log); + bpf_stream_free_list(node); +} + +int bpf_stream_stage_printk(struct bpf_stream_stage *ss, const char *fmt, ...) +{ + struct bpf_bprintf_buffers *buf; + va_list args; + int ret; + + if (bpf_try_get_buffers(&buf)) + return -EBUSY; + + va_start(args, fmt); + ret = vsnprintf(buf->buf, ARRAY_SIZE(buf->buf), fmt, args); + va_end(args); + ss->len += ret; + /* Exclude NULL byte during push. */ + ret = __bpf_stream_push_str(&ss->log, buf->buf, ret); + bpf_put_buffers(); + return ret; +} + +int bpf_stream_stage_commit(struct bpf_stream_stage *ss, struct bpf_prog *prog, + enum bpf_stream_id stream_id) +{ + struct llist_node *list, *head, *tail; + struct bpf_stream *stream; + int ret; + + stream = bpf_stream_get(stream_id, prog->aux); + if (!stream) + return -EINVAL; + + ret = bpf_stream_consume_capacity(stream, ss->len); + if (ret) + return ret; + + list = llist_del_all(&ss->log); + head = tail = list; + + if (!list) + return 0; + while (llist_next(list)) { + tail = llist_next(list); + list = tail; + } + llist_add_batch(head, tail, &stream->log); + return 0; +} + +struct dump_stack_ctx { + struct bpf_stream_stage *ss; + int err; +}; + +static bool dump_stack_cb(void *cookie, u64 ip, u64 sp, u64 bp) +{ + struct dump_stack_ctx *ctxp = cookie; + const char *file = "", *line = ""; + struct bpf_prog *prog; + int num, ret; + + rcu_read_lock(); + prog = bpf_prog_ksym_find(ip); + rcu_read_unlock(); + if (prog) { + ret = bpf_prog_get_file_line(prog, ip, &file, &line, &num); + if (ret < 0) + goto end; + ctxp->err = bpf_stream_stage_printk(ctxp->ss, "%pS\n %s @ %s:%d\n", + (void *)(long)ip, line, file, num); + return !ctxp->err; + } +end: + ctxp->err = bpf_stream_stage_printk(ctxp->ss, "%pS\n", (void *)(long)ip); + return !ctxp->err; +} + +int bpf_stream_stage_dump_stack(struct bpf_stream_stage *ss) +{ + struct dump_stack_ctx ctx = { .ss = ss }; + int ret; + + ret = bpf_stream_stage_printk(ss, "CPU: %d UID: %d PID: %d Comm: %s\n", + raw_smp_processor_id(), __kuid_val(current_real_cred()->euid), + current->pid, current->comm); + if (ret) + return ret; + ret = bpf_stream_stage_printk(ss, "Call trace:\n"); + if (ret) + return ret; + arch_bpf_stack_walk(dump_stack_cb, &ctx); + if (ctx.err) + return ctx.err; + return bpf_stream_stage_printk(ss, "\n"); +} diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index dd5304c6ac3c..e63039817af3 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3069,7 +3069,7 @@ static int bpf_obj_get(const union bpf_attr *attr) */ void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_type type, const struct bpf_link_ops *ops, struct bpf_prog *prog, - bool sleepable) + enum bpf_attach_type attach_type, bool sleepable) { WARN_ON(ops->dealloc && ops->dealloc_deferred); atomic64_set(&link->refcnt, 1); @@ -3078,12 +3078,14 @@ void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_type type, link->id = 0; link->ops = ops; link->prog = prog; + link->attach_type = attach_type; } void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, - const struct bpf_link_ops *ops, struct bpf_prog *prog) + const struct bpf_link_ops *ops, struct bpf_prog *prog, + enum bpf_attach_type attach_type) { - bpf_link_init_sleepable(link, type, ops, prog, false); + bpf_link_init_sleepable(link, type, ops, prog, attach_type, false); } static void bpf_link_free_id(int id) @@ -3228,7 +3230,14 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; if (type < ARRAY_SIZE(bpf_link_type_strs) && bpf_link_type_strs[type]) { - seq_printf(m, "link_type:\t%s\n", bpf_link_type_strs[type]); + if (link->type == BPF_LINK_TYPE_KPROBE_MULTI) + seq_printf(m, "link_type:\t%s\n", link->flags == BPF_F_KPROBE_MULTI_RETURN ? + "kretprobe_multi" : "kprobe_multi"); + else if (link->type == BPF_LINK_TYPE_UPROBE_MULTI) + seq_printf(m, "link_type:\t%s\n", link->flags == BPF_F_UPROBE_MULTI_RETURN ? + "uretprobe_multi" : "uprobe_multi"); + else + seq_printf(m, "link_type:\t%s\n", bpf_link_type_strs[type]); } else { WARN_ONCE(1, "missing BPF_LINK_TYPE(...) for link type %u\n", type); seq_printf(m, "link_type:\t<%u>\n", type); @@ -3403,10 +3412,12 @@ static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link, seq_printf(seq, "attach_type:\t%d\n" "target_obj_id:\t%u\n" - "target_btf_id:\t%u\n", - tr_link->attach_type, + "target_btf_id:\t%u\n" + "cookie:\t%llu\n", + link->attach_type, target_obj_id, - target_btf_id); + target_btf_id, + tr_link->link.cookie); } static int bpf_tracing_link_fill_link_info(const struct bpf_link *link, @@ -3415,7 +3426,8 @@ static int bpf_tracing_link_fill_link_info(const struct bpf_link *link, struct bpf_tracing_link *tr_link = container_of(link, struct bpf_tracing_link, link.link); - info->tracing.attach_type = tr_link->attach_type; + info->tracing.attach_type = link->attach_type; + info->tracing.cookie = tr_link->link.cookie; bpf_trampoline_unpack_key(tr_link->trampoline->key, &info->tracing.target_obj_id, &info->tracing.target_btf_id); @@ -3433,7 +3445,8 @@ static const struct bpf_link_ops bpf_tracing_link_lops = { static int bpf_tracing_prog_attach(struct bpf_prog *prog, int tgt_prog_fd, u32 btf_id, - u64 bpf_cookie) + u64 bpf_cookie, + enum bpf_attach_type attach_type) { struct bpf_link_primer link_primer; struct bpf_prog *tgt_prog = NULL; @@ -3501,8 +3514,8 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, goto out_put_prog; } bpf_link_init(&link->link.link, BPF_LINK_TYPE_TRACING, - &bpf_tracing_link_lops, prog); - link->attach_type = prog->expected_attach_type; + &bpf_tracing_link_lops, prog, attach_type); + link->link.cookie = bpf_cookie; mutex_lock(&prog->aux->dst_mutex); @@ -3651,8 +3664,10 @@ static void bpf_raw_tp_link_show_fdinfo(const struct bpf_link *link, container_of(link, struct bpf_raw_tp_link, link); seq_printf(seq, - "tp_name:\t%s\n", - raw_tp_link->btp->tp->name); + "tp_name:\t%s\n" + "cookie:\t%llu\n", + raw_tp_link->btp->tp->name, + raw_tp_link->cookie); } static int bpf_copy_to_user(char __user *ubuf, const char *buf, u32 ulen, @@ -3688,6 +3703,7 @@ static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link, return -EINVAL; info->raw_tracepoint.tp_name_len = tp_len + 1; + info->raw_tracepoint.cookie = raw_tp_link->cookie; if (!ubuf) return 0; @@ -3794,6 +3810,32 @@ static int bpf_perf_link_fill_kprobe(const struct perf_event *event, info->perf_event.kprobe.cookie = event->bpf_cookie; return 0; } + +static void bpf_perf_link_fdinfo_kprobe(const struct perf_event *event, + struct seq_file *seq) +{ + const char *name; + int err; + u32 prog_id, type; + u64 offset, addr; + unsigned long missed; + + err = bpf_get_perf_event_info(event, &prog_id, &type, &name, + &offset, &addr, &missed); + if (err) + return; + + seq_printf(seq, + "name:\t%s\n" + "offset:\t%#llx\n" + "missed:\t%lu\n" + "addr:\t%#llx\n" + "event_type:\t%s\n" + "cookie:\t%llu\n", + name, offset, missed, addr, + type == BPF_FD_TYPE_KRETPROBE ? "kretprobe" : "kprobe", + event->bpf_cookie); +} #endif #ifdef CONFIG_UPROBE_EVENTS @@ -3822,6 +3864,31 @@ static int bpf_perf_link_fill_uprobe(const struct perf_event *event, info->perf_event.uprobe.ref_ctr_offset = ref_ctr_offset; return 0; } + +static void bpf_perf_link_fdinfo_uprobe(const struct perf_event *event, + struct seq_file *seq) +{ + const char *name; + int err; + u32 prog_id, type; + u64 offset, ref_ctr_offset; + unsigned long missed; + + err = bpf_get_perf_event_info(event, &prog_id, &type, &name, + &offset, &ref_ctr_offset, &missed); + if (err) + return; + + seq_printf(seq, + "name:\t%s\n" + "offset:\t%#llx\n" + "ref_ctr_offset:\t%#llx\n" + "event_type:\t%s\n" + "cookie:\t%llu\n", + name, offset, ref_ctr_offset, + type == BPF_FD_TYPE_URETPROBE ? "uretprobe" : "uprobe", + event->bpf_cookie); +} #endif static int bpf_perf_link_fill_probe(const struct perf_event *event, @@ -3890,10 +3957,79 @@ static int bpf_perf_link_fill_link_info(const struct bpf_link *link, } } +static void bpf_perf_event_link_show_fdinfo(const struct perf_event *event, + struct seq_file *seq) +{ + seq_printf(seq, + "type:\t%u\n" + "config:\t%llu\n" + "event_type:\t%s\n" + "cookie:\t%llu\n", + event->attr.type, event->attr.config, + "event", event->bpf_cookie); +} + +static void bpf_tracepoint_link_show_fdinfo(const struct perf_event *event, + struct seq_file *seq) +{ + int err; + const char *name; + u32 prog_id; + + err = bpf_get_perf_event_info(event, &prog_id, NULL, &name, NULL, + NULL, NULL); + if (err) + return; + + seq_printf(seq, + "tp_name:\t%s\n" + "event_type:\t%s\n" + "cookie:\t%llu\n", + name, "tracepoint", event->bpf_cookie); +} + +static void bpf_probe_link_show_fdinfo(const struct perf_event *event, + struct seq_file *seq) +{ +#ifdef CONFIG_KPROBE_EVENTS + if (event->tp_event->flags & TRACE_EVENT_FL_KPROBE) + return bpf_perf_link_fdinfo_kprobe(event, seq); +#endif + +#ifdef CONFIG_UPROBE_EVENTS + if (event->tp_event->flags & TRACE_EVENT_FL_UPROBE) + return bpf_perf_link_fdinfo_uprobe(event, seq); +#endif +} + +static void bpf_perf_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_perf_link *perf_link; + const struct perf_event *event; + + perf_link = container_of(link, struct bpf_perf_link, link); + event = perf_get_event(perf_link->perf_file); + if (IS_ERR(event)) + return; + + switch (event->prog->type) { + case BPF_PROG_TYPE_PERF_EVENT: + return bpf_perf_event_link_show_fdinfo(event, seq); + case BPF_PROG_TYPE_TRACEPOINT: + return bpf_tracepoint_link_show_fdinfo(event, seq); + case BPF_PROG_TYPE_KPROBE: + return bpf_probe_link_show_fdinfo(event, seq); + default: + return; + } +} + static const struct bpf_link_ops bpf_perf_link_lops = { .release = bpf_perf_link_release, .dealloc = bpf_perf_link_dealloc, .fill_link_info = bpf_perf_link_fill_link_info, + .show_fdinfo = bpf_perf_link_show_fdinfo, }; static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) @@ -3916,7 +4052,8 @@ static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *pro err = -ENOMEM; goto out_put_file; } - bpf_link_init(&link->link, BPF_LINK_TYPE_PERF_EVENT, &bpf_perf_link_lops, prog); + bpf_link_init(&link->link, BPF_LINK_TYPE_PERF_EVENT, &bpf_perf_link_lops, prog, + attr->link_create.attach_type); link->perf_file = perf_file; err = bpf_link_prime(&link->link, &link_primer); @@ -3948,7 +4085,8 @@ static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *pro #endif /* CONFIG_PERF_EVENTS */ static int bpf_raw_tp_link_attach(struct bpf_prog *prog, - const char __user *user_tp_name, u64 cookie) + const char __user *user_tp_name, u64 cookie, + enum bpf_attach_type attach_type) { struct bpf_link_primer link_primer; struct bpf_raw_tp_link *link; @@ -3971,7 +4109,7 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog, tp_name = prog->aux->attach_func_name; break; } - return bpf_tracing_prog_attach(prog, 0, 0, 0); + return bpf_tracing_prog_attach(prog, 0, 0, 0, attach_type); case BPF_PROG_TYPE_RAW_TRACEPOINT: case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: if (strncpy_from_user(buf, user_tp_name, sizeof(buf) - 1) < 0) @@ -3993,7 +4131,7 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog, goto out_put_btp; } bpf_link_init_sleepable(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT, - &bpf_raw_tp_link_lops, prog, + &bpf_raw_tp_link_lops, prog, attach_type, tracepoint_is_faultable(btp->tp)); link->btp = btp; link->cookie = cookie; @@ -4035,7 +4173,7 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) tp_name = u64_to_user_ptr(attr->raw_tracepoint.name); cookie = attr->raw_tracepoint.cookie; - fd = bpf_raw_tp_link_attach(prog, tp_name, cookie); + fd = bpf_raw_tp_link_attach(prog, tp_name, cookie, prog->expected_attach_type); if (fd < 0) bpf_prog_put(prog); return fd; @@ -4185,6 +4323,25 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, } } +static bool is_cgroup_prog_type(enum bpf_prog_type ptype, enum bpf_attach_type atype, + bool check_atype) +{ + switch (ptype) { + case BPF_PROG_TYPE_CGROUP_DEVICE: + case BPF_PROG_TYPE_CGROUP_SKB: + case BPF_PROG_TYPE_CGROUP_SOCK: + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + case BPF_PROG_TYPE_CGROUP_SOCKOPT: + case BPF_PROG_TYPE_CGROUP_SYSCTL: + case BPF_PROG_TYPE_SOCK_OPS: + return true; + case BPF_PROG_TYPE_LSM: + return check_atype ? atype == BPF_LSM_CGROUP : true; + default: + return false; + } +} + #define BPF_PROG_ATTACH_LAST_FIELD expected_revision #define BPF_F_ATTACH_MASK_BASE \ @@ -4215,6 +4372,9 @@ static int bpf_prog_attach(const union bpf_attr *attr) if (bpf_mprog_supported(ptype)) { if (attr->attach_flags & ~BPF_F_ATTACH_MASK_MPROG) return -EINVAL; + } else if (is_cgroup_prog_type(ptype, 0, false)) { + if (attr->attach_flags & ~(BPF_F_ATTACH_MASK_BASE | BPF_F_ATTACH_MASK_MPROG)) + return -EINVAL; } else { if (attr->attach_flags & ~BPF_F_ATTACH_MASK_BASE) return -EINVAL; @@ -4232,6 +4392,11 @@ static int bpf_prog_attach(const union bpf_attr *attr) return -EINVAL; } + if (is_cgroup_prog_type(ptype, prog->expected_attach_type, true)) { + ret = cgroup_bpf_prog_attach(attr, ptype, prog); + goto out; + } + switch (ptype) { case BPF_PROG_TYPE_SK_SKB: case BPF_PROG_TYPE_SK_MSG: @@ -4243,20 +4408,6 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_PROG_TYPE_FLOW_DISSECTOR: ret = netns_bpf_prog_attach(attr, prog); break; - case BPF_PROG_TYPE_CGROUP_DEVICE: - case BPF_PROG_TYPE_CGROUP_SKB: - case BPF_PROG_TYPE_CGROUP_SOCK: - case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: - case BPF_PROG_TYPE_CGROUP_SOCKOPT: - case BPF_PROG_TYPE_CGROUP_SYSCTL: - case BPF_PROG_TYPE_SOCK_OPS: - case BPF_PROG_TYPE_LSM: - if (ptype == BPF_PROG_TYPE_LSM && - prog->expected_attach_type != BPF_LSM_CGROUP) - ret = -EINVAL; - else - ret = cgroup_bpf_prog_attach(attr, ptype, prog); - break; case BPF_PROG_TYPE_SCHED_CLS: if (attr->attach_type == BPF_TCX_INGRESS || attr->attach_type == BPF_TCX_EGRESS) @@ -4267,7 +4418,7 @@ static int bpf_prog_attach(const union bpf_attr *attr) default: ret = -EINVAL; } - +out: if (ret) bpf_prog_put(prog); return ret; @@ -4295,6 +4446,9 @@ static int bpf_prog_detach(const union bpf_attr *attr) if (IS_ERR(prog)) return PTR_ERR(prog); } + } else if (is_cgroup_prog_type(ptype, 0, false)) { + if (attr->attach_flags || attr->relative_fd) + return -EINVAL; } else if (attr->attach_flags || attr->relative_fd || attr->expected_revision) { @@ -5085,6 +5239,21 @@ static int bpf_link_get_info_by_fd(struct file *file, } +static int token_get_info_by_fd(struct file *file, + struct bpf_token *token, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + struct bpf_token_info __user *uinfo = u64_to_user_ptr(attr->info.info); + u32 info_len = attr->info.info_len; + int err; + + err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(*uinfo), info_len); + if (err) + return err; + return bpf_token_get_info_by_fd(token, attr, uattr); +} + #define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, @@ -5108,6 +5277,9 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, else if (fd_file(f)->f_op == &bpf_link_fops || fd_file(f)->f_op == &bpf_link_fops_poll) return bpf_link_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr, uattr); + else if (fd_file(f)->f_op == &bpf_token_fops) + return token_get_info_by_fd(fd_file(f), fd_file(f)->private_data, + attr, uattr); return -EINVAL; } @@ -5195,21 +5367,10 @@ static int bpf_task_fd_query_copy(const union bpf_attr *attr, if (put_user(zero, ubuf)) return -EFAULT; - } else if (input_len >= len + 1) { - /* ubuf can hold the string with NULL terminator */ - if (copy_to_user(ubuf, buf, len + 1)) - return -EFAULT; } else { - /* ubuf cannot hold the string with NULL terminator, - * do a partial copy with NULL terminator. - */ - char zero = '\0'; - - err = -ENOSPC; - if (copy_to_user(ubuf, buf, input_len - 1)) - return -EFAULT; - if (put_user(zero, ubuf + input_len - 1)) - return -EFAULT; + err = bpf_copy_to_user(ubuf, buf, input_len, len); + if (err == -EFAULT) + return err; } } @@ -5387,7 +5548,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) ret = bpf_tracing_prog_attach(prog, attr->link_create.target_fd, attr->link_create.target_btf_id, - attr->link_create.tracing.cookie); + attr->link_create.tracing.cookie, + attr->link_create.attach_type); break; case BPF_PROG_TYPE_LSM: case BPF_PROG_TYPE_TRACING: @@ -5396,7 +5558,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) goto out; } if (prog->expected_attach_type == BPF_TRACE_RAW_TP) - ret = bpf_raw_tp_link_attach(prog, NULL, attr->link_create.tracing.cookie); + ret = bpf_raw_tp_link_attach(prog, NULL, attr->link_create.tracing.cookie, + attr->link_create.attach_type); else if (prog->expected_attach_type == BPF_TRACE_ITER) ret = bpf_iter_link_attach(attr, uattr, prog); else if (prog->expected_attach_type == BPF_LSM_CGROUP) @@ -5405,7 +5568,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) ret = bpf_tracing_prog_attach(prog, attr->link_create.target_fd, attr->link_create.target_btf_id, - attr->link_create.tracing.cookie); + attr->link_create.tracing.cookie, + attr->link_create.attach_type); break; case BPF_PROG_TYPE_FLOW_DISSECTOR: case BPF_PROG_TYPE_SK_LOOKUP: @@ -5794,6 +5958,28 @@ static int token_create(union bpf_attr *attr) return bpf_token_create(attr); } +#define BPF_PROG_STREAM_READ_BY_FD_LAST_FIELD prog_stream_read.prog_fd + +static int prog_stream_read(union bpf_attr *attr) +{ + char __user *buf = u64_to_user_ptr(attr->prog_stream_read.stream_buf); + u32 len = attr->prog_stream_read.stream_buf_len; + struct bpf_prog *prog; + int ret; + + if (CHECK_ATTR(BPF_PROG_STREAM_READ_BY_FD)) + return -EINVAL; + + prog = bpf_prog_get(attr->prog_stream_read.prog_fd); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + ret = bpf_prog_stream_read(prog, attr->prog_stream_read.stream_id, buf, len); + bpf_prog_put(prog); + + return ret; +} + static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) { union bpf_attr attr; @@ -5930,6 +6116,9 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) case BPF_TOKEN_CREATE: err = token_create(&attr); break; + case BPF_PROG_STREAM_READ_BY_FD: + err = prog_stream_read(&attr); + break; default: err = -EINVAL; break; diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c index 8e61dc555415..9cbe15ce3540 100644 --- a/kernel/bpf/sysfs_btf.c +++ b/kernel/bpf/sysfs_btf.c @@ -45,7 +45,7 @@ static int btf_sysfs_vmlinux_mmap(struct file *filp, struct kobject *kobj, static struct bin_attribute bin_attr_btf_vmlinux __ro_after_init = { .attr = { .name = "vmlinux", .mode = 0444, }, - .read_new = sysfs_bin_attr_simple_read, + .read = sysfs_bin_attr_simple_read, .mmap = btf_sysfs_vmlinux_mmap, }; diff --git a/kernel/bpf/tcx.c b/kernel/bpf/tcx.c index 2e4885e7781f..efd987ea6872 100644 --- a/kernel/bpf/tcx.c +++ b/kernel/bpf/tcx.c @@ -142,7 +142,7 @@ static int tcx_link_prog_attach(struct bpf_link *link, u32 flags, u32 id_or_fd, u64 revision) { struct tcx_link *tcx = tcx_link(link); - bool created, ingress = tcx->location == BPF_TCX_INGRESS; + bool created, ingress = link->attach_type == BPF_TCX_INGRESS; struct bpf_mprog_entry *entry, *entry_new; struct net_device *dev = tcx->dev; int ret; @@ -169,7 +169,7 @@ static int tcx_link_prog_attach(struct bpf_link *link, u32 flags, u32 id_or_fd, static void tcx_link_release(struct bpf_link *link) { struct tcx_link *tcx = tcx_link(link); - bool ingress = tcx->location == BPF_TCX_INGRESS; + bool ingress = link->attach_type == BPF_TCX_INGRESS; struct bpf_mprog_entry *entry, *entry_new; struct net_device *dev; int ret = 0; @@ -204,7 +204,7 @@ static int tcx_link_update(struct bpf_link *link, struct bpf_prog *nprog, struct bpf_prog *oprog) { struct tcx_link *tcx = tcx_link(link); - bool ingress = tcx->location == BPF_TCX_INGRESS; + bool ingress = link->attach_type == BPF_TCX_INGRESS; struct bpf_mprog_entry *entry, *entry_new; struct net_device *dev; int ret = 0; @@ -260,8 +260,8 @@ static void tcx_link_fdinfo(const struct bpf_link *link, struct seq_file *seq) seq_printf(seq, "ifindex:\t%u\n", ifindex); seq_printf(seq, "attach_type:\t%u (%s)\n", - tcx->location, - tcx->location == BPF_TCX_INGRESS ? "ingress" : "egress"); + link->attach_type, + link->attach_type == BPF_TCX_INGRESS ? "ingress" : "egress"); } static int tcx_link_fill_info(const struct bpf_link *link, @@ -276,7 +276,7 @@ static int tcx_link_fill_info(const struct bpf_link *link, rtnl_unlock(); info->tcx.ifindex = ifindex; - info->tcx.attach_type = tcx->location; + info->tcx.attach_type = link->attach_type; return 0; } @@ -301,8 +301,8 @@ static int tcx_link_init(struct tcx_link *tcx, struct net_device *dev, struct bpf_prog *prog) { - bpf_link_init(&tcx->link, BPF_LINK_TYPE_TCX, &tcx_link_lops, prog); - tcx->location = attr->link_create.attach_type; + bpf_link_init(&tcx->link, BPF_LINK_TYPE_TCX, &tcx_link_lops, prog, + attr->link_create.attach_type); tcx->dev = dev; return bpf_link_prime(&tcx->link, link_primer); } diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c index 9dbc31b25e3d..fa353c5d550f 100644 --- a/kernel/bpf/tnum.c +++ b/kernel/bpf/tnum.c @@ -83,6 +83,11 @@ struct tnum tnum_sub(struct tnum a, struct tnum b) return TNUM(dv & ~mu, mu); } +struct tnum tnum_neg(struct tnum a) +{ + return tnum_sub(TNUM(0, 0), a); +} + struct tnum tnum_and(struct tnum a, struct tnum b) { u64 alpha, beta, v; diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c index 26057aa13503..0bbe412f854e 100644 --- a/kernel/bpf/token.c +++ b/kernel/bpf/token.c @@ -103,7 +103,7 @@ static void bpf_token_show_fdinfo(struct seq_file *m, struct file *filp) static const struct inode_operations bpf_token_iops = { }; -static const struct file_operations bpf_token_fops = { +const struct file_operations bpf_token_fops = { .release = bpf_token_release, .show_fdinfo = bpf_token_show_fdinfo, }; @@ -210,6 +210,29 @@ out_file: return err; } +int bpf_token_get_info_by_fd(struct bpf_token *token, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + struct bpf_token_info __user *uinfo = u64_to_user_ptr(attr->info.info); + struct bpf_token_info info; + u32 info_len = attr->info.info_len; + + info_len = min_t(u32, info_len, sizeof(info)); + memset(&info, 0, sizeof(info)); + + info.allowed_cmds = token->allowed_cmds; + info.allowed_maps = token->allowed_maps; + info.allowed_progs = token->allowed_progs; + info.allowed_attachs = token->allowed_attachs; + + if (copy_to_user(uinfo, &info, info_len) || + put_user(info_len, &uattr->info.info_len)) + return -EFAULT; + + return 0; +} + struct bpf_token *bpf_token_get_from_fd(u32 ufd) { CLASS(fd, f)(ufd); diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index c4b1a98ff726..0e364614c3a2 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -674,7 +674,8 @@ static const struct bpf_link_ops bpf_shim_tramp_link_lops = { static struct bpf_shim_tramp_link *cgroup_shim_alloc(const struct bpf_prog *prog, bpf_func_t bpf_func, - int cgroup_atype) + int cgroup_atype, + enum bpf_attach_type attach_type) { struct bpf_shim_tramp_link *shim_link = NULL; struct bpf_prog *p; @@ -701,7 +702,7 @@ static struct bpf_shim_tramp_link *cgroup_shim_alloc(const struct bpf_prog *prog p->expected_attach_type = BPF_LSM_MAC; bpf_prog_inc(p); bpf_link_init(&shim_link->link.link, BPF_LINK_TYPE_UNSPEC, - &bpf_shim_tramp_link_lops, p); + &bpf_shim_tramp_link_lops, p, attach_type); bpf_cgroup_atype_get(p->aux->attach_btf_id, cgroup_atype); return shim_link; @@ -726,7 +727,8 @@ static struct bpf_shim_tramp_link *cgroup_shim_find(struct bpf_trampoline *tr, } int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog, - int cgroup_atype) + int cgroup_atype, + enum bpf_attach_type attach_type) { struct bpf_shim_tramp_link *shim_link = NULL; struct bpf_attach_target_info tgt_info = {}; @@ -763,7 +765,7 @@ int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog, /* Allocate and install new shim. */ - shim_link = cgroup_shim_alloc(prog, bpf_func, cgroup_atype); + shim_link = cgroup_shim_alloc(prog, bpf_func, cgroup_atype, attach_type); if (!shim_link) { err = -ENOMEM; goto err; @@ -911,27 +913,32 @@ static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tram return bpf_prog_start_time(); } -static void notrace update_prog_stats(struct bpf_prog *prog, - u64 start) +static void notrace __update_prog_stats(struct bpf_prog *prog, u64 start) { struct bpf_prog_stats *stats; + unsigned long flags; + u64 duration; - if (static_branch_unlikely(&bpf_stats_enabled_key) && - /* static_key could be enabled in __bpf_prog_enter* - * and disabled in __bpf_prog_exit*. - * And vice versa. - * Hence check that 'start' is valid. - */ - start > NO_START_TIME) { - u64 duration = sched_clock() - start; - unsigned long flags; - - stats = this_cpu_ptr(prog->stats); - flags = u64_stats_update_begin_irqsave(&stats->syncp); - u64_stats_inc(&stats->cnt); - u64_stats_add(&stats->nsecs, duration); - u64_stats_update_end_irqrestore(&stats->syncp, flags); - } + /* + * static_key could be enabled in __bpf_prog_enter* and disabled in + * __bpf_prog_exit*. And vice versa. Check that 'start' is valid. + */ + if (start <= NO_START_TIME) + return; + + duration = sched_clock() - start; + stats = this_cpu_ptr(prog->stats); + flags = u64_stats_update_begin_irqsave(&stats->syncp); + u64_stats_inc(&stats->cnt); + u64_stats_add(&stats->nsecs, duration); + u64_stats_update_end_irqrestore(&stats->syncp, flags); +} + +static __always_inline void notrace update_prog_stats(struct bpf_prog *prog, + u64 start) +{ + if (static_branch_unlikely(&bpf_stats_enabled_key)) + __update_prog_stats(prog, start); } static void notrace __bpf_prog_exit_recur(struct bpf_prog *prog, u64 start, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a0d663be91a2..399f03e62508 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -44,6 +44,12 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { #undef BPF_LINK_TYPE }; +enum bpf_features { + BPF_FEAT_RDONLY_CAST_TO_VOID = 0, + BPF_FEAT_STREAMS = 1, + __MAX_BPF_FEAT, +}; + struct bpf_mem_alloc bpf_global_percpu_ma; static bool bpf_global_percpu_ma_set; @@ -405,7 +411,8 @@ static bool reg_not_null(const struct bpf_reg_state *reg) type == PTR_TO_MAP_KEY || type == PTR_TO_SOCK_COMMON || (type == PTR_TO_BTF_ID && is_trusted_reg(reg)) || - type == PTR_TO_MEM; + (type == PTR_TO_MEM && !(reg->type & PTR_UNTRUSTED)) || + type == CONST_PTR_TO_MAP; } static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg) @@ -855,7 +862,7 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re * dynptr */ if (state->stack[i].slot_type[0] != STACK_DYNPTR) { - verbose(env, "verifier internal error: misconfigured ref_obj_id\n"); + verifier_bug(env, "misconfigured ref_obj_id"); return -EFAULT; } if (state->stack[i].spilled_ptr.dynptr.first_slot) @@ -1403,7 +1410,7 @@ static void *realloc_array(void *arr, size_t old_n, size_t new_n, size_t size) goto out; alloc_size = kmalloc_size_roundup(size_mul(new_n, size)); - new_arr = krealloc(arr, alloc_size, GFP_KERNEL); + new_arr = krealloc(arr, alloc_size, GFP_KERNEL_ACCOUNT); if (!new_arr) { kfree(arr); return NULL; @@ -1420,7 +1427,7 @@ out: static int copy_reference_state(struct bpf_verifier_state *dst, const struct bpf_verifier_state *src) { dst->refs = copy_array(dst->refs, src->refs, src->acquired_refs, - sizeof(struct bpf_reference_state), GFP_KERNEL); + sizeof(struct bpf_reference_state), GFP_KERNEL_ACCOUNT); if (!dst->refs) return -ENOMEM; @@ -1439,7 +1446,7 @@ static int copy_stack_state(struct bpf_func_state *dst, const struct bpf_func_st size_t n = src->allocated_stack / BPF_REG_SIZE; dst->stack = copy_array(dst->stack, src->stack, n, sizeof(struct bpf_stack_state), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!dst->stack) return -ENOMEM; @@ -1647,7 +1654,7 @@ static void update_peak_states(struct bpf_verifier_env *env) { u32 cur_states; - cur_states = env->explored_states_size + env->free_list_size; + cur_states = env->explored_states_size + env->free_list_size + env->num_backedges; env->peak_states = max(env->peak_states, cur_states); } @@ -1659,6 +1666,13 @@ static void free_func_state(struct bpf_func_state *state) kfree(state); } +static void clear_jmp_history(struct bpf_verifier_state *state) +{ + kfree(state->jmp_history); + state->jmp_history = NULL; + state->jmp_history_cnt = 0; +} + static void free_verifier_state(struct bpf_verifier_state *state, bool free_self) { @@ -1669,11 +1683,12 @@ static void free_verifier_state(struct bpf_verifier_state *state, state->frame[i] = NULL; } kfree(state->refs); + clear_jmp_history(state); if (free_self) kfree(state); } -/* struct bpf_verifier_state->{parent,loop_entry} refer to states +/* struct bpf_verifier_state->parent refers to states * that are in either of env->{expored_states,free_list}. * In both cases the state is contained in struct bpf_verifier_state_list. */ @@ -1684,37 +1699,24 @@ static struct bpf_verifier_state_list *state_parent_as_list(struct bpf_verifier_ return NULL; } -static struct bpf_verifier_state_list *state_loop_entry_as_list(struct bpf_verifier_state *st) -{ - if (st->loop_entry) - return container_of(st->loop_entry, struct bpf_verifier_state_list, state); - return NULL; -} +static bool incomplete_read_marks(struct bpf_verifier_env *env, + struct bpf_verifier_state *st); /* A state can be freed if it is no longer referenced: * - is in the env->free_list; * - has no children states; - * - is not used as loop_entry. - * - * Freeing a state can make it's loop_entry free-able. */ static void maybe_free_verifier_state(struct bpf_verifier_env *env, struct bpf_verifier_state_list *sl) { - struct bpf_verifier_state_list *loop_entry_sl; - - while (sl && sl->in_free_list && - sl->state.branches == 0 && - sl->state.used_as_loop_entry == 0) { - loop_entry_sl = state_loop_entry_as_list(&sl->state); - if (loop_entry_sl) - loop_entry_sl->state.used_as_loop_entry--; - list_del(&sl->node); - free_verifier_state(&sl->state, false); - kfree(sl); - env->free_list_size--; - sl = loop_entry_sl; - } + if (!sl->in_free_list + || sl->state.branches != 0 + || incomplete_read_marks(env, &sl->state)) + return; + list_del(&sl->node); + free_verifier_state(&sl->state, false); + kfree(sl); + env->free_list_size--; } /* copy verifier state from src to dst growing dst stack space @@ -1733,6 +1735,13 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, struct bpf_func_state *dst; int i, err; + dst_state->jmp_history = copy_array(dst_state->jmp_history, src->jmp_history, + src->jmp_history_cnt, sizeof(*dst_state->jmp_history), + GFP_KERNEL_ACCOUNT); + if (!dst_state->jmp_history) + return -ENOMEM; + dst_state->jmp_history_cnt = src->jmp_history_cnt; + /* if dst has more stack frames then src frame, free them, this is also * necessary in case of exceptional exits using bpf_throw. */ @@ -1750,17 +1759,14 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, dst_state->parent = src->parent; dst_state->first_insn_idx = src->first_insn_idx; dst_state->last_insn_idx = src->last_insn_idx; - dst_state->insn_hist_start = src->insn_hist_start; - dst_state->insn_hist_end = src->insn_hist_end; dst_state->dfs_depth = src->dfs_depth; dst_state->callback_unroll_depth = src->callback_unroll_depth; - dst_state->used_as_loop_entry = src->used_as_loop_entry; dst_state->may_goto_depth = src->may_goto_depth; - dst_state->loop_entry = src->loop_entry; + dst_state->equal_state = src->equal_state; for (i = 0; i <= src->curframe; i++) { dst = dst_state->frame[i]; if (!dst) { - dst = kzalloc(sizeof(*dst), GFP_KERNEL); + dst = kzalloc(sizeof(*dst), GFP_KERNEL_ACCOUNT); if (!dst) return -ENOMEM; dst_state->frame[i] = dst; @@ -1799,181 +1805,241 @@ static bool same_callsites(struct bpf_verifier_state *a, struct bpf_verifier_sta return true; } -/* Open coded iterators allow back-edges in the state graph in order to - * check unbounded loops that iterators. - * - * In is_state_visited() it is necessary to know if explored states are - * part of some loops in order to decide whether non-exact states - * comparison could be used: - * - non-exact states comparison establishes sub-state relation and uses - * read and precision marks to do so, these marks are propagated from - * children states and thus are not guaranteed to be final in a loop; - * - exact states comparison just checks if current and explored states - * are identical (and thus form a back-edge). - * - * Paper "A New Algorithm for Identifying Loops in Decompilation" - * by Tao Wei, Jian Mao, Wei Zou and Yu Chen [1] presents a convenient - * algorithm for loop structure detection and gives an overview of - * relevant terminology. It also has helpful illustrations. - * - * [1] https://api.semanticscholar.org/CorpusID:15784067 - * - * We use a similar algorithm but because loop nested structure is - * irrelevant for verifier ours is significantly simpler and resembles - * strongly connected components algorithm from Sedgewick's textbook. - * - * Define topmost loop entry as a first node of the loop traversed in a - * depth first search starting from initial state. The goal of the loop - * tracking algorithm is to associate topmost loop entries with states - * derived from these entries. - * - * For each step in the DFS states traversal algorithm needs to identify - * the following situations: - * - * initial initial initial - * | | | - * V V V - * ... ... .---------> hdr - * | | | | - * V V | V - * cur .-> succ | .------... - * | | | | | | - * V | V | V V - * succ '-- cur | ... ... - * | | | - * | V V - * | succ <- cur - * | | - * | V - * | ... - * | | - * '----' - * - * (A) successor state of cur (B) successor state of cur or it's entry - * not yet traversed are in current DFS path, thus cur and succ - * are members of the same outermost loop - * - * initial initial - * | | - * V V - * ... ... - * | | - * V V - * .------... .------... - * | | | | - * V V V V - * .-> hdr ... ... ... - * | | | | | - * | V V V V - * | succ <- cur succ <- cur - * | | | - * | V V - * | ... ... - * | | | - * '----' exit - * - * (C) successor state of cur is a part of some loop but this loop - * does not include cur or successor state is not in a loop at all. - * - * Algorithm could be described as the following python code: - * - * traversed = set() # Set of traversed nodes - * entries = {} # Mapping from node to loop entry - * depths = {} # Depth level assigned to graph node - * path = set() # Current DFS path - * - * # Find outermost loop entry known for n - * def get_loop_entry(n): - * h = entries.get(n, None) - * while h in entries: - * h = entries[h] - * return h - * - * # Update n's loop entry if h comes before n in current DFS path. - * def update_loop_entry(n, h): - * if h in path and depths[entries.get(n, n)] < depths[n]: - * entries[n] = h1 +/* Return IP for a given frame in a call stack */ +static u32 frame_insn_idx(struct bpf_verifier_state *st, u32 frame) +{ + return frame == st->curframe + ? st->insn_idx + : st->frame[frame + 1]->callsite; +} + +/* For state @st look for a topmost frame with frame_insn_idx() in some SCC, + * if such frame exists form a corresponding @callchain as an array of + * call sites leading to this frame and SCC id. + * E.g.: * - * def dfs(n, depth): - * traversed.add(n) - * path.add(n) - * depths[n] = depth - * for succ in G.successors(n): - * if succ not in traversed: - * # Case A: explore succ and update cur's loop entry - * # only if succ's entry is in current DFS path. - * dfs(succ, depth + 1) - * h = entries.get(succ, None) - * update_loop_entry(n, h) - * else: - * # Case B or C depending on `h1 in path` check in update_loop_entry(). - * update_loop_entry(n, succ) - * path.remove(n) + * void foo() { A: loop {... SCC#1 ...}; } + * void bar() { B: loop { C: foo(); ... SCC#2 ... } + * D: loop { E: foo(); ... SCC#3 ... } } + * void main() { F: bar(); } * - * To adapt this algorithm for use with verifier: - * - use st->branch == 0 as a signal that DFS of succ had been finished - * and cur's loop entry has to be updated (case A), handle this in - * update_branch_counts(); - * - use st->branch > 0 as a signal that st is in the current DFS path; - * - handle cases B and C in is_state_visited(). + * @callchain at (A) would be either (F,SCC#2) or (F,SCC#3) depending + * on @st frame call sites being (F,C,A) or (F,E,A). */ -static struct bpf_verifier_state *get_loop_entry(struct bpf_verifier_env *env, - struct bpf_verifier_state *st) +static bool compute_scc_callchain(struct bpf_verifier_env *env, + struct bpf_verifier_state *st, + struct bpf_scc_callchain *callchain) { - struct bpf_verifier_state *topmost = st->loop_entry; - u32 steps = 0; + u32 i, scc, insn_idx; - while (topmost && topmost->loop_entry) { - if (verifier_bug_if(steps++ > st->dfs_depth, env, "infinite loop")) - return ERR_PTR(-EFAULT); - topmost = topmost->loop_entry; + memset(callchain, 0, sizeof(*callchain)); + for (i = 0; i <= st->curframe; i++) { + insn_idx = frame_insn_idx(st, i); + scc = env->insn_aux_data[insn_idx].scc; + if (scc) { + callchain->scc = scc; + break; + } else if (i < st->curframe) { + callchain->callsites[i] = insn_idx; + } else { + return false; + } } - return topmost; + return true; } -static void update_loop_entry(struct bpf_verifier_env *env, - struct bpf_verifier_state *cur, struct bpf_verifier_state *hdr) +/* Check if bpf_scc_visit instance for @callchain exists. */ +static struct bpf_scc_visit *scc_visit_lookup(struct bpf_verifier_env *env, + struct bpf_scc_callchain *callchain) { - /* The hdr->branches check decides between cases B and C in - * comment for get_loop_entry(). If hdr->branches == 0 then - * head's topmost loop entry is not in current DFS path, - * hence 'cur' and 'hdr' are not in the same loop and there is - * no need to update cur->loop_entry. - */ - if (hdr->branches && hdr->dfs_depth < (cur->loop_entry ?: cur)->dfs_depth) { - if (cur->loop_entry) { - cur->loop_entry->used_as_loop_entry--; - maybe_free_verifier_state(env, state_loop_entry_as_list(cur)); - } - cur->loop_entry = hdr; - hdr->used_as_loop_entry++; + struct bpf_scc_info *info = env->scc_info[callchain->scc]; + struct bpf_scc_visit *visits = info->visits; + u32 i; + + if (!info) + return NULL; + for (i = 0; i < info->num_visits; i++) + if (memcmp(callchain, &visits[i].callchain, sizeof(*callchain)) == 0) + return &visits[i]; + return NULL; +} + +/* Allocate a new bpf_scc_visit instance corresponding to @callchain. + * Allocated instances are alive for a duration of the do_check_common() + * call and are freed by free_states(). + */ +static struct bpf_scc_visit *scc_visit_alloc(struct bpf_verifier_env *env, + struct bpf_scc_callchain *callchain) +{ + struct bpf_scc_visit *visit; + struct bpf_scc_info *info; + u32 scc, num_visits; + u64 new_sz; + + scc = callchain->scc; + info = env->scc_info[scc]; + num_visits = info ? info->num_visits : 0; + new_sz = sizeof(*info) + sizeof(struct bpf_scc_visit) * (num_visits + 1); + info = kvrealloc(env->scc_info[scc], new_sz, GFP_KERNEL_ACCOUNT); + if (!info) + return NULL; + env->scc_info[scc] = info; + info->num_visits = num_visits + 1; + visit = &info->visits[num_visits]; + memset(visit, 0, sizeof(*visit)); + memcpy(&visit->callchain, callchain, sizeof(*callchain)); + return visit; +} + +/* Form a string '(callsite#1,callsite#2,...,scc)' in env->tmp_str_buf */ +static char *format_callchain(struct bpf_verifier_env *env, struct bpf_scc_callchain *callchain) +{ + char *buf = env->tmp_str_buf; + int i, delta = 0; + + delta += snprintf(buf + delta, TMP_STR_BUF_LEN - delta, "("); + for (i = 0; i < ARRAY_SIZE(callchain->callsites); i++) { + if (!callchain->callsites[i]) + break; + delta += snprintf(buf + delta, TMP_STR_BUF_LEN - delta, "%u,", + callchain->callsites[i]); } + delta += snprintf(buf + delta, TMP_STR_BUF_LEN - delta, "%u)", callchain->scc); + return env->tmp_str_buf; } -static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st) +/* If callchain for @st exists (@st is in some SCC), ensure that + * bpf_scc_visit instance for this callchain exists. + * If instance does not exist or is empty, assign visit->entry_state to @st. + */ +static int maybe_enter_scc(struct bpf_verifier_env *env, struct bpf_verifier_state *st) +{ + struct bpf_scc_callchain *callchain = &env->callchain_buf; + struct bpf_scc_visit *visit; + + if (!compute_scc_callchain(env, st, callchain)) + return 0; + visit = scc_visit_lookup(env, callchain); + visit = visit ?: scc_visit_alloc(env, callchain); + if (!visit) + return -ENOMEM; + if (!visit->entry_state) { + visit->entry_state = st; + if (env->log.level & BPF_LOG_LEVEL2) + verbose(env, "SCC enter %s\n", format_callchain(env, callchain)); + } + return 0; +} + +static int propagate_backedges(struct bpf_verifier_env *env, struct bpf_scc_visit *visit); + +/* If callchain for @st exists (@st is in some SCC), make it empty: + * - set visit->entry_state to NULL; + * - flush accumulated backedges. + */ +static int maybe_exit_scc(struct bpf_verifier_env *env, struct bpf_verifier_state *st) +{ + struct bpf_scc_callchain *callchain = &env->callchain_buf; + struct bpf_scc_visit *visit; + + if (!compute_scc_callchain(env, st, callchain)) + return 0; + visit = scc_visit_lookup(env, callchain); + if (!visit) { + verifier_bug(env, "scc exit: no visit info for call chain %s", + format_callchain(env, callchain)); + return -EFAULT; + } + if (visit->entry_state != st) + return 0; + if (env->log.level & BPF_LOG_LEVEL2) + verbose(env, "SCC exit %s\n", format_callchain(env, callchain)); + visit->entry_state = NULL; + env->num_backedges -= visit->num_backedges; + visit->num_backedges = 0; + update_peak_states(env); + return propagate_backedges(env, visit); +} + +/* Lookup an bpf_scc_visit instance corresponding to @st callchain + * and add @backedge to visit->backedges. @st callchain must exist. + */ +static int add_scc_backedge(struct bpf_verifier_env *env, + struct bpf_verifier_state *st, + struct bpf_scc_backedge *backedge) +{ + struct bpf_scc_callchain *callchain = &env->callchain_buf; + struct bpf_scc_visit *visit; + + if (!compute_scc_callchain(env, st, callchain)) { + verifier_bug(env, "add backedge: no SCC in verification path, insn_idx %d", + st->insn_idx); + return -EFAULT; + } + visit = scc_visit_lookup(env, callchain); + if (!visit) { + verifier_bug(env, "add backedge: no visit info for call chain %s", + format_callchain(env, callchain)); + return -EFAULT; + } + if (env->log.level & BPF_LOG_LEVEL2) + verbose(env, "SCC backedge %s\n", format_callchain(env, callchain)); + backedge->next = visit->backedges; + visit->backedges = backedge; + visit->num_backedges++; + env->num_backedges++; + update_peak_states(env); + return 0; +} + +/* bpf_reg_state->live marks for registers in a state @st are incomplete, + * if state @st is in some SCC and not all execution paths starting at this + * SCC are fully explored. + */ +static bool incomplete_read_marks(struct bpf_verifier_env *env, + struct bpf_verifier_state *st) +{ + struct bpf_scc_callchain *callchain = &env->callchain_buf; + struct bpf_scc_visit *visit; + + if (!compute_scc_callchain(env, st, callchain)) + return false; + visit = scc_visit_lookup(env, callchain); + if (!visit) + return false; + return !!visit->backedges; +} + +static void free_backedges(struct bpf_scc_visit *visit) +{ + struct bpf_scc_backedge *backedge, *next; + + for (backedge = visit->backedges; backedge; backedge = next) { + free_verifier_state(&backedge->state, false); + next = backedge->next; + kvfree(backedge); + } + visit->backedges = NULL; +} + +static int update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st) { struct bpf_verifier_state_list *sl = NULL, *parent_sl; struct bpf_verifier_state *parent; + int err; while (st) { u32 br = --st->branches; - /* br == 0 signals that DFS exploration for 'st' is finished, - * thus it is necessary to update parent's loop entry if it - * turned out that st is a part of some loop. - * This is a part of 'case A' in get_loop_entry() comment. - */ - if (br == 0 && st->parent && st->loop_entry) - update_loop_entry(env, st->parent, st->loop_entry); - - /* WARN_ON(br > 1) technically makes sense here, + /* verifier_bug_if(br > 1, ...) technically makes sense here, * but see comment in push_stack(), hence: */ - WARN_ONCE((int)br < 0, - "BUG update_branch_counts:branches_to_explore=%d\n", - br); + verifier_bug_if((int)br < 0, env, "%s:branches_to_explore=%d", __func__, br); if (br) break; + err = maybe_exit_scc(env, st); + if (err) + return err; parent = st->parent; parent_sl = state_parent_as_list(st); if (sl) @@ -1981,6 +2047,7 @@ static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifi st = parent; sl = parent_sl; } + return 0; } static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, @@ -2012,6 +2079,18 @@ static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, return 0; } +static bool error_recoverable_with_nospec(int err) +{ + /* Should only return true for non-fatal errors that are allowed to + * occur during speculative verification. For these we can insert a + * nospec and the program might still be accepted. Do not include + * something like ENOMEM because it is likely to re-occur for the next + * architectural path once it has been recovered-from in all speculative + * paths. + */ + return err == -EPERM || err == -EACCES || err == -EINVAL; +} + static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx, bool speculative) @@ -2020,9 +2099,9 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, struct bpf_verifier_stack_elem *elem; int err; - elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL); + elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL_ACCOUNT); if (!elem) - goto err; + return NULL; elem->insn_idx = insn_idx; elem->prev_insn_idx = prev_insn_idx; @@ -2032,12 +2111,12 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, env->stack_size++; err = copy_verifier_state(&elem->st, cur); if (err) - goto err; + return NULL; elem->st.speculative |= speculative; if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) { verbose(env, "The sequence of %d jumps is too complex.\n", env->stack_size); - goto err; + return NULL; } if (elem->st.parent) { ++elem->st.parent->branches; @@ -2052,12 +2131,6 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, */ } return &elem->st; -err: - free_verifier_state(env->cur_state, true); - env->cur_state = NULL; - /* pop all elements and return */ - while (!pop_stack(env, NULL, NULL, false)); - return NULL; } #define CALLER_SAVED_REGS 6 @@ -2450,6 +2523,58 @@ static void __reg64_deduce_bounds(struct bpf_reg_state *reg) if ((u64)reg->smin_value <= (u64)reg->smax_value) { reg->umin_value = max_t(u64, reg->smin_value, reg->umin_value); reg->umax_value = min_t(u64, reg->smax_value, reg->umax_value); + } else { + /* If the s64 range crosses the sign boundary, then it's split + * between the beginning and end of the U64 domain. In that + * case, we can derive new bounds if the u64 range overlaps + * with only one end of the s64 range. + * + * In the following example, the u64 range overlaps only with + * positive portion of the s64 range. + * + * 0 U64_MAX + * | [xxxxxxxxxxxxxx u64 range xxxxxxxxxxxxxx] | + * |----------------------------|----------------------------| + * |xxxxx s64 range xxxxxxxxx] [xxxxxxx| + * 0 S64_MAX S64_MIN -1 + * + * We can thus derive the following new s64 and u64 ranges. + * + * 0 U64_MAX + * | [xxxxxx u64 range xxxxx] | + * |----------------------------|----------------------------| + * | [xxxxxx s64 range xxxxx] | + * 0 S64_MAX S64_MIN -1 + * + * If they overlap in two places, we can't derive anything + * because reg_state can't represent two ranges per numeric + * domain. + * + * 0 U64_MAX + * | [xxxxxxxxxxxxxxxxx u64 range xxxxxxxxxxxxxxxxx] | + * |----------------------------|----------------------------| + * |xxxxx s64 range xxxxxxxxx] [xxxxxxxxxx| + * 0 S64_MAX S64_MIN -1 + * + * The first condition below corresponds to the first diagram + * above. + */ + if (reg->umax_value < (u64)reg->smin_value) { + reg->smin_value = (s64)reg->umin_value; + reg->umax_value = min_t(u64, reg->umax_value, reg->smax_value); + } else if ((u64)reg->smax_value < reg->umin_value) { + /* This second condition considers the case where the u64 range + * overlaps with the negative portion of the s64 range: + * + * 0 U64_MAX + * | [xxxxxxxxxxxxxx u64 range xxxxxxxxxxxxxx] | + * |----------------------------|----------------------------| + * |xxxxxxxxx] [xxxxxxxxxxxx s64 range | + * 0 S64_MAX S64_MIN -1 + */ + reg->smax_value = (s64)reg->umax_value; + reg->umin_value = max_t(u64, reg->umin_value, reg->smin_value); + } } } @@ -2481,20 +2606,6 @@ static void __reg_deduce_mixed_bounds(struct bpf_reg_state *reg) reg->smin_value = max_t(s64, reg->smin_value, new_smin); reg->smax_value = min_t(s64, reg->smax_value, new_smax); - /* if s32 can be treated as valid u32 range, we can use it as well */ - if ((u32)reg->s32_min_value <= (u32)reg->s32_max_value) { - /* s32 -> u64 tightening */ - new_umin = (reg->umin_value & ~0xffffffffULL) | (u32)reg->s32_min_value; - new_umax = (reg->umax_value & ~0xffffffffULL) | (u32)reg->s32_max_value; - reg->umin_value = max_t(u64, reg->umin_value, new_umin); - reg->umax_value = min_t(u64, reg->umax_value, new_umax); - /* s32 -> s64 tightening */ - new_smin = (reg->smin_value & ~0xffffffffULL) | (u32)reg->s32_min_value; - new_smax = (reg->smax_value & ~0xffffffffULL) | (u32)reg->s32_max_value; - reg->smin_value = max_t(s64, reg->smin_value, new_smin); - reg->smax_value = min_t(s64, reg->smax_value, new_smax); - } - /* Here we would like to handle a special case after sign extending load, * when upper bits for a 64-bit range are all 1s or all 0s. * @@ -2561,6 +2672,7 @@ static void reg_bounds_sync(struct bpf_reg_state *reg) /* We might have learned something about the sign bit. */ __reg_deduce_bounds(reg); __reg_deduce_bounds(reg); + __reg_deduce_bounds(reg); /* We might have learned some bits from the bounds. */ __reg_bound_offset(reg); /* Intersecting with the old var_off might have improved our bounds @@ -2607,13 +2719,13 @@ static int reg_bounds_sanity_check(struct bpf_verifier_env *env, return 0; out: - verbose(env, "REG INVARIANTS VIOLATION (%s): %s u64=[%#llx, %#llx] " - "s64=[%#llx, %#llx] u32=[%#x, %#x] s32=[%#x, %#x] var_off=(%#llx, %#llx)\n", - ctx, msg, reg->umin_value, reg->umax_value, - reg->smin_value, reg->smax_value, - reg->u32_min_value, reg->u32_max_value, - reg->s32_min_value, reg->s32_max_value, - reg->var_off.value, reg->var_off.mask); + verifier_bug(env, "REG INVARIANTS VIOLATION (%s): %s u64=[%#llx, %#llx] " + "s64=[%#llx, %#llx] u32=[%#x, %#x] s32=[%#x, %#x] var_off=(%#llx, %#llx)", + ctx, msg, reg->umin_value, reg->umax_value, + reg->smin_value, reg->smax_value, + reg->u32_min_value, reg->u32_max_value, + reg->s32_min_value, reg->s32_max_value, + reg->var_off.value, reg->var_off.mask); if (env->test_reg_invariants) return -EFAULT; __mark_reg_unbounded(reg); @@ -2723,22 +2835,33 @@ static void mark_reg_not_init(struct bpf_verifier_env *env, __mark_reg_not_init(env, regs + regno); } -static void mark_btf_ld_reg(struct bpf_verifier_env *env, - struct bpf_reg_state *regs, u32 regno, - enum bpf_reg_type reg_type, - struct btf *btf, u32 btf_id, - enum bpf_type_flag flag) +static int mark_btf_ld_reg(struct bpf_verifier_env *env, + struct bpf_reg_state *regs, u32 regno, + enum bpf_reg_type reg_type, + struct btf *btf, u32 btf_id, + enum bpf_type_flag flag) { - if (reg_type == SCALAR_VALUE) { + switch (reg_type) { + case SCALAR_VALUE: mark_reg_unknown(env, regs, regno); - return; + return 0; + case PTR_TO_BTF_ID: + mark_reg_known_zero(env, regs, regno); + regs[regno].type = PTR_TO_BTF_ID | flag; + regs[regno].btf = btf; + regs[regno].btf_id = btf_id; + if (type_may_be_null(flag)) + regs[regno].id = ++env->id_gen; + return 0; + case PTR_TO_MEM: + mark_reg_known_zero(env, regs, regno); + regs[regno].type = PTR_TO_MEM | flag; + regs[regno].mem_size = 0; + return 0; + default: + verifier_bug(env, "unexpected reg_type %d in %s\n", reg_type, __func__); + return -EFAULT; } - mark_reg_known_zero(env, regs, regno); - regs[regno].type = PTR_TO_BTF_ID | flag; - regs[regno].btf = btf; - regs[regno].btf_id = btf_id; - if (type_may_be_null(flag)) - regs[regno].id = ++env->id_gen; } #define DEF_NOT_SUBREG (0) @@ -2787,9 +2910,9 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env, struct bpf_verifier_stack_elem *elem; struct bpf_func_state *frame; - elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL); + elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL_ACCOUNT); if (!elem) - goto err; + return NULL; elem->insn_idx = insn_idx; elem->prev_insn_idx = prev_insn_idx; @@ -2801,35 +2924,24 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env, verbose(env, "The sequence of %d jumps is too complex for async cb.\n", env->stack_size); - goto err; + return NULL; } /* Unlike push_stack() do not copy_verifier_state(). * The caller state doesn't matter. * This is async callback. It starts in a fresh stack. * Initialize it similar to do_check_common(). - * But we do need to make sure to not clobber insn_hist, so we keep - * chaining insn_hist_start/insn_hist_end indices as for a normal - * child state. */ elem->st.branches = 1; elem->st.in_sleepable = is_sleepable; - elem->st.insn_hist_start = env->cur_state->insn_hist_end; - elem->st.insn_hist_end = elem->st.insn_hist_start; - frame = kzalloc(sizeof(*frame), GFP_KERNEL); + frame = kzalloc(sizeof(*frame), GFP_KERNEL_ACCOUNT); if (!frame) - goto err; + return NULL; init_func_state(env, frame, BPF_MAIN_FUNC /* callsite */, 0 /* frameno within this callchain */, subprog /* subprog number within this prog */); elem->st.frame[0] = frame; return &elem->st; -err: - free_verifier_state(env->cur_state, true); - env->cur_state = NULL; - /* pop all elements and return */ - while (!pop_stack(env, NULL, NULL, false)); - return NULL; } @@ -3167,7 +3279,7 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) return -EINVAL; } - tab = kzalloc(sizeof(*tab), GFP_KERNEL); + tab = kzalloc(sizeof(*tab), GFP_KERNEL_ACCOUNT); if (!tab) return -ENOMEM; prog_aux->kfunc_tab = tab; @@ -3183,7 +3295,7 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) return 0; if (!btf_tab && offset) { - btf_tab = kzalloc(sizeof(*btf_tab), GFP_KERNEL); + btf_tab = kzalloc(sizeof(*btf_tab), GFP_KERNEL_ACCOUNT); if (!btf_tab) return -ENOMEM; prog_aux->kfunc_btf_tab = btf_tab; @@ -3843,10 +3955,11 @@ static void linked_regs_unpack(u64 val, struct linked_regs *s) } /* for any branch, call, exit record the history of jmps in the given state */ -static int push_insn_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur, - int insn_flags, u64 linked_regs) +static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur, + int insn_flags, u64 linked_regs) { - struct bpf_insn_hist_entry *p; + u32 cnt = cur->jmp_history_cnt; + struct bpf_jmp_history_entry *p; size_t alloc_size; /* combine instruction flags if we already recorded this instruction */ @@ -3866,32 +3979,29 @@ static int push_insn_history(struct bpf_verifier_env *env, struct bpf_verifier_s return 0; } - if (cur->insn_hist_end + 1 > env->insn_hist_cap) { - alloc_size = size_mul(cur->insn_hist_end + 1, sizeof(*p)); - p = kvrealloc(env->insn_hist, alloc_size, GFP_USER); - if (!p) - return -ENOMEM; - env->insn_hist = p; - env->insn_hist_cap = alloc_size / sizeof(*p); - } + cnt++; + alloc_size = kmalloc_size_roundup(size_mul(cnt, sizeof(*p))); + p = krealloc(cur->jmp_history, alloc_size, GFP_KERNEL_ACCOUNT); + if (!p) + return -ENOMEM; + cur->jmp_history = p; - p = &env->insn_hist[cur->insn_hist_end]; + p = &cur->jmp_history[cnt - 1]; p->idx = env->insn_idx; p->prev_idx = env->prev_insn_idx; p->flags = insn_flags; p->linked_regs = linked_regs; - - cur->insn_hist_end++; + cur->jmp_history_cnt = cnt; env->cur_hist_ent = p; return 0; } -static struct bpf_insn_hist_entry *get_insn_hist_entry(struct bpf_verifier_env *env, - u32 hist_start, u32 hist_end, int insn_idx) +static struct bpf_jmp_history_entry *get_jmp_hist_entry(struct bpf_verifier_state *st, + u32 hist_end, int insn_idx) { - if (hist_end > hist_start && env->insn_hist[hist_end - 1].idx == insn_idx) - return &env->insn_hist[hist_end - 1]; + if (hist_end > 0 && st->jmp_history[hist_end - 1].idx == insn_idx) + return &st->jmp_history[hist_end - 1]; return NULL; } @@ -3908,26 +4018,25 @@ static struct bpf_insn_hist_entry *get_insn_hist_entry(struct bpf_verifier_env * * history entry recording a jump from last instruction of parent state and * first instruction of given state. */ -static int get_prev_insn_idx(const struct bpf_verifier_env *env, - struct bpf_verifier_state *st, - int insn_idx, u32 hist_start, u32 *hist_endp) +static int get_prev_insn_idx(struct bpf_verifier_state *st, int i, + u32 *history) { - u32 hist_end = *hist_endp; - u32 cnt = hist_end - hist_start; + u32 cnt = *history; - if (insn_idx == st->first_insn_idx) { + if (i == st->first_insn_idx) { if (cnt == 0) return -ENOENT; - if (cnt == 1 && env->insn_hist[hist_start].idx == insn_idx) + if (cnt == 1 && st->jmp_history[0].idx == i) return -ENOENT; } - if (cnt && env->insn_hist[hist_end - 1].idx == insn_idx) { - (*hist_endp)--; - return env->insn_hist[hist_end - 1].prev_idx; + if (cnt && st->jmp_history[cnt - 1].idx == i) { + i = st->jmp_history[cnt - 1].prev_idx; + (*history)--; } else { - return insn_idx - 1; + i--; } + return i; } static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn) @@ -4108,7 +4217,7 @@ static void fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask) /* If any register R in hist->linked_regs is marked as precise in bt, * do bt_set_frame_{reg,slot}(bt, R) for all registers in hist->linked_regs. */ -static void bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_insn_hist_entry *hist) +static void bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_jmp_history_entry *hist) { struct linked_regs linked_regs; bool some_precise = false; @@ -4153,7 +4262,7 @@ static bool calls_callback(struct bpf_verifier_env *env, int insn_idx); * - *was* processed previously during backtracking. */ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, - struct bpf_insn_hist_entry *hist, struct backtrack_state *bt) + struct bpf_jmp_history_entry *hist, struct backtrack_state *bt) { struct bpf_insn *insn = env->prog->insnsi + idx; u8 class = BPF_CLASS(insn->code); @@ -4448,7 +4557,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * . if (scalar cond K|scalar) * . helper_call(.., scalar, ...) where ARG_CONST is expected * backtrack through the verifier states and mark all registers and - * stack slots with spilled constants that these scalar regisers + * stack slots with spilled constants that these scalar registers * should be precise. * . during state pruning two registers (or spilled stack slots) * are equivalent if both are not precise. @@ -4571,7 +4680,7 @@ static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_ * SCALARS, as well as any other registers and slots that contribute to * a tracked state of given registers/stack slots, depending on specific BPF * assembly instructions (see backtrack_insns() for exact instruction handling - * logic). This backtracking relies on recorded insn_hist and is able to + * logic). This backtracking relies on recorded jmp_history and is able to * traverse entire chain of parent states. This process ends only when all the * necessary registers/slots and their transitive dependencies are marked as * precise. @@ -4651,23 +4760,27 @@ static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_ * mark_all_scalars_imprecise() to hopefully get more permissive and generic * finalized states which help in short circuiting more future states. */ -static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) +static int __mark_chain_precision(struct bpf_verifier_env *env, + struct bpf_verifier_state *starting_state, + int regno, + bool *changed) { + struct bpf_verifier_state *st = starting_state; struct backtrack_state *bt = &env->bt; - struct bpf_verifier_state *st = env->cur_state; int first_idx = st->first_insn_idx; - int last_idx = env->insn_idx; + int last_idx = starting_state->insn_idx; int subseq_idx = -1; struct bpf_func_state *func; + bool tmp, skip_first = true; struct bpf_reg_state *reg; - bool skip_first = true; int i, fr, err; if (!env->bpf_capable) return 0; + changed = changed ?: &tmp; /* set frame number from which we are starting to backtrack */ - bt_init(bt, env->cur_state->curframe); + bt_init(bt, starting_state->curframe); /* Do sanity checks against current state of register and/or stack * slot, but don't set precise flag in current state, as precision @@ -4677,7 +4790,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) if (regno >= 0) { reg = &func->regs[regno]; if (reg->type != SCALAR_VALUE) { - WARN_ONCE(1, "backtracing misuse"); + verifier_bug(env, "backtracking misuse"); return -EFAULT; } bt_set_reg(bt, regno); @@ -4688,9 +4801,8 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) for (;;) { DECLARE_BITMAP(mask, 64); - u32 hist_start = st->insn_hist_start; - u32 hist_end = st->insn_hist_end; - struct bpf_insn_hist_entry *hist; + u32 history = st->jmp_history_cnt; + struct bpf_jmp_history_entry *hist; if (env->log.level & BPF_LOG_LEVEL2) { verbose(env, "mark_precise: frame%d: last_idx %d first_idx %d subseq_idx %d \n", @@ -4712,8 +4824,10 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) for_each_set_bit(i, mask, 32) { reg = &st->frame[0]->regs[i]; bt_clear_reg(bt, i); - if (reg->type == SCALAR_VALUE) + if (reg->type == SCALAR_VALUE) { reg->precise = true; + *changed = true; + } } return 0; } @@ -4728,11 +4842,11 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) err = 0; skip_first = false; } else { - hist = get_insn_hist_entry(env, hist_start, hist_end, i); + hist = get_jmp_hist_entry(st, history, i); err = backtrack_insn(env, i, subseq_idx, hist, bt); } if (err == -ENOTSUPP) { - mark_all_scalars_precise(env, env->cur_state); + mark_all_scalars_precise(env, starting_state); bt_reset(bt); return 0; } else if (err) { @@ -4745,7 +4859,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) */ return 0; subseq_idx = i; - i = get_prev_insn_idx(env, st, i, hist_start, &hist_end); + i = get_prev_insn_idx(st, i, &history); if (i == -ENOENT) break; if (i >= env->prog->len) { @@ -4772,10 +4886,12 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) bt_clear_frame_reg(bt, fr, i); continue; } - if (reg->precise) + if (reg->precise) { bt_clear_frame_reg(bt, fr, i); - else + } else { reg->precise = true; + *changed = true; + } } bitmap_from_u64(mask, bt_frame_stack_mask(bt, fr)); @@ -4790,10 +4906,12 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) continue; } reg = &func->stack[i].spilled_ptr; - if (reg->precise) + if (reg->precise) { bt_clear_frame_slot(bt, fr, i); - else + } else { reg->precise = true; + *changed = true; + } } if (env->log.level & BPF_LOG_LEVEL2) { fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, @@ -4820,7 +4938,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) * fallback to marking all precise */ if (!bt_empty(bt)) { - mark_all_scalars_precise(env, env->cur_state); + mark_all_scalars_precise(env, starting_state); bt_reset(bt); } @@ -4829,15 +4947,16 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) int mark_chain_precision(struct bpf_verifier_env *env, int regno) { - return __mark_chain_precision(env, regno); + return __mark_chain_precision(env, env->cur_state, regno, NULL); } /* mark_chain_precision_batch() assumes that env->bt is set in the caller to * desired reg and stack masks across all relevant frames */ -static int mark_chain_precision_batch(struct bpf_verifier_env *env) +static int mark_chain_precision_batch(struct bpf_verifier_env *env, + struct bpf_verifier_state *starting_state) { - return __mark_chain_precision(env, -1); + return __mark_chain_precision(env, starting_state, -1, NULL); } static bool is_spillable_regtype(enum bpf_reg_type type) @@ -5026,7 +5145,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, } if (sanitize) - env->insn_aux_data[insn_idx].sanitize_stack_spill = true; + env->insn_aux_data[insn_idx].nospec_result = true; } err = destroy_if_dynptr_stack_slot(env, state, spi); @@ -5109,7 +5228,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, } if (insn_flags) - return push_insn_history(env, env->cur_state, insn_flags, 0); + return push_jmp_history(env, env->cur_state, insn_flags, 0); return 0; } @@ -5416,7 +5535,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, insn_flags = 0; /* we are not restoring spilled register */ } if (insn_flags) - return push_insn_history(env, env->cur_state, insn_flags, 0); + return push_jmp_history(env, env->cur_state, insn_flags, 0); return 0; } @@ -5896,6 +6015,7 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, struct bpf_insn *insn = &env->prog->insnsi[insn_idx]; int class = BPF_CLASS(insn->code); struct bpf_reg_state *val_reg; + int ret; /* Things we already checked for in check_map_access and caller: * - Reject cases where variable offset may touch kptr @@ -5929,8 +6049,11 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, /* We can simply mark the value_regno receiving the pointer * value from map as PTR_TO_BTF_ID, with the correct type. */ - mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, kptr_field->kptr.btf, - kptr_field->kptr.btf_id, btf_ld_kptr_type(env, kptr_field)); + ret = mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, + kptr_field->kptr.btf, kptr_field->kptr.btf_id, + btf_ld_kptr_type(env, kptr_field)); + if (ret < 0) + return ret; } else if (class == BPF_STX) { val_reg = reg_state(env, value_regno); if (!register_is_null(val_reg) && @@ -7143,7 +7266,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, if (env->ops->btf_struct_access && !type_is_alloc(reg->type) && atype == BPF_WRITE) { if (!btf_is_kernel(reg->btf)) { - verbose(env, "verifier internal error: reg->btf must be kernel btf\n"); + verifier_bug(env, "reg->btf must be kernel btf"); return -EFAULT; } ret = env->ops->btf_struct_access(&env->log, reg, off, size); @@ -7159,7 +7282,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, if (type_is_alloc(reg->type) && !type_is_non_owning_ref(reg->type) && !(reg->type & MEM_RCU) && !reg->ref_obj_id) { - verbose(env, "verifier internal error: ref_obj_id for allocated object must be non-zero\n"); + verifier_bug(env, "ref_obj_id for allocated object must be non-zero"); return -EFAULT; } @@ -7229,8 +7352,11 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, clear_trusted_flags(&flag); } - if (atype == BPF_READ && value_regno >= 0) - mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag); + if (atype == BPF_READ && value_regno >= 0) { + ret = mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag); + if (ret < 0) + return ret; + } return 0; } @@ -7284,13 +7410,19 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env, /* Simulate access to a PTR_TO_BTF_ID */ memset(&map_reg, 0, sizeof(map_reg)); - mark_btf_ld_reg(env, &map_reg, 0, PTR_TO_BTF_ID, btf_vmlinux, *map->ops->map_btf_id, 0); + ret = mark_btf_ld_reg(env, &map_reg, 0, PTR_TO_BTF_ID, + btf_vmlinux, *map->ops->map_btf_id, 0); + if (ret < 0) + return ret; ret = btf_struct_access(&env->log, &map_reg, off, size, atype, &btf_id, &flag, NULL); if (ret < 0) return ret; - if (value_regno >= 0) - mark_btf_ld_reg(env, regs, value_regno, ret, btf_vmlinux, btf_id, flag); + if (value_regno >= 0) { + ret = mark_btf_ld_reg(env, regs, value_regno, ret, btf_vmlinux, btf_id, flag); + if (ret < 0) + return ret; + } return 0; } @@ -7474,6 +7606,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } } else if (base_type(reg->type) == PTR_TO_MEM) { bool rdonly_mem = type_is_rdonly_mem(reg->type); + bool rdonly_untrusted = rdonly_mem && (reg->type & PTR_UNTRUSTED); if (type_may_be_null(reg->type)) { verbose(env, "R%d invalid mem access '%s'\n", regno, @@ -7493,8 +7626,13 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return -EACCES; } - err = check_mem_region_access(env, regno, off, size, - reg->mem_size, false); + /* + * Accesses to untrusted PTR_TO_MEM are done through probe + * instructions, hence no need to check bounds in that case. + */ + if (!rdonly_untrusted) + err = check_mem_region_access(env, regno, off, size, + reg->mem_size, false); if (!err && value_regno >= 0 && (t == BPF_READ || rdonly_mem)) mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_CTX) { @@ -8521,7 +8659,7 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn * ARG_PTR_TO_DYNPTR (or ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_*): */ if ((arg_type & (MEM_UNINIT | MEM_RDONLY)) == (MEM_UNINIT | MEM_RDONLY)) { - verbose(env, "verifier internal error: misconfigured dynptr helper type flags\n"); + verifier_bug(env, "misconfigured dynptr helper type flags"); return -EFAULT; } @@ -8887,8 +9025,8 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx, if (cur_iter->iter.state != BPF_ITER_STATE_ACTIVE && cur_iter->iter.state != BPF_ITER_STATE_DRAINED) { - verbose(env, "verifier internal error: unexpected iterator state %d (%s)\n", - cur_iter->iter.state, iter_state_str(cur_iter->iter.state)); + verifier_bug(env, "unexpected iterator state %d (%s)", + cur_iter->iter.state, iter_state_str(cur_iter->iter.state)); return -EFAULT; } @@ -8898,7 +9036,7 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx, */ if (!cur_st->parent || cur_st->parent->insn_idx != insn_idx || !same_callsites(cur_st->parent, cur_st)) { - verbose(env, "bug: bad parent state for iter next call"); + verifier_bug(env, "bad parent state for iter next call"); return -EFAULT; } /* Note cur_st->parent in the call below, it is necessary to skip @@ -8957,8 +9095,8 @@ static int resolve_map_arg_type(struct bpf_verifier_env *env, { if (!meta->map_ptr) { /* kernel subsystem misconfigured verifier */ - verbose(env, "invalid map_ptr to access map->type\n"); - return -EACCES; + verifier_bug(env, "invalid map_ptr to access map->type"); + return -EFAULT; } switch (meta->map_ptr->map_type) { @@ -9104,7 +9242,7 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, compatible = compatible_reg_types[base_type(arg_type)]; if (!compatible) { - verbose(env, "verifier internal error: unsupported arg type %d\n", arg_type); + verifier_bug(env, "unsupported arg type %d", arg_type); return -EFAULT; } @@ -9186,7 +9324,7 @@ found: if (!arg_btf_id) { if (!compatible->btf_id) { - verbose(env, "verifier internal error: missing arg compatible BTF ID\n"); + verifier_bug(env, "missing arg compatible BTF ID"); return -EFAULT; } arg_btf_id = compatible->btf_id; @@ -9218,7 +9356,7 @@ found: case PTR_TO_BTF_ID | MEM_PERCPU | MEM_ALLOC: if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock && meta->func_id != BPF_FUNC_kptr_xchg) { - verbose(env, "verifier internal error: unimplemented handling of MEM_ALLOC\n"); + verifier_bug(env, "unimplemented handling of MEM_ALLOC"); return -EFAULT; } /* Check if local kptr in src arg matches kptr in dst arg */ @@ -9233,7 +9371,7 @@ found: /* Handled by helper specific checks */ break; default: - verbose(env, "verifier internal error: invalid PTR_TO_BTF_ID register for type match\n"); + verifier_bug(env, "invalid PTR_TO_BTF_ID register for type match"); return -EFAULT; } return 0; @@ -9494,7 +9632,7 @@ static int get_constant_map_key(struct bpf_verifier_env *env, * to prevent pruning on it. */ bt_set_frame_slot(&env->bt, key->frameno, spi); - err = mark_chain_precision_batch(env); + err = mark_chain_precision_batch(env, env->cur_state); if (err < 0) return err; @@ -9591,7 +9729,7 @@ skip_type_check: return -EINVAL; } if (meta->release_regno) { - verbose(env, "verifier internal error: more than one release argument\n"); + verifier_bug(env, "more than one release argument"); return -EFAULT; } meta->release_regno = regno; @@ -9599,10 +9737,10 @@ skip_type_check: if (reg->ref_obj_id && base_type(arg_type) != ARG_KPTR_XCHG_DEST) { if (meta->ref_obj_id) { - verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", + verbose(env, "more than one arg with ref_obj_id R%d %u %u", regno, reg->ref_obj_id, meta->ref_obj_id); - return -EFAULT; + return -EACCES; } meta->ref_obj_id = reg->ref_obj_id; } @@ -9645,8 +9783,8 @@ skip_type_check: * we have to check map_key here. Otherwise it means * that kernel subsystem misconfigured verifier */ - verbose(env, "invalid map_ptr to access map->key\n"); - return -EACCES; + verifier_bug(env, "invalid map_ptr to access map->key"); + return -EFAULT; } key_size = meta->map_ptr->key_size; err = check_helper_mem_access(env, regno, key_size, BPF_READ, false, NULL); @@ -9672,8 +9810,8 @@ skip_type_check: */ if (!meta->map_ptr) { /* kernel subsystem misconfigured verifier */ - verbose(env, "invalid map_ptr to access map->value\n"); - return -EACCES; + verifier_bug(env, "invalid map_ptr to access map->value"); + return -EFAULT; } meta->raw_mode = arg_type & MEM_UNINIT; err = check_helper_mem_access(env, regno, meta->map_ptr->value_size, @@ -9702,7 +9840,7 @@ skip_type_check: if (err) return err; } else { - verbose(env, "verifier internal error\n"); + verifier_bug(env, "spin lock arg on unexpected helper"); return -EFAULT; } break; @@ -10283,7 +10421,7 @@ static int setup_func_entry(struct bpf_verifier_env *env, int subprog, int calls } caller = state->frame[state->curframe]; - callee = kzalloc(sizeof(*callee), GFP_KERNEL); + callee = kzalloc(sizeof(*callee), GFP_KERNEL_ACCOUNT); if (!callee) return -ENOMEM; state->frame[state->curframe + 1] = callee; @@ -10338,6 +10476,12 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, bpf_log(log, "R%d is not a scalar\n", regno); return -EINVAL; } + } else if (arg->arg_type & PTR_UNTRUSTED) { + /* + * Anything is allowed for untrusted arguments, as these are + * read-only and probe read instructions would protect against + * invalid memory access. + */ } else if (arg->arg_type == ARG_PTR_TO_CTX) { ret = check_func_arg_reg_off(env, reg, regno, ARG_DONTCARE); if (ret < 0) @@ -10860,8 +11004,8 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) return -EINVAL; } if (!calls_callback(env, callee->callsite)) { - verbose(env, "BUG: in callback at %d, callsite %d !calls_callback\n", - *insn_idx, callee->callsite); + verifier_bug(env, "in callback at %d, callsite %d !calls_callback", + *insn_idx, callee->callsite); return -EFAULT; } } else { @@ -10968,8 +11112,8 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, return 0; if (map == NULL) { - verbose(env, "kernel subsystem misconfigured verifier\n"); - return -EINVAL; + verifier_bug(env, "expected map for helper call"); + return -EFAULT; } /* In case of read-only, some additional restrictions @@ -11007,7 +11151,7 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, if (func_id != BPF_FUNC_tail_call) return 0; if (!map || map->map_type != BPF_MAP_TYPE_PROG_ARRAY) { - verbose(env, "kernel subsystem misconfigured verifier\n"); + verbose(env, "expected prog array map for tail call"); return -EINVAL; } @@ -11150,7 +11294,7 @@ static int check_get_func_ip(struct bpf_verifier_env *env) return -ENOTSUPP; } -static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env) +static struct bpf_insn_aux_data *cur_aux(const struct bpf_verifier_env *env) { return &env->insn_aux_data[env->insn_idx]; } @@ -11260,9 +11404,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn /* With LD_ABS/IND some JITs save/restore skb from r1. */ changes_data = bpf_helper_changes_pkt_data(func_id); if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) { - verbose(env, "kernel subsystem misconfigured func %s#%d: r1 != ctx\n", - func_id_name(func_id), func_id); - return -EINVAL; + verifier_bug(env, "func %s#%d: r1 != ctx", func_id_name(func_id), func_id); + return -EFAULT; } memset(&meta, 0, sizeof(meta)); @@ -11270,8 +11413,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn err = check_func_proto(fn, func_id); if (err) { - verbose(env, "kernel subsystem misconfigured func %s#%d\n", - func_id_name(func_id), func_id); + verifier_bug(env, "incorrect func proto %s#%d", func_id_name(func_id), func_id); return err; } @@ -11344,7 +11486,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn */ if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1])) { if (regs[meta.release_regno].type == CONST_PTR_TO_DYNPTR) { - verbose(env, "verifier internal error: CONST_PTR_TO_DYNPTR cannot be released\n"); + verifier_bug(env, "CONST_PTR_TO_DYNPTR cannot be released"); return -EFAULT; } err = unmark_stack_slots_dynptr(env, ®s[meta.release_regno]); @@ -11461,23 +11603,23 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (meta.dynptr_id) { - verbose(env, "verifier internal error: meta.dynptr_id already set\n"); + verifier_bug(env, "meta.dynptr_id already set"); return -EFAULT; } if (meta.ref_obj_id) { - verbose(env, "verifier internal error: meta.ref_obj_id already set\n"); + verifier_bug(env, "meta.ref_obj_id already set"); return -EFAULT; } id = dynptr_id(env, reg); if (id < 0) { - verbose(env, "verifier internal error: failed to obtain dynptr id\n"); + verifier_bug(env, "failed to obtain dynptr id"); return id; } ref_obj_id = dynptr_ref_obj_id(env, reg); if (ref_obj_id < 0) { - verbose(env, "verifier internal error: failed to obtain dynptr ref_obj_id\n"); + verifier_bug(env, "failed to obtain dynptr ref_obj_id"); return ref_obj_id; } @@ -11562,9 +11704,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn * to map element returned from bpf_map_lookup_elem() */ if (meta.map_ptr == NULL) { - verbose(env, - "kernel subsystem misconfigured verifier\n"); - return -EINVAL; + verifier_bug(env, "unexpected null map_ptr"); + return -EFAULT; } if (func_id == BPF_FUNC_map_lookup_elem && @@ -11654,10 +11795,9 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn } } else { if (fn->ret_btf_id == BPF_PTR_POISON) { - verbose(env, "verifier internal error:"); - verbose(env, "func %s has non-overwritten BPF_PTR_POISON return type\n", - func_id_name(func_id)); - return -EINVAL; + verifier_bug(env, "func %s has non-overwritten BPF_PTR_POISON return type", + func_id_name(func_id)); + return -EFAULT; } ret_btf = btf_vmlinux; ret_btf_id = *fn->ret_btf_id; @@ -11682,8 +11822,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn regs[BPF_REG_0].id = ++env->id_gen; if (helper_multiple_ref_obj_use(func_id, meta.map_ptr)) { - verbose(env, "verifier internal error: func %s#%d sets ref_obj_id more than once\n", - func_id_name(func_id), func_id); + verifier_bug(env, "func %s#%d sets ref_obj_id more than once", + func_id_name(func_id), func_id); return -EFAULT; } @@ -12395,7 +12535,7 @@ static int process_irq_flag(struct bpf_verifier_env *env, int regno, if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore]) kfunc_class = IRQ_LOCK_KFUNC; } else { - verbose(env, "verifier internal error: unknown irq flags kfunc\n"); + verifier_bug(env, "unknown irq flags kfunc"); return -EFAULT; } @@ -12436,12 +12576,12 @@ static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state struct btf_record *rec = reg_btf_record(reg); if (!env->cur_state->active_locks) { - verbose(env, "verifier internal error: ref_set_non_owning w/o active lock\n"); + verifier_bug(env, "%s w/o active lock", __func__); return -EFAULT; } if (type_flag(reg->type) & NON_OWN_REF) { - verbose(env, "verifier internal error: NON_OWN_REF already set\n"); + verifier_bug(env, "NON_OWN_REF already set"); return -EFAULT; } @@ -12460,8 +12600,7 @@ static int ref_convert_owning_non_owning(struct bpf_verifier_env *env, u32 ref_o int i; if (!ref_obj_id) { - verbose(env, "verifier internal error: ref_obj_id is zero for " - "owning -> non-owning conversion\n"); + verifier_bug(env, "ref_obj_id is zero for owning -> non-owning conversion"); return -EFAULT; } @@ -12481,7 +12620,7 @@ static int ref_convert_owning_non_owning(struct bpf_verifier_env *env, u32 ref_o return 0; } - verbose(env, "verifier internal error: ref state missing for ref_obj_id\n"); + verifier_bug(env, "ref state missing for ref_obj_id"); return -EFAULT; } @@ -12543,7 +12682,7 @@ static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_ ptr = reg->btf; break; default: - verbose(env, "verifier internal error: unknown reg type for lock check\n"); + verifier_bug(env, "unknown reg type for lock check"); return -EFAULT; } id = reg->id; @@ -12704,7 +12843,7 @@ __process_kf_arg_ptr_to_graph_root(struct bpf_verifier_env *env, u32 head_off; if (meta->btf != btf_vmlinux) { - verbose(env, "verifier internal error: unexpected btf mismatch in kfunc call\n"); + verifier_bug(env, "unexpected btf mismatch in kfunc call"); return -EFAULT; } @@ -12735,7 +12874,7 @@ __process_kf_arg_ptr_to_graph_root(struct bpf_verifier_env *env, } if (*head_field) { - verbose(env, "verifier internal error: repeating %s arg\n", head_type_name); + verifier_bug(env, "repeating %s arg", head_type_name); return -EFAULT; } *head_field = field; @@ -12772,7 +12911,7 @@ __process_kf_arg_ptr_to_graph_node(struct bpf_verifier_env *env, u32 node_off; if (meta->btf != btf_vmlinux) { - verbose(env, "verifier internal error: unexpected btf mismatch in kfunc call\n"); + verifier_bug(env, "unexpected btf mismatch in kfunc call"); return -EFAULT; } @@ -12900,7 +13039,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (is_kfunc_arg_prog(btf, &args[i])) { /* Used to reject repeated use of __prog. */ if (meta->arg_prog) { - verbose(env, "Only 1 prog->aux argument supported per-kfunc\n"); + verifier_bug(env, "Only 1 prog->aux argument supported per-kfunc"); return -EFAULT; } meta->arg_prog = true; @@ -12916,7 +13055,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (is_kfunc_arg_constant(meta->btf, &args[i])) { if (meta->arg_constant.found) { - verbose(env, "verifier internal error: only one constant argument permitted\n"); + verifier_bug(env, "only one constant argument permitted"); return -EFAULT; } if (!tnum_is_const(reg->var_off)) { @@ -12968,9 +13107,9 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (reg->ref_obj_id) { if (is_kfunc_release(meta) && meta->ref_obj_id) { - verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", - regno, reg->ref_obj_id, - meta->ref_obj_id); + verifier_bug(env, "more than one arg with ref_obj_id R%d %u %u", + regno, reg->ref_obj_id, + meta->ref_obj_id); return -EFAULT; } meta->ref_obj_id = reg->ref_obj_id; @@ -13050,7 +13189,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_RES_SPIN_LOCK: break; default: - WARN_ON_ONCE(1); + verifier_bug(env, "unknown kfunc arg type %d", kf_arg_type); return -EFAULT; } @@ -13119,14 +13258,14 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ enum bpf_dynptr_type parent_type = meta->initialized_dynptr.type; if (parent_type == BPF_DYNPTR_TYPE_INVALID) { - verbose(env, "verifier internal error: no dynptr type for parent of clone\n"); + verifier_bug(env, "no dynptr type for parent of clone"); return -EFAULT; } dynptr_arg_type |= (unsigned int)get_dynptr_type_flag(parent_type); clone_ref_obj_id = meta->initialized_dynptr.ref_obj_id; if (dynptr_type_refcounted(parent_type) && !clone_ref_obj_id) { - verbose(env, "verifier internal error: missing ref obj id for parent of clone\n"); + verifier_bug(env, "missing ref obj id for parent of clone"); return -EFAULT; } } @@ -13139,7 +13278,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ int id = dynptr_id(env, reg); if (id < 0) { - verbose(env, "verifier internal error: failed to obtain dynptr id\n"); + verifier_bug(env, "failed to obtain dynptr id"); return id; } meta->initialized_dynptr.id = id; @@ -13275,7 +13414,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (is_kfunc_arg_const_mem_size(meta->btf, size_arg, size_reg)) { if (meta->arg_constant.found) { - verbose(env, "verifier internal error: only one constant argument permitted\n"); + verifier_bug(env, "only one constant argument permitted"); return -EFAULT; } if (!tnum_is_const(size_reg->var_off)) { @@ -13307,7 +13446,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ rec = reg_btf_record(reg); if (!rec) { - verbose(env, "verifier internal error: Couldn't find btf_record\n"); + verifier_bug(env, "Couldn't find btf_record"); return -EFAULT; } @@ -13541,16 +13680,24 @@ static int check_special_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_ca regs[BPF_REG_0].btf_id = meta->ret_btf_id; } else if (meta->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) { ret_t = btf_type_by_id(desc_btf, meta->arg_constant.value); - if (!ret_t || !btf_type_is_struct(ret_t)) { + if (!ret_t) { + verbose(env, "Unknown type ID %lld passed to kfunc bpf_rdonly_cast\n", + meta->arg_constant.value); + return -EINVAL; + } else if (btf_type_is_struct(ret_t)) { + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_UNTRUSTED; + regs[BPF_REG_0].btf = desc_btf; + regs[BPF_REG_0].btf_id = meta->arg_constant.value; + } else if (btf_type_is_void(ret_t)) { + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_MEM | MEM_RDONLY | PTR_UNTRUSTED; + regs[BPF_REG_0].mem_size = 0; + } else { verbose(env, - "kfunc bpf_rdonly_cast type ID argument must be of a struct\n"); + "kfunc bpf_rdonly_cast type ID argument must be of a struct or void\n"); return -EINVAL; } - - mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_UNTRUSTED; - regs[BPF_REG_0].btf = desc_btf; - regs[BPF_REG_0].btf_id = meta->arg_constant.value; } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_slice] || meta->func_id == special_kfunc_list[KF_bpf_dynptr_slice_rdwr]) { enum bpf_type_flag type_flag = get_dynptr_type_flag(meta->initialized_dynptr.type); @@ -13558,7 +13705,7 @@ static int check_special_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_ca mark_reg_known_zero(env, regs, BPF_REG_0); if (!meta->arg_constant.found) { - verbose(env, "verifier internal error: bpf_dynptr_slice(_rdwr) no constant size\n"); + verifier_bug(env, "bpf_dynptr_slice(_rdwr) no constant size"); return -EFAULT; } @@ -13578,7 +13725,7 @@ static int check_special_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_ca } if (!meta->initialized_dynptr.id) { - verbose(env, "verifier internal error: no dynptr id\n"); + verifier_bug(env, "no dynptr id"); return -EFAULT; } regs[BPF_REG_0].dynptr_id = meta->initialized_dynptr.id; @@ -14018,7 +14165,9 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, static bool can_skip_alu_sanitation(const struct bpf_verifier_env *env, const struct bpf_insn *insn) { - return env->bypass_spec_v1 || BPF_SRC(insn->code) == BPF_K; + return env->bypass_spec_v1 || + BPF_SRC(insn->code) == BPF_K || + cur_aux(env)->nospec; } static int update_alu_sanitation_state(struct bpf_insn_aux_data *aux, @@ -14218,10 +14367,9 @@ static int sanitize_err(struct bpf_verifier_env *env, case REASON_STACK: verbose(env, "R%d could not be pushed for speculative verification, %s\n", dst, err); - break; + return -ENOMEM; default: - verbose(env, "verifier internal error: unknown reason (%d)\n", - reason); + verifier_bug(env, "unknown reason (%d)", reason); break; } @@ -14288,7 +14436,7 @@ static int sanitize_check_bounds(struct bpf_verifier_env *env, } break; default: - break; + return -EOPNOTSUPP; } return 0; @@ -14315,7 +14463,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, struct bpf_sanitize_info info = {}; u8 opcode = BPF_OP(insn->code); u32 dst = insn->dst_reg; - int ret; + int ret, bounds_ret; dst_reg = ®s[dst]; @@ -14347,6 +14495,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, return -EACCES; } + /* + * Accesses to untrusted PTR_TO_MEM are done through probe + * instructions, hence no need to track offsets. + */ + if (base_type(ptr_reg->type) == PTR_TO_MEM && (ptr_reg->type & PTR_UNTRUSTED)) + return 0; + switch (base_type(ptr_reg->type)) { case PTR_TO_CTX: case PTR_TO_MAP_VALUE: @@ -14515,11 +14670,19 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, if (!check_reg_sane_offset(env, dst_reg, ptr_reg->type)) return -EINVAL; reg_bounds_sync(dst_reg); - if (sanitize_check_bounds(env, insn, dst_reg) < 0) - return -EACCES; + bounds_ret = sanitize_check_bounds(env, insn, dst_reg); + if (bounds_ret == -EACCES) + return bounds_ret; if (sanitize_needed(opcode)) { ret = sanitize_ptr_alu(env, insn, dst_reg, off_reg, dst_reg, &info, true); + if (verifier_bug_if(!can_skip_alu_sanitation(env, insn) + && !env->cur_state->speculative + && bounds_ret + && !ret, + env, "Pointer type unsupported by sanitize_check_bounds() not rejected by retrieve_ptr_limit() as required")) { + return -EFAULT; + } if (ret < 0) return sanitize_err(env, insn, ret, off_reg, dst_reg); } @@ -14534,14 +14697,25 @@ static void scalar32_min_max_add(struct bpf_reg_state *dst_reg, s32 *dst_smax = &dst_reg->s32_max_value; u32 *dst_umin = &dst_reg->u32_min_value; u32 *dst_umax = &dst_reg->u32_max_value; + u32 umin_val = src_reg->u32_min_value; + u32 umax_val = src_reg->u32_max_value; + bool min_overflow, max_overflow; if (check_add_overflow(*dst_smin, src_reg->s32_min_value, dst_smin) || check_add_overflow(*dst_smax, src_reg->s32_max_value, dst_smax)) { *dst_smin = S32_MIN; *dst_smax = S32_MAX; } - if (check_add_overflow(*dst_umin, src_reg->u32_min_value, dst_umin) || - check_add_overflow(*dst_umax, src_reg->u32_max_value, dst_umax)) { + + /* If either all additions overflow or no additions overflow, then + * it is okay to set: dst_umin = dst_umin + src_umin, dst_umax = + * dst_umax + src_umax. Otherwise (some additions overflow), set + * the output bounds to unbounded. + */ + min_overflow = check_add_overflow(*dst_umin, umin_val, dst_umin); + max_overflow = check_add_overflow(*dst_umax, umax_val, dst_umax); + + if (!min_overflow && max_overflow) { *dst_umin = 0; *dst_umax = U32_MAX; } @@ -14554,14 +14728,25 @@ static void scalar_min_max_add(struct bpf_reg_state *dst_reg, s64 *dst_smax = &dst_reg->smax_value; u64 *dst_umin = &dst_reg->umin_value; u64 *dst_umax = &dst_reg->umax_value; + u64 umin_val = src_reg->umin_value; + u64 umax_val = src_reg->umax_value; + bool min_overflow, max_overflow; if (check_add_overflow(*dst_smin, src_reg->smin_value, dst_smin) || check_add_overflow(*dst_smax, src_reg->smax_value, dst_smax)) { *dst_smin = S64_MIN; *dst_smax = S64_MAX; } - if (check_add_overflow(*dst_umin, src_reg->umin_value, dst_umin) || - check_add_overflow(*dst_umax, src_reg->umax_value, dst_umax)) { + + /* If either all additions overflow or no additions overflow, then + * it is okay to set: dst_umin = dst_umin + src_umin, dst_umax = + * dst_umax + src_umax. Otherwise (some additions overflow), set + * the output bounds to unbounded. + */ + min_overflow = check_add_overflow(*dst_umin, umin_val, dst_umin); + max_overflow = check_add_overflow(*dst_umax, umax_val, dst_umax); + + if (!min_overflow && max_overflow) { *dst_umin = 0; *dst_umax = U64_MAX; } @@ -14572,8 +14757,11 @@ static void scalar32_min_max_sub(struct bpf_reg_state *dst_reg, { s32 *dst_smin = &dst_reg->s32_min_value; s32 *dst_smax = &dst_reg->s32_max_value; + u32 *dst_umin = &dst_reg->u32_min_value; + u32 *dst_umax = &dst_reg->u32_max_value; u32 umin_val = src_reg->u32_min_value; u32 umax_val = src_reg->u32_max_value; + bool min_underflow, max_underflow; if (check_sub_overflow(*dst_smin, src_reg->s32_max_value, dst_smin) || check_sub_overflow(*dst_smax, src_reg->s32_min_value, dst_smax)) { @@ -14581,14 +14769,18 @@ static void scalar32_min_max_sub(struct bpf_reg_state *dst_reg, *dst_smin = S32_MIN; *dst_smax = S32_MAX; } - if (dst_reg->u32_min_value < umax_val) { - /* Overflow possible, we know nothing */ - dst_reg->u32_min_value = 0; - dst_reg->u32_max_value = U32_MAX; - } else { - /* Cannot overflow (as long as bounds are consistent) */ - dst_reg->u32_min_value -= umax_val; - dst_reg->u32_max_value -= umin_val; + + /* If either all subtractions underflow or no subtractions + * underflow, it is okay to set: dst_umin = dst_umin - src_umax, + * dst_umax = dst_umax - src_umin. Otherwise (some subtractions + * underflow), set the output bounds to unbounded. + */ + min_underflow = check_sub_overflow(*dst_umin, umax_val, dst_umin); + max_underflow = check_sub_overflow(*dst_umax, umin_val, dst_umax); + + if (min_underflow && !max_underflow) { + *dst_umin = 0; + *dst_umax = U32_MAX; } } @@ -14597,8 +14789,11 @@ static void scalar_min_max_sub(struct bpf_reg_state *dst_reg, { s64 *dst_smin = &dst_reg->smin_value; s64 *dst_smax = &dst_reg->smax_value; + u64 *dst_umin = &dst_reg->umin_value; + u64 *dst_umax = &dst_reg->umax_value; u64 umin_val = src_reg->umin_value; u64 umax_val = src_reg->umax_value; + bool min_underflow, max_underflow; if (check_sub_overflow(*dst_smin, src_reg->smax_value, dst_smin) || check_sub_overflow(*dst_smax, src_reg->smin_value, dst_smax)) { @@ -14606,14 +14801,18 @@ static void scalar_min_max_sub(struct bpf_reg_state *dst_reg, *dst_smin = S64_MIN; *dst_smax = S64_MAX; } - if (dst_reg->umin_value < umax_val) { - /* Overflow possible, we know nothing */ - dst_reg->umin_value = 0; - dst_reg->umax_value = U64_MAX; - } else { - /* Cannot overflow (as long as bounds are consistent) */ - dst_reg->umin_value -= umax_val; - dst_reg->umax_value -= umin_val; + + /* If either all subtractions underflow or no subtractions + * underflow, it is okay to set: dst_umin = dst_umin - src_umax, + * dst_umax = dst_umax - src_umin. Otherwise (some subtractions + * underflow), set the output bounds to unbounded. + */ + min_underflow = check_sub_overflow(*dst_umin, umax_val, dst_umin); + max_underflow = check_sub_overflow(*dst_umax, umin_val, dst_umax); + + if (min_underflow && !max_underflow) { + *dst_umin = 0; + *dst_umax = U64_MAX; } } @@ -15075,6 +15274,7 @@ static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn, switch (BPF_OP(insn->code)) { case BPF_ADD: case BPF_SUB: + case BPF_NEG: case BPF_AND: case BPF_XOR: case BPF_OR: @@ -15143,6 +15343,13 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, scalar_min_max_sub(dst_reg, &src_reg); dst_reg->var_off = tnum_sub(dst_reg->var_off, src_reg.var_off); break; + case BPF_NEG: + env->fake_reg[0] = *dst_reg; + __mark_reg_known(dst_reg, 0); + scalar32_min_max_sub(dst_reg, &env->fake_reg[0]); + scalar_min_max_sub(dst_reg, &env->fake_reg[0]); + dst_reg->var_off = tnum_neg(env->fake_reg[0].var_off); + break; case BPF_MUL: dst_reg->var_off = tnum_mul(dst_reg->var_off, src_reg.var_off); scalar32_min_max_mul(dst_reg, &src_reg); @@ -15282,12 +15489,12 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, if (WARN_ON_ONCE(ptr_reg)) { print_verifier_state(env, vstate, vstate->curframe, true); verbose(env, "verifier internal error: unexpected ptr_reg\n"); - return -EINVAL; + return -EFAULT; } if (WARN_ON(!src_reg)) { print_verifier_state(env, vstate, vstate->curframe, true); verbose(env, "verifier internal error: no src_reg\n"); - return -EINVAL; + return -EFAULT; } err = adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg); if (err) @@ -15366,7 +15573,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } /* check dest operand */ - err = check_reg_arg(env, insn->dst_reg, DST_OP); + if (opcode == BPF_NEG) { + err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK); + err = err ?: adjust_scalar_min_max_vals(env, insn, + ®s[insn->dst_reg], + regs[insn->dst_reg]); + } else { + err = check_reg_arg(env, insn->dst_reg, DST_OP); + } if (err) return err; @@ -16033,6 +16247,10 @@ static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state if (!is_reg_const(reg2, is_jmp32)) break; val = reg_const_value(reg2, is_jmp32); + /* Forget the ranges before narrowing tnums, to avoid invariant + * violations if we're on a dead branch. + */ + __mark_reg_unbounded(reg1); if (is_jmp32) { t = tnum_and(tnum_subreg(reg1->var_off), tnum_const(~val)); reg1->var_off = tnum_with_subreg(reg1->var_off, t); @@ -16477,7 +16695,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, } if (insn_flags) { - err = push_insn_history(env, this_branch, insn_flags, 0); + err = push_jmp_history(env, this_branch, insn_flags, 0); if (err) return err; } @@ -16535,7 +16753,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, if (dst_reg->type == SCALAR_VALUE && dst_reg->id) collect_linked_regs(this_branch, dst_reg->id, &linked_regs); if (linked_regs.cnt > 1) { - err = push_insn_history(env, this_branch, 0, linked_regs_pack(&linked_regs)); + err = push_jmp_history(env, this_branch, 0, linked_regs_pack(&linked_regs)); if (err) return err; } @@ -16690,7 +16908,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) dst_reg->btf_id = aux->btf_var.btf_id; break; default: - verbose(env, "bpf verifier is misconfigured\n"); + verifier_bug(env, "pseudo btf id: unexpected dst reg type"); return -EFAULT; } return 0; @@ -16732,8 +16950,8 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) insn->src_reg == BPF_PSEUDO_MAP_IDX) { dst_reg->type = CONST_PTR_TO_MAP; } else { - verbose(env, "bpf verifier is misconfigured\n"); - return -EINVAL; + verifier_bug(env, "unexpected src reg value for ldimm64"); + return -EFAULT; } return 0; @@ -16779,8 +16997,8 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) } if (!env->ops->gen_ld_abs) { - verbose(env, "bpf verifier is misconfigured\n"); - return -EINVAL; + verifier_bug(env, "gen_ld_abs is null"); + return -EFAULT; } if (insn->dst_reg != BPF_REG_0 || insn->off != 0 || @@ -17190,7 +17408,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) /* forward- or cross-edge */ insn_state[t] = DISCOVERED | e; } else { - verbose(env, "insn state internal bug\n"); + verifier_bug(env, "insn state internal bug"); return -EFAULT; } return DONE_EXPLORING; @@ -17610,17 +17828,18 @@ static int check_cfg(struct bpf_verifier_env *env) int *insn_stack, *insn_state, *insn_postorder; int ex_insn_beg, i, ret = 0; - insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); + insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT); if (!insn_state) return -ENOMEM; - insn_stack = env->cfg.insn_stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); + insn_stack = env->cfg.insn_stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT); if (!insn_stack) { kvfree(insn_state); return -ENOMEM; } - insn_postorder = env->cfg.insn_postorder = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); + insn_postorder = env->cfg.insn_postorder = + kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT); if (!insn_postorder) { kvfree(insn_state); kvfree(insn_stack); @@ -17650,7 +17869,7 @@ walk_cfg: break; default: if (ret > 0) { - verbose(env, "visit_insn internal bug\n"); + verifier_bug(env, "visit_insn internal bug"); ret = -EFAULT; } goto err_free; @@ -17658,7 +17877,7 @@ walk_cfg: } if (env->cfg.cur_stack < 0) { - verbose(env, "pop stack internal bug\n"); + verifier_bug(env, "pop stack internal bug"); ret = -EFAULT; goto err_free; } @@ -17754,7 +17973,7 @@ static int check_btf_func_early(struct bpf_verifier_env *env, urecord = make_bpfptr(attr->func_info, uattr.is_kernel); min_size = min_t(u32, krec_size, urec_size); - krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL | __GFP_NOWARN); + krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (!krecord) return -ENOMEM; @@ -17854,7 +18073,7 @@ static int check_btf_func(struct bpf_verifier_env *env, urecord = make_bpfptr(attr->func_info, uattr.is_kernel); krecord = prog->aux->func_info; - info_aux = kcalloc(nfuncs, sizeof(*info_aux), GFP_KERNEL | __GFP_NOWARN); + info_aux = kcalloc(nfuncs, sizeof(*info_aux), GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (!info_aux) return -ENOMEM; @@ -17940,7 +18159,7 @@ static int check_btf_line(struct bpf_verifier_env *env, * pass in a smaller bpf_line_info object. */ linfo = kvcalloc(nr_linfo, sizeof(struct bpf_line_info), - GFP_KERNEL | __GFP_NOWARN); + GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (!linfo) return -ENOMEM; @@ -18263,10 +18482,6 @@ static void clean_verifier_state(struct bpf_verifier_env *env, { int i; - if (st->frame[0]->regs[0].live & REG_LIVE_DONE) - /* all regs in this state in all frames were already marked */ - return; - for (i = 0; i <= st->curframe; i++) clean_func_state(env, st->frame[i]); } @@ -18274,7 +18489,7 @@ static void clean_verifier_state(struct bpf_verifier_env *env, /* the parentage chains form a tree. * the verifier states are added to state lists at given insn and * pushed into state stack for future exploration. - * when the verifier reaches bpf_exit insn some of the verifer states + * when the verifier reaches bpf_exit insn some of the verifier states * stored in the state lists have their final liveness state already, * but a lot of states will get revised from liveness point of view when * the verifier explores other branches. @@ -18306,7 +18521,6 @@ static void clean_verifier_state(struct bpf_verifier_env *env, static void clean_live_states(struct bpf_verifier_env *env, int insn, struct bpf_verifier_state *cur) { - struct bpf_verifier_state *loop_entry; struct bpf_verifier_state_list *sl; struct list_head *pos, *head; @@ -18315,12 +18529,14 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn, sl = container_of(pos, struct bpf_verifier_state_list, node); if (sl->state.branches) continue; - loop_entry = get_loop_entry(env, &sl->state); - if (!IS_ERR_OR_NULL(loop_entry) && loop_entry->branches) - continue; if (sl->state.insn_idx != insn || !same_callsites(&sl->state, cur)) continue; + if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE) + /* all regs in this state in all frames were already marked */ + continue; + if (incomplete_read_marks(env, &sl->state)) + continue; clean_verifier_state(env, &sl->state); } } @@ -18767,9 +18983,7 @@ static bool states_equal(struct bpf_verifier_env *env, * and all frame states need to be equivalent */ for (i = 0; i <= old->curframe; i++) { - insn_idx = i == old->curframe - ? env->insn_idx - : old->frame[i + 1]->callsite; + insn_idx = frame_insn_idx(old, i); if (old->frame[i]->callsite != cur->frame[i]->callsite) return false; if (!func_states_equal(env, old->frame[i], cur->frame[i], insn_idx, exact)) @@ -18816,12 +19030,15 @@ static int propagate_liveness_reg(struct bpf_verifier_env *env, */ static int propagate_liveness(struct bpf_verifier_env *env, const struct bpf_verifier_state *vstate, - struct bpf_verifier_state *vparent) + struct bpf_verifier_state *vparent, + bool *changed) { struct bpf_reg_state *state_reg, *parent_reg; struct bpf_func_state *state, *parent; int i, frame, err = 0; + bool tmp = false; + changed = changed ?: &tmp; if (vparent->curframe != vstate->curframe) { WARN(1, "propagate_live: parent frame %d current frame %d\n", vparent->curframe, vstate->curframe); @@ -18840,6 +19057,7 @@ static int propagate_liveness(struct bpf_verifier_env *env, &parent_reg[i]); if (err < 0) return err; + *changed |= err > 0; if (err == REG_LIVE_READ64) mark_insn_zext(env, &parent_reg[i]); } @@ -18851,6 +19069,7 @@ static int propagate_liveness(struct bpf_verifier_env *env, state_reg = &state->stack[i].spilled_ptr; err = propagate_liveness_reg(env, state_reg, parent_reg); + *changed |= err > 0; if (err < 0) return err; } @@ -18862,7 +19081,9 @@ static int propagate_liveness(struct bpf_verifier_env *env, * propagate them into the current state */ static int propagate_precision(struct bpf_verifier_env *env, - const struct bpf_verifier_state *old) + const struct bpf_verifier_state *old, + struct bpf_verifier_state *cur, + bool *changed) { struct bpf_reg_state *state_reg; struct bpf_func_state *state; @@ -18910,13 +19131,53 @@ static int propagate_precision(struct bpf_verifier_env *env, verbose(env, "\n"); } - err = mark_chain_precision_batch(env); + err = __mark_chain_precision(env, cur, -1, changed); if (err < 0) return err; return 0; } +#define MAX_BACKEDGE_ITERS 64 + +/* Propagate read and precision marks from visit->backedges[*].state->equal_state + * to corresponding parent states of visit->backedges[*].state until fixed point is reached, + * then free visit->backedges. + * After execution of this function incomplete_read_marks() will return false + * for all states corresponding to @visit->callchain. + */ +static int propagate_backedges(struct bpf_verifier_env *env, struct bpf_scc_visit *visit) +{ + struct bpf_scc_backedge *backedge; + struct bpf_verifier_state *st; + bool changed; + int i, err; + + i = 0; + do { + if (i++ > MAX_BACKEDGE_ITERS) { + if (env->log.level & BPF_LOG_LEVEL2) + verbose(env, "%s: too many iterations\n", __func__); + for (backedge = visit->backedges; backedge; backedge = backedge->next) + mark_all_scalars_precise(env, &backedge->state); + break; + } + changed = false; + for (backedge = visit->backedges; backedge; backedge = backedge->next) { + st = &backedge->state; + err = propagate_liveness(env, st->equal_state, st, &changed); + if (err) + return err; + err = propagate_precision(env, st->equal_state, st, &changed); + if (err) + return err; + } + } while (changed); + + free_backedges(visit); + return 0; +} + static bool states_maybe_looping(struct bpf_verifier_state *old, struct bpf_verifier_state *cur) { @@ -18944,7 +19205,7 @@ static bool is_iter_next_insn(struct bpf_verifier_env *env, int insn_idx) * terminology) calls specially: as opposed to bounded BPF loops, it *expects* * states to match, which otherwise would look like an infinite loop. So while * iter_next() calls are taken care of, we still need to be careful and - * prevent erroneous and too eager declaration of "ininite loop", when + * prevent erroneous and too eager declaration of "infinite loop", when * iterators are involved. * * Here's a situation in pseudo-BPF assembly form: @@ -18986,7 +19247,7 @@ static bool is_iter_next_insn(struct bpf_verifier_env *env, int insn_idx) * * This approach allows to keep infinite loop heuristic even in the face of * active iterator. E.g., C snippet below is and will be detected as - * inifintely looping: + * infinitely looping: * * struct bpf_iter_num it; * int *p, x; @@ -19026,14 +19287,14 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) { struct bpf_verifier_state_list *new_sl; struct bpf_verifier_state_list *sl; - struct bpf_verifier_state *cur = env->cur_state, *new, *loop_entry; + struct bpf_verifier_state *cur = env->cur_state, *new; + bool force_new_state, add_new_state, loop; int i, j, n, err, states_cnt = 0; - bool force_new_state, add_new_state, force_exact; struct list_head *pos, *tmp, *head; force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx) || /* Avoid accumulating infinitely long jmp history */ - cur->insn_hist_end - cur->insn_hist_start > 40; + cur->jmp_history_cnt > 40; /* bpf progs typically have pruning point every 4 instructions * http://vger.kernel.org/bpfconf2019.html#session-1 @@ -19050,6 +19311,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) clean_live_states(env, insn_idx, cur); + loop = false; head = explored_state(env, insn_idx); list_for_each_safe(pos, tmp, head) { sl = container_of(pos, struct bpf_verifier_state_list, node); @@ -19129,7 +19391,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) spi = __get_spi(iter_reg->off + iter_reg->var_off.value); iter_state = &func(env, iter_reg)->stack[spi].spilled_ptr; if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE) { - update_loop_entry(env, cur, &sl->state); + loop = true; goto hit; } } @@ -19138,7 +19400,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) if (is_may_goto_insn_at(env, insn_idx)) { if (sl->state.may_goto_depth != cur->may_goto_depth && states_equal(env, &sl->state, cur, RANGE_WITHIN)) { - update_loop_entry(env, cur, &sl->state); + loop = true; goto hit; } } @@ -19180,38 +19442,9 @@ skip_inf_loop_check: add_new_state = false; goto miss; } - /* If sl->state is a part of a loop and this loop's entry is a part of - * current verification path then states have to be compared exactly. - * 'force_exact' is needed to catch the following case: - * - * initial Here state 'succ' was processed first, - * | it was eventually tracked to produce a - * V state identical to 'hdr'. - * .---------> hdr All branches from 'succ' had been explored - * | | and thus 'succ' has its .branches == 0. - * | V - * | .------... Suppose states 'cur' and 'succ' correspond - * | | | to the same instruction + callsites. - * | V V In such case it is necessary to check - * | ... ... if 'succ' and 'cur' are states_equal(). - * | | | If 'succ' and 'cur' are a part of the - * | V V same loop exact flag has to be set. - * | succ <- cur To check if that is the case, verify - * | | if loop entry of 'succ' is in current - * | V DFS path. - * | ... - * | | - * '----' - * - * Additional details are in the comment before get_loop_entry(). - */ - loop_entry = get_loop_entry(env, &sl->state); - if (IS_ERR(loop_entry)) - return PTR_ERR(loop_entry); - force_exact = loop_entry && loop_entry->branches > 0; - if (states_equal(env, &sl->state, cur, force_exact ? RANGE_WITHIN : NOT_EXACT)) { - if (force_exact) - update_loop_entry(env, cur, loop_entry); + /* See comments for mark_all_regs_read_and_precise() */ + loop = incomplete_read_marks(env, &sl->state); + if (states_equal(env, &sl->state, cur, loop ? RANGE_WITHIN : NOT_EXACT)) { hit: sl->hit_cnt++; /* reached equivalent register/stack state, @@ -19224,7 +19457,7 @@ hit: * they'll be immediately forgotten as we're pruning * this state and will pop a new one. */ - err = propagate_liveness(env, &sl->state, cur); + err = propagate_liveness(env, &sl->state, cur, NULL); /* if previous state reached the exit with precision and * current state is equivalent to it (except precision marks) @@ -19232,10 +19465,98 @@ hit: * the current state. */ if (is_jmp_point(env, env->insn_idx)) - err = err ? : push_insn_history(env, cur, 0, 0); - err = err ? : propagate_precision(env, &sl->state); + err = err ? : push_jmp_history(env, cur, 0, 0); + err = err ? : propagate_precision(env, &sl->state, cur, NULL); if (err) return err; + /* When processing iterator based loops above propagate_liveness and + * propagate_precision calls are not sufficient to transfer all relevant + * read and precision marks. E.g. consider the following case: + * + * .-> A --. Assume the states are visited in the order A, B, C. + * | | | Assume that state B reaches a state equivalent to state A. + * | v v At this point, state C is not processed yet, so state A + * '-- B C has not received any read or precision marks from C. + * Thus, marks propagated from A to B are incomplete. + * + * The verifier mitigates this by performing the following steps: + * + * - Prior to the main verification pass, strongly connected components + * (SCCs) are computed over the program's control flow graph, + * intraprocedurally. + * + * - During the main verification pass, `maybe_enter_scc()` checks + * whether the current verifier state is entering an SCC. If so, an + * instance of a `bpf_scc_visit` object is created, and the state + * entering the SCC is recorded as the entry state. + * + * - This instance is associated not with the SCC itself, but with a + * `bpf_scc_callchain`: a tuple consisting of the call sites leading to + * the SCC and the SCC id. See `compute_scc_callchain()`. + * + * - When a verification path encounters a `states_equal(..., + * RANGE_WITHIN)` condition, there exists a call chain describing the + * current state and a corresponding `bpf_scc_visit` instance. A copy + * of the current state is created and added to + * `bpf_scc_visit->backedges`. + * + * - When a verification path terminates, `maybe_exit_scc()` is called + * from `update_branch_counts()`. For states with `branches == 0`, it + * checks whether the state is the entry state of any `bpf_scc_visit` + * instance. If it is, this indicates that all paths originating from + * this SCC visit have been explored. `propagate_backedges()` is then + * called, which propagates read and precision marks through the + * backedges until a fixed point is reached. + * (In the earlier example, this would propagate marks from A to B, + * from C to A, and then again from A to B.) + * + * A note on callchains + * -------------------- + * + * Consider the following example: + * + * void foo() { loop { ... SCC#1 ... } } + * void main() { + * A: foo(); + * B: ... + * C: foo(); + * } + * + * Here, there are two distinct callchains leading to SCC#1: + * - (A, SCC#1) + * - (C, SCC#1) + * + * Each callchain identifies a separate `bpf_scc_visit` instance that + * accumulates backedge states. The `propagate_{liveness,precision}()` + * functions traverse the parent state of each backedge state, which + * means these parent states must remain valid (i.e., not freed) while + * the corresponding `bpf_scc_visit` instance exists. + * + * Associating `bpf_scc_visit` instances directly with SCCs instead of + * callchains would break this invariant: + * - States explored during `C: foo()` would contribute backedges to + * SCC#1, but SCC#1 would only be exited once the exploration of + * `A: foo()` completes. + * - By that time, the states explored between `A: foo()` and `C: foo()` + * (i.e., `B: ...`) may have already been freed, causing the parent + * links for states from `C: foo()` to become invalid. + */ + if (loop) { + struct bpf_scc_backedge *backedge; + + backedge = kzalloc(sizeof(*backedge), GFP_KERNEL_ACCOUNT); + if (!backedge) + return -ENOMEM; + err = copy_verifier_state(&backedge->state, cur); + backedge->state.equal_state = &sl->state; + backedge->state.insn_idx = insn_idx; + err = err ?: add_scc_backedge(env, &sl->state, backedge); + if (err) { + free_verifier_state(&backedge->state, false); + kvfree(backedge); + return err; + } + } return 1; } miss: @@ -19287,7 +19608,7 @@ miss: * When looping the sl->state.branches will be > 0 and this state * will not be considered for equivalence until branches == 0. */ - new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL); + new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL_ACCOUNT); if (!new_sl) return -ENOMEM; env->total_states++; @@ -19309,13 +19630,20 @@ miss: return err; } new->insn_idx = insn_idx; - WARN_ONCE(new->branches != 1, - "BUG is_state_visited:branches_to_explore=%d insn %d\n", new->branches, insn_idx); + verifier_bug_if(new->branches != 1, env, + "%s:branches_to_explore=%d insn %d", + __func__, new->branches, insn_idx); + err = maybe_enter_scc(env, new); + if (err) { + free_verifier_state(new, false); + kvfree(new_sl); + return err; + } cur->parent = new; cur->first_insn_idx = insn_idx; - cur->insn_hist_start = cur->insn_hist_end; cur->dfs_depth = new->dfs_depth + 1; + clear_jmp_history(cur); list_add(&new_sl->node, head); /* connect new state to parentage chain. Current frame needs all @@ -19387,10 +19715,27 @@ static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev) !reg_type_mismatch_ok(prev)); } +static bool is_ptr_to_mem_or_btf_id(enum bpf_reg_type type) +{ + switch (base_type(type)) { + case PTR_TO_MEM: + case PTR_TO_BTF_ID: + return true; + default: + return false; + } +} + +static bool is_ptr_to_mem(enum bpf_reg_type type) +{ + return base_type(type) == PTR_TO_MEM; +} + static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type type, bool allow_trust_mismatch) { enum bpf_reg_type *prev_type = &env->insn_aux_data[env->insn_idx].ptr_type; + enum bpf_reg_type merged_type; if (*prev_type == NOT_INIT) { /* Saw a valid insn @@ -19407,15 +19752,24 @@ static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type typ * Reject it. */ if (allow_trust_mismatch && - base_type(type) == PTR_TO_BTF_ID && - base_type(*prev_type) == PTR_TO_BTF_ID) { + is_ptr_to_mem_or_btf_id(type) && + is_ptr_to_mem_or_btf_id(*prev_type)) { /* * Have to support a use case when one path through * the program yields TRUSTED pointer while another * is UNTRUSTED. Fallback to UNTRUSTED to generate * BPF_PROBE_MEM/BPF_PROBE_MEMSX. + * Same behavior of MEM_RDONLY flag. */ - *prev_type = PTR_TO_BTF_ID | PTR_UNTRUSTED; + if (is_ptr_to_mem(type) || is_ptr_to_mem(*prev_type)) + merged_type = PTR_TO_MEM; + else + merged_type = PTR_TO_BTF_ID; + if ((type & PTR_UNTRUSTED) || (*prev_type & PTR_UNTRUSTED)) + merged_type |= PTR_UNTRUSTED; + if ((type & MEM_RDONLY) || (*prev_type & MEM_RDONLY)) + merged_type |= MEM_RDONLY; + *prev_type = merged_type; } else { verbose(env, "same insn cannot be used with different pointers\n"); return -EINVAL; @@ -19425,20 +19779,224 @@ static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type typ return 0; } +enum { + PROCESS_BPF_EXIT = 1 +}; + +static int process_bpf_exit_full(struct bpf_verifier_env *env, + bool *do_print_state, + bool exception_exit) +{ + /* We must do check_reference_leak here before + * prepare_func_exit to handle the case when + * state->curframe > 0, it may be a callback function, + * for which reference_state must match caller reference + * state when it exits. + */ + int err = check_resource_leak(env, exception_exit, + !env->cur_state->curframe, + "BPF_EXIT instruction in main prog"); + if (err) + return err; + + /* The side effect of the prepare_func_exit which is + * being skipped is that it frees bpf_func_state. + * Typically, process_bpf_exit will only be hit with + * outermost exit. copy_verifier_state in pop_stack will + * handle freeing of any extra bpf_func_state left over + * from not processing all nested function exits. We + * also skip return code checks as they are not needed + * for exceptional exits. + */ + if (exception_exit) + return PROCESS_BPF_EXIT; + + if (env->cur_state->curframe) { + /* exit from nested function */ + err = prepare_func_exit(env, &env->insn_idx); + if (err) + return err; + *do_print_state = true; + return 0; + } + + err = check_return_code(env, BPF_REG_0, "R0"); + if (err) + return err; + return PROCESS_BPF_EXIT; +} + +static int do_check_insn(struct bpf_verifier_env *env, bool *do_print_state) +{ + int err; + struct bpf_insn *insn = &env->prog->insnsi[env->insn_idx]; + u8 class = BPF_CLASS(insn->code); + + if (class == BPF_ALU || class == BPF_ALU64) { + err = check_alu_op(env, insn); + if (err) + return err; + + } else if (class == BPF_LDX) { + bool is_ldsx = BPF_MODE(insn->code) == BPF_MEMSX; + + /* Check for reserved fields is already done in + * resolve_pseudo_ldimm64(). + */ + err = check_load_mem(env, insn, false, is_ldsx, true, "ldx"); + if (err) + return err; + } else if (class == BPF_STX) { + if (BPF_MODE(insn->code) == BPF_ATOMIC) { + err = check_atomic(env, insn); + if (err) + return err; + env->insn_idx++; + return 0; + } + + if (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0) { + verbose(env, "BPF_STX uses reserved fields\n"); + return -EINVAL; + } + + err = check_store_reg(env, insn, false); + if (err) + return err; + } else if (class == BPF_ST) { + enum bpf_reg_type dst_reg_type; + + if (BPF_MODE(insn->code) != BPF_MEM || + insn->src_reg != BPF_REG_0) { + verbose(env, "BPF_ST uses reserved fields\n"); + return -EINVAL; + } + /* check src operand */ + err = check_reg_arg(env, insn->dst_reg, SRC_OP); + if (err) + return err; + + dst_reg_type = cur_regs(env)[insn->dst_reg].type; + + /* check that memory (dst_reg + off) is writeable */ + err = check_mem_access(env, env->insn_idx, insn->dst_reg, + insn->off, BPF_SIZE(insn->code), + BPF_WRITE, -1, false, false); + if (err) + return err; + + err = save_aux_ptr_type(env, dst_reg_type, false); + if (err) + return err; + } else if (class == BPF_JMP || class == BPF_JMP32) { + u8 opcode = BPF_OP(insn->code); + + env->jmps_processed++; + if (opcode == BPF_CALL) { + if (BPF_SRC(insn->code) != BPF_K || + (insn->src_reg != BPF_PSEUDO_KFUNC_CALL && + insn->off != 0) || + (insn->src_reg != BPF_REG_0 && + insn->src_reg != BPF_PSEUDO_CALL && + insn->src_reg != BPF_PSEUDO_KFUNC_CALL) || + insn->dst_reg != BPF_REG_0 || class == BPF_JMP32) { + verbose(env, "BPF_CALL uses reserved fields\n"); + return -EINVAL; + } + + if (env->cur_state->active_locks) { + if ((insn->src_reg == BPF_REG_0 && + insn->imm != BPF_FUNC_spin_unlock) || + (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && + (insn->off != 0 || !kfunc_spin_allowed(insn->imm)))) { + verbose(env, + "function calls are not allowed while holding a lock\n"); + return -EINVAL; + } + } + if (insn->src_reg == BPF_PSEUDO_CALL) { + err = check_func_call(env, insn, &env->insn_idx); + } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { + err = check_kfunc_call(env, insn, &env->insn_idx); + if (!err && is_bpf_throw_kfunc(insn)) + return process_bpf_exit_full(env, do_print_state, true); + } else { + err = check_helper_call(env, insn, &env->insn_idx); + } + if (err) + return err; + + mark_reg_scratched(env, BPF_REG_0); + } else if (opcode == BPF_JA) { + if (BPF_SRC(insn->code) != BPF_K || + insn->src_reg != BPF_REG_0 || + insn->dst_reg != BPF_REG_0 || + (class == BPF_JMP && insn->imm != 0) || + (class == BPF_JMP32 && insn->off != 0)) { + verbose(env, "BPF_JA uses reserved fields\n"); + return -EINVAL; + } + + if (class == BPF_JMP) + env->insn_idx += insn->off + 1; + else + env->insn_idx += insn->imm + 1; + return 0; + } else if (opcode == BPF_EXIT) { + if (BPF_SRC(insn->code) != BPF_K || + insn->imm != 0 || + insn->src_reg != BPF_REG_0 || + insn->dst_reg != BPF_REG_0 || + class == BPF_JMP32) { + verbose(env, "BPF_EXIT uses reserved fields\n"); + return -EINVAL; + } + return process_bpf_exit_full(env, do_print_state, false); + } else { + err = check_cond_jmp_op(env, insn, &env->insn_idx); + if (err) + return err; + } + } else if (class == BPF_LD) { + u8 mode = BPF_MODE(insn->code); + + if (mode == BPF_ABS || mode == BPF_IND) { + err = check_ld_abs(env, insn); + if (err) + return err; + + } else if (mode == BPF_IMM) { + err = check_ld_imm(env, insn); + if (err) + return err; + + env->insn_idx++; + sanitize_mark_insn_seen(env); + } else { + verbose(env, "invalid BPF_LD mode\n"); + return -EINVAL; + } + } else { + verbose(env, "unknown insn class %d\n", class); + return -EINVAL; + } + + env->insn_idx++; + return 0; +} + static int do_check(struct bpf_verifier_env *env) { bool pop_log = !(env->log.level & BPF_LOG_LEVEL2); struct bpf_verifier_state *state = env->cur_state; struct bpf_insn *insns = env->prog->insnsi; - struct bpf_reg_state *regs; int insn_cnt = env->prog->len; bool do_print_state = false; int prev_insn_idx = -1; for (;;) { - bool exception_exit = false; struct bpf_insn *insn; - u8 class; + struct bpf_insn_aux_data *insn_aux; int err; /* reset current history entry on each new instruction */ @@ -19452,7 +20010,7 @@ static int do_check(struct bpf_verifier_env *env) } insn = &insns[env->insn_idx]; - class = BPF_CLASS(insn->code); + insn_aux = &env->insn_aux_data[env->insn_idx]; if (++env->insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) { verbose(env, @@ -19462,6 +20020,7 @@ static int do_check(struct bpf_verifier_env *env) } state->last_insn_idx = env->prev_insn_idx; + state->insn_idx = env->insn_idx; if (is_prune_point(env, env->insn_idx)) { err = is_state_visited(env, env->insn_idx); @@ -19483,7 +20042,7 @@ static int do_check(struct bpf_verifier_env *env) } if (is_jmp_point(env, env->insn_idx)) { - err = push_insn_history(env, state, 0, 0); + err = push_jmp_history(env, state, 0, 0); if (err) return err; } @@ -19522,215 +20081,67 @@ static int do_check(struct bpf_verifier_env *env) return err; } - regs = cur_regs(env); sanitize_mark_insn_seen(env); prev_insn_idx = env->insn_idx; - if (class == BPF_ALU || class == BPF_ALU64) { - err = check_alu_op(env, insn); - if (err) - return err; - - } else if (class == BPF_LDX) { - bool is_ldsx = BPF_MODE(insn->code) == BPF_MEMSX; + /* Reduce verification complexity by stopping speculative path + * verification when a nospec is encountered. + */ + if (state->speculative && insn_aux->nospec) + goto process_bpf_exit; - /* Check for reserved fields is already done in - * resolve_pseudo_ldimm64(). + err = do_check_insn(env, &do_print_state); + if (error_recoverable_with_nospec(err) && state->speculative) { + /* Prevent this speculative path from ever reaching the + * insn that would have been unsafe to execute. */ - err = check_load_mem(env, insn, false, is_ldsx, true, - "ldx"); - if (err) - return err; - } else if (class == BPF_STX) { - if (BPF_MODE(insn->code) == BPF_ATOMIC) { - err = check_atomic(env, insn); - if (err) - return err; - env->insn_idx++; - continue; - } - - if (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0) { - verbose(env, "BPF_STX uses reserved fields\n"); - return -EINVAL; - } - - err = check_store_reg(env, insn, false); - if (err) - return err; - } else if (class == BPF_ST) { - enum bpf_reg_type dst_reg_type; - - if (BPF_MODE(insn->code) != BPF_MEM || - insn->src_reg != BPF_REG_0) { - verbose(env, "BPF_ST uses reserved fields\n"); - return -EINVAL; - } - /* check src operand */ - err = check_reg_arg(env, insn->dst_reg, SRC_OP); - if (err) - return err; - - dst_reg_type = regs[insn->dst_reg].type; - - /* check that memory (dst_reg + off) is writeable */ - err = check_mem_access(env, env->insn_idx, insn->dst_reg, - insn->off, BPF_SIZE(insn->code), - BPF_WRITE, -1, false, false); - if (err) - return err; - - err = save_aux_ptr_type(env, dst_reg_type, false); + insn_aux->nospec = true; + /* If it was an ADD/SUB insn, potentially remove any + * markings for alu sanitization. + */ + insn_aux->alu_state = 0; + goto process_bpf_exit; + } else if (err < 0) { + return err; + } else if (err == PROCESS_BPF_EXIT) { + goto process_bpf_exit; + } + WARN_ON_ONCE(err); + + if (state->speculative && insn_aux->nospec_result) { + /* If we are on a path that performed a jump-op, this + * may skip a nospec patched-in after the jump. This can + * currently never happen because nospec_result is only + * used for the write-ops + * `*(size*)(dst_reg+off)=src_reg|imm32` which must + * never skip the following insn. Still, add a warning + * to document this in case nospec_result is used + * elsewhere in the future. + * + * All non-branch instructions have a single + * fall-through edge. For these, nospec_result should + * already work. + */ + if (verifier_bug_if(BPF_CLASS(insn->code) == BPF_JMP || + BPF_CLASS(insn->code) == BPF_JMP32, env, + "speculation barrier after jump instruction may not have the desired effect")) + return -EFAULT; +process_bpf_exit: + mark_verifier_state_scratched(env); + err = update_branch_counts(env, env->cur_state); if (err) return err; - } else if (class == BPF_JMP || class == BPF_JMP32) { - u8 opcode = BPF_OP(insn->code); - - env->jmps_processed++; - if (opcode == BPF_CALL) { - if (BPF_SRC(insn->code) != BPF_K || - (insn->src_reg != BPF_PSEUDO_KFUNC_CALL - && insn->off != 0) || - (insn->src_reg != BPF_REG_0 && - insn->src_reg != BPF_PSEUDO_CALL && - insn->src_reg != BPF_PSEUDO_KFUNC_CALL) || - insn->dst_reg != BPF_REG_0 || - class == BPF_JMP32) { - verbose(env, "BPF_CALL uses reserved fields\n"); - return -EINVAL; - } - - if (env->cur_state->active_locks) { - if ((insn->src_reg == BPF_REG_0 && insn->imm != BPF_FUNC_spin_unlock) || - (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && - (insn->off != 0 || !kfunc_spin_allowed(insn->imm)))) { - verbose(env, "function calls are not allowed while holding a lock\n"); - return -EINVAL; - } - } - if (insn->src_reg == BPF_PSEUDO_CALL) { - err = check_func_call(env, insn, &env->insn_idx); - } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { - err = check_kfunc_call(env, insn, &env->insn_idx); - if (!err && is_bpf_throw_kfunc(insn)) { - exception_exit = true; - goto process_bpf_exit_full; - } - } else { - err = check_helper_call(env, insn, &env->insn_idx); - } - if (err) - return err; - - mark_reg_scratched(env, BPF_REG_0); - } else if (opcode == BPF_JA) { - if (BPF_SRC(insn->code) != BPF_K || - insn->src_reg != BPF_REG_0 || - insn->dst_reg != BPF_REG_0 || - (class == BPF_JMP && insn->imm != 0) || - (class == BPF_JMP32 && insn->off != 0)) { - verbose(env, "BPF_JA uses reserved fields\n"); - return -EINVAL; - } - - if (class == BPF_JMP) - env->insn_idx += insn->off + 1; - else - env->insn_idx += insn->imm + 1; - continue; - - } else if (opcode == BPF_EXIT) { - if (BPF_SRC(insn->code) != BPF_K || - insn->imm != 0 || - insn->src_reg != BPF_REG_0 || - insn->dst_reg != BPF_REG_0 || - class == BPF_JMP32) { - verbose(env, "BPF_EXIT uses reserved fields\n"); - return -EINVAL; - } -process_bpf_exit_full: - /* We must do check_reference_leak here before - * prepare_func_exit to handle the case when - * state->curframe > 0, it may be a callback - * function, for which reference_state must - * match caller reference state when it exits. - */ - err = check_resource_leak(env, exception_exit, !env->cur_state->curframe, - "BPF_EXIT instruction in main prog"); - if (err) - return err; - - /* The side effect of the prepare_func_exit - * which is being skipped is that it frees - * bpf_func_state. Typically, process_bpf_exit - * will only be hit with outermost exit. - * copy_verifier_state in pop_stack will handle - * freeing of any extra bpf_func_state left over - * from not processing all nested function - * exits. We also skip return code checks as - * they are not needed for exceptional exits. - */ - if (exception_exit) - goto process_bpf_exit; - - if (state->curframe) { - /* exit from nested function */ - err = prepare_func_exit(env, &env->insn_idx); - if (err) - return err; - do_print_state = true; - continue; - } - - err = check_return_code(env, BPF_REG_0, "R0"); - if (err) - return err; -process_bpf_exit: - mark_verifier_state_scratched(env); - update_branch_counts(env, env->cur_state); - err = pop_stack(env, &prev_insn_idx, - &env->insn_idx, pop_log); - if (err < 0) { - if (err != -ENOENT) - return err; - break; - } else { - if (verifier_bug_if(env->cur_state->loop_entry, env, - "broken loop detection")) - return -EFAULT; - do_print_state = true; - continue; - } - } else { - err = check_cond_jmp_op(env, insn, &env->insn_idx); - if (err) - return err; - } - } else if (class == BPF_LD) { - u8 mode = BPF_MODE(insn->code); - - if (mode == BPF_ABS || mode == BPF_IND) { - err = check_ld_abs(env, insn); - if (err) - return err; - - } else if (mode == BPF_IMM) { - err = check_ld_imm(env, insn); - if (err) + err = pop_stack(env, &prev_insn_idx, &env->insn_idx, + pop_log); + if (err < 0) { + if (err != -ENOENT) return err; - - env->insn_idx++; - sanitize_mark_insn_seen(env); + break; } else { - verbose(env, "invalid BPF_LD mode\n"); - return -EINVAL; + do_print_state = true; + continue; } - } else { - verbose(env, "unknown insn class %d\n", class); - return -EINVAL; } - - env->insn_idx++; } return 0; @@ -20682,7 +21093,10 @@ static int opt_remove_nops(struct bpf_verifier_env *env) static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env, const union bpf_attr *attr) { - struct bpf_insn *patch, zext_patch[2], rnd_hi32_patch[4]; + struct bpf_insn *patch; + /* use env->insn_buf as two independent buffers */ + struct bpf_insn *zext_patch = env->insn_buf; + struct bpf_insn *rnd_hi32_patch = &env->insn_buf[2]; struct bpf_insn_aux_data *aux = env->insn_aux_data; int i, patch_len, delta = 0, len = env->prog->len; struct bpf_insn *insns = env->prog->insnsi; @@ -20801,8 +21215,8 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) epilogue_cnt = ops->gen_epilogue(epilogue_buf, env->prog, -(subprogs[0].stack_depth + 8)); if (epilogue_cnt >= INSN_BUF_SIZE) { - verbose(env, "bpf verifier is misconfigured\n"); - return -EINVAL; + verifier_bug(env, "epilogue is too long"); + return -EFAULT; } else if (epilogue_cnt) { /* Save the ARG_PTR_TO_CTX for the epilogue to use */ cnt = 0; @@ -20824,14 +21238,14 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) if (ops->gen_prologue || env->seen_direct_write) { if (!ops->gen_prologue) { - verbose(env, "bpf verifier is misconfigured\n"); - return -EINVAL; + verifier_bug(env, "gen_prologue is null"); + return -EFAULT; } cnt = ops->gen_prologue(insn_buf, env->seen_direct_write, env->prog); if (cnt >= INSN_BUF_SIZE) { - verbose(env, "bpf verifier is misconfigured\n"); - return -EINVAL; + verifier_bug(env, "prologue is too long"); + return -EFAULT; } else if (cnt) { new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt); if (!new_prog) @@ -20858,6 +21272,28 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) bpf_convert_ctx_access_t convert_ctx_access; u8 mode; + if (env->insn_aux_data[i + delta].nospec) { + WARN_ON_ONCE(env->insn_aux_data[i + delta].alu_state); + struct bpf_insn *patch = insn_buf; + + *patch++ = BPF_ST_NOSPEC(); + *patch++ = *insn; + cnt = patch - insn_buf; + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = new_prog; + insn = new_prog->insnsi + i + delta; + /* This can not be easily merged with the + * nospec_result-case, because an insn may require a + * nospec before and after itself. Therefore also do not + * 'continue' here but potentially apply further + * patching to insn. *insn should equal patch[1] now. + */ + } + if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) || insn->code == (BPF_LDX | BPF_MEM | BPF_H) || insn->code == (BPF_LDX | BPF_MEM | BPF_W) || @@ -20907,14 +21343,16 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) } if (type == BPF_WRITE && - env->insn_aux_data[i + delta].sanitize_stack_spill) { - struct bpf_insn patch[] = { - *insn, - BPF_ST_NOSPEC(), - }; + env->insn_aux_data[i + delta].nospec_result) { + /* nospec_result is only used to mitigate Spectre v4 and + * to limit verification-time for Spectre v1. + */ + struct bpf_insn *patch = insn_buf; - cnt = ARRAY_SIZE(patch); - new_prog = bpf_patch_insn_data(env, i + delta, patch, cnt); + *patch++ = *insn; + *patch++ = BPF_ST_NOSPEC(); + cnt = patch - insn_buf; + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); if (!new_prog) return -ENOMEM; @@ -20949,6 +21387,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) * for this case. */ case PTR_TO_BTF_ID | MEM_ALLOC | PTR_UNTRUSTED: + case PTR_TO_MEM | MEM_RDONLY | PTR_UNTRUSTED: if (type == BPF_READ) { if (BPF_MODE(insn->code) == BPF_MEM) insn->code = BPF_LDX | BPF_PROBE_MEM | @@ -20987,8 +21426,8 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) u8 size_code; if (type == BPF_WRITE) { - verbose(env, "bpf verifier narrow ctx access misconfigured\n"); - return -EINVAL; + verifier_bug(env, "narrow ctx access misconfigured"); + return -EFAULT; } size_code = BPF_H; @@ -21006,16 +21445,16 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) &target_size); if (cnt == 0 || cnt >= INSN_BUF_SIZE || (ctx_field_size && !target_size)) { - verbose(env, "bpf verifier is misconfigured\n"); - return -EINVAL; + verifier_bug(env, "error during ctx access conversion"); + return -EFAULT; } if (is_narrower_load && size < target_size) { u8 shift = bpf_ctx_narrow_access_offset( off, size, size_default) * 8; if (shift && cnt + 1 >= INSN_BUF_SIZE) { - verbose(env, "bpf verifier narrow ctx load misconfigured\n"); - return -EINVAL; + verifier_bug(env, "narrow ctx load misconfigured"); + return -EFAULT; } if (ctx_field_size <= 4) { if (shift) @@ -21445,8 +21884,8 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, */ desc = find_kfunc_desc(env->prog, insn->imm, insn->off); if (!desc) { - verbose(env, "verifier internal error: kernel function descriptor not found for func_id %u\n", - insn->imm); + verifier_bug(env, "kernel function descriptor not found for func_id %u", + insn->imm); return -EFAULT; } @@ -21461,8 +21900,8 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, u64 obj_new_size = env->insn_aux_data[insn_idx].obj_new_size; if (desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl] && kptr_struct_meta) { - verbose(env, "verifier internal error: NULL kptr_struct_meta expected at insn_idx %d\n", - insn_idx); + verifier_bug(env, "NULL kptr_struct_meta expected at insn_idx %d", + insn_idx); return -EFAULT; } @@ -21478,15 +21917,15 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) }; if (desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl] && kptr_struct_meta) { - verbose(env, "verifier internal error: NULL kptr_struct_meta expected at insn_idx %d\n", - insn_idx); + verifier_bug(env, "NULL kptr_struct_meta expected at insn_idx %d", + insn_idx); return -EFAULT; } if (desc->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl] && !kptr_struct_meta) { - verbose(env, "verifier internal error: kptr_struct_meta expected at insn_idx %d\n", - insn_idx); + verifier_bug(env, "kptr_struct_meta expected at insn_idx %d", + insn_idx); return -EFAULT; } @@ -21508,8 +21947,8 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } if (!kptr_struct_meta) { - verbose(env, "verifier internal error: kptr_struct_meta expected at insn_idx %d\n", - insn_idx); + verifier_bug(env, "kptr_struct_meta expected at insn_idx %d", + insn_idx); return -EFAULT; } @@ -21543,7 +21982,7 @@ static int add_hidden_subprog(struct bpf_verifier_env *env, struct bpf_insn *pat /* We only reserve one slot for hidden subprogs in subprog_info. */ if (env->hidden_subprog_cnt) { - verbose(env, "verifier internal error: only one hidden subprog supported\n"); + verifier_bug(env, "only one hidden subprog supported"); return -EFAULT; } /* We're not patching any existing instruction, just appending the new @@ -21583,13 +22022,12 @@ static int do_misc_fixups(struct bpf_verifier_env *env) u16 stack_depth_extra = 0; if (env->seen_exception && !env->exception_callback_subprog) { - struct bpf_insn patch[] = { - env->prog->insnsi[insn_cnt - 1], - BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), - BPF_EXIT_INSN(), - }; + struct bpf_insn *patch = insn_buf; - ret = add_hidden_subprog(env, patch, ARRAY_SIZE(patch)); + *patch++ = env->prog->insnsi[insn_cnt - 1]; + *patch++ = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1); + *patch++ = BPF_EXIT_INSN(); + ret = add_hidden_subprog(env, insn_buf, patch - insn_buf); if (ret < 0) return ret; prog = env->prog; @@ -21625,20 +22063,18 @@ static int do_misc_fixups(struct bpf_verifier_env *env) insn->off == 1 && insn->imm == -1) { bool is64 = BPF_CLASS(insn->code) == BPF_ALU64; bool isdiv = BPF_OP(insn->code) == BPF_DIV; - struct bpf_insn *patchlet; - struct bpf_insn chk_and_sdiv[] = { - BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) | - BPF_NEG | BPF_K, insn->dst_reg, - 0, 0, 0), - }; - struct bpf_insn chk_and_smod[] = { - BPF_MOV32_IMM(insn->dst_reg, 0), - }; + struct bpf_insn *patch = insn_buf; - patchlet = isdiv ? chk_and_sdiv : chk_and_smod; - cnt = isdiv ? ARRAY_SIZE(chk_and_sdiv) : ARRAY_SIZE(chk_and_smod); + if (isdiv) + *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) | + BPF_NEG | BPF_K, insn->dst_reg, + 0, 0, 0); + else + *patch++ = BPF_MOV32_IMM(insn->dst_reg, 0); - new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt); + cnt = patch - insn_buf; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); if (!new_prog) return -ENOMEM; @@ -21657,83 +22093,79 @@ static int do_misc_fixups(struct bpf_verifier_env *env) bool isdiv = BPF_OP(insn->code) == BPF_DIV; bool is_sdiv = isdiv && insn->off == 1; bool is_smod = !isdiv && insn->off == 1; - struct bpf_insn *patchlet; - struct bpf_insn chk_and_div[] = { - /* [R,W]x div 0 -> 0 */ - BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | - BPF_JNE | BPF_K, insn->src_reg, - 0, 2, 0), - BPF_ALU32_REG(BPF_XOR, insn->dst_reg, insn->dst_reg), - BPF_JMP_IMM(BPF_JA, 0, 0, 1), - *insn, - }; - struct bpf_insn chk_and_mod[] = { - /* [R,W]x mod 0 -> [R,W]x */ - BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | - BPF_JEQ | BPF_K, insn->src_reg, - 0, 1 + (is64 ? 0 : 1), 0), - *insn, - BPF_JMP_IMM(BPF_JA, 0, 0, 1), - BPF_MOV32_REG(insn->dst_reg, insn->dst_reg), - }; - struct bpf_insn chk_and_sdiv[] = { + struct bpf_insn *patch = insn_buf; + + if (is_sdiv) { /* [R,W]x sdiv 0 -> 0 * LLONG_MIN sdiv -1 -> LLONG_MIN * INT_MIN sdiv -1 -> INT_MIN */ - BPF_MOV64_REG(BPF_REG_AX, insn->src_reg), - BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) | - BPF_ADD | BPF_K, BPF_REG_AX, - 0, 0, 1), - BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | - BPF_JGT | BPF_K, BPF_REG_AX, - 0, 4, 1), - BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | - BPF_JEQ | BPF_K, BPF_REG_AX, - 0, 1, 0), - BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) | - BPF_MOV | BPF_K, insn->dst_reg, - 0, 0, 0), + *patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg); + *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) | + BPF_ADD | BPF_K, BPF_REG_AX, + 0, 0, 1); + *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | + BPF_JGT | BPF_K, BPF_REG_AX, + 0, 4, 1); + *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | + BPF_JEQ | BPF_K, BPF_REG_AX, + 0, 1, 0); + *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) | + BPF_MOV | BPF_K, insn->dst_reg, + 0, 0, 0); /* BPF_NEG(LLONG_MIN) == -LLONG_MIN == LLONG_MIN */ - BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) | - BPF_NEG | BPF_K, insn->dst_reg, - 0, 0, 0), - BPF_JMP_IMM(BPF_JA, 0, 0, 1), - *insn, - }; - struct bpf_insn chk_and_smod[] = { + *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) | + BPF_NEG | BPF_K, insn->dst_reg, + 0, 0, 0); + *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); + *patch++ = *insn; + cnt = patch - insn_buf; + } else if (is_smod) { /* [R,W]x mod 0 -> [R,W]x */ /* [R,W]x mod -1 -> 0 */ - BPF_MOV64_REG(BPF_REG_AX, insn->src_reg), - BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) | - BPF_ADD | BPF_K, BPF_REG_AX, - 0, 0, 1), - BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | - BPF_JGT | BPF_K, BPF_REG_AX, - 0, 3, 1), - BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | - BPF_JEQ | BPF_K, BPF_REG_AX, - 0, 3 + (is64 ? 0 : 1), 1), - BPF_MOV32_IMM(insn->dst_reg, 0), - BPF_JMP_IMM(BPF_JA, 0, 0, 1), - *insn, - BPF_JMP_IMM(BPF_JA, 0, 0, 1), - BPF_MOV32_REG(insn->dst_reg, insn->dst_reg), - }; - - if (is_sdiv) { - patchlet = chk_and_sdiv; - cnt = ARRAY_SIZE(chk_and_sdiv); - } else if (is_smod) { - patchlet = chk_and_smod; - cnt = ARRAY_SIZE(chk_and_smod) - (is64 ? 2 : 0); + *patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg); + *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) | + BPF_ADD | BPF_K, BPF_REG_AX, + 0, 0, 1); + *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | + BPF_JGT | BPF_K, BPF_REG_AX, + 0, 3, 1); + *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | + BPF_JEQ | BPF_K, BPF_REG_AX, + 0, 3 + (is64 ? 0 : 1), 1); + *patch++ = BPF_MOV32_IMM(insn->dst_reg, 0); + *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); + *patch++ = *insn; + + if (!is64) { + *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); + *patch++ = BPF_MOV32_REG(insn->dst_reg, insn->dst_reg); + } + cnt = patch - insn_buf; + } else if (isdiv) { + /* [R,W]x div 0 -> 0 */ + *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | + BPF_JNE | BPF_K, insn->src_reg, + 0, 2, 0); + *patch++ = BPF_ALU32_REG(BPF_XOR, insn->dst_reg, insn->dst_reg); + *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); + *patch++ = *insn; + cnt = patch - insn_buf; } else { - patchlet = isdiv ? chk_and_div : chk_and_mod; - cnt = isdiv ? ARRAY_SIZE(chk_and_div) : - ARRAY_SIZE(chk_and_mod) - (is64 ? 2 : 0); + /* [R,W]x mod 0 -> [R,W]x */ + *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | + BPF_JEQ | BPF_K, insn->src_reg, + 0, 1 + (is64 ? 0 : 1), 0); + *patch++ = *insn; + + if (!is64) { + *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); + *patch++ = BPF_MOV32_REG(insn->dst_reg, insn->dst_reg); + } + cnt = patch - insn_buf; } - new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt); + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); if (!new_prog) return -ENOMEM; @@ -21747,7 +22179,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) if (BPF_CLASS(insn->code) == BPF_LDX && (BPF_MODE(insn->code) == BPF_PROBE_MEM || BPF_MODE(insn->code) == BPF_PROBE_MEMSX)) { - struct bpf_insn *patch = &insn_buf[0]; + struct bpf_insn *patch = insn_buf; u64 uaddress_limit = bpf_arch_uaddress_limit(); if (!uaddress_limit) @@ -21779,8 +22211,8 @@ static int do_misc_fixups(struct bpf_verifier_env *env) BPF_MODE(insn->code) == BPF_IND)) { cnt = env->ops->gen_ld_abs(insn, insn_buf); if (cnt == 0 || cnt >= INSN_BUF_SIZE) { - verbose(env, "bpf verifier is misconfigured\n"); - return -EINVAL; + verifier_bug(env, "%d insns generated for ld_abs", cnt); + return -EFAULT; } new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); @@ -21798,7 +22230,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) insn->code == (BPF_ALU64 | BPF_SUB | BPF_X)) { const u8 code_add = BPF_ALU64 | BPF_ADD | BPF_X; const u8 code_sub = BPF_ALU64 | BPF_SUB | BPF_X; - struct bpf_insn *patch = &insn_buf[0]; + struct bpf_insn *patch = insn_buf; bool issrc, isneg, isimm; u32 off_reg; @@ -22115,8 +22547,8 @@ static int do_misc_fixups(struct bpf_verifier_env *env) if (cnt == -EOPNOTSUPP) goto patch_map_ops_generic; if (cnt <= 0 || cnt >= INSN_BUF_SIZE) { - verbose(env, "bpf verifier is misconfigured\n"); - return -EINVAL; + verifier_bug(env, "%d insns generated for map lookup", cnt); + return -EFAULT; } new_prog = bpf_patch_insn_data(env, i + delta, @@ -22403,9 +22835,9 @@ patch_call_imm: * programs to call them, must be real in-kernel functions */ if (!fn->func) { - verbose(env, - "kernel subsystem misconfigured func %s#%d\n", - func_id_name(insn->imm), insn->imm); + verifier_bug(env, + "not inlined functions %s#%d is missing func", + func_id_name(insn->imm), insn->imm); return -EFAULT; } insn->imm = fn->func - __bpf_call_base; @@ -22475,8 +22907,8 @@ next_insn: if (!map_ptr->ops->map_poke_track || !map_ptr->ops->map_poke_untrack || !map_ptr->ops->map_poke_run) { - verbose(env, "bpf verifier is misconfigured\n"); - return -EINVAL; + verifier_bug(env, "poke tab is misconfigured"); + return -EFAULT; } ret = map_ptr->ops->map_poke_track(map_ptr, prog->aux); @@ -22666,7 +23098,12 @@ static void free_states(struct bpf_verifier_env *env) { struct bpf_verifier_state_list *sl; struct list_head *head, *pos, *tmp; - int i; + struct bpf_scc_info *info; + int i, j; + + free_verifier_state(env->cur_state, true); + env->cur_state = NULL; + while (!pop_stack(env, NULL, NULL, false)); list_for_each_safe(pos, tmp, &env->free_list) { sl = container_of(pos, struct bpf_verifier_state_list, node); @@ -22675,6 +23112,14 @@ static void free_states(struct bpf_verifier_env *env) } INIT_LIST_HEAD(&env->free_list); + for (i = 0; i < env->scc_cnt; ++i) { + info = env->scc_info[i]; + for (j = 0; j < info->num_visits; j++) + free_backedges(&info->visits[j]); + kvfree(info); + env->scc_info[i] = NULL; + } + if (!env->explored_states) return; @@ -22702,13 +23147,13 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) env->prev_linfo = NULL; env->pass_cnt++; - state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL); + state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL_ACCOUNT); if (!state) return -ENOMEM; state->curframe = 0; state->speculative = false; state->branches = 1; - state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL); + state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL_ACCOUNT); if (!state->frame[0]) { kfree(state); return -ENOMEM; @@ -22759,11 +23204,12 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) __mark_dynptr_reg(reg, BPF_DYNPTR_TYPE_LOCAL, true, ++env->id_gen); } else if (base_type(arg->arg_type) == ARG_PTR_TO_MEM) { reg->type = PTR_TO_MEM; - if (arg->arg_type & PTR_MAYBE_NULL) - reg->type |= PTR_MAYBE_NULL; + reg->type |= arg->arg_type & + (PTR_MAYBE_NULL | PTR_UNTRUSTED | MEM_RDONLY); mark_reg_known_zero(env, regs, i); reg->mem_size = arg->mem_size; - reg->id = ++env->id_gen; + if (arg->arg_type & PTR_MAYBE_NULL) + reg->id = ++env->id_gen; } else if (base_type(arg->arg_type) == ARG_PTR_TO_BTF_ID) { reg->type = PTR_TO_BTF_ID; if (arg->arg_type & PTR_MAYBE_NULL) @@ -22780,8 +23226,8 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) /* caller can pass either PTR_TO_ARENA or SCALAR */ mark_reg_unknown(env, regs, i); } else { - WARN_ONCE(1, "BUG: unhandled arg#%d type %d\n", - i - BPF_REG_1, arg->arg_type); + verifier_bug(env, "unhandled arg#%d type %d", + i - BPF_REG_1, arg->arg_type); ret = -EFAULT; goto out; } @@ -22811,14 +23257,6 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) ret = do_check(env); out: - /* check for NULL is necessary, since cur_state can be freed inside - * do_check() under memory pressure. - */ - if (env->cur_state) { - free_verifier_state(env->cur_state, true); - env->cur_state = NULL; - } - while (!pop_stack(env, NULL, NULL, false)); if (!ret && pop_log) bpf_vlog_reset(&env->log, 0); free_states(env); @@ -22934,7 +23372,7 @@ static void print_verification_stats(struct bpf_verifier_env *env) int bpf_prog_ctx_arg_info_init(struct bpf_prog *prog, const struct bpf_ctx_arg_aux *info, u32 cnt) { - prog->aux->ctx_arg_info = kmemdup_array(info, cnt, sizeof(*info), GFP_KERNEL); + prog->aux->ctx_arg_info = kmemdup_array(info, cnt, sizeof(*info), GFP_KERNEL_ACCOUNT); prog->aux->ctx_arg_info_size = cnt; return prog->aux->ctx_arg_info ? 0 : -ENOMEM; @@ -23543,11 +23981,14 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) return ret; } else if (prog->type == BPF_PROG_TYPE_TRACING && btf_id_set_contains(&btf_id_deny, btf_id)) { + verbose(env, "Attaching tracing programs to function '%s' is rejected.\n", + tgt_info.tgt_name); return -EINVAL; } else if ((prog->expected_attach_type == BPF_TRACE_FEXIT || prog->expected_attach_type == BPF_MODIFY_RETURN) && btf_id_set_contains(&noreturn_deny, btf_id)) { - verbose(env, "Attaching fexit/fmod_ret to __noreturn functions is rejected.\n"); + verbose(env, "Attaching fexit/fmod_ret to __noreturn function '%s' is rejected.\n", + tgt_info.tgt_name); return -EINVAL; } @@ -23676,6 +24117,7 @@ static bool can_jump(struct bpf_insn *insn) case BPF_JSLT: case BPF_JSLE: case BPF_JCOND: + case BPF_JSET: return true; } @@ -23878,7 +24320,7 @@ static int compute_live_registers(struct bpf_verifier_env *env) * - repeat the computation while {in,out} fields changes for * any instruction. */ - state = kvcalloc(insn_cnt, sizeof(*state), GFP_KERNEL); + state = kvcalloc(insn_cnt, sizeof(*state), GFP_KERNEL_ACCOUNT); if (!state) { err = -ENOMEM; goto out; @@ -23916,6 +24358,10 @@ static int compute_live_registers(struct bpf_verifier_env *env) if (env->log.level & BPF_LOG_LEVEL2) { verbose(env, "Live regs before insn:\n"); for (i = 0; i < insn_cnt; ++i) { + if (env->insn_aux_data[i].scc) + verbose(env, "%3d ", env->insn_aux_data[i].scc); + else + verbose(env, " "); verbose(env, "%3d: ", i); for (j = BPF_REG_0; j < BPF_REG_10; ++j) if (insn_aux[i].live_regs_before & BIT(j)) @@ -23937,6 +24383,185 @@ out: return err; } +/* + * Compute strongly connected components (SCCs) on the CFG. + * Assign an SCC number to each instruction, recorded in env->insn_aux[*].scc. + * If instruction is a sole member of its SCC and there are no self edges, + * assign it SCC number of zero. + * Uses a non-recursive adaptation of Tarjan's algorithm for SCC computation. + */ +static int compute_scc(struct bpf_verifier_env *env) +{ + const u32 NOT_ON_STACK = U32_MAX; + + struct bpf_insn_aux_data *aux = env->insn_aux_data; + const u32 insn_cnt = env->prog->len; + int stack_sz, dfs_sz, err = 0; + u32 *stack, *pre, *low, *dfs; + u32 succ_cnt, i, j, t, w; + u32 next_preorder_num; + u32 next_scc_id; + bool assign_scc; + u32 succ[2]; + + next_preorder_num = 1; + next_scc_id = 1; + /* + * - 'stack' accumulates vertices in DFS order, see invariant comment below; + * - 'pre[t] == p' => preorder number of vertex 't' is 'p'; + * - 'low[t] == n' => smallest preorder number of the vertex reachable from 't' is 'n'; + * - 'dfs' DFS traversal stack, used to emulate explicit recursion. + */ + stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT); + pre = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT); + low = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT); + dfs = kvcalloc(insn_cnt, sizeof(*dfs), GFP_KERNEL_ACCOUNT); + if (!stack || !pre || !low || !dfs) { + err = -ENOMEM; + goto exit; + } + /* + * References: + * [1] R. Tarjan "Depth-First Search and Linear Graph Algorithms" + * [2] D. J. Pearce "A Space-Efficient Algorithm for Finding Strongly Connected Components" + * + * The algorithm maintains the following invariant: + * - suppose there is a path 'u' ~> 'v', such that 'pre[v] < pre[u]'; + * - then, vertex 'u' remains on stack while vertex 'v' is on stack. + * + * Consequently: + * - If 'low[v] < pre[v]', there is a path from 'v' to some vertex 'u', + * such that 'pre[u] == low[v]'; vertex 'u' is currently on the stack, + * and thus there is an SCC (loop) containing both 'u' and 'v'. + * - If 'low[v] == pre[v]', loops containing 'v' have been explored, + * and 'v' can be considered the root of some SCC. + * + * Here is a pseudo-code for an explicitly recursive version of the algorithm: + * + * NOT_ON_STACK = insn_cnt + 1 + * pre = [0] * insn_cnt + * low = [0] * insn_cnt + * scc = [0] * insn_cnt + * stack = [] + * + * next_preorder_num = 1 + * next_scc_id = 1 + * + * def recur(w): + * nonlocal next_preorder_num + * nonlocal next_scc_id + * + * pre[w] = next_preorder_num + * low[w] = next_preorder_num + * next_preorder_num += 1 + * stack.append(w) + * for s in successors(w): + * # Note: for classic algorithm the block below should look as: + * # + * # if pre[s] == 0: + * # recur(s) + * # low[w] = min(low[w], low[s]) + * # elif low[s] != NOT_ON_STACK: + * # low[w] = min(low[w], pre[s]) + * # + * # But replacing both 'min' instructions with 'low[w] = min(low[w], low[s])' + * # does not break the invariant and makes itartive version of the algorithm + * # simpler. See 'Algorithm #3' from [2]. + * + * # 's' not yet visited + * if pre[s] == 0: + * recur(s) + * # if 's' is on stack, pick lowest reachable preorder number from it; + * # if 's' is not on stack 'low[s] == NOT_ON_STACK > low[w]', + * # so 'min' would be a noop. + * low[w] = min(low[w], low[s]) + * + * if low[w] == pre[w]: + * # 'w' is the root of an SCC, pop all vertices + * # below 'w' on stack and assign same SCC to them. + * while True: + * t = stack.pop() + * low[t] = NOT_ON_STACK + * scc[t] = next_scc_id + * if t == w: + * break + * next_scc_id += 1 + * + * for i in range(0, insn_cnt): + * if pre[i] == 0: + * recur(i) + * + * Below implementation replaces explicit recursion with array 'dfs'. + */ + for (i = 0; i < insn_cnt; i++) { + if (pre[i]) + continue; + stack_sz = 0; + dfs_sz = 1; + dfs[0] = i; +dfs_continue: + while (dfs_sz) { + w = dfs[dfs_sz - 1]; + if (pre[w] == 0) { + low[w] = next_preorder_num; + pre[w] = next_preorder_num; + next_preorder_num++; + stack[stack_sz++] = w; + } + /* Visit 'w' successors */ + succ_cnt = insn_successors(env->prog, w, succ); + for (j = 0; j < succ_cnt; ++j) { + if (pre[succ[j]]) { + low[w] = min(low[w], low[succ[j]]); + } else { + dfs[dfs_sz++] = succ[j]; + goto dfs_continue; + } + } + /* + * Preserve the invariant: if some vertex above in the stack + * is reachable from 'w', keep 'w' on the stack. + */ + if (low[w] < pre[w]) { + dfs_sz--; + goto dfs_continue; + } + /* + * Assign SCC number only if component has two or more elements, + * or if component has a self reference. + */ + assign_scc = stack[stack_sz - 1] != w; + for (j = 0; j < succ_cnt; ++j) { + if (succ[j] == w) { + assign_scc = true; + break; + } + } + /* Pop component elements from stack */ + do { + t = stack[--stack_sz]; + low[t] = NOT_ON_STACK; + if (assign_scc) + aux[t].scc = next_scc_id; + } while (t != w); + if (assign_scc) + next_scc_id++; + dfs_sz--; + } + } + env->scc_info = kvcalloc(next_scc_id, sizeof(*env->scc_info), GFP_KERNEL_ACCOUNT); + if (!env->scc_info) { + err = -ENOMEM; + goto exit; + } +exit: + kvfree(stack); + kvfree(pre); + kvfree(low); + kvfree(dfs); + return err; +} + int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size) { u64 start_time = ktime_get_ns(); @@ -23945,6 +24570,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 u32 log_true_size; bool is_priv; + BTF_TYPE_EMIT(enum bpf_features); + /* no program is valid */ if (ARRAY_SIZE(bpf_verifier_ops) == 0) return -EINVAL; @@ -23952,7 +24579,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 /* 'struct bpf_verifier_env' can be global, but since it's not small, * allocate/free it every time bpf_check() is called */ - env = kvzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL); + env = kvzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL_ACCOUNT); if (!env) return -ENOMEM; @@ -24015,7 +24642,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 env->explored_states = kvcalloc(state_htab_size(env), sizeof(struct list_head), - GFP_USER); + GFP_KERNEL_ACCOUNT); ret = -ENOMEM; if (!env->explored_states) goto skip_full_check; @@ -24058,6 +24685,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 if (ret) goto skip_full_check; + ret = compute_scc(env); + if (ret < 0) + goto skip_full_check; + ret = compute_live_registers(env); if (ret < 0) goto skip_full_check; @@ -24142,7 +24773,7 @@ skip_full_check: /* if program passed verifier, update used_maps in bpf_prog_info */ env->prog->aux->used_maps = kmalloc_array(env->used_map_cnt, sizeof(env->used_maps[0]), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!env->prog->aux->used_maps) { ret = -ENOMEM; @@ -24157,7 +24788,7 @@ skip_full_check: /* if program passed verifier, update used_btfs in bpf_prog_aux */ env->prog->aux->used_btfs = kmalloc_array(env->used_btf_cnt, sizeof(env->used_btfs[0]), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!env->prog->aux->used_btfs) { ret = -ENOMEM; goto err_release_maps; @@ -24198,9 +24829,9 @@ err_unlock: if (!is_priv) mutex_unlock(&bpf_verifier_lock); vfree(env->insn_aux_data); - kvfree(env->insn_hist); err_free_env: kvfree(env->cfg.insn_postorder); + kvfree(env->scc_info); kvfree(env); return ret; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index a723b7dc6e4e..312c6a8b55bb 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2074,6 +2074,11 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) for_each_subsys(ss, ssid) INIT_LIST_HEAD(&cgrp->e_csets[ssid]); +#ifdef CONFIG_CGROUP_BPF + for (int i = 0; i < ARRAY_SIZE(cgrp->bpf.revisions); i++) + cgrp->bpf.revisions[i] = 1; +#endif + init_waitqueue_head(&cgrp->offline_waitq); INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent); } diff --git a/kernel/cpu.c b/kernel/cpu.c index a59e009e0be4..faf0f23fc5d8 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -37,6 +37,7 @@ #include <linux/cpuset.h> #include <linux/random.h> #include <linux/cc_platform.h> +#include <linux/parser.h> #include <trace/events/power.h> #define CREATE_TRACE_POINTS @@ -3174,8 +3175,38 @@ void __init boot_cpu_hotplug_init(void) #ifdef CONFIG_CPU_MITIGATIONS /* - * These are used for a global "mitigations=" cmdline option for toggling - * optional CPU mitigations. + * All except the cross-thread attack vector are mitigated by default. + * Cross-thread mitigation often requires disabling SMT which is expensive + * so cross-thread mitigations are only partially enabled by default. + * + * Guest-to-Host and Guest-to-Guest vectors are only needed if KVM support is + * present. + */ +static bool attack_vectors[NR_CPU_ATTACK_VECTORS] __ro_after_init = { + [CPU_MITIGATE_USER_KERNEL] = true, + [CPU_MITIGATE_USER_USER] = true, + [CPU_MITIGATE_GUEST_HOST] = IS_ENABLED(CONFIG_KVM), + [CPU_MITIGATE_GUEST_GUEST] = IS_ENABLED(CONFIG_KVM), +}; + +bool cpu_attack_vector_mitigated(enum cpu_attack_vectors v) +{ + if (v < NR_CPU_ATTACK_VECTORS) + return attack_vectors[v]; + + WARN_ONCE(1, "Invalid attack vector %d\n", v); + return false; +} + +/* + * There are 3 global options, 'off', 'auto', 'auto,nosmt'. These may optionally + * be combined with attack-vector disables which follow them. + * + * Examples: + * mitigations=auto,no_user_kernel,no_user_user,no_cross_thread + * mitigations=auto,nosmt,no_guest_host,no_guest_guest + * + * mitigations=off is equivalent to disabling all attack vectors. */ enum cpu_mitigations { CPU_MITIGATIONS_OFF, @@ -3183,19 +3214,96 @@ enum cpu_mitigations { CPU_MITIGATIONS_AUTO_NOSMT, }; +enum { + NO_USER_KERNEL, + NO_USER_USER, + NO_GUEST_HOST, + NO_GUEST_GUEST, + NO_CROSS_THREAD, + NR_VECTOR_PARAMS, +}; + +enum smt_mitigations smt_mitigations __ro_after_init = SMT_MITIGATIONS_AUTO; static enum cpu_mitigations cpu_mitigations __ro_after_init = CPU_MITIGATIONS_AUTO; +static const match_table_t global_mitigations = { + { CPU_MITIGATIONS_AUTO_NOSMT, "auto,nosmt"}, + { CPU_MITIGATIONS_AUTO, "auto"}, + { CPU_MITIGATIONS_OFF, "off"}, +}; + +static const match_table_t vector_mitigations = { + { NO_USER_KERNEL, "no_user_kernel"}, + { NO_USER_USER, "no_user_user"}, + { NO_GUEST_HOST, "no_guest_host"}, + { NO_GUEST_GUEST, "no_guest_guest"}, + { NO_CROSS_THREAD, "no_cross_thread"}, + { NR_VECTOR_PARAMS, NULL}, +}; + +static int __init mitigations_parse_global_opt(char *arg) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(global_mitigations); i++) { + const char *pattern = global_mitigations[i].pattern; + + if (!strncmp(arg, pattern, strlen(pattern))) { + cpu_mitigations = global_mitigations[i].token; + return strlen(pattern); + } + } + + return 0; +} + static int __init mitigations_parse_cmdline(char *arg) { - if (!strcmp(arg, "off")) - cpu_mitigations = CPU_MITIGATIONS_OFF; - else if (!strcmp(arg, "auto")) - cpu_mitigations = CPU_MITIGATIONS_AUTO; - else if (!strcmp(arg, "auto,nosmt")) - cpu_mitigations = CPU_MITIGATIONS_AUTO_NOSMT; - else - pr_crit("Unsupported mitigations=%s, system may still be vulnerable\n", - arg); + char *s, *p; + int len; + + len = mitigations_parse_global_opt(arg); + + if (cpu_mitigations_off()) { + memset(attack_vectors, 0, sizeof(attack_vectors)); + smt_mitigations = SMT_MITIGATIONS_OFF; + } else if (cpu_mitigations_auto_nosmt()) { + smt_mitigations = SMT_MITIGATIONS_ON; + } + + p = arg + len; + + if (!*p) + return 0; + + /* Attack vector controls may come after the ',' */ + if (*p++ != ',' || !IS_ENABLED(CONFIG_ARCH_HAS_CPU_ATTACK_VECTORS)) { + pr_crit("Unsupported mitigations=%s, system may still be vulnerable\n", arg); + return 0; + } + + while ((s = strsep(&p, ",")) != NULL) { + switch (match_token(s, vector_mitigations, NULL)) { + case NO_USER_KERNEL: + attack_vectors[CPU_MITIGATE_USER_KERNEL] = false; + break; + case NO_USER_USER: + attack_vectors[CPU_MITIGATE_USER_USER] = false; + break; + case NO_GUEST_HOST: + attack_vectors[CPU_MITIGATE_GUEST_HOST] = false; + break; + case NO_GUEST_GUEST: + attack_vectors[CPU_MITIGATE_GUEST_GUEST] = false; + break; + case NO_CROSS_THREAD: + smt_mitigations = SMT_MITIGATIONS_OFF; + break; + default: + pr_crit("Unsupported mitigations options %s\n", s); + return 0; + } + } return 0; } diff --git a/kernel/entry/Makefile b/kernel/entry/Makefile index d4b8bd0af79b..77fcd83dd663 100644 --- a/kernel/entry/Makefile +++ b/kernel/entry/Makefile @@ -12,5 +12,6 @@ ccflags-$(CONFIG_TRACE_BRANCH_PROFILING) += -DDISABLE_BRANCH_PROFILING CFLAGS_REMOVE_common.o = -fstack-protector -fstack-protector-strong CFLAGS_common.o += -fno-stack-protector -obj-$(CONFIG_GENERIC_ENTRY) += common.o syscall_user_dispatch.o +obj-$(CONFIG_GENERIC_IRQ_ENTRY) += common.o +obj-$(CONFIG_GENERIC_SYSCALL) += syscall-common.o syscall_user_dispatch.o obj-$(CONFIG_KVM_XFER_TO_GUEST_WORK) += kvm.o diff --git a/kernel/entry/common.c b/kernel/entry/common.c index a8dd1f27417c..408d28b5179d 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -1,84 +1,13 @@ // SPDX-License-Identifier: GPL-2.0 -#include <linux/context_tracking.h> -#include <linux/entry-common.h> +#include <linux/irq-entry-common.h> #include <linux/resume_user_mode.h> #include <linux/highmem.h> #include <linux/jump_label.h> #include <linux/kmsan.h> #include <linux/livepatch.h> -#include <linux/audit.h> #include <linux/tick.h> -#include "common.h" - -#define CREATE_TRACE_POINTS -#include <trace/events/syscalls.h> - -static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) -{ - if (unlikely(audit_context())) { - unsigned long args[6]; - - syscall_get_arguments(current, regs, args); - audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]); - } -} - -long syscall_trace_enter(struct pt_regs *regs, long syscall, - unsigned long work) -{ - long ret = 0; - - /* - * Handle Syscall User Dispatch. This must comes first, since - * the ABI here can be something that doesn't make sense for - * other syscall_work features. - */ - if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { - if (syscall_user_dispatch(regs)) - return -1L; - } - - /* Handle ptrace */ - if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) { - ret = ptrace_report_syscall_entry(regs); - if (ret || (work & SYSCALL_WORK_SYSCALL_EMU)) - return -1L; - } - - /* Do seccomp after ptrace, to catch any tracer changes. */ - if (work & SYSCALL_WORK_SECCOMP) { - ret = __secure_computing(); - if (ret == -1L) - return ret; - } - - /* Either of the above might have changed the syscall number */ - syscall = syscall_get_nr(current, regs); - - if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) { - trace_sys_enter(regs, syscall); - /* - * Probes or BPF hooks in the tracepoint may have changed the - * system call number as well. - */ - syscall = syscall_get_nr(current, regs); - } - - syscall_enter_audit(regs, syscall); - - return ret ? : syscall; -} - -noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs) -{ - enter_from_user_mode(regs); - instrumentation_begin(); - local_irq_enable(); - instrumentation_end(); -} - /* Workaround to allow gradual conversion of architecture code */ void __weak arch_do_signal_or_restart(struct pt_regs *regs) { } @@ -133,46 +62,6 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs, return ti_work; } -/* - * If SYSCALL_EMU is set, then the only reason to report is when - * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall - * instruction has been already reported in syscall_enter_from_user_mode(). - */ -static inline bool report_single_step(unsigned long work) -{ - if (work & SYSCALL_WORK_SYSCALL_EMU) - return false; - - return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP; -} - -void syscall_exit_work(struct pt_regs *regs, unsigned long work) -{ - bool step; - - /* - * If the syscall was rolled back due to syscall user dispatching, - * then the tracers below are not invoked for the same reason as - * the entry side was not invoked in syscall_trace_enter(): The ABI - * of these syscalls is unknown. - */ - if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { - if (unlikely(current->syscall_dispatch.on_dispatch)) { - current->syscall_dispatch.on_dispatch = false; - return; - } - } - - audit_syscall_exit(regs); - - if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT) - trace_sys_exit(regs, syscall_get_return_value(current, regs)); - - step = report_single_step(work); - if (step || work & SYSCALL_WORK_SYSCALL_TRACE) - ptrace_report_syscall_exit(regs, step); -} - noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs) { enter_from_user_mode(regs); @@ -220,7 +109,8 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) * TINY_RCU does not support EQS, so let the compiler eliminate * this part when enabled. */ - if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) { + if (!IS_ENABLED(CONFIG_TINY_RCU) && + (is_idle_task(current) || arch_in_rcu_eqs())) { /* * If RCU is not watching then the same careful * sequence vs. lockdep and tracing is required diff --git a/kernel/entry/syscall-common.c b/kernel/entry/syscall-common.c new file mode 100644 index 000000000000..66e6ba7fa80c --- /dev/null +++ b/kernel/entry/syscall-common.c @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/audit.h> +#include <linux/entry-common.h> +#include "common.h" + +#define CREATE_TRACE_POINTS +#include <trace/events/syscalls.h> + +static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) +{ + if (unlikely(audit_context())) { + unsigned long args[6]; + + syscall_get_arguments(current, regs, args); + audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]); + } +} + +long syscall_trace_enter(struct pt_regs *regs, long syscall, + unsigned long work) +{ + long ret = 0; + + /* + * Handle Syscall User Dispatch. This must comes first, since + * the ABI here can be something that doesn't make sense for + * other syscall_work features. + */ + if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { + if (syscall_user_dispatch(regs)) + return -1L; + } + + /* Handle ptrace */ + if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) { + ret = ptrace_report_syscall_entry(regs); + if (ret || (work & SYSCALL_WORK_SYSCALL_EMU)) + return -1L; + } + + /* Do seccomp after ptrace, to catch any tracer changes. */ + if (work & SYSCALL_WORK_SECCOMP) { + ret = __secure_computing(); + if (ret == -1L) + return ret; + } + + /* Either of the above might have changed the syscall number */ + syscall = syscall_get_nr(current, regs); + + if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) { + trace_sys_enter(regs, syscall); + /* + * Probes or BPF hooks in the tracepoint may have changed the + * system call number as well. + */ + syscall = syscall_get_nr(current, regs); + } + + syscall_enter_audit(regs, syscall); + + return ret ? : syscall; +} + +noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs) +{ + enter_from_user_mode(regs); + instrumentation_begin(); + local_irq_enable(); + instrumentation_end(); +} + +/* + * If SYSCALL_EMU is set, then the only reason to report is when + * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall + * instruction has been already reported in syscall_enter_from_user_mode(). + */ +static inline bool report_single_step(unsigned long work) +{ + if (work & SYSCALL_WORK_SYSCALL_EMU) + return false; + + return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP; +} + +void syscall_exit_work(struct pt_regs *regs, unsigned long work) +{ + bool step; + + /* + * If the syscall was rolled back due to syscall user dispatching, + * then the tracers below are not invoked for the same reason as + * the entry side was not invoked in syscall_trace_enter(): The ABI + * of these syscalls is unknown. + */ + if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { + if (unlikely(current->syscall_dispatch.on_dispatch)) { + current->syscall_dispatch.on_dispatch = false; + return; + } + } + + audit_syscall_exit(regs); + + if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT) + trace_sys_exit(regs, syscall_get_return_value(current, regs)); + + step = report_single_step(work); + if (step || work & SYSCALL_WORK_SYSCALL_TRACE) + ptrace_report_syscall_exit(regs, step); +} diff --git a/kernel/entry/syscall_user_dispatch.c b/kernel/entry/syscall_user_dispatch.c index 5340c5aa89e7..a9055eccb27e 100644 --- a/kernel/entry/syscall_user_dispatch.c +++ b/kernel/entry/syscall_user_dispatch.c @@ -78,7 +78,7 @@ static int task_set_syscall_user_dispatch(struct task_struct *task, unsigned lon if (offset || len || selector) return -EINVAL; break; - case PR_SYS_DISPATCH_ON: + case PR_SYS_DISPATCH_EXCLUSIVE_ON: /* * Validate the direct dispatcher region just for basic * sanity against overflow and a 0-sized dispatcher @@ -87,30 +87,40 @@ static int task_set_syscall_user_dispatch(struct task_struct *task, unsigned lon */ if (offset && offset + len <= offset) return -EINVAL; - + break; + case PR_SYS_DISPATCH_INCLUSIVE_ON: + if (len == 0 || offset + len <= offset) + return -EINVAL; /* - * access_ok() will clear memory tags for tagged addresses - * if current has memory tagging enabled. - - * To enable a tracer to set a tracees selector the - * selector address must be untagged for access_ok(), - * otherwise an untagged tracer will always fail to set a - * tagged tracees selector. + * Invert the range, the check in syscall_user_dispatch() + * supports wrap-around. */ - if (selector && !access_ok(untagged_addr(selector), sizeof(*selector))) - return -EFAULT; - + offset = offset + len; + len = -len; break; default: return -EINVAL; } + /* + * access_ok() will clear memory tags for tagged addresses + * if current has memory tagging enabled. + * + * To enable a tracer to set a tracees selector the + * selector address must be untagged for access_ok(), + * otherwise an untagged tracer will always fail to set a + * tagged tracees selector. + */ + if (mode != PR_SYS_DISPATCH_OFF && selector && + !access_ok(untagged_addr(selector), sizeof(*selector))) + return -EFAULT; + task->syscall_dispatch.selector = selector; task->syscall_dispatch.offset = offset; task->syscall_dispatch.len = len; task->syscall_dispatch.on_dispatch = false; - if (mode == PR_SYS_DISPATCH_ON) + if (mode != PR_SYS_DISPATCH_OFF) set_task_syscall_work(task, SYSCALL_USER_DISPATCH); else clear_task_syscall_work(task, SYSCALL_USER_DISPATCH); diff --git a/kernel/fork.c b/kernel/fork.c index 115e2b5f628f..52901fe4a3c2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1046,7 +1046,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, RCU_INIT_POINTER(mm->exe_file, NULL); mmu_notifier_subscriptions_init(mm); init_tlb_flush_pending(mm); - futex_mm_init(mm); #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS) mm->pmd_huge_pte = NULL; #endif @@ -1061,6 +1060,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm->def_flags = 0; } + if (futex_mm_init(mm)) + goto fail_mm_init; + if (mm_alloc_pgd(mm)) goto fail_nopgd; @@ -1090,6 +1092,8 @@ fail_nocontext: fail_noid: mm_free_pgd(mm); fail_nopgd: + futex_hash_free(mm); +fail_mm_init: free_mm(mm); return NULL; } @@ -1145,7 +1149,7 @@ void mmput(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(mmput); -#ifdef CONFIG_MMU +#if defined(CONFIG_MMU) || defined(CONFIG_FUTEX_PRIVATE_HASH) static void mmput_async_fn(struct work_struct *work) { struct mm_struct *mm = container_of(work, struct mm_struct, @@ -1886,10 +1890,7 @@ static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk) #ifdef CONFIG_RV static void rv_task_fork(struct task_struct *p) { - int i; - - for (i = 0; i < RV_PER_TASK_MONITORS; i++) - p->rv[i].da_mon.monitoring = false; + memset(&p->rv, 0, sizeof(p->rv)); } #else #define rv_task_fork(p) do {} while (0) @@ -2123,9 +2124,8 @@ __latent_entropy struct task_struct *copy_process( lockdep_init_task(p); #endif -#ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ -#endif + #ifdef CONFIG_BCACHE p->sequential_io = 0; p->sequential_io_avg = 0; @@ -3216,7 +3216,7 @@ int unshare_files(void) return 0; } -int sysctl_max_threads(const struct ctl_table *table, int write, +static int sysctl_max_threads(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; @@ -3238,3 +3238,21 @@ int sysctl_max_threads(const struct ctl_table *table, int write, return 0; } + +static const struct ctl_table fork_sysctl_table[] = { + { + .procname = "threads-max", + .data = NULL, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sysctl_max_threads, + }, +}; + +static int __init init_fork_sysctl(void) +{ + register_sysctl_init("kernel", fork_sysctl_table); + return 0; +} + +subsys_initcall(init_fork_sysctl); diff --git a/kernel/futex/core.c b/kernel/futex/core.c index 90d53fb0ee9e..d9bb5567af0c 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -42,7 +42,6 @@ #include <linux/fault-inject.h> #include <linux/slab.h> #include <linux/prctl.h> -#include <linux/rcuref.h> #include <linux/mempolicy.h> #include <linux/mmap_lock.h> @@ -65,12 +64,11 @@ static struct { #define futex_queues (__futex_data.queues) struct futex_private_hash { - rcuref_t users; + int state; unsigned int hash_mask; struct rcu_head rcu; void *mm; bool custom; - bool immutable; struct futex_hash_bucket queues[]; }; @@ -129,6 +127,12 @@ static struct futex_hash_bucket * __futex_hash(union futex_key *key, struct futex_private_hash *fph); #ifdef CONFIG_FUTEX_PRIVATE_HASH +static bool futex_ref_get(struct futex_private_hash *fph); +static bool futex_ref_put(struct futex_private_hash *fph); +static bool futex_ref_is_dead(struct futex_private_hash *fph); + +enum { FR_PERCPU = 0, FR_ATOMIC }; + static inline bool futex_key_is_private(union futex_key *key) { /* @@ -138,19 +142,14 @@ static inline bool futex_key_is_private(union futex_key *key) return !(key->both.offset & (FUT_OFF_INODE | FUT_OFF_MMSHARED)); } -bool futex_private_hash_get(struct futex_private_hash *fph) +static bool futex_private_hash_get(struct futex_private_hash *fph) { - if (fph->immutable) - return true; - return rcuref_get(&fph->users); + return futex_ref_get(fph); } void futex_private_hash_put(struct futex_private_hash *fph) { - /* Ignore return value, last put is verified via rcuref_is_dead() */ - if (fph->immutable) - return; - if (rcuref_put(&fph->users)) + if (futex_ref_put(fph)) wake_up_var(fph->mm); } @@ -243,14 +242,18 @@ static bool __futex_pivot_hash(struct mm_struct *mm, fph = rcu_dereference_protected(mm->futex_phash, lockdep_is_held(&mm->futex_hash_lock)); if (fph) { - if (!rcuref_is_dead(&fph->users)) { + if (!futex_ref_is_dead(fph)) { mm->futex_phash_new = new; return false; } futex_rehash_private(fph, new); } - rcu_assign_pointer(mm->futex_phash, new); + new->state = FR_PERCPU; + scoped_guard(rcu) { + mm->futex_batches = get_state_synchronize_rcu(); + rcu_assign_pointer(mm->futex_phash, new); + } kvfree_rcu(fph, rcu); return true; } @@ -289,9 +292,7 @@ again: if (!fph) return NULL; - if (fph->immutable) - return fph; - if (rcuref_get(&fph->users)) + if (futex_private_hash_get(fph)) return fph; } futex_pivot_hash(mm); @@ -1524,19 +1525,221 @@ static void futex_hash_bucket_init(struct futex_hash_bucket *fhb, } #define FH_CUSTOM 0x01 -#define FH_IMMUTABLE 0x02 #ifdef CONFIG_FUTEX_PRIVATE_HASH + +/* + * futex-ref + * + * Heavily inspired by percpu-rwsem/percpu-refcount; not reusing any of that + * code because it just doesn't fit right. + * + * Dual counter, per-cpu / atomic approach like percpu-refcount, except it + * re-initializes the state automatically, such that the fph swizzle is also a + * transition back to per-cpu. + */ + +static void futex_ref_rcu(struct rcu_head *head); + +static void __futex_ref_atomic_begin(struct futex_private_hash *fph) +{ + struct mm_struct *mm = fph->mm; + + /* + * The counter we're about to switch to must have fully switched; + * otherwise it would be impossible for it to have reported success + * from futex_ref_is_dead(). + */ + WARN_ON_ONCE(atomic_long_read(&mm->futex_atomic) != 0); + + /* + * Set the atomic to the bias value such that futex_ref_{get,put}() + * will never observe 0. Will be fixed up in __futex_ref_atomic_end() + * when folding in the percpu count. + */ + atomic_long_set(&mm->futex_atomic, LONG_MAX); + smp_store_release(&fph->state, FR_ATOMIC); + + call_rcu_hurry(&mm->futex_rcu, futex_ref_rcu); +} + +static void __futex_ref_atomic_end(struct futex_private_hash *fph) +{ + struct mm_struct *mm = fph->mm; + unsigned int count = 0; + long ret; + int cpu; + + /* + * Per __futex_ref_atomic_begin() the state of the fph must be ATOMIC + * and per this RCU callback, everybody must now observe this state and + * use the atomic variable. + */ + WARN_ON_ONCE(fph->state != FR_ATOMIC); + + /* + * Therefore the per-cpu counter is now stable, sum and reset. + */ + for_each_possible_cpu(cpu) { + unsigned int *ptr = per_cpu_ptr(mm->futex_ref, cpu); + count += *ptr; + *ptr = 0; + } + + /* + * Re-init for the next cycle. + */ + this_cpu_inc(*mm->futex_ref); /* 0 -> 1 */ + + /* + * Add actual count, subtract bias and initial refcount. + * + * The moment this atomic operation happens, futex_ref_is_dead() can + * become true. + */ + ret = atomic_long_add_return(count - LONG_MAX - 1, &mm->futex_atomic); + if (!ret) + wake_up_var(mm); + + WARN_ON_ONCE(ret < 0); + mmput_async(mm); +} + +static void futex_ref_rcu(struct rcu_head *head) +{ + struct mm_struct *mm = container_of(head, struct mm_struct, futex_rcu); + struct futex_private_hash *fph = rcu_dereference_raw(mm->futex_phash); + + if (fph->state == FR_PERCPU) { + /* + * Per this extra grace-period, everybody must now observe + * fph as the current fph and no previously observed fph's + * are in-flight. + * + * Notably, nobody will now rely on the atomic + * futex_ref_is_dead() state anymore so we can begin the + * migration of the per-cpu counter into the atomic. + */ + __futex_ref_atomic_begin(fph); + return; + } + + __futex_ref_atomic_end(fph); +} + +/* + * Drop the initial refcount and transition to atomics. + */ +static void futex_ref_drop(struct futex_private_hash *fph) +{ + struct mm_struct *mm = fph->mm; + + /* + * Can only transition the current fph; + */ + WARN_ON_ONCE(rcu_dereference_raw(mm->futex_phash) != fph); + /* + * We enqueue at least one RCU callback. Ensure mm stays if the task + * exits before the transition is completed. + */ + mmget(mm); + + /* + * In order to avoid the following scenario: + * + * futex_hash() __futex_pivot_hash() + * guard(rcu); guard(mm->futex_hash_lock); + * fph = mm->futex_phash; + * rcu_assign_pointer(&mm->futex_phash, new); + * futex_hash_allocate() + * futex_ref_drop() + * fph->state = FR_ATOMIC; + * atomic_set(, BIAS); + * + * futex_private_hash_get(fph); // OOPS + * + * Where an old fph (which is FR_ATOMIC) and should fail on + * inc_not_zero, will succeed because a new transition is started and + * the atomic is bias'ed away from 0. + * + * There must be at least one full grace-period between publishing a + * new fph and trying to replace it. + */ + if (poll_state_synchronize_rcu(mm->futex_batches)) { + /* + * There was a grace-period, we can begin now. + */ + __futex_ref_atomic_begin(fph); + return; + } + + call_rcu_hurry(&mm->futex_rcu, futex_ref_rcu); +} + +static bool futex_ref_get(struct futex_private_hash *fph) +{ + struct mm_struct *mm = fph->mm; + + guard(rcu)(); + + if (smp_load_acquire(&fph->state) == FR_PERCPU) { + this_cpu_inc(*mm->futex_ref); + return true; + } + + return atomic_long_inc_not_zero(&mm->futex_atomic); +} + +static bool futex_ref_put(struct futex_private_hash *fph) +{ + struct mm_struct *mm = fph->mm; + + guard(rcu)(); + + if (smp_load_acquire(&fph->state) == FR_PERCPU) { + this_cpu_dec(*mm->futex_ref); + return false; + } + + return atomic_long_dec_and_test(&mm->futex_atomic); +} + +static bool futex_ref_is_dead(struct futex_private_hash *fph) +{ + struct mm_struct *mm = fph->mm; + + guard(rcu)(); + + if (smp_load_acquire(&fph->state) == FR_PERCPU) + return false; + + return atomic_long_read(&mm->futex_atomic) == 0; +} + +int futex_mm_init(struct mm_struct *mm) +{ + mutex_init(&mm->futex_hash_lock); + RCU_INIT_POINTER(mm->futex_phash, NULL); + mm->futex_phash_new = NULL; + /* futex-ref */ + atomic_long_set(&mm->futex_atomic, 0); + mm->futex_batches = get_state_synchronize_rcu(); + mm->futex_ref = alloc_percpu(unsigned int); + if (!mm->futex_ref) + return -ENOMEM; + this_cpu_inc(*mm->futex_ref); /* 0 -> 1 */ + return 0; +} + void futex_hash_free(struct mm_struct *mm) { struct futex_private_hash *fph; + free_percpu(mm->futex_ref); kvfree(mm->futex_phash_new); fph = rcu_dereference_raw(mm->futex_phash); - if (fph) { - WARN_ON_ONCE(rcuref_read(&fph->users) > 1); + if (fph) kvfree(fph); - } } static bool futex_pivot_pending(struct mm_struct *mm) @@ -1549,7 +1752,7 @@ static bool futex_pivot_pending(struct mm_struct *mm) return true; fph = rcu_dereference(mm->futex_phash); - return rcuref_is_dead(&fph->users); + return futex_ref_is_dead(fph); } static bool futex_hash_less(struct futex_private_hash *a, @@ -1591,21 +1794,20 @@ static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags) */ scoped_guard(rcu) { fph = rcu_dereference(mm->futex_phash); - if (fph && (!fph->hash_mask || fph->immutable)) { + if (fph && !fph->hash_mask) { if (custom) return -EBUSY; return 0; } } - fph = kvzalloc(struct_size(fph, queues, hash_slots), GFP_KERNEL_ACCOUNT | __GFP_NOWARN); + fph = kvzalloc(struct_size(fph, queues, hash_slots), + GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (!fph) return -ENOMEM; - rcuref_init(&fph->users, 1); fph->hash_mask = hash_slots ? hash_slots - 1 : 0; fph->custom = custom; - fph->immutable = !!(flags & FH_IMMUTABLE); fph->mm = mm; for (i = 0; i < hash_slots; i++) @@ -1629,7 +1831,7 @@ again: mm->futex_phash_new = NULL; if (fph) { - if (cur && (!cur->hash_mask || cur->immutable)) { + if (cur && !cur->hash_mask) { /* * If two threads simultaneously request the global * hash then the first one performs the switch, @@ -1645,7 +1847,7 @@ again: * allocated a replacement hash, drop the initial * reference on the existing hash. */ - futex_private_hash_put(cur); + futex_ref_drop(cur); } if (new) { @@ -1722,19 +1924,6 @@ static int futex_hash_get_slots(void) return 0; } -static int futex_hash_get_immutable(void) -{ - struct futex_private_hash *fph; - - guard(rcu)(); - fph = rcu_dereference(current->mm->futex_phash); - if (fph && fph->immutable) - return 1; - if (fph && !fph->hash_mask) - return 1; - return 0; -} - #else static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags) @@ -1747,10 +1936,6 @@ static int futex_hash_get_slots(void) return 0; } -static int futex_hash_get_immutable(void) -{ - return 0; -} #endif int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4) @@ -1760,10 +1945,8 @@ int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4) switch (arg2) { case PR_FUTEX_HASH_SET_SLOTS: - if (arg4 & ~FH_FLAG_IMMUTABLE) + if (arg4) return -EINVAL; - if (arg4 & FH_FLAG_IMMUTABLE) - flags |= FH_IMMUTABLE; ret = futex_hash_allocate(arg3, flags); break; @@ -1771,10 +1954,6 @@ int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4) ret = futex_hash_get_slots(); break; - case PR_FUTEX_HASH_GET_IMMUTABLE: - ret = futex_hash_get_immutable(); - break; - default: ret = -EINVAL; break; diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h index fcd1617212ee..c74eac572acd 100644 --- a/kernel/futex/futex.h +++ b/kernel/futex/futex.h @@ -228,14 +228,12 @@ extern void futex_hash_get(struct futex_hash_bucket *hb); extern void futex_hash_put(struct futex_hash_bucket *hb); extern struct futex_private_hash *futex_private_hash(void); -extern bool futex_private_hash_get(struct futex_private_hash *fph); extern void futex_private_hash_put(struct futex_private_hash *fph); #else /* !CONFIG_FUTEX_PRIVATE_HASH */ static inline void futex_hash_get(struct futex_hash_bucket *hb) { } static inline void futex_hash_put(struct futex_hash_bucket *hb) { } static inline struct futex_private_hash *futex_private_hash(void) { return NULL; } -static inline bool futex_private_hash_get(void) { return false; } static inline void futex_private_hash_put(struct futex_private_hash *fph) { } #endif diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 3f02a0e45254..1da5e9d9da71 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -144,6 +144,17 @@ config GENERIC_IRQ_DEBUGFS config GENERIC_IRQ_KEXEC_CLEAR_VM_FORWARD bool +config IRQ_KUNIT_TEST + bool "KUnit tests for IRQ management APIs" if !KUNIT_ALL_TESTS + depends on KUNIT=y + default KUNIT_ALL_TESTS + imply SMP + help + This option enables KUnit tests for the IRQ subsystem API. These are + only for development and testing, not for regular kernel use cases. + + If unsure, say N. + endmenu config GENERIC_IRQ_MULTI_HANDLER diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index c0f44c06d69d..6ab3a4055667 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -19,3 +19,4 @@ obj-$(CONFIG_GENERIC_IRQ_IPI_MUX) += ipi-mux.o obj-$(CONFIG_SMP) += affinity.o obj-$(CONFIG_GENERIC_IRQ_DEBUGFS) += debugfs.o obj-$(CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR) += matrix.o +obj-$(CONFIG_IRQ_KUNIT_TEST) += irq_test.o diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 2b274007e8ba..624106e886ad 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -457,22 +457,33 @@ void unmask_threaded_irq(struct irq_desc *desc) unmask_irq(desc); } -static bool irq_check_poll(struct irq_desc *desc) -{ - if (!(desc->istate & IRQS_POLL_INPROGRESS)) - return false; - return irq_wait_for_poll(desc); +/* Busy wait until INPROGRESS is cleared */ +static bool irq_wait_on_inprogress(struct irq_desc *desc) +{ + if (IS_ENABLED(CONFIG_SMP)) { + do { + raw_spin_unlock(&desc->lock); + while (irqd_irq_inprogress(&desc->irq_data)) + cpu_relax(); + raw_spin_lock(&desc->lock); + } while (irqd_irq_inprogress(&desc->irq_data)); + + /* Might have been disabled in meantime */ + return !irqd_irq_disabled(&desc->irq_data) && desc->action; + } + return false; } static bool irq_can_handle_pm(struct irq_desc *desc) { - unsigned int mask = IRQD_IRQ_INPROGRESS | IRQD_WAKEUP_ARMED; + struct irq_data *irqd = &desc->irq_data; + const struct cpumask *aff; /* * If the interrupt is not in progress and is not an armed * wakeup interrupt, proceed. */ - if (!irqd_has_set(&desc->irq_data, mask)) + if (!irqd_has_set(irqd, IRQD_IRQ_INPROGRESS | IRQD_WAKEUP_ARMED)) return true; /* @@ -480,13 +491,54 @@ static bool irq_can_handle_pm(struct irq_desc *desc) * and suspended, disable it and notify the pm core about the * event. */ - if (irq_pm_check_wakeup(desc)) + if (unlikely(irqd_has_set(irqd, IRQD_WAKEUP_ARMED))) { + irq_pm_handle_wakeup(desc); + return false; + } + + /* Check whether the interrupt is polled on another CPU */ + if (unlikely(desc->istate & IRQS_POLL_INPROGRESS)) { + if (WARN_ONCE(irq_poll_cpu == smp_processor_id(), + "irq poll in progress on cpu %d for irq %d\n", + smp_processor_id(), desc->irq_data.irq)) + return false; + return irq_wait_on_inprogress(desc); + } + + /* The below works only for single target interrupts */ + if (!IS_ENABLED(CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK) || + !irqd_is_single_target(irqd) || desc->handle_irq != handle_edge_irq) return false; /* - * Handle a potential concurrent poll on a different core. + * If the interrupt affinity was moved to this CPU and the + * interrupt is currently handled on the previous target CPU, then + * busy wait for INPROGRESS to be cleared. Otherwise for edge type + * interrupts the handler might get stuck on the previous target: + * + * CPU 0 CPU 1 (new target) + * handle_edge_irq() + * repeat: + * handle_event() handle_edge_irq() + * if (INPROGESS) { + * set(PENDING); + * mask(); + * return; + * } + * if (PENDING) { + * clear(PENDING); + * unmask(); + * goto repeat; + * } + * + * This happens when the device raises interrupts with a high rate + * and always before handle_event() completes and the CPU0 handler + * can clear INPROGRESS. This has been observed in virtual machines. */ - return irq_check_poll(desc); + aff = irq_data_get_effective_affinity_mask(irqd); + if (cpumask_first(aff) != smp_processor_id()) + return false; + return irq_wait_on_inprogress(desc); } static inline bool irq_can_handle_actions(struct irq_desc *desc) diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index aebfe225c9a6..0164ca48da59 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -20,6 +20,7 @@ #define istate core_internal_state__do_not_mess_with_it extern bool noirqdebug; +extern int irq_poll_cpu; extern struct irqaction chained_action; @@ -112,7 +113,6 @@ irqreturn_t handle_irq_event(struct irq_desc *desc); int check_irq_resend(struct irq_desc *desc, bool inject); void clear_irq_resend(struct irq_desc *desc); void irq_resend_init(struct irq_desc *desc); -bool irq_wait_for_poll(struct irq_desc *desc); void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action); void wake_threads_waitq(struct irq_desc *desc); @@ -277,11 +277,11 @@ static inline bool irq_is_nmi(struct irq_desc *desc) } #ifdef CONFIG_PM_SLEEP -bool irq_pm_check_wakeup(struct irq_desc *desc); +void irq_pm_handle_wakeup(struct irq_desc *desc); void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action); void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action); #else -static inline bool irq_pm_check_wakeup(struct irq_desc *desc) { return false; } +static inline void irq_pm_handle_wakeup(struct irq_desc *desc) { } static inline void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) { } static inline void diff --git a/kernel/irq/irq_test.c b/kernel/irq/irq_test.c new file mode 100644 index 000000000000..5161b56a12f9 --- /dev/null +++ b/kernel/irq/irq_test.c @@ -0,0 +1,229 @@ +// SPDX-License-Identifier: LGPL-2.1+ + +#include <linux/cpu.h> +#include <linux/cpumask.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/irqdesc.h> +#include <linux/irqdomain.h> +#include <linux/nodemask.h> +#include <kunit/test.h> + +#include "internals.h" + +static irqreturn_t noop_handler(int irq, void *data) +{ + return IRQ_HANDLED; +} + +static void noop(struct irq_data *data) { } +static unsigned int noop_ret(struct irq_data *data) { return 0; } + +static int noop_affinity(struct irq_data *data, const struct cpumask *dest, + bool force) +{ + irq_data_update_effective_affinity(data, dest); + + return 0; +} + +static struct irq_chip fake_irq_chip = { + .name = "fake", + .irq_startup = noop_ret, + .irq_shutdown = noop, + .irq_enable = noop, + .irq_disable = noop, + .irq_ack = noop, + .irq_mask = noop, + .irq_unmask = noop, + .irq_set_affinity = noop_affinity, + .flags = IRQCHIP_SKIP_SET_WAKE, +}; + +static void irq_disable_depth_test(struct kunit *test) +{ + struct irq_desc *desc; + int virq, ret; + + virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, NULL); + KUNIT_ASSERT_GE(test, virq, 0); + + irq_set_chip_and_handler(virq, &dummy_irq_chip, handle_simple_irq); + + desc = irq_to_desc(virq); + KUNIT_ASSERT_PTR_NE(test, desc, NULL); + + ret = request_irq(virq, noop_handler, 0, "test_irq", NULL); + KUNIT_EXPECT_EQ(test, ret, 0); + + KUNIT_EXPECT_EQ(test, desc->depth, 0); + + disable_irq(virq); + KUNIT_EXPECT_EQ(test, desc->depth, 1); + + enable_irq(virq); + KUNIT_EXPECT_EQ(test, desc->depth, 0); + + free_irq(virq, NULL); +} + +static void irq_free_disabled_test(struct kunit *test) +{ + struct irq_desc *desc; + int virq, ret; + + virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, NULL); + KUNIT_ASSERT_GE(test, virq, 0); + + irq_set_chip_and_handler(virq, &dummy_irq_chip, handle_simple_irq); + + desc = irq_to_desc(virq); + KUNIT_ASSERT_PTR_NE(test, desc, NULL); + + ret = request_irq(virq, noop_handler, 0, "test_irq", NULL); + KUNIT_EXPECT_EQ(test, ret, 0); + + KUNIT_EXPECT_EQ(test, desc->depth, 0); + + disable_irq(virq); + KUNIT_EXPECT_EQ(test, desc->depth, 1); + + free_irq(virq, NULL); + KUNIT_EXPECT_GE(test, desc->depth, 1); + + ret = request_irq(virq, noop_handler, 0, "test_irq", NULL); + KUNIT_EXPECT_EQ(test, ret, 0); + KUNIT_EXPECT_EQ(test, desc->depth, 0); + + free_irq(virq, NULL); +} + +static void irq_shutdown_depth_test(struct kunit *test) +{ + struct irq_desc *desc; + struct irq_data *data; + int virq, ret; + struct irq_affinity_desc affinity = { + .is_managed = 1, + .mask = CPU_MASK_ALL, + }; + + if (!IS_ENABLED(CONFIG_SMP)) + kunit_skip(test, "requires CONFIG_SMP for managed shutdown"); + + virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, &affinity); + KUNIT_ASSERT_GE(test, virq, 0); + + irq_set_chip_and_handler(virq, &dummy_irq_chip, handle_simple_irq); + + desc = irq_to_desc(virq); + KUNIT_ASSERT_PTR_NE(test, desc, NULL); + + data = irq_desc_get_irq_data(desc); + KUNIT_ASSERT_PTR_NE(test, data, NULL); + + ret = request_irq(virq, noop_handler, 0, "test_irq", NULL); + KUNIT_EXPECT_EQ(test, ret, 0); + + KUNIT_EXPECT_TRUE(test, irqd_is_activated(data)); + KUNIT_EXPECT_TRUE(test, irqd_is_started(data)); + KUNIT_EXPECT_TRUE(test, irqd_affinity_is_managed(data)); + + KUNIT_EXPECT_EQ(test, desc->depth, 0); + + disable_irq(virq); + KUNIT_EXPECT_EQ(test, desc->depth, 1); + + irq_shutdown_and_deactivate(desc); + + KUNIT_EXPECT_FALSE(test, irqd_is_activated(data)); + KUNIT_EXPECT_FALSE(test, irqd_is_started(data)); + + KUNIT_EXPECT_EQ(test, irq_activate(desc), 0); +#ifdef CONFIG_SMP + irq_startup_managed(desc); +#endif + + KUNIT_EXPECT_EQ(test, desc->depth, 1); + + enable_irq(virq); + KUNIT_EXPECT_EQ(test, desc->depth, 0); + + free_irq(virq, NULL); +} + +static void irq_cpuhotplug_test(struct kunit *test) +{ + struct irq_desc *desc; + struct irq_data *data; + int virq, ret; + struct irq_affinity_desc affinity = { + .is_managed = 1, + }; + + if (!IS_ENABLED(CONFIG_SMP)) + kunit_skip(test, "requires CONFIG_SMP for CPU hotplug"); + if (!get_cpu_device(1)) + kunit_skip(test, "requires more than 1 CPU for CPU hotplug"); + if (!cpu_is_hotpluggable(1)) + kunit_skip(test, "CPU 1 must be hotpluggable"); + + cpumask_copy(&affinity.mask, cpumask_of(1)); + + virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, &affinity); + KUNIT_ASSERT_GE(test, virq, 0); + + irq_set_chip_and_handler(virq, &fake_irq_chip, handle_simple_irq); + + desc = irq_to_desc(virq); + KUNIT_ASSERT_PTR_NE(test, desc, NULL); + + data = irq_desc_get_irq_data(desc); + KUNIT_ASSERT_PTR_NE(test, data, NULL); + + ret = request_irq(virq, noop_handler, 0, "test_irq", NULL); + KUNIT_EXPECT_EQ(test, ret, 0); + + KUNIT_EXPECT_TRUE(test, irqd_is_activated(data)); + KUNIT_EXPECT_TRUE(test, irqd_is_started(data)); + KUNIT_EXPECT_TRUE(test, irqd_affinity_is_managed(data)); + + KUNIT_EXPECT_EQ(test, desc->depth, 0); + + disable_irq(virq); + KUNIT_EXPECT_EQ(test, desc->depth, 1); + + KUNIT_EXPECT_EQ(test, remove_cpu(1), 0); + KUNIT_EXPECT_FALSE(test, irqd_is_activated(data)); + KUNIT_EXPECT_FALSE(test, irqd_is_started(data)); + KUNIT_EXPECT_GE(test, desc->depth, 1); + KUNIT_EXPECT_EQ(test, add_cpu(1), 0); + + KUNIT_EXPECT_FALSE(test, irqd_is_activated(data)); + KUNIT_EXPECT_FALSE(test, irqd_is_started(data)); + KUNIT_EXPECT_EQ(test, desc->depth, 1); + + enable_irq(virq); + KUNIT_EXPECT_TRUE(test, irqd_is_activated(data)); + KUNIT_EXPECT_TRUE(test, irqd_is_started(data)); + KUNIT_EXPECT_EQ(test, desc->depth, 0); + + free_irq(virq, NULL); +} + +static struct kunit_case irq_test_cases[] = { + KUNIT_CASE(irq_disable_depth_test), + KUNIT_CASE(irq_free_disabled_test), + KUNIT_CASE(irq_shutdown_depth_test), + KUNIT_CASE(irq_cpuhotplug_test), + {} +}; + +static struct kunit_suite irq_test_suite = { + .name = "irq_test_cases", + .test_cases = irq_test_cases, +}; + +kunit_test_suite(irq_test_suite); +MODULE_DESCRIPTION("IRQ unit test suite"); +MODULE_LICENSE("GPL"); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index c8b6de09047b..dc473faadcc8 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -317,6 +317,7 @@ static struct irq_domain *__irq_domain_instantiate(const struct irq_domain_info domain->flags |= info->domain_flags; domain->exit = info->exit; + domain->dev = info->dev; #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY if (info->parent) { @@ -1561,6 +1562,7 @@ void irq_domain_free_irqs_top(struct irq_domain *domain, unsigned int virq, } irq_domain_free_irqs_common(domain, virq, nr_irqs); } +EXPORT_SYMBOL_GPL(irq_domain_free_irqs_top); static void irq_domain_free_irqs_hierarchy(struct irq_domain *domain, unsigned int irq_base, diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 9febe797a5f6..9b09ad3f9914 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -889,6 +889,7 @@ static struct irq_domain *__msi_create_irq_domain(struct fwnode_handle *fwnode, if (domain) { irq_domain_update_bus_token(domain, info->bus_token); + domain->dev = info->dev; if (info->flags & MSI_FLAG_PARENT_PM_DEV) domain->pm_dev = parent->pm_dev; } @@ -1051,6 +1052,7 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid, bundle->info.data = domain_data; bundle->info.chip_data = chip_data; bundle->info.alloc_data = &bundle->alloc_info; + bundle->info.dev = dev; pops = parent->msi_parent_ops; snprintf(bundle->name, sizeof(bundle->name), "%s%s-%s", @@ -1089,7 +1091,6 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid, if (!domain) return false; - domain->dev = dev; dev->msi.data->__domains[domid].domain = domain; if (msi_domain_prepare_irqs(domain, dev, hwsize, &bundle->alloc_info)) { diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index 445912d51033..f7394729cedc 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -13,17 +13,13 @@ #include "internals.h" -bool irq_pm_check_wakeup(struct irq_desc *desc) +void irq_pm_handle_wakeup(struct irq_desc *desc) { - if (irqd_is_wakeup_armed(&desc->irq_data)) { - irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED); - desc->istate |= IRQS_SUSPENDED | IRQS_PENDING; - desc->depth++; - irq_disable(desc); - pm_system_irq_wakeup(irq_desc_get_irq(desc)); - return true; - } - return false; + irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED); + desc->istate |= IRQS_SUSPENDED | IRQS_PENDING; + desc->depth++; + irq_disable(desc); + pm_system_irq_wakeup(irq_desc_get_irq(desc)); } /* diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 8f26982e7300..73280ccb74b0 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -19,45 +19,10 @@ static int irqfixup __read_mostly; #define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10) static void poll_spurious_irqs(struct timer_list *unused); static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs); -static int irq_poll_cpu; +int irq_poll_cpu; static atomic_t irq_poll_active; /* - * We wait here for a poller to finish. - * - * If the poll runs on this CPU, then we yell loudly and return - * false. That will leave the interrupt line disabled in the worst - * case, but it should never happen. - * - * We wait until the poller is done and then recheck disabled and - * action (about to be disabled). Only if it's still active, we return - * true and let the handler run. - */ -bool irq_wait_for_poll(struct irq_desc *desc) -{ - lockdep_assert_held(&desc->lock); - - if (WARN_ONCE(irq_poll_cpu == smp_processor_id(), - "irq poll in progress on cpu %d for irq %d\n", - smp_processor_id(), desc->irq_data.irq)) - return false; - -#ifdef CONFIG_SMP - do { - raw_spin_unlock(&desc->lock); - while (irqd_irq_inprogress(&desc->irq_data)) - cpu_relax(); - raw_spin_lock(&desc->lock); - } while (irqd_irq_inprogress(&desc->irq_data)); - /* Might have been disabled in meantime */ - return !irqd_irq_disabled(&desc->irq_data) && desc->action; -#else - return false; -#endif -} - - -/* * Recovery handler for misrouted interrupts. */ static bool try_one_irq(struct irq_desc *desc, bool force) diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 4198f30aac3c..1e7635864124 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -829,8 +829,7 @@ static struct bpf_iter_reg ksym_iter_reg_info = { .seq_info = &ksym_iter_seq_info, }; -BTF_ID_LIST(btf_ksym_iter_id) -BTF_ID(struct, kallsym_iter) +BTF_ID_LIST_SINGLE(btf_ksym_iter_id, struct, kallsym_iter) static int __init bpf_ksym_iter_register(void) { diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c index c2871180edcc..49ab81faaed9 100644 --- a/kernel/kcsan/kcsan_test.c +++ b/kernel/kcsan/kcsan_test.c @@ -533,7 +533,7 @@ static void test_barrier_nothreads(struct kunit *test) struct kcsan_scoped_access *reorder_access = NULL; #endif arch_spinlock_t arch_spinlock = __ARCH_SPIN_LOCK_UNLOCKED; - atomic_t dummy; + atomic_t dummy = ATOMIC_INIT(0); KCSAN_TEST_REQUIRES(test, reorder_access != NULL); KCSAN_TEST_REQUIRES(test, IS_ENABLED(CONFIG_SMP)); diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 3a9a9f240dbc..351cd7d76dfa 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1080,7 +1080,7 @@ int kernel_kexec(void) console_suspend_all(); error = dpm_suspend_start(PMSG_FREEZE); if (error) - goto Resume_console; + goto Resume_devices; /* * dpm_suspend_end() must be called after dpm_suspend_start() * to complete the transition, like in the hibernation flows @@ -1135,8 +1135,6 @@ int kernel_kexec(void) dpm_resume_start(PMSG_RESTORE); Resume_devices: dpm_resume_end(PMSG_RESTORE); - Resume_console: - pm_restore_gfp_mask(); console_resume_all(); thaw_processes(); Restore_console: diff --git a/kernel/kprobes.c b/kernel/kprobes.c index ffe0c3d52306..ab8f9fc1f0d1 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -135,8 +135,12 @@ struct kprobe_insn_cache kprobe_insn_slots = { static int collect_garbage_slots(struct kprobe_insn_cache *c); /** - * __get_insn_slot() - Find a slot on an executable page for an instruction. - * We allocate an executable page if there's no room on existing ones. + * __get_insn_slot - Find a slot on an executable page for an instruction. + * @c: Pointer to kprobe instruction cache + * + * Description: Locates available slot on existing executable pages, + * allocates an executable page if there's no room on existing ones. + * Return: Pointer to instruction slot on success, NULL on failure. */ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c) { diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index dd2bbf73718b..2d4c5bab5af8 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -297,33 +297,30 @@ static inline void lock_time_add(struct lock_time *src, struct lock_time *dst) dst->nr += src->nr; } -struct lock_class_stats lock_stats(struct lock_class *class) +void lock_stats(struct lock_class *class, struct lock_class_stats *stats) { - struct lock_class_stats stats; int cpu, i; - memset(&stats, 0, sizeof(struct lock_class_stats)); + memset(stats, 0, sizeof(struct lock_class_stats)); for_each_possible_cpu(cpu) { struct lock_class_stats *pcs = &per_cpu(cpu_lock_stats, cpu)[class - lock_classes]; - for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++) - stats.contention_point[i] += pcs->contention_point[i]; + for (i = 0; i < ARRAY_SIZE(stats->contention_point); i++) + stats->contention_point[i] += pcs->contention_point[i]; - for (i = 0; i < ARRAY_SIZE(stats.contending_point); i++) - stats.contending_point[i] += pcs->contending_point[i]; + for (i = 0; i < ARRAY_SIZE(stats->contending_point); i++) + stats->contending_point[i] += pcs->contending_point[i]; - lock_time_add(&pcs->read_waittime, &stats.read_waittime); - lock_time_add(&pcs->write_waittime, &stats.write_waittime); + lock_time_add(&pcs->read_waittime, &stats->read_waittime); + lock_time_add(&pcs->write_waittime, &stats->write_waittime); - lock_time_add(&pcs->read_holdtime, &stats.read_holdtime); - lock_time_add(&pcs->write_holdtime, &stats.write_holdtime); + lock_time_add(&pcs->read_holdtime, &stats->read_holdtime); + lock_time_add(&pcs->write_holdtime, &stats->write_holdtime); - for (i = 0; i < ARRAY_SIZE(stats.bounces); i++) - stats.bounces[i] += pcs->bounces[i]; + for (i = 0; i < ARRAY_SIZE(stats->bounces); i++) + stats->bounces[i] += pcs->bounces[i]; } - - return stats; } void clear_lock_stats(struct lock_class *class) @@ -6619,8 +6616,16 @@ void lockdep_unregister_key(struct lock_class_key *key) if (need_callback) call_rcu(&delayed_free.rcu_head, free_zapped_rcu); - /* Wait until is_dynamic_key() has finished accessing k->hash_entry. */ - synchronize_rcu(); + /* + * Wait until is_dynamic_key() has finished accessing k->hash_entry. + * + * Some operations like __qdisc_destroy() will call this in a debug + * kernel, and the network traffic is disabled while waiting, hence + * the delay of the wait matters in debugging cases. Currently use a + * synchronize_rcu_expedited() to speed up the wait at the cost of + * system IPIs. TODO: Replace RCU with hazptr for this. + */ + synchronize_rcu_expedited(); } EXPORT_SYMBOL_GPL(lockdep_unregister_key); diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index 82156caf77d1..0e5e6ffe91a3 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -47,29 +47,31 @@ enum { __LOCKF(USED_READ) }; +enum { #define LOCKDEP_STATE(__STATE) LOCKF_ENABLED_##__STATE | -static const unsigned long LOCKF_ENABLED_IRQ = + LOCKF_ENABLED_IRQ = #include "lockdep_states.h" - 0; + 0, #undef LOCKDEP_STATE #define LOCKDEP_STATE(__STATE) LOCKF_USED_IN_##__STATE | -static const unsigned long LOCKF_USED_IN_IRQ = + LOCKF_USED_IN_IRQ = #include "lockdep_states.h" - 0; + 0, #undef LOCKDEP_STATE #define LOCKDEP_STATE(__STATE) LOCKF_ENABLED_##__STATE##_READ | -static const unsigned long LOCKF_ENABLED_IRQ_READ = + LOCKF_ENABLED_IRQ_READ = #include "lockdep_states.h" - 0; + 0, #undef LOCKDEP_STATE #define LOCKDEP_STATE(__STATE) LOCKF_USED_IN_##__STATE##_READ | -static const unsigned long LOCKF_USED_IN_IRQ_READ = + LOCKF_USED_IN_IRQ_READ = #include "lockdep_states.h" - 0; + 0, #undef LOCKDEP_STATE +}; #define LOCKF_ENABLED_IRQ_ALL (LOCKF_ENABLED_IRQ | LOCKF_ENABLED_IRQ_READ) #define LOCKF_USED_IN_IRQ_ALL (LOCKF_USED_IN_IRQ | LOCKF_USED_IN_IRQ_READ) diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index b52c07c4707c..1916db9aa46b 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -657,7 +657,7 @@ static int lock_stat_open(struct inode *inode, struct file *file) if (!test_bit(idx, lock_classes_in_use)) continue; iter->class = class; - iter->stats = lock_stats(class); + lock_stats(class, &iter->stats); iter++; } diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index 6e6f6071cfa2..949103fd8e9b 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -53,17 +53,18 @@ void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, { lockdep_assert_held(&lock->wait_lock); - /* Mark the current thread as blocked on the lock: */ - task->blocked_on = waiter; + /* Current thread can't be already blocked (since it's executing!) */ + DEBUG_LOCKS_WARN_ON(__get_task_blocked_on(task)); } void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, struct task_struct *task) { + struct mutex *blocked_on = __get_task_blocked_on(task); + DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); DEBUG_LOCKS_WARN_ON(waiter->task != task); - DEBUG_LOCKS_WARN_ON(task->blocked_on != waiter); - task->blocked_on = NULL; + DEBUG_LOCKS_WARN_ON(blocked_on && blocked_on != lock); INIT_LIST_HEAD(&waiter->list); waiter->task = NULL; diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index a39ecccbd106..de7d6702cd96 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -191,9 +191,7 @@ static void __mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, struct list_head *list) { -#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER hung_task_set_blocker(lock, BLOCKER_TYPE_MUTEX); -#endif debug_mutex_add_waiter(lock, waiter, current); list_add_tail(&waiter->list, list); @@ -209,9 +207,7 @@ __mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter) __mutex_clear_flag(lock, MUTEX_FLAGS); debug_mutex_remove_waiter(lock, waiter, current); -#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER hung_task_clear_blocker(); -#endif } /* @@ -644,6 +640,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas goto err_early_kill; } + __set_task_blocked_on(current, lock); set_current_state(state); trace_contention_begin(lock, LCB_F_MUTEX); for (;;) { @@ -680,6 +677,12 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas first = __mutex_waiter_is_first(lock, &waiter); + /* + * As we likely have been woken up by task + * that has cleared our blocked_on state, re-set + * it to the lock we are trying to acquire. + */ + set_task_blocked_on(current, lock); set_current_state(state); /* * Here we order against unlock; we must either see it change @@ -691,8 +694,15 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas if (first) { trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN); + /* + * mutex_optimistic_spin() can call schedule(), so + * clear blocked on so we don't become unselectable + * to run. + */ + clear_task_blocked_on(current, lock); if (mutex_optimistic_spin(lock, ww_ctx, &waiter)) break; + set_task_blocked_on(current, lock); trace_contention_begin(lock, LCB_F_MUTEX); } @@ -700,6 +710,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas } raw_spin_lock_irqsave(&lock->wait_lock, flags); acquired: + __clear_task_blocked_on(current, lock); __set_current_state(TASK_RUNNING); if (ww_ctx) { @@ -729,9 +740,11 @@ skip_wait: return 0; err: + __clear_task_blocked_on(current, lock); __set_current_state(TASK_RUNNING); __mutex_remove_waiter(lock, &waiter); err_early_kill: + WARN_ON(__get_task_blocked_on(current)); trace_contention_end(lock, ret); raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q); debug_mutex_free_waiter(&waiter); @@ -942,6 +955,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne next = waiter->task; debug_mutex_wake_waiter(lock, waiter); + __clear_task_blocked_on(next, lock); wake_q_add(&wake_q, next); } diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index cbff35b9b7ae..2e8080a9bee3 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -6,7 +6,7 @@ * * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> */ - +#ifndef CONFIG_PREEMPT_RT /* * This is the control structure for tasks blocked on mutex, which resides * on the blocked task's kernel stack: @@ -70,3 +70,4 @@ extern void debug_mutex_init(struct mutex *lock, const char *name, # define debug_mutex_unlock(lock) do { } while (0) # define debug_mutex_init(lock, name, key) do { } while (0) #endif /* !CONFIG_DEBUG_MUTEXES */ +#endif /* CONFIG_PREEMPT_RT */ diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c index 2d933528a0fa..bafd5af98eae 100644 --- a/kernel/locking/rtmutex_api.c +++ b/kernel/locking/rtmutex_api.c @@ -13,6 +13,24 @@ */ int max_lock_depth = 1024; +static const struct ctl_table rtmutex_sysctl_table[] = { + { + .procname = "max_lock_depth", + .data = &max_lock_depth, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +}; + +static int __init init_rtmutex_sysctl(void) +{ + register_sysctl_init("kernel", rtmutex_sysctl_table); + return 0; +} + +subsys_initcall(init_rtmutex_sysctl); + /* * Debug aware fast / slowpath lock,trylock,unlock * diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 2ddb827e3bea..8572dba95af4 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -727,8 +727,6 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) return ret; } -#define OWNER_SPINNABLE (OWNER_NULL | OWNER_WRITER | OWNER_READER) - static inline enum owner_state rwsem_owner_state(struct task_struct *owner, unsigned long flags) { @@ -835,7 +833,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) enum owner_state owner_state; owner_state = rwsem_spin_on_owner(sem); - if (!(owner_state & OWNER_SPINNABLE)) + if (owner_state == OWNER_NONSPINNABLE) break; /* diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h index 37f025a096c9..086fd5487ca7 100644 --- a/kernel/locking/ww_mutex.h +++ b/kernel/locking/ww_mutex.h @@ -284,6 +284,12 @@ __ww_mutex_die(struct MUTEX *lock, struct MUTEX_WAITER *waiter, #ifndef WW_RT debug_mutex_wake_waiter(lock, waiter); #endif + /* + * When waking up the task to die, be sure to clear the + * blocked_on pointer. Otherwise we can see circular + * blocked_on relationships that can't resolve. + */ + __clear_task_blocked_on(waiter->task, lock); wake_q_add(wake_q, waiter->task); } @@ -331,9 +337,15 @@ static bool __ww_mutex_wound(struct MUTEX *lock, * it's wounded in __ww_mutex_check_kill() or has a * wakeup pending to re-read the wounded state. */ - if (owner != current) + if (owner != current) { + /* + * When waking up the task to wound, be sure to clear the + * blocked_on pointer. Otherwise we can see circular + * blocked_on relationships that can't resolve. + */ + __clear_task_blocked_on(owner, lock); wake_q_add(wake_q, owner); - + } return true; } diff --git a/kernel/module/internal.h b/kernel/module/internal.h index 8d74b0a21c82..51ddd8866ef3 100644 --- a/kernel/module/internal.h +++ b/kernel/module/internal.h @@ -58,6 +58,9 @@ extern const struct kernel_symbol __stop___ksymtab_gpl[]; extern const u32 __start___kcrctab[]; extern const u32 __start___kcrctab_gpl[]; +#define KMOD_PATH_LEN 256 +extern char modprobe_path[]; + struct load_info { const char *name; /* pointer to module in temporary copy, freed at end of load_module() */ diff --git a/kernel/module/main.c b/kernel/module/main.c index 43df45c39f59..81f9df8859dc 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -126,9 +126,37 @@ static void mod_update_bounds(struct module *mod) } /* Block module loading/unloading? */ -int modules_disabled; +static int modules_disabled; core_param(nomodule, modules_disabled, bint, 0); +static const struct ctl_table module_sysctl_table[] = { + { + .procname = "modprobe", + .data = &modprobe_path, + .maxlen = KMOD_PATH_LEN, + .mode = 0644, + .proc_handler = proc_dostring, + }, + { + .procname = "modules_disabled", + .data = &modules_disabled, + .maxlen = sizeof(int), + .mode = 0644, + /* only handle a transition from default "0" to "1" */ + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_ONE, + }, +}; + +static int __init init_module_sysctl(void) +{ + register_sysctl_init("kernel", module_sysctl_table); + return 0; +} + +subsys_initcall(init_module_sysctl); + /* Waiting for a module to finish initializing? */ static DECLARE_WAIT_QUEUE_HEAD(module_wq); @@ -2645,7 +2673,7 @@ static int find_module_sections(struct module *mod, struct load_info *info) sizeof(*mod->trace_bprintk_fmt_start), &mod->num_trace_bprintk_fmt); #endif -#ifdef CONFIG_FTRACE_MCOUNT_RECORD +#ifdef CONFIG_DYNAMIC_FTRACE /* sechdrs[0].sh_size is always zero */ mod->ftrace_callsites = section_objs(info, FTRACE_CALLSITE_SECTION, sizeof(*mod->ftrace_callsites), diff --git a/kernel/module/sysfs.c b/kernel/module/sysfs.c index b401ff4b02d2..c7622ff5226a 100644 --- a/kernel/module/sysfs.c +++ b/kernel/module/sysfs.c @@ -56,9 +56,9 @@ static void free_sect_attrs(struct module_sect_attrs *sect_attrs) { const struct bin_attribute *const *bin_attr; - for (bin_attr = sect_attrs->grp.bin_attrs_new; *bin_attr; bin_attr++) + for (bin_attr = sect_attrs->grp.bin_attrs; *bin_attr; bin_attr++) kfree((*bin_attr)->attr.name); - kfree(sect_attrs->grp.bin_attrs_new); + kfree(sect_attrs->grp.bin_attrs); kfree(sect_attrs); } @@ -86,7 +86,7 @@ static int add_sect_attrs(struct module *mod, const struct load_info *info) /* Setup section attributes. */ sect_attrs->grp.name = "sections"; - sect_attrs->grp.bin_attrs_new = gattr; + sect_attrs->grp.bin_attrs = gattr; sattr = §_attrs->attrs[0]; for (i = 0; i < info->hdr->e_shnum; i++) { @@ -101,7 +101,7 @@ static int add_sect_attrs(struct module *mod, const struct load_info *info) ret = -ENOMEM; goto out; } - sattr->read_new = module_sect_read; + sattr->read = module_sect_read; sattr->private = (void *)sec->sh_addr; sattr->size = MODULE_SECT_READ_SIZE; sattr->attr.mode = 0400; @@ -144,7 +144,7 @@ struct module_notes_attrs { static void free_notes_attrs(struct module_notes_attrs *notes_attrs) { - kfree(notes_attrs->grp.bin_attrs_new); + kfree(notes_attrs->grp.bin_attrs); kfree(notes_attrs); } @@ -178,7 +178,7 @@ static int add_notes_attrs(struct module *mod, const struct load_info *info) } notes_attrs->grp.name = "notes"; - notes_attrs->grp.bin_attrs_new = gattr; + notes_attrs->grp.bin_attrs = gattr; nattr = ¬es_attrs->attrs[0]; for (loaded = i = 0; i < info->hdr->e_shnum; ++i) { @@ -190,7 +190,7 @@ static int add_notes_attrs(struct module *mod, const struct load_info *info) nattr->attr.mode = 0444; nattr->size = info->sechdrs[i].sh_size; nattr->private = (void *)info->sechdrs[i].sh_addr; - nattr->read_new = sysfs_bin_attr_simple_read; + nattr->read = sysfs_bin_attr_simple_read; *(gattr++) = nattr++; } ++loaded; diff --git a/kernel/panic.c b/kernel/panic.c index b0b9a8bf4560..43817111c979 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -84,6 +84,50 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list); EXPORT_SYMBOL(panic_notifier_list); #ifdef CONFIG_SYSCTL + +/* + * Taint values can only be increased + * This means we can safely use a temporary. + */ +static int proc_taint(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table t; + unsigned long tmptaint = get_taint(); + int err; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + t = *table; + t.data = &tmptaint; + err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos); + if (err < 0) + return err; + + if (write) { + int i; + + /* + * If we are relying on panic_on_taint not producing + * false positives due to userspace input, bail out + * before setting the requested taint flags. + */ + if (panic_on_taint_nousertaint && (tmptaint & panic_on_taint)) + return -EINVAL; + + /* + * Poor man's atomic or. Not worth adding a primitive + * to everyone's atomic.h for this + */ + for (i = 0; i < TAINT_FLAGS_COUNT; i++) + if ((1UL << i) & tmptaint) + add_taint(i, LOCKDEP_STILL_OK); + } + + return err; +} + static const struct ctl_table kern_panic_table[] = { #ifdef CONFIG_SMP { @@ -97,6 +141,12 @@ static const struct ctl_table kern_panic_table[] = { }, #endif { + .procname = "tainted", + .maxlen = sizeof(long), + .mode = 0644, + .proc_handler = proc_taint, + }, + { .procname = "panic", .data = &panic_timeout, .maxlen = sizeof(int), @@ -133,6 +183,16 @@ static const struct ctl_table kern_panic_table[] = { .mode = 0644, .proc_handler = proc_douintvec, }, +#if (defined(CONFIG_X86_32) || defined(CONFIG_PARISC)) && \ + defined(CONFIG_DEBUG_STACKOVERFLOW) + { + .procname = "panic_on_stackoverflow", + .data = &sysctl_panic_on_stackoverflow, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif }; static __init int kernel_panic_sysctls_init(void) @@ -307,15 +367,15 @@ static void panic_other_cpus_shutdown(bool crash_kexec) } /** - * panic - halt the system + * vpanic - halt the system * @fmt: The text string to print + * @args: Arguments for the format string * * Display a message, then perform cleanups. This function never returns. */ -void panic(const char *fmt, ...) +void vpanic(const char *fmt, va_list args) { static char buf[1024]; - va_list args; long i, i_next = 0, len; int state = 0; int old_cpu, this_cpu; @@ -366,9 +426,7 @@ void panic(const char *fmt, ...) console_verbose(); bust_spinlocks(1); - va_start(args, fmt); len = vscnprintf(buf, sizeof(buf), fmt, args); - va_end(args); if (len && buf[len - 1] == '\n') buf[len - 1] = '\0'; @@ -505,7 +563,17 @@ void panic(const char *fmt, ...) mdelay(PANIC_TIMER_STEP); } } +EXPORT_SYMBOL(vpanic); +/* Identical to vpanic(), except it takes variadic arguments instead of va_list */ +void panic(const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + vpanic(fmt, args); + va_end(args); +} EXPORT_SYMBOL(panic); #define TAINT_FLAG(taint, _c_true, _c_false, _module) \ diff --git a/kernel/pid.c b/kernel/pid.c index 07db7d8d066c..c45a28c16cd2 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -713,6 +713,29 @@ static struct ctl_table_root pid_table_root = { .set_ownership = pid_table_root_set_ownership, }; +static int proc_do_cad_pid(const struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + struct pid *new_pid; + pid_t tmp_pid; + int r; + struct ctl_table tmp_table = *table; + + tmp_pid = pid_vnr(cad_pid); + tmp_table.data = &tmp_pid; + + r = proc_dointvec(&tmp_table, write, buffer, lenp, ppos); + if (r || !write) + return r; + + new_pid = find_get_pid(tmp_pid); + if (!new_pid) + return -ESRCH; + + put_pid(xchg(&cad_pid, new_pid)); + return 0; +} + static const struct ctl_table pid_table[] = { { .procname = "pid_max", @@ -723,6 +746,14 @@ static const struct ctl_table pid_table[] = { .extra1 = &pid_max_min, .extra2 = &pid_max_max, }, +#ifdef CONFIG_PROC_SYSCTL + { + .procname = "cad_pid", + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_do_cad_pid, + }, +#endif }; #endif diff --git a/kernel/power/console.c b/kernel/power/console.c index fcdf0e14a47d..19c48aa5355d 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c @@ -16,6 +16,7 @@ #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) static int orig_fgconsole, orig_kmsg; +static bool vt_switch_done; static DEFINE_MUTEX(vt_switch_mutex); @@ -136,17 +137,21 @@ void pm_prepare_console(void) if (orig_fgconsole < 0) return; + vt_switch_done = true; + orig_kmsg = vt_kmsg_redirect(SUSPEND_CONSOLE); return; } void pm_restore_console(void) { - if (!pm_vt_switch()) + if (!pm_vt_switch() && !vt_switch_done) return; if (orig_fgconsole >= 0) { vt_move_to_console(orig_fgconsole, 0); vt_kmsg_redirect(orig_kmsg); } + + vt_switch_done = false; } diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 9216e3b91d3b..1f1f30cca573 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -381,6 +381,23 @@ static int create_image(int platform_mode) return error; } +static void shrink_shmem_memory(void) +{ + struct sysinfo info; + unsigned long nr_shmem_pages, nr_freed_pages; + + si_meminfo(&info); + nr_shmem_pages = info.sharedram; /* current page count used for shmem */ + /* + * The intent is to reclaim all shmem pages. Though shrink_all_memory() can + * only reclaim about half of them, it's enough for creating the hibernation + * image. + */ + nr_freed_pages = shrink_all_memory(nr_shmem_pages); + pr_debug("requested to reclaim %lu shmem pages, actually freed %lu pages\n", + nr_shmem_pages, nr_freed_pages); +} + /** * hibernation_snapshot - Quiesce devices and create a hibernation image. * @platform_mode: If set, use platform driver to prepare for the transition. @@ -422,6 +439,15 @@ int hibernation_snapshot(int platform_mode) goto Thaw; } + /* + * Device drivers may move lots of data to shmem in dpm_prepare(). The shmem + * pages will use lots of system memory, causing hibernation image creation + * fail due to insufficient free memory. + * This call is to force flush the shmem pages to swap disk and reclaim + * the system memory so that image creation can succeed. + */ + shrink_shmem_memory(); + console_suspend_all(); error = dpm_suspend(PMSG_FREEZE); diff --git a/kernel/power/main.c b/kernel/power/main.c index 3d484630505a..3cf2d7e72567 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -8,6 +8,7 @@ #include <linux/acpi.h> #include <linux/export.h> +#include <linux/init.h> #include <linux/kobject.h> #include <linux/string.h> #include <linux/pm-trace.h> @@ -112,6 +113,14 @@ int pm_notifier_call_chain(unsigned long val) /* If set, devices may be suspended and resumed asynchronously. */ int pm_async_enabled = 1; +static int __init pm_async_setup(char *str) +{ + if (!strcmp(str, "off")) + pm_async_enabled = 0; + return 1; +} +__setup("pm_async=", pm_async_setup); + static ssize_t pm_async_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 2af36cfe35cd..501df0676a61 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1536,7 +1536,7 @@ static unsigned long copy_data_pages(struct memory_bitmap *copy_bm, memory_bm_position_reset(orig_bm); memory_bm_position_reset(copy_bm); copy_pfn = memory_bm_next_pfn(copy_bm); - for(;;) { + for (;;) { pfn = memory_bm_next_pfn(orig_bm); if (unlikely(pfn == BM_END_OF_MAP)) break; @@ -2161,13 +2161,13 @@ static const char *check_image_kernel(struct swsusp_info *info) { if (info->version_code != LINUX_VERSION_CODE) return "kernel version"; - if (strcmp(info->uts.sysname,init_utsname()->sysname)) + if (strcmp(info->uts.sysname, init_utsname()->sysname)) return "system type"; - if (strcmp(info->uts.release,init_utsname()->release)) + if (strcmp(info->uts.release, init_utsname()->release)) return "kernel release"; - if (strcmp(info->uts.version,init_utsname()->version)) + if (strcmp(info->uts.version, init_utsname()->version)) return "version"; - if (strcmp(info->uts.machine,init_utsname()->machine)) + if (strcmp(info->uts.machine, init_utsname()->machine)) return "machine"; return NULL; } @@ -2361,7 +2361,7 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm, struct memory_bitmap *zero_bm) { unsigned long decoded_pfn; - bool zero; + bool zero; int j; for (j = 0; j < PAGE_SIZE / sizeof(long); j++) { diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 48a24e7b309d..bbed41ad29cf 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -72,7 +72,6 @@ int vprintk_store(int facility, int level, const char *fmt, va_list args); __printf(1, 0) int vprintk_default(const char *fmt, va_list args); -__printf(1, 0) int vprintk_deferred(const char *fmt, va_list args); void __printk_safe_enter(void); void __printk_safe_exit(void); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 70ec0f21abc3..7a893d51d02b 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -55,22 +55,24 @@ MODULE_DESCRIPTION("Read-Copy Update module-based torture test facility"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com> and Josh Triplett <josh@joshtriplett.org>"); -/* Bits for ->extendables field, extendables param, and related definitions. */ -#define RCUTORTURE_RDR_SHIFT_1 8 /* Put SRCU index in upper bits. */ -#define RCUTORTURE_RDR_MASK_1 (0xff << RCUTORTURE_RDR_SHIFT_1) -#define RCUTORTURE_RDR_SHIFT_2 16 /* Put SRCU index in upper bits. */ -#define RCUTORTURE_RDR_MASK_2 (0xff << RCUTORTURE_RDR_SHIFT_2) -#define RCUTORTURE_RDR_BH 0x01 /* Extend readers by disabling bh. */ -#define RCUTORTURE_RDR_IRQ 0x02 /* ... disabling interrupts. */ -#define RCUTORTURE_RDR_PREEMPT 0x04 /* ... disabling preemption. */ -#define RCUTORTURE_RDR_RBH 0x08 /* ... rcu_read_lock_bh(). */ -#define RCUTORTURE_RDR_SCHED 0x10 /* ... rcu_read_lock_sched(). */ -#define RCUTORTURE_RDR_RCU_1 0x20 /* ... entering another RCU reader. */ -#define RCUTORTURE_RDR_RCU_2 0x40 /* ... entering another RCU reader. */ -#define RCUTORTURE_RDR_NBITS 7 /* Number of bits defined above. */ -#define RCUTORTURE_MAX_EXTEND \ +// Bits for ->extendables field, extendables param, and related definitions. +#define RCUTORTURE_RDR_SHIFT_1 8 // Put SRCU index in upper bits. +#define RCUTORTURE_RDR_MASK_1 (0xff << RCUTORTURE_RDR_SHIFT_1) +#define RCUTORTURE_RDR_SHIFT_2 16 // Put SRCU index in upper bits. +#define RCUTORTURE_RDR_MASK_2 (0xff << RCUTORTURE_RDR_SHIFT_2) +#define RCUTORTURE_RDR_BH 0x01 // Extend readers by disabling bh. +#define RCUTORTURE_RDR_IRQ 0x02 // ... disabling interrupts. +#define RCUTORTURE_RDR_PREEMPT 0x04 // ... disabling preemption. +#define RCUTORTURE_RDR_RBH 0x08 // ... rcu_read_lock_bh(). +#define RCUTORTURE_RDR_SCHED 0x10 // ... rcu_read_lock_sched(). +#define RCUTORTURE_RDR_RCU_1 0x20 // ... entering another RCU reader. +#define RCUTORTURE_RDR_RCU_2 0x40 // ... entering another RCU reader. +#define RCUTORTURE_RDR_UPDOWN 0x80 // ... up-read from task, down-read from timer. + // Note: Manual start, automatic end. +#define RCUTORTURE_RDR_NBITS 8 // Number of bits defined above. +#define RCUTORTURE_MAX_EXTEND \ (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_IRQ | RCUTORTURE_RDR_PREEMPT | \ - RCUTORTURE_RDR_RBH | RCUTORTURE_RDR_SCHED) + RCUTORTURE_RDR_RBH | RCUTORTURE_RDR_SCHED) // Intentionally omit RCUTORTURE_RDR_UPDOWN. #define RCUTORTURE_RDR_ALLBITS \ (RCUTORTURE_MAX_EXTEND | RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2 | \ RCUTORTURE_RDR_MASK_1 | RCUTORTURE_RDR_MASK_2) @@ -110,6 +112,7 @@ torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives"); torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers"); torture_param(int, leakpointer, 0, "Leak pointer dereferences from readers"); torture_param(int, n_barrier_cbs, 0, "# of callbacks/kthreads for barrier testing"); +torture_param(int, n_up_down, 32, "# of concurrent up/down hrtimer-based RCU readers"); torture_param(int, nfakewriters, 4, "Number of RCU fake writer threads"); torture_param(int, nreaders, -1, "Number of RCU reader threads"); torture_param(int, object_debug, 0, "Enable debug-object double call_rcu() testing"); @@ -156,6 +159,7 @@ static int nrealfakewriters; static struct task_struct *writer_task; static struct task_struct **fakewriter_tasks; static struct task_struct **reader_tasks; +static struct task_struct *updown_task; static struct task_struct **nocb_tasks; static struct task_struct *stats_task; static struct task_struct *fqs_task; @@ -378,6 +382,8 @@ struct rcu_torture_ops { void (*readunlock)(int idx); int (*readlock_held)(void); // lockdep. int (*readlock_nesting)(void); // actual nesting, if available, -1 if not. + int (*down_read)(void); + void (*up_read)(int idx); unsigned long (*get_gp_seq)(void); unsigned long (*gp_diff)(unsigned long new, unsigned long old); void (*deferred_free)(struct rcu_torture *p); @@ -427,6 +433,7 @@ struct rcu_torture_ops { int no_pi_lock; int debug_objects; int start_poll_irqsoff; + int have_up_down; const char *name; }; @@ -464,7 +471,7 @@ rcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp) !(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) { started = cur_ops->get_gp_seq(); ts = rcu_trace_clock_local(); - if (preempt_count() & (SOFTIRQ_MASK | HARDIRQ_MASK)) + if ((preempt_count() & HARDIRQ_MASK) || softirq_count()) longdelay_ms = 5; /* Avoid triggering BH limits. */ mdelay(longdelay_ms); rtrsp->rt_delay_ms = longdelay_ms; @@ -711,11 +718,6 @@ static int srcu_torture_read_lock(void) WARN_ON_ONCE(idx & ~0x1); ret += idx << 1; } - if (reader_flavor & SRCU_READ_FLAVOR_LITE) { - idx = srcu_read_lock_lite(srcu_ctlp); - WARN_ON_ONCE(idx & ~0x1); - ret += idx << 2; - } if (reader_flavor & SRCU_READ_FLAVOR_FAST) { scp = srcu_read_lock_fast(srcu_ctlp); idx = __srcu_ptr_to_ctr(srcu_ctlp, scp); @@ -749,8 +751,6 @@ static void srcu_torture_read_unlock(int idx) WARN_ON_ONCE((reader_flavor && (idx & ~reader_flavor)) || (!reader_flavor && (idx & ~0x1))); if (reader_flavor & SRCU_READ_FLAVOR_FAST) srcu_read_unlock_fast(srcu_ctlp, __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x8) >> 3)); - if (reader_flavor & SRCU_READ_FLAVOR_LITE) - srcu_read_unlock_lite(srcu_ctlp, (idx & 0x4) >> 2); if (reader_flavor & SRCU_READ_FLAVOR_NMI) srcu_read_unlock_nmisafe(srcu_ctlp, (idx & 0x2) >> 1); if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL)) @@ -762,6 +762,50 @@ static int torture_srcu_read_lock_held(void) return srcu_read_lock_held(srcu_ctlp); } +static bool srcu_torture_have_up_down(void) +{ + int rf = reader_flavor; + + if (!rf) + rf = SRCU_READ_FLAVOR_NORMAL; + return !!(cur_ops->have_up_down & rf); +} + +static int srcu_torture_down_read(void) +{ + int idx; + struct srcu_ctr __percpu *scp; + + WARN_ON_ONCE(reader_flavor & ~SRCU_READ_FLAVOR_ALL); + WARN_ON_ONCE(reader_flavor & (reader_flavor - 1)); + + if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL)) { + idx = srcu_down_read(srcu_ctlp); + WARN_ON_ONCE(idx & ~0x1); + return idx; + } + if (reader_flavor & SRCU_READ_FLAVOR_FAST) { + scp = srcu_down_read_fast(srcu_ctlp); + idx = __srcu_ptr_to_ctr(srcu_ctlp, scp); + WARN_ON_ONCE(idx & ~0x1); + return idx << 3; + } + WARN_ON_ONCE(1); + return 0; +} + +static void srcu_torture_up_read(int idx) +{ + WARN_ON_ONCE((reader_flavor && (idx & ~reader_flavor)) || (!reader_flavor && (idx & ~0x1))); + if (reader_flavor & SRCU_READ_FLAVOR_FAST) + srcu_up_read_fast(srcu_ctlp, __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x8) >> 3)); + else if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || + !(reader_flavor & SRCU_READ_FLAVOR_ALL)) + srcu_up_read(srcu_ctlp, idx & 0x1); + else + WARN_ON_ONCE(1); +} + static unsigned long srcu_torture_completed(void) { return srcu_batches_completed(srcu_ctlp); @@ -819,6 +863,8 @@ static struct rcu_torture_ops srcu_ops = { .readlock = srcu_torture_read_lock, .read_delay = srcu_read_delay, .readunlock = srcu_torture_read_unlock, + .down_read = srcu_torture_down_read, + .up_read = srcu_torture_up_read, .readlock_held = torture_srcu_read_lock_held, .get_gp_seq = srcu_torture_completed, .gp_diff = rcu_seq_diff, @@ -839,6 +885,8 @@ static struct rcu_torture_ops srcu_ops = { .irq_capable = 1, .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU), .debug_objects = 1, + .have_up_down = IS_ENABLED(CONFIG_TINY_SRCU) + ? 0 : SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_FAST, .name = "srcu" }; @@ -864,6 +912,8 @@ static struct rcu_torture_ops srcud_ops = { .read_delay = srcu_read_delay, .readunlock = srcu_torture_read_unlock, .readlock_held = torture_srcu_read_lock_held, + .down_read = srcu_torture_down_read, + .up_read = srcu_torture_up_read, .get_gp_seq = srcu_torture_completed, .gp_diff = rcu_seq_diff, .deferred_free = srcu_torture_deferred_free, @@ -883,6 +933,8 @@ static struct rcu_torture_ops srcud_ops = { .irq_capable = 1, .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU), .debug_objects = 1, + .have_up_down = IS_ENABLED(CONFIG_TINY_SRCU) + ? 0 : SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_FAST, .name = "srcud" }; @@ -910,7 +962,8 @@ static struct rcu_torture_ops busted_srcud_ops = { /* * Definitions for trivial CONFIG_PREEMPT=n-only torture testing. - * This implementation does not necessarily work well with CPU hotplug. + * This implementation does not work well with CPU hotplug nor + * with rcutorture's shuffling. */ static void synchronize_rcu_trivial(void) @@ -923,6 +976,16 @@ static void synchronize_rcu_trivial(void) } } +static void rcu_sync_torture_init_trivial(void) +{ + rcu_sync_torture_init(); + // if (onoff_interval || shuffle_interval) { + if (WARN_ONCE(onoff_interval || shuffle_interval, "%s: Non-zero onoff_interval (%d) or shuffle_interval (%d) breaks trivial RCU, resetting to zero", __func__, onoff_interval, shuffle_interval)) { + onoff_interval = 0; + shuffle_interval = 0; + } +} + static int rcu_torture_read_lock_trivial(void) { preempt_disable(); @@ -936,7 +999,7 @@ static void rcu_torture_read_unlock_trivial(int idx) static struct rcu_torture_ops trivial_ops = { .ttype = RCU_TRIVIAL_FLAVOR, - .init = rcu_sync_torture_init, + .init = rcu_sync_torture_init_trivial, .readlock = rcu_torture_read_lock_trivial, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_torture_read_unlock_trivial, @@ -1722,6 +1785,7 @@ rcu_torture_writer(void *arg) cur_ops->gp_kthread_dbg(); WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count); rcu_ftrace_dump(DUMP_ALL); + break; } if (stutter_waited) sched_set_normal(current, oldnice); @@ -1915,14 +1979,14 @@ static void rcu_torture_reader_do_mbchk(long myid, struct rcu_torture *rtp, // Verify the specified RCUTORTURE_RDR* state. #define ROEC_ARGS "%s %s: Current %#x To add %#x To remove %#x preempt_count() %#x\n", __func__, s, curstate, new, old, preempt_count() -static void rcutorture_one_extend_check(char *s, int curstate, int new, int old, bool insoftirq) +static void rcutorture_one_extend_check(char *s, int curstate, int new, int old) { int mask; - if (!IS_ENABLED(CONFIG_RCU_TORTURE_TEST_CHK_RDR_STATE)) + if (!IS_ENABLED(CONFIG_RCU_TORTURE_TEST_CHK_RDR_STATE) || in_nmi()) return; - WARN_ONCE(!(curstate & RCUTORTURE_RDR_IRQ) && irqs_disabled(), ROEC_ARGS); + WARN_ONCE(!(curstate & RCUTORTURE_RDR_IRQ) && irqs_disabled() && !in_hardirq(), ROEC_ARGS); WARN_ONCE((curstate & RCUTORTURE_RDR_IRQ) && !irqs_disabled(), ROEC_ARGS); // If CONFIG_PREEMPT_COUNT=n, further checks are unreliable. @@ -1930,21 +1994,21 @@ static void rcutorture_one_extend_check(char *s, int curstate, int new, int old, return; WARN_ONCE((curstate & (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH)) && - !(preempt_count() & SOFTIRQ_MASK), ROEC_ARGS); + !softirq_count(), ROEC_ARGS); WARN_ONCE((curstate & (RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED)) && !(preempt_count() & PREEMPT_MASK), ROEC_ARGS); WARN_ONCE(cur_ops->readlock_nesting && (curstate & (RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2)) && cur_ops->readlock_nesting() == 0, ROEC_ARGS); - // Timer handlers have all sorts of stuff disabled, so ignore + // Interrupt handlers have all sorts of stuff disabled, so ignore // unintended disabling. - if (insoftirq) + if (in_serving_softirq() || in_hardirq()) return; WARN_ONCE(cur_ops->extendables && !(curstate & (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH)) && - (preempt_count() & SOFTIRQ_MASK), ROEC_ARGS); + softirq_count(), ROEC_ARGS); /* * non-preemptible RCU in a preemptible kernel uses preempt_disable() @@ -1965,6 +2029,9 @@ static void rcutorture_one_extend_check(char *s, int curstate, int new, int old, if (!IS_ENABLED(CONFIG_PREEMPT_RCU)) mask |= RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED; + if (IS_ENABLED(CONFIG_PREEMPT_RT) && softirq_count()) + mask |= RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH; + WARN_ONCE(cur_ops->readlock_nesting && !(curstate & mask) && cur_ops->readlock_nesting() > 0, ROEC_ARGS); } @@ -1978,8 +2045,7 @@ static void rcutorture_one_extend_check(char *s, int curstate, int new, int old, * beginning or end of the critical section and if there was actually a * change, do a ->read_delay(). */ -static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq, - struct torture_random_state *trsp, +static void rcutorture_one_extend(int *readstate, int newstate, struct torture_random_state *trsp, struct rt_read_seg *rtrsp) { bool first; @@ -1993,8 +2059,8 @@ static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq, first = idxold1 == 0; WARN_ON_ONCE(idxold2 < 0); - WARN_ON_ONCE(idxold2 & ~RCUTORTURE_RDR_ALLBITS); - rcutorture_one_extend_check("before change", idxold1, statesnew, statesold, insoftirq); + WARN_ON_ONCE(idxold2 & ~(RCUTORTURE_RDR_ALLBITS | RCUTORTURE_RDR_UPDOWN)); + rcutorture_one_extend_check("before change", idxold1, statesnew, statesold); rtrsp->rt_readstate = newstate; /* First, put new protection in place to avoid critical-section gap. */ @@ -2014,8 +2080,7 @@ static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq, idxnew2 = (cur_ops->readlock() << RCUTORTURE_RDR_SHIFT_2) & RCUTORTURE_RDR_MASK_2; // Complain unless both the old and the new protection is in place. - rcutorture_one_extend_check("during change", - idxold1 | statesnew, statesnew, statesold, insoftirq); + rcutorture_one_extend_check("during change", idxold1 | statesnew, statesnew, statesold); // Sample CPU under both sets of protections to reduce confusion. if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_CPU)) { @@ -2069,6 +2134,11 @@ static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq, if (lockit) raw_spin_unlock_irqrestore(¤t->pi_lock, flags); } + if (statesold & RCUTORTURE_RDR_UPDOWN) { + cur_ops->up_read((idxold1 & RCUTORTURE_RDR_MASK_1) >> RCUTORTURE_RDR_SHIFT_1); + WARN_ON_ONCE(idxnew1 != -1); + idxold1 = 0; + } /* Delay if neither beginning nor end and there was a change. */ if ((statesnew || statesold) && *readstate && newstate) @@ -2085,7 +2155,7 @@ static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq, WARN_ON_ONCE(*readstate < 0); if (WARN_ON_ONCE(*readstate & ~RCUTORTURE_RDR_ALLBITS)) pr_info("Unexpected readstate value of %#x\n", *readstate); - rcutorture_one_extend_check("after change", *readstate, statesnew, statesold, insoftirq); + rcutorture_one_extend_check("after change", *readstate, statesnew, statesold); } /* Return the biggest extendables mask given current RCU and boot parameters. */ @@ -2152,8 +2222,7 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp) * critical section. */ static struct rt_read_seg * -rcutorture_loop_extend(int *readstate, bool insoftirq, struct torture_random_state *trsp, - struct rt_read_seg *rtrsp) +rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp, struct rt_read_seg *rtrsp) { int i; int j; @@ -2167,7 +2236,8 @@ rcutorture_loop_extend(int *readstate, bool insoftirq, struct torture_random_sta i = ((i | (i >> 3)) & RCUTORTURE_RDR_MAX_LOOPS) + 1; for (j = 0; j < i; j++) { mask = rcutorture_extend_mask(*readstate, trsp); - rcutorture_one_extend(readstate, mask, insoftirq, trsp, &rtrsp[j]); + WARN_ON_ONCE(mask & RCUTORTURE_RDR_UPDOWN); + rcutorture_one_extend(readstate, mask, trsp, &rtrsp[j]); } return &rtrsp[j]; } @@ -2209,10 +2279,11 @@ static bool rcu_torture_one_read_start(struct rcu_torture_one_read_state *rtorsp rtorsp->started = cur_ops->get_gp_seq(); rtorsp->ts = rcu_trace_clock_local(); rtorsp->p = rcu_dereference_check(rcu_torture_current, - !cur_ops->readlock_held || cur_ops->readlock_held()); + !cur_ops->readlock_held || cur_ops->readlock_held() || + (rtorsp->readstate & RCUTORTURE_RDR_UPDOWN)); if (rtorsp->p == NULL) { /* Wait for rcu_torture_writer to get underway */ - rcutorture_one_extend(&rtorsp->readstate, 0, myid < 0, trsp, rtorsp->rtrsp); + rcutorture_one_extend(&rtorsp->readstate, 0, trsp, rtorsp->rtrsp); return false; } if (rtorsp->p->rtort_mbtest == 0) @@ -2226,7 +2297,7 @@ static bool rcu_torture_one_read_start(struct rcu_torture_one_read_state *rtorsp * critical sections and check for errors. */ static void rcu_torture_one_read_end(struct rcu_torture_one_read_state *rtorsp, - struct torture_random_state *trsp, long myid) + struct torture_random_state *trsp) { int i; unsigned long completed; @@ -2273,7 +2344,7 @@ static void rcu_torture_one_read_end(struct rcu_torture_one_read_state *rtorsp, } if (cur_ops->reader_blocked) preempted = cur_ops->reader_blocked(); - rcutorture_one_extend(&rtorsp->readstate, 0, myid < 0, trsp, rtorsp->rtrsp); + rcutorture_one_extend(&rtorsp->readstate, 0, trsp, rtorsp->rtrsp); WARN_ON_ONCE(rtorsp->readstate); // This next splat is expected behavior if leakpointer, especially // for CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels. @@ -2302,13 +2373,14 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) WARN_ON_ONCE(!rcu_is_watching()); init_rcu_torture_one_read_state(&rtors, trsp); newstate = rcutorture_extend_mask(rtors.readstate, trsp); - rcutorture_one_extend(&rtors.readstate, newstate, myid < 0, trsp, rtors.rtrsp++); + WARN_ON_ONCE(newstate & RCUTORTURE_RDR_UPDOWN); + rcutorture_one_extend(&rtors.readstate, newstate, trsp, rtors.rtrsp++); if (!rcu_torture_one_read_start(&rtors, trsp, myid)) { - rcutorture_one_extend(&rtors.readstate, 0, myid < 0, trsp, rtors.rtrsp); + rcutorture_one_extend(&rtors.readstate, 0, trsp, rtors.rtrsp); return false; } - rtors.rtrsp = rcutorture_loop_extend(&rtors.readstate, myid < 0, trsp, rtors.rtrsp); - rcu_torture_one_read_end(&rtors, trsp, myid); + rtors.rtrsp = rcutorture_loop_extend(&rtors.readstate, trsp, rtors.rtrsp); + rcu_torture_one_read_end(&rtors, trsp); return true; } @@ -2378,6 +2450,152 @@ rcu_torture_reader(void *arg) return 0; } +struct rcu_torture_one_read_state_updown { + struct hrtimer rtorsu_hrt; + bool rtorsu_inuse; + ktime_t rtorsu_kt; + int rtorsu_cpu; + unsigned long rtorsu_j; + unsigned long rtorsu_ndowns; + unsigned long rtorsu_nups; + unsigned long rtorsu_nmigrates; + struct torture_random_state rtorsu_trs; + struct rcu_torture_one_read_state rtorsu_rtors; +}; + +static struct rcu_torture_one_read_state_updown *updownreaders; +static DEFINE_TORTURE_RANDOM(rcu_torture_updown_rand); +static int rcu_torture_updown(void *arg); + +static enum hrtimer_restart rcu_torture_updown_hrt(struct hrtimer *hrtp) +{ + int cpu = raw_smp_processor_id(); + struct rcu_torture_one_read_state_updown *rtorsup; + + rtorsup = container_of(hrtp, struct rcu_torture_one_read_state_updown, rtorsu_hrt); + rcu_torture_one_read_end(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs); + WARN_ONCE(rtorsup->rtorsu_nups >= rtorsup->rtorsu_ndowns, "%s: Up without matching down #%zu.\n", __func__, rtorsup - updownreaders); + WRITE_ONCE(rtorsup->rtorsu_nups, rtorsup->rtorsu_nups + 1); + WRITE_ONCE(rtorsup->rtorsu_nmigrates, + rtorsup->rtorsu_nmigrates + (cpu != rtorsup->rtorsu_cpu)); + smp_store_release(&rtorsup->rtorsu_inuse, false); + return HRTIMER_NORESTART; +} + +static int rcu_torture_updown_init(void) +{ + int i; + struct torture_random_state *rand = &rcu_torture_updown_rand; + int ret; + + if (n_up_down < 0) + return 0; + if (!srcu_torture_have_up_down()) { + VERBOSE_TOROUT_STRING("rcu_torture_updown_init: Disabling up/down reader tests due to lack of primitives"); + return 0; + } + updownreaders = kcalloc(n_up_down, sizeof(*updownreaders), GFP_KERNEL); + if (!updownreaders) { + VERBOSE_TOROUT_STRING("rcu_torture_updown_init: Out of memory, disabling up/down reader tests"); + return -ENOMEM; + } + for (i = 0; i < n_up_down; i++) { + init_rcu_torture_one_read_state(&updownreaders[i].rtorsu_rtors, rand); + hrtimer_setup(&updownreaders[i].rtorsu_hrt, rcu_torture_updown_hrt, CLOCK_MONOTONIC, + HRTIMER_MODE_REL | HRTIMER_MODE_HARD); + torture_random_init(&updownreaders[i].rtorsu_trs); + init_rcu_torture_one_read_state(&updownreaders[i].rtorsu_rtors, + &updownreaders[i].rtorsu_trs); + } + ret = torture_create_kthread(rcu_torture_updown, rand, updown_task); + if (ret) { + kfree(updownreaders); + updownreaders = NULL; + } + return ret; +} + +static void rcu_torture_updown_cleanup(void) +{ + struct rcu_torture_one_read_state_updown *rtorsup; + + for (rtorsup = updownreaders; rtorsup < &updownreaders[n_up_down]; rtorsup++) { + if (!smp_load_acquire(&rtorsup->rtorsu_inuse)) + continue; + if (hrtimer_cancel(&rtorsup->rtorsu_hrt) || WARN_ON_ONCE(rtorsup->rtorsu_inuse)) { + rcu_torture_one_read_end(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs); + WARN_ONCE(rtorsup->rtorsu_nups >= rtorsup->rtorsu_ndowns, "%s: Up without matching down #%zu.\n", __func__, rtorsup - updownreaders); + WRITE_ONCE(rtorsup->rtorsu_nups, rtorsup->rtorsu_nups + 1); + smp_store_release(&rtorsup->rtorsu_inuse, false); + } + + } + kfree(updownreaders); + updownreaders = NULL; +} + +// Do one reader for rcu_torture_updown(). +static void rcu_torture_updown_one(struct rcu_torture_one_read_state_updown *rtorsup) +{ + int idx; + int rawidx; + ktime_t t; + + init_rcu_torture_one_read_state(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs); + rawidx = cur_ops->down_read(); + WRITE_ONCE(rtorsup->rtorsu_ndowns, rtorsup->rtorsu_ndowns + 1); + idx = (rawidx << RCUTORTURE_RDR_SHIFT_1) & RCUTORTURE_RDR_MASK_1; + rtorsup->rtorsu_rtors.readstate = idx | RCUTORTURE_RDR_UPDOWN; + rtorsup->rtorsu_rtors.rtrsp++; + rtorsup->rtorsu_cpu = raw_smp_processor_id(); + if (!rcu_torture_one_read_start(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs, -1)) { + WARN_ONCE(rtorsup->rtorsu_nups >= rtorsup->rtorsu_ndowns, "%s: Up without matching down #%zu.\n", __func__, rtorsup - updownreaders); + WRITE_ONCE(rtorsup->rtorsu_nups, rtorsup->rtorsu_nups + 1); + schedule_timeout_idle(HZ); + return; + } + smp_store_release(&rtorsup->rtorsu_inuse, true); + t = torture_random(&rtorsup->rtorsu_trs) & 0xfffff; // One per million. + if (t < 10 * 1000) + t = 200 * 1000 * 1000; + hrtimer_start(&rtorsup->rtorsu_hrt, t, HRTIMER_MODE_REL | HRTIMER_MODE_HARD); + smp_mb(); // Sample jiffies after posting hrtimer. + rtorsup->rtorsu_j = jiffies; // Not used by hrtimer handler. + rtorsup->rtorsu_kt = t; +} + +/* + * RCU torture up/down reader kthread, starting RCU readers in kthread + * context and ending them in hrtimer handlers. Otherwise similar to + * rcu_torture_reader(). + */ +static int +rcu_torture_updown(void *arg) +{ + unsigned long j; + struct rcu_torture_one_read_state_updown *rtorsup; + + VERBOSE_TOROUT_STRING("rcu_torture_updown task started"); + do { + for (rtorsup = updownreaders; rtorsup < &updownreaders[n_up_down]; rtorsup++) { + if (torture_must_stop()) + break; + j = smp_load_acquire(&jiffies); // Time before ->rtorsu_inuse. + if (smp_load_acquire(&rtorsup->rtorsu_inuse)) { + WARN_ONCE(time_after(j, rtorsup->rtorsu_j + 1 + HZ * 10), + "hrtimer queued at jiffies %lu for %lld ns took %lu jiffies\n", rtorsup->rtorsu_j, rtorsup->rtorsu_kt, j - rtorsup->rtorsu_j); + continue; + } + rcu_torture_updown_one(rtorsup); + } + torture_hrtimeout_ms(1, 1000, &rcu_torture_updown_rand); + stutter_wait("rcu_torture_updown"); + } while (!torture_must_stop()); + rcu_torture_updown_cleanup(); + torture_kthread_stopping("rcu_torture_updown"); + return 0; +} + /* * Randomly Toggle CPUs' callback-offload state. This uses hrtimers to * increase race probabilities and fuzzes the interval between toggling. @@ -2441,6 +2659,10 @@ rcu_torture_stats_print(void) long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; long n_gpwraps = 0; + unsigned long ndowns = 0; + unsigned long nunexpired = 0; + unsigned long nmigrates = 0; + unsigned long nups = 0; struct rcu_torture *rtcp; static unsigned long rtcv_snap = ULONG_MAX; static bool splatted; @@ -2454,10 +2676,18 @@ rcu_torture_stats_print(void) if (cur_ops->get_gpwrap_count) n_gpwraps += cur_ops->get_gpwrap_count(cpu); } + if (updownreaders) { + for (i = 0; i < n_up_down; i++) { + ndowns += READ_ONCE(updownreaders[i].rtorsu_ndowns); + nups += READ_ONCE(updownreaders[i].rtorsu_nups); + nunexpired += READ_ONCE(updownreaders[i].rtorsu_inuse); + nmigrates += READ_ONCE(updownreaders[i].rtorsu_nmigrates); + } + } for (i = RCU_TORTURE_PIPE_LEN; i >= 0; i--) { if (pipesummary[i] != 0) break; - } + } // The value of variable "i" is used later, so don't clobber it! pr_alert("%s%s ", torture_type, TORTURE_FLAG); rtcp = rcu_access_pointer(rcu_torture_current); @@ -2478,6 +2708,8 @@ rcu_torture_stats_print(void) n_rcu_torture_boost_failure, n_rcu_torture_boosts, atomic_long_read(&n_rcu_torture_timers)); + if (updownreaders) + pr_cont("ndowns: %lu nups: %lu nhrt: %lu nmigrates: %lu ", ndowns, nups, nunexpired, nmigrates); torture_onoff_stats(); pr_cont("barrier: %ld/%ld:%ld ", data_race(n_barrier_successes), @@ -2632,7 +2864,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) "reader_flavor=%x " "nocbs_nthreads=%d nocbs_toggle=%d " "test_nmis=%d " - "preempt_duration=%d preempt_interval=%d\n", + "preempt_duration=%d preempt_interval=%d n_up_down=%d\n", torture_type, tag, nrealreaders, nrealfakewriters, stat_interval, verbose, test_no_idle_hz, shuffle_interval, stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, @@ -2646,7 +2878,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) reader_flavor, nocbs_nthreads, nocbs_toggle, test_nmis, - preempt_duration, preempt_interval); + preempt_duration, preempt_interval, n_up_down); } static int rcutorture_booster_cleanup(unsigned int cpu) @@ -3749,6 +3981,10 @@ rcu_torture_cleanup(void) nocb_tasks = NULL; } + if (updown_task) { + torture_stop_kthread(rcu_torture_updown, updown_task); + updown_task = NULL; + } if (reader_tasks) { for (i = 0; i < nrealreaders; i++) torture_stop_kthread(rcu_torture_reader, @@ -4245,11 +4481,6 @@ rcu_torture_init(void) /* Start up the kthreads. */ rcu_torture_write_types(); - firsterr = torture_create_kthread(rcu_torture_writer, NULL, - writer_task); - if (torture_init_error(firsterr)) - goto unwind; - if (nrealfakewriters > 0) { fakewriter_tasks = kcalloc(nrealfakewriters, sizeof(fakewriter_tasks[0]), @@ -4282,6 +4513,15 @@ rcu_torture_init(void) if (torture_init_error(firsterr)) goto unwind; } + + firsterr = torture_create_kthread(rcu_torture_writer, NULL, + writer_task); + if (torture_init_error(firsterr)) + goto unwind; + + firsterr = rcu_torture_updown_init(); + if (torture_init_error(firsterr)) + goto unwind; nrealnocbers = nocbs_nthreads; if (WARN_ON(nrealnocbers < 0)) nrealnocbers = 1; diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index f11a7c2af778..df646e0694a8 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -85,7 +85,7 @@ torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_SCALE_TEST) ? 10 : 0, // Number of typesafe_lookup structures, that is, the degree of concurrency. torture_param(long, lookup_instances, 0, "Number of typesafe_lookup structures."); // Number of loops per experiment, all readers execute operations concurrently. -torture_param(long, loops, 10000, "Number of loops per experiment."); +torture_param(int, loops, 10000, "Number of loops per experiment."); // Number of readers, with -1 defaulting to about 75% of the CPUs. torture_param(int, nreaders, -1, "Number of readers, -1 for 75% of CPUs."); // Number of runs. @@ -246,36 +246,6 @@ static const struct ref_scale_ops srcu_fast_ops = { .name = "srcu-fast" }; -static void srcu_lite_ref_scale_read_section(const int nloops) -{ - int i; - int idx; - - for (i = nloops; i >= 0; i--) { - idx = srcu_read_lock_lite(srcu_ctlp); - srcu_read_unlock_lite(srcu_ctlp, idx); - } -} - -static void srcu_lite_ref_scale_delay_section(const int nloops, const int udl, const int ndl) -{ - int i; - int idx; - - for (i = nloops; i >= 0; i--) { - idx = srcu_read_lock_lite(srcu_ctlp); - un_delay(udl, ndl); - srcu_read_unlock_lite(srcu_ctlp, idx); - } -} - -static const struct ref_scale_ops srcu_lite_ops = { - .init = rcu_sync_scale_init, - .readsection = srcu_lite_ref_scale_read_section, - .delaysection = srcu_lite_ref_scale_delay_section, - .name = "srcu-lite" -}; - #ifdef CONFIG_TASKS_RCU // Definitions for RCU Tasks ref scale testing: Empty read markers. @@ -1140,7 +1110,7 @@ static void ref_scale_print_module_parms(const struct ref_scale_ops *cur_ops, const char *tag) { pr_alert("%s" SCALE_FLAG - "--- %s: verbose=%d verbose_batched=%d shutdown=%d holdoff=%d lookup_instances=%ld loops=%ld nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag, + "--- %s: verbose=%d verbose_batched=%d shutdown=%d holdoff=%d lookup_instances=%ld loops=%d nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag, verbose, verbose_batched, shutdown, holdoff, lookup_instances, loops, nreaders, nruns, readdelay); } @@ -1193,7 +1163,7 @@ ref_scale_init(void) long i; int firsterr = 0; static const struct ref_scale_ops *scale_ops[] = { - &rcu_ops, &srcu_ops, &srcu_fast_ops, &srcu_lite_ops, RCU_TRACE_OPS RCU_TASKS_OPS + &rcu_ops, &srcu_ops, &srcu_fast_ops, RCU_TRACE_OPS RCU_TASKS_OPS &refcnt_ops, &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &sched_clock_ops, &clock_ops, &jiffies_ops, &typesafe_ref_ops, &typesafe_lock_ops, &typesafe_seqlock_ops, @@ -1238,12 +1208,16 @@ ref_scale_init(void) // Reader tasks (default to ~75% of online CPUs). if (nreaders < 0) nreaders = (num_online_cpus() >> 1) + (num_online_cpus() >> 2); - if (WARN_ONCE(loops <= 0, "%s: loops = %ld, adjusted to 1\n", __func__, loops)) + if (WARN_ONCE(loops <= 0, "%s: loops = %d, adjusted to 1\n", __func__, loops)) loops = 1; if (WARN_ONCE(nreaders <= 0, "%s: nreaders = %d, adjusted to 1\n", __func__, nreaders)) nreaders = 1; if (WARN_ONCE(nruns <= 0, "%s: nruns = %d, adjusted to 1\n", __func__, nruns)) nruns = 1; + if (WARN_ONCE(loops > INT_MAX / nreaders, + "%s: nreaders * loops will overflow, adjusted loops to %d", + __func__, INT_MAX / nreaders)) + loops = INT_MAX / nreaders; reader_tasks = kcalloc(nreaders, sizeof(reader_tasks[0]), GFP_KERNEL); if (!reader_tasks) { diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 48047260697e..c5e8ebc493d5 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -502,6 +502,8 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx) */ if (!did_gp) smp_mb(); /* A */ + else if (srcu_gp_is_expedited(ssp)) + synchronize_rcu_expedited(); /* X */ else synchronize_rcu(); /* X */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 14d4499c6fc3..174ee243b349 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -160,7 +160,6 @@ static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp, unsigned long gps, unsigned long flags); static void invoke_rcu_core(void); static void rcu_report_exp_rdp(struct rcu_data *rdp); -static void sync_sched_exp_online_cleanup(int cpu); static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp); static bool rcu_rdp_is_offloaded(struct rcu_data *rdp); static bool rcu_rdp_cpu_online(struct rcu_data *rdp); @@ -377,7 +376,7 @@ EXPORT_SYMBOL_GPL(rcu_momentary_eqs); */ static int rcu_is_cpu_rrupt_from_idle(void) { - long nesting; + long nmi_nesting = ct_nmi_nesting(); /* * Usually called from the tick; but also used from smp_function_call() @@ -389,21 +388,28 @@ static int rcu_is_cpu_rrupt_from_idle(void) /* Check for counter underflows */ RCU_LOCKDEP_WARN(ct_nesting() < 0, "RCU nesting counter underflow!"); - RCU_LOCKDEP_WARN(ct_nmi_nesting() <= 0, - "RCU nmi_nesting counter underflow/zero!"); - /* Are we at first interrupt nesting level? */ - nesting = ct_nmi_nesting(); - if (nesting > 1) + /* Non-idle interrupt or nested idle interrupt */ + if (nmi_nesting > 1) return false; /* - * If we're not in an interrupt, we must be in the idle task! + * Non nested idle interrupt (interrupting section where RCU + * wasn't watching). */ - WARN_ON_ONCE(!nesting && !is_idle_task(current)); + if (nmi_nesting == 1) + return true; + + /* Not in an interrupt */ + if (!nmi_nesting) { + RCU_LOCKDEP_WARN(!in_task() || !is_idle_task(current), + "RCU nmi_nesting counter not in idle task!"); + return !rcu_is_watching_curr_cpu(); + } - /* Does CPU appear to be idle from an RCU standpoint? */ - return ct_nesting() == 0; + RCU_LOCKDEP_WARN(1, "RCU nmi_nesting counter underflow/zero!"); + + return false; } #define DEFAULT_RCU_BLIMIT (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? 1000 : 10) @@ -1625,8 +1631,10 @@ static void rcu_sr_put_wait_head(struct llist_node *node) atomic_set_release(&sr_wn->inuse, 0); } -/* Disabled by default. */ -static int rcu_normal_wake_from_gp; +/* Enable rcu_normal_wake_from_gp automatically on small systems. */ +#define WAKE_FROM_GP_CPU_THRESHOLD 16 + +static int rcu_normal_wake_from_gp = -1; module_param(rcu_normal_wake_from_gp, int, 0644); static struct workqueue_struct *sync_wq; @@ -1829,6 +1837,18 @@ static noinline_for_stack bool rcu_gp_init(void) start_new_poll = rcu_sr_normal_gp_init(); /* Record GP times before starting GP, hence rcu_seq_start(). */ old_gp_seq = rcu_state.gp_seq; + /* + * Critical ordering: rcu_seq_start() must happen BEFORE the CPU hotplug + * scan below. Otherwise we risk a race where a newly onlining CPU could + * be missed by the current grace period, potentially leading to + * use-after-free errors. For a detailed explanation of this race, see + * Documentation/RCU/Design/Requirements/Requirements.rst in the + * "Hotplug CPU" section. + * + * Also note that the root rnp's gp_seq is kept separate from, and lags, + * the rcu_state's gp_seq, for a reason. See the Quick-Quiz on + * Single-node systems for more details (in Data-Structures.rst). + */ rcu_seq_start(&rcu_state.gp_seq); /* Ensure that rcu_seq_done_exact() guardband doesn't give false positives. */ WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && @@ -1865,6 +1885,10 @@ static noinline_for_stack bool rcu_gp_init(void) /* Exclude CPU hotplug operations. */ rcu_for_each_leaf_node(rnp) { local_irq_disable(); + /* + * Serialize with CPU offline. See Requirements.rst > Hotplug CPU > + * Concurrent Quiescent State Reporting for Offline CPUs. + */ arch_spin_lock(&rcu_state.ofl_lock); raw_spin_lock_rcu_node(rnp); if (rnp->qsmaskinit == rnp->qsmaskinitnext && @@ -1939,7 +1963,12 @@ static noinline_for_stack bool rcu_gp_init(void) trace_rcu_grace_period_init(rcu_state.name, rnp->gp_seq, rnp->level, rnp->grplo, rnp->grphi, rnp->qsmask); - /* Quiescent states for tasks on any now-offline CPUs. */ + /* + * Quiescent states for tasks on any now-offline CPUs. Since we + * released the ofl and rnp lock before this loop, CPUs might + * have gone offline and we have to report QS on their behalf. + * See Requirements.rst > Hotplug CPU > Concurrent QS Reporting. + */ mask = rnp->qsmask & ~rnp->qsmaskinitnext; rnp->rcu_gp_init_mask = mask; if ((mask || rnp->wait_blkd_tasks) && rcu_is_leaf_node(rnp)) @@ -3243,7 +3272,7 @@ static void synchronize_rcu_normal(void) trace_rcu_sr_normal(rcu_state.name, &rs.head, TPS("request")); - if (!READ_ONCE(rcu_normal_wake_from_gp)) { + if (READ_ONCE(rcu_normal_wake_from_gp) < 1) { wait_rcu_gp(call_rcu_hurry); goto trace_complete_out; } @@ -4268,7 +4297,6 @@ int rcutree_online_cpu(unsigned int cpu) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) return 0; /* Too early in boot for scheduler work. */ - sync_sched_exp_online_cleanup(cpu); // Stop-machine done, so allow nohz_full to disable tick. tick_dep_clear(TICK_DEP_BIT_RCU); @@ -4358,6 +4386,12 @@ void rcutree_report_cpu_dead(void) * may introduce a new READ-side while it is actually off the QS masks. */ lockdep_assert_irqs_disabled(); + /* + * CPUHP_AP_SMPCFD_DYING was the last call for rcu_exp_handler() execution. + * The requested QS must have been reported on the last context switch + * from stop machine to idle. + */ + WARN_ON_ONCE(rdp->cpu_no_qs.b.exp); // Do any dangling deferred wakeups. do_nocb_deferred_wakeup(rdp); @@ -4365,6 +4399,13 @@ void rcutree_report_cpu_dead(void) /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ mask = rdp->grpmask; + + /* + * Hold the ofl_lock and rnp lock to avoid races between CPU going + * offline and doing a QS report (as below), versus rcu_gp_init(). + * See Requirements.rst > Hotplug CPU > Concurrent QS Reporting section + * for more details. + */ arch_spin_lock(&rcu_state.ofl_lock); raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq); @@ -4375,6 +4416,7 @@ void rcutree_report_cpu_dead(void) rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); raw_spin_lock_irqsave_rcu_node(rnp, flags); } + /* Clear from ->qsmaskinitnext to mark offline. */ WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); arch_spin_unlock(&rcu_state.ofl_lock); @@ -4847,6 +4889,12 @@ void __init rcu_init(void) sync_wq = alloc_workqueue("sync_wq", WQ_MEM_RECLAIM, 0); WARN_ON(!sync_wq); + /* Respect if explicitly disabled via a boot parameter. */ + if (rcu_normal_wake_from_gp < 0) { + if (num_possible_cpus() <= WAKE_FROM_GP_CPU_THRESHOLD) + rcu_normal_wake_from_gp = 1; + } + /* Fill in default value for rcutree.qovld boot parameter. */ /* -After- the rcu_node ->lock fields are initialized! */ if (qovld < 0) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 3830c19cf2f6..de6ca13a7b5f 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -174,6 +174,17 @@ struct rcu_snap_record { unsigned long jiffies; /* Track jiffies value */ }; +/* + * An IRQ work (deferred_qs_iw) is used by RCU to get the scheduler's attention. + * to report quiescent states at the soonest possible time. + * The request can be in one of the following states: + * - DEFER_QS_IDLE: An IRQ work is yet to be scheduled. + * - DEFER_QS_PENDING: An IRQ work was scheduled but either not yet run, or it + * ran and we still haven't reported a quiescent state. + */ +#define DEFER_QS_IDLE 0 +#define DEFER_QS_PENDING 1 + /* Per-CPU data for read-copy update. */ struct rcu_data { /* 1) quiescent-state and grace-period handling : */ @@ -192,7 +203,7 @@ struct rcu_data { /* during and after the last grace */ /* period it is aware of. */ struct irq_work defer_qs_iw; /* Obtain later scheduler attention. */ - bool defer_qs_iw_pending; /* Scheduler attention pending? */ + int defer_qs_iw_pending; /* Scheduler attention pending? */ struct work_struct strict_work; /* Schedule readers for strict GPs. */ /* 2) batch handling */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index c36c7d5575ca..6058a734090c 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -141,6 +141,13 @@ static void __maybe_unused sync_exp_reset_tree(void) raw_spin_lock_irqsave_rcu_node(rnp, flags); WARN_ON_ONCE(rnp->expmask); WRITE_ONCE(rnp->expmask, rnp->expmaskinit); + /* + * Need to wait for any blocked tasks as well. Note that + * additional blocking tasks will also block the expedited GP + * until such time as the ->expmask bits are cleared. + */ + if (rcu_is_leaf_node(rnp) && rcu_preempt_has_tasks(rnp)) + WRITE_ONCE(rnp->exp_tasks, rnp->blkd_tasks.next); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } } @@ -393,13 +400,6 @@ static void __sync_rcu_exp_select_node_cpus(struct rcu_exp_work *rewp) } mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; - /* - * Need to wait for any blocked tasks as well. Note that - * additional blocking tasks will also block the expedited GP - * until such time as the ->expmask bits are cleared. - */ - if (rcu_preempt_has_tasks(rnp)) - WRITE_ONCE(rnp->exp_tasks, rnp->blkd_tasks.next); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* IPI the remaining CPUs for expedited quiescent state. */ @@ -751,12 +751,8 @@ static void rcu_exp_handler(void *unused) struct task_struct *t = current; /* - * First, is there no need for a quiescent state from this CPU, - * or is this CPU already looking for a quiescent state for the - * current grace period? If either is the case, just leave. - * However, this should not happen due to the preemptible - * sync_sched_exp_online_cleanup() implementation being a no-op, - * so warn if this does happen. + * WARN if the CPU is unexpectedly already looking for a + * QS or has already reported one. */ ASSERT_EXCLUSIVE_WRITER_SCOPED(rdp->cpu_no_qs.b.exp); if (WARN_ON_ONCE(!(READ_ONCE(rnp->expmask) & rdp->grpmask) || @@ -803,11 +799,6 @@ static void rcu_exp_handler(void *unused) WARN_ON_ONCE(1); } -/* PREEMPTION=y, so no PREEMPTION=n expedited grace period to clean up after. */ -static void sync_sched_exp_online_cleanup(int cpu) -{ -} - /* * Scan the current list of tasks blocked within RCU read-side critical * sections, printing out the tid of each that is blocking the current @@ -885,38 +876,6 @@ static void rcu_exp_handler(void *unused) rcu_exp_need_qs(); } -/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */ -static void sync_sched_exp_online_cleanup(int cpu) -{ - unsigned long flags; - int my_cpu; - struct rcu_data *rdp; - int ret; - struct rcu_node *rnp; - - rdp = per_cpu_ptr(&rcu_data, cpu); - rnp = rdp->mynode; - my_cpu = get_cpu(); - /* Quiescent state either not needed or already requested, leave. */ - if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || - READ_ONCE(rdp->cpu_no_qs.b.exp)) { - put_cpu(); - return; - } - /* Quiescent state needed on current CPU, so set it up locally. */ - if (my_cpu == cpu) { - local_irq_save(flags); - rcu_exp_need_qs(); - local_irq_restore(flags); - put_cpu(); - return; - } - /* Quiescent state needed on some other CPU, send IPI. */ - ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0); - put_cpu(); - WARN_ON_ONCE(ret); -} - /* * Because preemptible RCU does not exist, we never have to check for * tasks blocked within RCU read-side critical sections that are diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index b473ff056f49..e6cd56603cad 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -276,7 +276,7 @@ static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype, * callback storms, no need to wake up too early. */ if (waketype == RCU_NOCB_WAKE_LAZY && - rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) { + rdp_gp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) { mod_timer(&rdp_gp->nocb_timer, jiffies + rcu_get_jiffies_lazy_flush()); WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype); } else if (waketype == RCU_NOCB_WAKE_BYPASS) { @@ -1146,7 +1146,6 @@ static bool rcu_nocb_rdp_offload_wait_cond(struct rcu_data *rdp) static int rcu_nocb_rdp_offload(struct rcu_data *rdp) { int wake_gp; - struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; WARN_ON_ONCE(cpu_online(rdp->cpu)); /* @@ -1156,7 +1155,7 @@ static int rcu_nocb_rdp_offload(struct rcu_data *rdp) if (!rdp->nocb_gp_rdp) return -EINVAL; - if (WARN_ON_ONCE(!rdp_gp->nocb_gp_kthread)) + if (WARN_ON_ONCE(!rdp->nocb_gp_kthread)) return -EINVAL; pr_info("Offloading %d\n", rdp->cpu); @@ -1166,7 +1165,7 @@ static int rcu_nocb_rdp_offload(struct rcu_data *rdp) wake_gp = rcu_nocb_queue_toggle_rdp(rdp); if (wake_gp) - wake_up_process(rdp_gp->nocb_gp_kthread); + wake_up_process(rdp->nocb_gp_kthread); swait_event_exclusive(rdp->nocb_state_wq, rcu_nocb_rdp_offload_wait_cond(rdp)); @@ -1564,6 +1563,9 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) if (rdp->nocb_gp_rdp == rdp) show_rcu_nocb_gp_state(rdp); + if (!rcu_segcblist_is_offloaded(&rdp->cblist)) + return; + nocb_next_rdp = list_next_or_null_rcu(&rdp->nocb_gp_rdp->nocb_head_rdp, &rdp->nocb_entry_rdp, typeof(*rdp), diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0b0f56f6abc8..fc14adf15cbb 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -486,13 +486,16 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) struct rcu_node *rnp; union rcu_special special; + rdp = this_cpu_ptr(&rcu_data); + if (rdp->defer_qs_iw_pending == DEFER_QS_PENDING) + rdp->defer_qs_iw_pending = DEFER_QS_IDLE; + /* * If RCU core is waiting for this CPU to exit its critical section, * report the fact that it has exited. Because irqs are disabled, * t->rcu_read_unlock_special cannot change. */ special = t->rcu_read_unlock_special; - rdp = this_cpu_ptr(&rcu_data); if (!special.s && !rdp->cpu_no_qs.b.exp) { local_irq_restore(flags); return; @@ -534,7 +537,6 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq && (!empty_norm || rnp->qsmask)); empty_exp = sync_rcu_exp_done(rnp); - smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ np = rcu_next_node_entry(t, rnp); list_del_init(&t->rcu_node_entry); t->rcu_blocked_node = NULL; @@ -624,10 +626,98 @@ notrace void rcu_preempt_deferred_qs(struct task_struct *t) */ static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp) { + unsigned long flags; struct rcu_data *rdp; rdp = container_of(iwp, struct rcu_data, defer_qs_iw); - rdp->defer_qs_iw_pending = false; + local_irq_save(flags); + + /* + * If the IRQ work handler happens to run in the middle of RCU read-side + * critical section, it could be ineffective in getting the scheduler's + * attention to report a deferred quiescent state (the whole point of the + * IRQ work). For this reason, requeue the IRQ work. + * + * Basically, we want to avoid following situation: + * 1. rcu_read_unlock() queues IRQ work (state -> DEFER_QS_PENDING) + * 2. CPU enters new rcu_read_lock() + * 3. IRQ work runs but cannot report QS due to rcu_preempt_depth() > 0 + * 4. rcu_read_unlock() does not re-queue work (state still PENDING) + * 5. Deferred QS reporting does not happen. + */ + if (rcu_preempt_depth() > 0) + WRITE_ONCE(rdp->defer_qs_iw_pending, DEFER_QS_IDLE); + + local_irq_restore(flags); +} + +/* + * Check if expedited grace period processing during unlock is needed. + * + * This function determines whether expedited handling is required based on: + * 1. Task blocking an expedited grace period (based on a heuristic, could be + * false-positive, see below.) + * 2. CPU participating in an expedited grace period + * 3. Strict grace period mode requiring expedited handling + * 4. RCU priority deboosting needs when interrupts were disabled + * + * @t: The task being checked + * @rdp: The per-CPU RCU data + * @rnp: The RCU node for this CPU + * @irqs_were_disabled: Whether interrupts were disabled before rcu_read_unlock() + * + * Returns true if expedited processing of the rcu_read_unlock() is needed. + */ +static bool rcu_unlock_needs_exp_handling(struct task_struct *t, + struct rcu_data *rdp, + struct rcu_node *rnp, + bool irqs_were_disabled) +{ + /* + * Check if this task is blocking an expedited grace period. If the + * task was preempted within an RCU read-side critical section and is + * on the expedited grace period blockers list (exp_tasks), we need + * expedited handling to unblock the expedited GP. This is not an exact + * check because 't' might not be on the exp_tasks list at all - its + * just a fast heuristic that can be false-positive sometimes. + */ + if (t->rcu_blocked_node && READ_ONCE(t->rcu_blocked_node->exp_tasks)) + return true; + + /* + * Check if this CPU is participating in an expedited grace period. + * The expmask bitmap tracks which CPUs need to check in for the + * current expedited GP. If our CPU's bit is set, we need expedited + * handling to help complete the expedited GP. + */ + if (rdp->grpmask & READ_ONCE(rnp->expmask)) + return true; + + /* + * In CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels, all grace periods + * are treated as short for testing purposes even if that means + * disturbing the system more. Check if either: + * - This CPU has not yet reported a quiescent state, or + * - This task was preempted within an RCU critical section + * In either case, require expedited handling for strict GP mode. + */ + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) && + ((rdp->grpmask & READ_ONCE(rnp->qsmask)) || t->rcu_blocked_node)) + return true; + + /* + * RCU priority boosting case: If a task is subject to RCU priority + * boosting and exits an RCU read-side critical section with interrupts + * disabled, we need expedited handling to ensure timely deboosting. + * Without this, a low-priority task could incorrectly run at high + * real-time priority for an extended period degrading real-time + * responsiveness. This applies to all CONFIG_RCU_BOOST=y kernels, + * not just to PREEMPT_RT. + */ + if (IS_ENABLED(CONFIG_RCU_BOOST) && irqs_were_disabled && t->rcu_blocked_node) + return true; + + return false; } /* @@ -649,18 +739,14 @@ static void rcu_read_unlock_special(struct task_struct *t) local_irq_save(flags); irqs_were_disabled = irqs_disabled_flags(flags); if (preempt_bh_were_disabled || irqs_were_disabled) { - bool expboost; // Expedited GP in flight or possible boosting. + bool needs_exp; // Expedited handling needed. struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; - expboost = (t->rcu_blocked_node && READ_ONCE(t->rcu_blocked_node->exp_tasks)) || - (rdp->grpmask & READ_ONCE(rnp->expmask)) || - (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) && - ((rdp->grpmask & READ_ONCE(rnp->qsmask)) || t->rcu_blocked_node)) || - (IS_ENABLED(CONFIG_RCU_BOOST) && irqs_were_disabled && - t->rcu_blocked_node); + needs_exp = rcu_unlock_needs_exp_handling(t, rdp, rnp, irqs_were_disabled); + // Need to defer quiescent state until everything is enabled. - if (use_softirq && (in_hardirq() || (expboost && !irqs_were_disabled))) { + if (use_softirq && (in_hardirq() || (needs_exp && !irqs_were_disabled))) { // Using softirq, safe to awaken, and either the // wakeup is free or there is either an expedited // GP in flight or a potential need to deboost. @@ -673,17 +759,13 @@ static void rcu_read_unlock_special(struct task_struct *t) set_tsk_need_resched(current); set_preempt_need_resched(); if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled && - expboost && !rdp->defer_qs_iw_pending && cpu_online(rdp->cpu)) { + needs_exp && rdp->defer_qs_iw_pending != DEFER_QS_PENDING && + cpu_online(rdp->cpu)) { // Get scheduler to re-evaluate and call hooks. // If !IRQ_WORK, FQS scan will eventually IPI. - if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) && - IS_ENABLED(CONFIG_PREEMPT_RT)) - rdp->defer_qs_iw = IRQ_WORK_INIT_HARD( - rcu_preempt_deferred_qs_handler); - else - init_irq_work(&rdp->defer_qs_iw, - rcu_preempt_deferred_qs_handler); - rdp->defer_qs_iw_pending = true; + rdp->defer_qs_iw = + IRQ_WORK_INIT_HARD(rcu_preempt_deferred_qs_handler); + rdp->defer_qs_iw_pending = DEFER_QS_PENDING; irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu); } } diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 486c00536207..2a5a5329322d 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -17,8 +17,37 @@ // Controlling CPU stall warnings, including delay calculation. /* panic() on RCU Stall sysctl. */ -int sysctl_panic_on_rcu_stall __read_mostly; -int sysctl_max_rcu_stall_to_panic __read_mostly; +static int sysctl_panic_on_rcu_stall __read_mostly; +static int sysctl_max_rcu_stall_to_panic __read_mostly; + +static const struct ctl_table rcu_stall_sysctl_table[] = { + { + .procname = "panic_on_rcu_stall", + .data = &sysctl_panic_on_rcu_stall, + .maxlen = sizeof(sysctl_panic_on_rcu_stall), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "max_rcu_stall_to_panic", + .data = &sysctl_max_rcu_stall_to_panic, + .maxlen = sizeof(sysctl_max_rcu_stall_to_panic), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_INT_MAX, + }, +}; + +static int __init init_rcu_stall_sysctl(void) +{ + register_sysctl_init("kernel", rcu_stall_sysctl_table); + return 0; +} + +subsys_initcall(init_rcu_stall_sysctl); #ifdef CONFIG_SYSFS @@ -953,8 +982,7 @@ void show_rcu_gp_kthreads(void) for_each_possible_cpu(cpu) { rdp = per_cpu_ptr(&rcu_data, cpu); cbs += data_race(READ_ONCE(rdp->n_cbs_invoked)); - if (rcu_segcblist_is_offloaded(&rdp->cblist)) - show_rcu_nocb_state(rdp); + show_rcu_nocb_state(rdp); } pr_info("RCU callbacks invoked since boot: %lu\n", cbs); show_rcu_tasks_gp_kthreads(); diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index 2b331822c7e7..cdea931aae30 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -4,6 +4,9 @@ * Auto-group scheduling implementation: */ +#include "autogroup.h" +#include "sched.h" + unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; static struct autogroup autogroup_default; static atomic_t autogroup_seq_nr; @@ -25,9 +28,9 @@ static void __init sched_autogroup_sysctl_init(void) { register_sysctl_init("kernel", sched_autogroup_sysctls); } -#else +#else /* !CONFIG_SYSCTL: */ #define sched_autogroup_sysctl_init() do { } while (0) -#endif +#endif /* !CONFIG_SYSCTL */ void __init autogroup_init(struct task_struct *init_task) { @@ -108,7 +111,7 @@ static inline struct autogroup *autogroup_create(void) free_rt_sched_group(tg); tg->rt_se = root_task_group.rt_se; tg->rt_rq = root_task_group.rt_rq; -#endif +#endif /* CONFIG_RT_GROUP_SCHED */ tg->autogroup = ag; sched_online_group(tg, &root_task_group); diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h index 90d69f2c5eaf..06c82b2bdfb5 100644 --- a/kernel/sched/autogroup.h +++ b/kernel/sched/autogroup.h @@ -2,6 +2,8 @@ #ifndef _KERNEL_SCHED_AUTOGROUP_H #define _KERNEL_SCHED_AUTOGROUP_H +#include "sched.h" + #ifdef CONFIG_SCHED_AUTOGROUP struct autogroup { @@ -41,7 +43,7 @@ autogroup_task_group(struct task_struct *p, struct task_group *tg) extern int autogroup_path(struct task_group *tg, char *buf, int buflen); -#else /* !CONFIG_SCHED_AUTOGROUP */ +#else /* !CONFIG_SCHED_AUTOGROUP: */ static inline void autogroup_init(struct task_struct *init_task) { } static inline void autogroup_free(struct task_group *tg) { } @@ -61,6 +63,6 @@ static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) return 0; } -#endif /* CONFIG_SCHED_AUTOGROUP */ +#endif /* !CONFIG_SCHED_AUTOGROUP */ #endif /* _KERNEL_SCHED_AUTOGROUP_H */ diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c index 72d97aa8b726..c4a488e67aa7 100644 --- a/kernel/sched/build_policy.c +++ b/kernel/sched/build_policy.c @@ -50,11 +50,9 @@ #include "idle.c" #include "rt.c" +#include "cpudeadline.c" -#ifdef CONFIG_SMP -# include "cpudeadline.c" -# include "pelt.c" -#endif +#include "pelt.c" #include "cputime.c" #include "deadline.c" diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c index bf9d8db94b70..e2cf3b08d4e9 100644 --- a/kernel/sched/build_utility.c +++ b/kernel/sched/build_utility.c @@ -80,11 +80,10 @@ #include "wait_bit.c" #include "wait.c" -#ifdef CONFIG_SMP -# include "cpupri.c" -# include "stop_task.c" -# include "topology.c" -#endif +#include "cpupri.c" +#include "stop_task.c" + +#include "topology.c" #ifdef CONFIG_SCHED_CORE # include "core_sched.c" diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index a09655b48140..f5e6dd6a6b3a 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -54,6 +54,9 @@ * */ +#include <linux/sched/clock.h> +#include "sched.h" + /* * Scheduler clock - returns current time in nanosec units. * This is default implementation. @@ -471,7 +474,7 @@ notrace void sched_clock_idle_wakeup_event(void) } EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); -#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ +#else /* !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK: */ void __init sched_clock_init(void) { @@ -489,7 +492,7 @@ notrace u64 sched_clock_cpu(int cpu) return sched_clock(); } -#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ +#endif /* !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ /* * Running clock - returns the time that has elapsed while a guest has been diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 3561ab533dd4..19ee702273c0 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -13,6 +13,11 @@ * Waiting for completion is a typically sync point, but not an exclusion point. */ +#include <linux/linkage.h> +#include <linux/sched/debug.h> +#include <linux/completion.h> +#include "sched.h" + static void complete_with_flags(struct completion *x, int wake_flags) { unsigned long flags; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 81c6df746df1..66d93a872968 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -69,8 +69,8 @@ #include <linux/livepatch_sched.h> #ifdef CONFIG_PREEMPT_DYNAMIC -# ifdef CONFIG_GENERIC_ENTRY -# include <linux/entry-common.h> +# ifdef CONFIG_GENERIC_IRQ_ENTRY +# include <linux/irq-entry-common.h> # endif #endif @@ -96,6 +96,7 @@ #include "../workqueue_internal.h" #include "../../io_uring/io-wq.h" #include "../smpboot.h" +#include "../locking/mutex.h" EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpu); EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpumask); @@ -119,6 +120,35 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); +#ifdef CONFIG_SCHED_PROXY_EXEC +DEFINE_STATIC_KEY_TRUE(__sched_proxy_exec); +static int __init setup_proxy_exec(char *str) +{ + bool proxy_enable = true; + + if (*str && kstrtobool(str + 1, &proxy_enable)) { + pr_warn("Unable to parse sched_proxy_exec=\n"); + return 0; + } + + if (proxy_enable) { + pr_info("sched_proxy_exec enabled via boot arg\n"); + static_branch_enable(&__sched_proxy_exec); + } else { + pr_info("sched_proxy_exec disabled via boot arg\n"); + static_branch_disable(&__sched_proxy_exec); + } + return 1; +} +#else +static int __init setup_proxy_exec(char *str) +{ + pr_warn("CONFIG_SCHED_PROXY_EXEC=n, so it cannot be enabled or disabled at boot time\n"); + return 0; +} +#endif +__setup("sched_proxy_exec", setup_proxy_exec); + /* * Debugging: various feature bits * @@ -481,13 +511,13 @@ void sched_core_put(void) schedule_work(&_work); } -#else /* !CONFIG_SCHED_CORE */ +#else /* !CONFIG_SCHED_CORE: */ static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { } static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { } -#endif /* CONFIG_SCHED_CORE */ +#endif /* !CONFIG_SCHED_CORE */ /* need a wrapper since we may need to trace from modules */ EXPORT_TRACEPOINT_SYMBOL(sched_set_state_tp); @@ -650,7 +680,6 @@ void raw_spin_rq_unlock(struct rq *rq) raw_spin_unlock(rq_lockp(rq)); } -#ifdef CONFIG_SMP /* * double_rq_lock - safely lock two runqueues */ @@ -667,7 +696,6 @@ void double_rq_lock(struct rq *rq1, struct rq *rq2) double_rq_clock_clear_update(rq1, rq2); } -#endif /* * __task_rq_lock - lock the rq @p resides on. @@ -853,8 +881,6 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) return HRTIMER_NORESTART; } -#ifdef CONFIG_SMP - static void __hrtick_restart(struct rq *rq) { struct hrtimer *timer = &rq->hrtick_timer; @@ -899,33 +925,12 @@ void hrtick_start(struct rq *rq, u64 delay) smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); } -#else -/* - * Called to set the hrtick timer state. - * - * called with rq->lock held and IRQs disabled - */ -void hrtick_start(struct rq *rq, u64 delay) -{ - /* - * Don't schedule slices shorter than 10000ns, that just - * doesn't make sense. Rely on vruntime for fairness. - */ - delay = max_t(u64, delay, 10000LL); - hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), - HRTIMER_MODE_REL_PINNED_HARD); -} - -#endif /* CONFIG_SMP */ - static void hrtick_rq_init(struct rq *rq) { -#ifdef CONFIG_SMP INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq); -#endif hrtimer_setup(&rq->hrtick_timer, hrtick, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); } -#else /* CONFIG_SCHED_HRTICK */ +#else /* !CONFIG_SCHED_HRTICK: */ static inline void hrtick_clear(struct rq *rq) { } @@ -933,7 +938,7 @@ static inline void hrtick_clear(struct rq *rq) static inline void hrtick_rq_init(struct rq *rq) { } -#endif /* CONFIG_SCHED_HRTICK */ +#endif /* !CONFIG_SCHED_HRTICK */ /* * try_cmpxchg based fetch_or() macro so it works for different integer types: @@ -949,7 +954,7 @@ static inline void hrtick_rq_init(struct rq *rq) _val; \ }) -#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) +#ifdef TIF_POLLING_NRFLAG /* * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, * this avoids any races wrt polling state changes and thereby avoids @@ -988,13 +993,11 @@ static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif) return true; } -#ifdef CONFIG_SMP static inline bool set_nr_if_polling(struct task_struct *p) { return false; } #endif -#endif static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) { @@ -1110,6 +1113,7 @@ static void __resched_curr(struct rq *rq, int tif) cpu = cpu_of(rq); + trace_sched_set_need_resched_tp(curr, cpu, tif); if (cpu == smp_processor_id()) { set_ti_thread_flag(cti, tif); if (tif == TIF_NEED_RESCHED) @@ -1125,6 +1129,11 @@ static void __resched_curr(struct rq *rq, int tif) } } +void __trace_set_need_resched(struct task_struct *curr, int tif) +{ + trace_sched_set_need_resched_tp(curr, smp_processor_id(), tif); +} + void resched_curr(struct rq *rq) { __resched_curr(rq, TIF_NEED_RESCHED); @@ -1167,7 +1176,6 @@ void resched_cpu(int cpu) raw_spin_rq_unlock_irqrestore(rq, flags); } -#ifdef CONFIG_SMP #ifdef CONFIG_NO_HZ_COMMON /* * In the semi idle case, use the nearest busy CPU for migrating timers @@ -1374,10 +1382,8 @@ bool sched_can_stop_tick(struct rq *rq) return true; } #endif /* CONFIG_NO_HZ_FULL */ -#endif /* CONFIG_SMP */ -#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ - (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) +#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_FAIR_GROUP_SCHED) /* * Iterate task_group tree rooted at *from, calling @down when first entering a * node and @up when leaving it for the final time. @@ -1971,7 +1977,7 @@ undo: sysctl_sched_uclamp_util_min_rt_default = old_min_rt; return result; } -#endif +#endif /* CONFIG_SYSCTL */ static void uclamp_fork(struct task_struct *p) { @@ -2037,13 +2043,13 @@ static void __init init_uclamp(void) } } -#else /* !CONFIG_UCLAMP_TASK */ +#else /* !CONFIG_UCLAMP_TASK: */ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p, int flags) { } static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { } static inline void uclamp_fork(struct task_struct *p) { } static inline void uclamp_post_fork(struct task_struct *p) { } static inline void init_uclamp(void) { } -#endif /* CONFIG_UCLAMP_TASK */ +#endif /* !CONFIG_UCLAMP_TASK */ bool sched_task_on_rq(struct task_struct *p) { @@ -2353,8 +2359,6 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state return ncsw; } -#ifdef CONFIG_SMP - static void __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx); @@ -2936,8 +2940,15 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag struct set_affinity_pending my_pending = { }, *pending = NULL; bool stop_pending, complete = false; - /* Can the task run on the task's current CPU? If so, we're done */ - if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) { + /* + * Can the task run on the task's current CPU? If so, we're done + * + * We are also done if the task is the current donor, boosting a lock- + * holding proxy, (and potentially has been migrated outside its + * current or previous affinity mask) + */ + if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask) || + (task_current_donor(rq, p) && !task_current(rq, p))) { struct task_struct *push_task = NULL; if ((flags & SCA_MIGRATE_ENABLE) && @@ -3305,6 +3316,8 @@ void relax_compatible_cpus_allowed_ptr(struct task_struct *p) WARN_ON_ONCE(ret); } +#ifdef CONFIG_SMP + void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { unsigned int state = READ_ONCE(p->__state); @@ -3358,6 +3371,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) __set_task_cpu(p, new_cpu); } +#endif /* CONFIG_SMP */ #ifdef CONFIG_NUMA_BALANCING static void __migrate_swap_task(struct task_struct *p, int cpu) @@ -3657,17 +3671,6 @@ void sched_set_stop_task(int cpu, struct task_struct *stop) } } -#else /* CONFIG_SMP */ - -static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { } - -static inline bool rq_has_pinned_tasks(struct rq *rq) -{ - return false; -} - -#endif /* !CONFIG_SMP */ - static void ttwu_stat(struct task_struct *p, int cpu, int wake_flags) { @@ -3678,7 +3681,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) rq = this_rq(); -#ifdef CONFIG_SMP if (cpu == rq->cpu) { __schedstat_inc(rq->ttwu_local); __schedstat_inc(p->stats.nr_wakeups_local); @@ -3698,7 +3700,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) if (wake_flags & WF_MIGRATED) __schedstat_inc(p->stats.nr_wakeups_migrate); -#endif /* CONFIG_SMP */ __schedstat_inc(rq->ttwu_count); __schedstat_inc(p->stats.nr_wakeups); @@ -3727,13 +3728,11 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, if (p->sched_contributes_to_load) rq->nr_uninterruptible--; -#ifdef CONFIG_SMP if (wake_flags & WF_RQ_SELECTED) en_flags |= ENQUEUE_RQ_SELECTED; if (wake_flags & WF_MIGRATED) en_flags |= ENQUEUE_MIGRATED; else -#endif if (p->in_iowait) { delayacct_blkio_end(p); atomic_dec(&task_rq(p)->nr_iowait); @@ -3744,7 +3743,6 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, ttwu_do_wakeup(p); -#ifdef CONFIG_SMP if (p->sched_class->task_woken) { /* * Our task @p is fully woken up and running; so it's safe to @@ -3766,7 +3764,6 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, rq->idle_stamp = 0; } -#endif } /* @@ -3820,7 +3817,6 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags) return ret; } -#ifdef CONFIG_SMP void sched_ttwu_pending(void *arg) { struct llist_node *llist = arg; @@ -3887,7 +3883,9 @@ static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED); WRITE_ONCE(rq->ttwu_pending, 1); +#ifdef CONFIG_SMP __smp_call_single_queue(cpu, &p->wake_entry.llist); +#endif } void wake_up_if_idle(int cpu) @@ -3993,15 +3991,6 @@ static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) return false; } -#else /* !CONFIG_SMP */ - -static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) -{ - return false; -} - -#endif /* CONFIG_SMP */ - static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) { struct rq *rq = cpu_rq(cpu); @@ -4257,7 +4246,6 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags)) break; -#ifdef CONFIG_SMP /* * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be * possible to, falsely, observe p->on_cpu == 0. @@ -4336,9 +4324,6 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) psi_ttwu_dequeue(p); set_task_cpu(p, cpu); } -#else - cpu = task_cpu(p); -#endif /* CONFIG_SMP */ ttwu_queue(p, cpu, wake_flags); } @@ -4371,14 +4356,12 @@ static bool __task_needs_rq_lock(struct task_struct *p) if (p->on_rq) return true; -#ifdef CONFIG_SMP /* * Ensure the task has finished __schedule() and will not be referenced * anymore. Again, see try_to_wake_up() for a longer comment. */ smp_rmb(); smp_cond_load_acquire(&p->on_cpu, !VAL); -#endif return false; } @@ -4534,10 +4517,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->capture_control = NULL; #endif init_numa_balancing(clone_flags, p); -#ifdef CONFIG_SMP p->wake_entry.u_flags = CSD_TYPE_TTWU; p->migration_pending = NULL; -#endif init_sched_mm_cid(p); } @@ -4600,8 +4581,8 @@ static int sysctl_numa_balancing(const struct ctl_table *table, int write, } return err; } -#endif -#endif +#endif /* CONFIG_PROC_SYSCTL */ +#endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_SCHEDSTATS @@ -4788,14 +4769,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif -#if defined(CONFIG_SMP) p->on_cpu = 0; -#endif init_task_preempt_count(p); -#ifdef CONFIG_SMP plist_node_init(&p->pushable_tasks, MAX_PRIO); RB_CLEAR_NODE(&p->pushable_dl_tasks); -#endif + return 0; } @@ -4872,7 +4850,6 @@ void wake_up_new_task(struct task_struct *p) raw_spin_lock_irqsave(&p->pi_lock, rf.flags); WRITE_ONCE(p->__state, TASK_RUNNING); -#ifdef CONFIG_SMP /* * Fork balancing, do it here and not earlier because: * - cpus_ptr can change in the fork path @@ -4884,7 +4861,6 @@ void wake_up_new_task(struct task_struct *p) p->recent_used_cpu = task_cpu(p); rseq_migrate(p); __set_task_cpu(p, select_task_rq(p, task_cpu(p), &wake_flags)); -#endif rq = __task_rq_lock(p, &rf); update_rq_clock(rq); post_init_entity_util_avg(p); @@ -4892,7 +4868,6 @@ void wake_up_new_task(struct task_struct *p) activate_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_INITIAL); trace_sched_wakeup_new(p); wakeup_preempt(rq, p, wake_flags); -#ifdef CONFIG_SMP if (p->sched_class->task_woken) { /* * Nothing relies on rq->lock after this, so it's fine to @@ -4902,7 +4877,6 @@ void wake_up_new_task(struct task_struct *p) p->sched_class->task_woken(rq, p); rq_repin_lock(rq, &rf); } -#endif task_rq_unlock(rq, p, &rf); } @@ -4979,7 +4953,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr, __fire_sched_out_preempt_notifiers(curr, next); } -#else /* !CONFIG_PREEMPT_NOTIFIERS */ +#else /* !CONFIG_PREEMPT_NOTIFIERS: */ static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) { @@ -4991,11 +4965,10 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr, { } -#endif /* CONFIG_PREEMPT_NOTIFIERS */ +#endif /* !CONFIG_PREEMPT_NOTIFIERS */ static inline void prepare_task(struct task_struct *next) { -#ifdef CONFIG_SMP /* * Claim the task as running, we do this before switching to it * such that any running task will have this set. @@ -5004,12 +4977,10 @@ static inline void prepare_task(struct task_struct *next) * its ordering comment. */ WRITE_ONCE(next->on_cpu, 1); -#endif } static inline void finish_task(struct task_struct *prev) { -#ifdef CONFIG_SMP /* * This must be the very last reference to @prev from this CPU. After * p->on_cpu is cleared, the task can be moved to a different CPU. We @@ -5022,11 +4993,8 @@ static inline void finish_task(struct task_struct *prev) * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). */ smp_store_release(&prev->on_cpu, 0); -#endif } -#ifdef CONFIG_SMP - static void do_balance_callbacks(struct rq *rq, struct balance_callback *head) { void (*func)(struct rq *rq); @@ -5108,14 +5076,6 @@ void balance_callbacks(struct rq *rq, struct balance_callback *head) } } -#else - -static inline void __balance_callbacks(struct rq *rq) -{ -} - -#endif - static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf) { @@ -5325,7 +5285,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) * switched the context for the first time. It is returning from * schedule for the first time in this path. */ - trace_sched_exit_tp(true, CALLER_ADDR0); + trace_sched_exit_tp(true); preempt_enable(); if (current->set_child_tid) @@ -5503,8 +5463,6 @@ unsigned int nr_iowait(void) return sum; } -#ifdef CONFIG_SMP - /* * sched_exec - execve() is a valuable balancing opportunity, because at * this point the task has the smallest effective memory and cache footprint. @@ -5528,8 +5486,6 @@ void sched_exec(void) stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); } -#endif - DEFINE_PER_CPU(struct kernel_stat, kstat); DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); @@ -5564,7 +5520,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) struct rq *rq; u64 ns; -#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) +#ifdef CONFIG_64BIT /* * 64-bit doesn't need locks to atomically read a 64-bit value. * So we have a optimization chance when the task's delta_exec is 0. @@ -5691,12 +5647,10 @@ void sched_tick(void) if (donor->flags & PF_WQ_WORKER) wq_worker_tick(donor); -#ifdef CONFIG_SMP if (!scx_switched_all()) { rq->idle_balance = idle_cpu(cpu); sched_balance_trigger(rq); } -#endif } #ifdef CONFIG_NO_HZ_FULL @@ -5836,10 +5790,10 @@ int __init sched_tick_offload_init(void) return 0; } -#else /* !CONFIG_NO_HZ_FULL */ +#else /* !CONFIG_NO_HZ_FULL: */ static inline void sched_tick_start(int cpu) { } static inline void sched_tick_stop(int cpu) { } -#endif +#endif /* !CONFIG_NO_HZ_FULL */ #if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ defined(CONFIG_TRACE_PREEMPT_TOGGLE)) @@ -6554,7 +6508,7 @@ static inline void sched_core_cpu_dying(unsigned int cpu) rq->core = rq; } -#else /* !CONFIG_SCHED_CORE */ +#else /* !CONFIG_SCHED_CORE: */ static inline void sched_core_cpu_starting(unsigned int cpu) {} static inline void sched_core_cpu_deactivate(unsigned int cpu) {} @@ -6566,7 +6520,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) return __pick_next_task(rq, prev, rf); } -#endif /* CONFIG_SCHED_CORE */ +#endif /* !CONFIG_SCHED_CORE */ /* * Constants for the sched_mode argument of __schedule(). @@ -6582,11 +6536,13 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) /* * Helper function for __schedule() * - * If a task does not have signals pending, deactivate it - * Otherwise marks the task's __state as RUNNING + * Tries to deactivate the task, unless the should_block arg + * is false or if a signal is pending. In the case a signal + * is pending, marks the task's __state as RUNNING (and clear + * blocked_on). */ static bool try_to_block_task(struct rq *rq, struct task_struct *p, - unsigned long *task_state_p) + unsigned long *task_state_p, bool should_block) { unsigned long task_state = *task_state_p; int flags = DEQUEUE_NOCLOCK; @@ -6597,6 +6553,16 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p, return false; } + /* + * We check should_block after signal_pending because we + * will want to wake the task in that case. But if + * should_block is false, its likely due to the task being + * blocked on a mutex, and we want to keep it on the runqueue + * to be selectable for proxy-execution. + */ + if (!should_block) + return false; + p->sched_contributes_to_load = (task_state & TASK_UNINTERRUPTIBLE) && !(task_state & TASK_NOLOAD) && @@ -6620,6 +6586,194 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p, return true; } +#ifdef CONFIG_SCHED_PROXY_EXEC +static inline struct task_struct *proxy_resched_idle(struct rq *rq) +{ + put_prev_set_next_task(rq, rq->donor, rq->idle); + rq_set_donor(rq, rq->idle); + set_tsk_need_resched(rq->idle); + return rq->idle; +} + +static bool __proxy_deactivate(struct rq *rq, struct task_struct *donor) +{ + unsigned long state = READ_ONCE(donor->__state); + + /* Don't deactivate if the state has been changed to TASK_RUNNING */ + if (state == TASK_RUNNING) + return false; + /* + * Because we got donor from pick_next_task(), it is *crucial* + * that we call proxy_resched_idle() before we deactivate it. + * As once we deactivate donor, donor->on_rq is set to zero, + * which allows ttwu() to immediately try to wake the task on + * another rq. So we cannot use *any* references to donor + * after that point. So things like cfs_rq->curr or rq->donor + * need to be changed from next *before* we deactivate. + */ + proxy_resched_idle(rq); + return try_to_block_task(rq, donor, &state, true); +} + +static struct task_struct *proxy_deactivate(struct rq *rq, struct task_struct *donor) +{ + if (!__proxy_deactivate(rq, donor)) { + /* + * XXX: For now, if deactivation failed, set donor + * as unblocked, as we aren't doing proxy-migrations + * yet (more logic will be needed then). + */ + donor->blocked_on = NULL; + } + return NULL; +} + +/* + * Find runnable lock owner to proxy for mutex blocked donor + * + * Follow the blocked-on relation: + * task->blocked_on -> mutex->owner -> task... + * + * Lock order: + * + * p->pi_lock + * rq->lock + * mutex->wait_lock + * + * Returns the task that is going to be used as execution context (the one + * that is actually going to be run on cpu_of(rq)). + */ +static struct task_struct * +find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) +{ + struct task_struct *owner = NULL; + int this_cpu = cpu_of(rq); + struct task_struct *p; + struct mutex *mutex; + + /* Follow blocked_on chain. */ + for (p = donor; task_is_blocked(p); p = owner) { + mutex = p->blocked_on; + /* Something changed in the chain, so pick again */ + if (!mutex) + return NULL; + /* + * By taking mutex->wait_lock we hold off concurrent mutex_unlock() + * and ensure @owner sticks around. + */ + guard(raw_spinlock)(&mutex->wait_lock); + + /* Check again that p is blocked with wait_lock held */ + if (mutex != __get_task_blocked_on(p)) { + /* + * Something changed in the blocked_on chain and + * we don't know if only at this level. So, let's + * just bail out completely and let __schedule() + * figure things out (pick_again loop). + */ + return NULL; + } + + owner = __mutex_owner(mutex); + if (!owner) { + __clear_task_blocked_on(p, mutex); + return p; + } + + if (!READ_ONCE(owner->on_rq) || owner->se.sched_delayed) { + /* XXX Don't handle blocked owners/delayed dequeue yet */ + return proxy_deactivate(rq, donor); + } + + if (task_cpu(owner) != this_cpu) { + /* XXX Don't handle migrations yet */ + return proxy_deactivate(rq, donor); + } + + if (task_on_rq_migrating(owner)) { + /* + * One of the chain of mutex owners is currently migrating to this + * CPU, but has not yet been enqueued because we are holding the + * rq lock. As a simple solution, just schedule rq->idle to give + * the migration a chance to complete. Much like the migrate_task + * case we should end up back in find_proxy_task(), this time + * hopefully with all relevant tasks already enqueued. + */ + return proxy_resched_idle(rq); + } + + /* + * Its possible to race where after we check owner->on_rq + * but before we check (owner_cpu != this_cpu) that the + * task on another cpu was migrated back to this cpu. In + * that case it could slip by our checks. So double check + * we are still on this cpu and not migrating. If we get + * inconsistent results, try again. + */ + if (!task_on_rq_queued(owner) || task_cpu(owner) != this_cpu) + return NULL; + + if (owner == p) { + /* + * It's possible we interleave with mutex_unlock like: + * + * lock(&rq->lock); + * find_proxy_task() + * mutex_unlock() + * lock(&wait_lock); + * donor(owner) = current->blocked_donor; + * unlock(&wait_lock); + * + * wake_up_q(); + * ... + * ttwu_runnable() + * __task_rq_lock() + * lock(&wait_lock); + * owner == p + * + * Which leaves us to finish the ttwu_runnable() and make it go. + * + * So schedule rq->idle so that ttwu_runnable() can get the rq + * lock and mark owner as running. + */ + return proxy_resched_idle(rq); + } + /* + * OK, now we're absolutely sure @owner is on this + * rq, therefore holding @rq->lock is sufficient to + * guarantee its existence, as per ttwu_remote(). + */ + } + + WARN_ON_ONCE(owner && !owner->on_rq); + return owner; +} +#else /* SCHED_PROXY_EXEC */ +static struct task_struct * +find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) +{ + WARN_ONCE(1, "This should never be called in the !SCHED_PROXY_EXEC case\n"); + return donor; +} +#endif /* SCHED_PROXY_EXEC */ + +static inline void proxy_tag_curr(struct rq *rq, struct task_struct *owner) +{ + if (!sched_proxy_exec()) + return; + /* + * pick_next_task() calls set_next_task() on the chosen task + * at some point, which ensures it is not push/pullable. + * However, the chosen/donor task *and* the mutex owner form an + * atomic pair wrt push/pull. + * + * Make sure owner we run is not pushable. Unfortunately we can + * only deal with that by means of a dequeue/enqueue cycle. :-/ + */ + dequeue_task(rq, owner, DEQUEUE_NOCLOCK | DEQUEUE_SAVE); + enqueue_task(rq, owner, ENQUEUE_NOCLOCK | ENQUEUE_RESTORE); +} + /* * __schedule() is the main scheduler function. * @@ -6674,7 +6828,8 @@ static void __sched notrace __schedule(int sched_mode) struct rq *rq; int cpu; - trace_sched_entry_tp(preempt, CALLER_ADDR0); + /* Trace preemptions consistently with task switches */ + trace_sched_entry_tp(sched_mode == SM_PREEMPT); cpu = smp_processor_id(); rq = cpu_rq(cpu); @@ -6732,15 +6887,31 @@ static void __sched notrace __schedule(int sched_mode) goto picked; } } else if (!preempt && prev_state) { - try_to_block_task(rq, prev, &prev_state); + /* + * We pass task_is_blocked() as the should_block arg + * in order to keep mutex-blocked tasks on the runqueue + * for slection with proxy-exec (without proxy-exec + * task_is_blocked() will always be false). + */ + try_to_block_task(rq, prev, &prev_state, + !task_is_blocked(prev)); switch_count = &prev->nvcsw; } - next = pick_next_task(rq, prev, &rf); +pick_again: + next = pick_next_task(rq, rq->donor, &rf); rq_set_donor(rq, next); + if (unlikely(task_is_blocked(next))) { + next = find_proxy_task(rq, next, &rf); + if (!next) + goto pick_again; + if (next == rq->idle) + goto keep_resched; + } picked: clear_tsk_need_resched(prev); clear_preempt_need_resched(); +keep_resched: rq->last_seen_need_resched_ns = 0; is_switch = prev != next; @@ -6751,6 +6922,10 @@ picked: * changes to task_struct made by pick_next_task(). */ RCU_INIT_POINTER(rq->curr, next); + + if (!task_current_donor(rq, next)) + proxy_tag_curr(rq, next); + /* * The membarrier system call requires each architecture * to have a full memory barrier after updating @@ -6785,11 +6960,15 @@ picked: /* Also unlocks the rq: */ rq = context_switch(rq, prev, next, &rf); } else { + /* In case next was already curr but just got blocked_donor */ + if (!task_current_donor(rq, next)) + proxy_tag_curr(rq, next); + rq_unpin_lock(rq, &rf); __balance_callbacks(rq); raw_spin_rq_unlock_irq(rq); } - trace_sched_exit_tp(is_switch, CALLER_ADDR0); + trace_sched_exit_tp(is_switch); } void __noreturn do_task_dead(void) @@ -6993,14 +7172,14 @@ NOKPROBE_SYMBOL(preempt_schedule); EXPORT_SYMBOL(preempt_schedule); #ifdef CONFIG_PREEMPT_DYNAMIC -#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) -#ifndef preempt_schedule_dynamic_enabled -#define preempt_schedule_dynamic_enabled preempt_schedule -#define preempt_schedule_dynamic_disabled NULL -#endif +# ifdef CONFIG_HAVE_PREEMPT_DYNAMIC_CALL +# ifndef preempt_schedule_dynamic_enabled +# define preempt_schedule_dynamic_enabled preempt_schedule +# define preempt_schedule_dynamic_disabled NULL +# endif DEFINE_STATIC_CALL(preempt_schedule, preempt_schedule_dynamic_enabled); EXPORT_STATIC_CALL_TRAMP(preempt_schedule); -#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +# elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule); void __sched notrace dynamic_preempt_schedule(void) { @@ -7010,8 +7189,8 @@ void __sched notrace dynamic_preempt_schedule(void) } NOKPROBE_SYMBOL(dynamic_preempt_schedule); EXPORT_SYMBOL(dynamic_preempt_schedule); -#endif -#endif +# endif +#endif /* CONFIG_PREEMPT_DYNAMIC */ /** * preempt_schedule_notrace - preempt_schedule called by tracing @@ -7066,14 +7245,14 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) EXPORT_SYMBOL_GPL(preempt_schedule_notrace); #ifdef CONFIG_PREEMPT_DYNAMIC -#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) -#ifndef preempt_schedule_notrace_dynamic_enabled -#define preempt_schedule_notrace_dynamic_enabled preempt_schedule_notrace -#define preempt_schedule_notrace_dynamic_disabled NULL -#endif +# if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) +# ifndef preempt_schedule_notrace_dynamic_enabled +# define preempt_schedule_notrace_dynamic_enabled preempt_schedule_notrace +# define preempt_schedule_notrace_dynamic_disabled NULL +# endif DEFINE_STATIC_CALL(preempt_schedule_notrace, preempt_schedule_notrace_dynamic_enabled); EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace); -#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +# elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule_notrace); void __sched notrace dynamic_preempt_schedule_notrace(void) { @@ -7083,7 +7262,7 @@ void __sched notrace dynamic_preempt_schedule_notrace(void) } NOKPROBE_SYMBOL(dynamic_preempt_schedule_notrace); EXPORT_SYMBOL(dynamic_preempt_schedule_notrace); -#endif +# endif #endif #endif /* CONFIG_PREEMPTION */ @@ -7302,7 +7481,7 @@ out_unlock: preempt_enable(); } -#endif +#endif /* CONFIG_RT_MUTEXES */ #if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) int __sched __cond_resched(void) @@ -7333,17 +7512,17 @@ EXPORT_SYMBOL(__cond_resched); #endif #ifdef CONFIG_PREEMPT_DYNAMIC -#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) -#define cond_resched_dynamic_enabled __cond_resched -#define cond_resched_dynamic_disabled ((void *)&__static_call_return0) +# ifdef CONFIG_HAVE_PREEMPT_DYNAMIC_CALL +# define cond_resched_dynamic_enabled __cond_resched +# define cond_resched_dynamic_disabled ((void *)&__static_call_return0) DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched); EXPORT_STATIC_CALL_TRAMP(cond_resched); -#define might_resched_dynamic_enabled __cond_resched -#define might_resched_dynamic_disabled ((void *)&__static_call_return0) +# define might_resched_dynamic_enabled __cond_resched +# define might_resched_dynamic_disabled ((void *)&__static_call_return0) DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched); EXPORT_STATIC_CALL_TRAMP(might_resched); -#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +# elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched); int __sched dynamic_cond_resched(void) { @@ -7361,8 +7540,8 @@ int __sched dynamic_might_resched(void) return __cond_resched(); } EXPORT_SYMBOL(dynamic_might_resched); -#endif -#endif +# endif +#endif /* CONFIG_PREEMPT_DYNAMIC */ /* * __cond_resched_lock() - if a reschedule is pending, drop the given lock, @@ -7428,9 +7607,9 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write); #ifdef CONFIG_PREEMPT_DYNAMIC -#ifdef CONFIG_GENERIC_ENTRY -#include <linux/entry-common.h> -#endif +# ifdef CONFIG_GENERIC_IRQ_ENTRY +# include <linux/irq-entry-common.h> +# endif /* * SC:cond_resched @@ -7485,37 +7664,37 @@ int preempt_dynamic_mode = preempt_dynamic_undefined; int sched_dynamic_mode(const char *str) { -#ifndef CONFIG_PREEMPT_RT +# ifndef CONFIG_PREEMPT_RT if (!strcmp(str, "none")) return preempt_dynamic_none; if (!strcmp(str, "voluntary")) return preempt_dynamic_voluntary; -#endif +# endif if (!strcmp(str, "full")) return preempt_dynamic_full; -#ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY +# ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY if (!strcmp(str, "lazy")) return preempt_dynamic_lazy; -#endif +# endif return -EINVAL; } -#define preempt_dynamic_key_enable(f) static_key_enable(&sk_dynamic_##f.key) -#define preempt_dynamic_key_disable(f) static_key_disable(&sk_dynamic_##f.key) +# define preempt_dynamic_key_enable(f) static_key_enable(&sk_dynamic_##f.key) +# define preempt_dynamic_key_disable(f) static_key_disable(&sk_dynamic_##f.key) -#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) -#define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled) -#define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled) -#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) -#define preempt_dynamic_enable(f) preempt_dynamic_key_enable(f) -#define preempt_dynamic_disable(f) preempt_dynamic_key_disable(f) -#else -#error "Unsupported PREEMPT_DYNAMIC mechanism" -#endif +# if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) +# define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled) +# define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled) +# elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +# define preempt_dynamic_enable(f) preempt_dynamic_key_enable(f) +# define preempt_dynamic_disable(f) preempt_dynamic_key_disable(f) +# else +# error "Unsupported PREEMPT_DYNAMIC mechanism" +# endif static DEFINE_MUTEX(sched_dynamic_mutex); @@ -7619,7 +7798,7 @@ static void __init preempt_dynamic_init(void) } } -#define PREEMPT_MODEL_ACCESSOR(mode) \ +# define PREEMPT_MODEL_ACCESSOR(mode) \ bool preempt_model_##mode(void) \ { \ WARN_ON_ONCE(preempt_dynamic_mode == preempt_dynamic_undefined); \ @@ -7820,12 +7999,10 @@ void show_state_filter(unsigned int state_filter) */ void __init init_idle(struct task_struct *idle, int cpu) { -#ifdef CONFIG_SMP struct affinity_context ac = (struct affinity_context) { .new_mask = cpumask_of(cpu), .flags = 0, }; -#endif struct rq *rq = cpu_rq(cpu); unsigned long flags; @@ -7841,13 +8018,11 @@ void __init init_idle(struct task_struct *idle, int cpu) idle->flags |= PF_KTHREAD | PF_NO_SETAFFINITY; kthread_set_per_cpu(idle, cpu); -#ifdef CONFIG_SMP /* * No validation and serialization required at boot time and for * setting up the idle tasks of not yet online CPUs. */ set_cpus_allowed_common(idle, &ac); -#endif /* * We're having a chicken and egg problem, even though we are * holding rq->lock, the CPU isn't yet set to this CPU so the @@ -7866,9 +8041,7 @@ void __init init_idle(struct task_struct *idle, int cpu) rq_set_donor(rq, idle); rcu_assign_pointer(rq->curr, idle); idle->on_rq = TASK_ON_RQ_QUEUED; -#ifdef CONFIG_SMP idle->on_cpu = 1; -#endif raw_spin_rq_unlock(rq); raw_spin_unlock_irqrestore(&idle->pi_lock, flags); @@ -7881,13 +8054,9 @@ void __init init_idle(struct task_struct *idle, int cpu) idle->sched_class = &idle_sched_class; ftrace_graph_init_idle_task(idle, cpu); vtime_init_idle(idle, cpu); -#ifdef CONFIG_SMP sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); -#endif } -#ifdef CONFIG_SMP - int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial) { @@ -8123,7 +8292,7 @@ static void balance_hotplug_wait(void) TASK_UNINTERRUPTIBLE); } -#else +#else /* !CONFIG_HOTPLUG_CPU: */ static inline void balance_push(struct rq *rq) { @@ -8137,7 +8306,7 @@ static inline void balance_hotplug_wait(void) { } -#endif /* CONFIG_HOTPLUG_CPU */ +#endif /* !CONFIG_HOTPLUG_CPU */ void set_rq_online(struct rq *rq) { @@ -8446,7 +8615,7 @@ int sched_cpu_dying(unsigned int cpu) sched_core_cpu_dying(cpu); return 0; } -#endif +#endif /* CONFIG_HOTPLUG_CPU */ void __init sched_init_smp(void) { @@ -8470,6 +8639,8 @@ void __init sched_init_smp(void) init_sched_rt_class(); init_sched_dl_class(); + sched_init_dl_servers(); + sched_smp_initialized = true; } @@ -8480,13 +8651,6 @@ static int __init migration_init(void) } early_initcall(migration_init); -#else -void __init sched_init_smp(void) -{ - sched_init_granularity(); -} -#endif /* CONFIG_SMP */ - int in_sched_functions(unsigned long addr) { return in_lock_functions(addr) || @@ -8512,9 +8676,7 @@ void __init sched_init(void) int i; /* Make sure the linker didn't screw up */ -#ifdef CONFIG_SMP BUG_ON(!sched_class_above(&stop_sched_class, &dl_sched_class)); -#endif BUG_ON(!sched_class_above(&dl_sched_class, &rt_sched_class)); BUG_ON(!sched_class_above(&rt_sched_class, &fair_sched_class)); BUG_ON(!sched_class_above(&fair_sched_class, &idle_sched_class)); @@ -8557,9 +8719,7 @@ void __init sched_init(void) #endif /* CONFIG_RT_GROUP_SCHED */ } -#ifdef CONFIG_SMP init_defrootdomain(); -#endif #ifdef CONFIG_RT_GROUP_SCHED init_rt_bandwidth(&root_task_group.rt_bandwidth, @@ -8620,7 +8780,6 @@ void __init sched_init(void) rq->rt.rt_runtime = global_rt_runtime(); init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); #endif -#ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; rq->cpu_capacity = SCHED_CAPACITY_SCALE; @@ -8646,7 +8805,6 @@ void __init sched_init(void) #ifdef CONFIG_HOTPLUG_CPU rcuwait_init(&rq->hotplug_wait); #endif -#endif /* CONFIG_SMP */ hrtick_rq_init(rq); atomic_set(&rq->nr_iowait, 0); fair_server_init(rq); @@ -8694,10 +8852,9 @@ void __init sched_init(void) calc_load_update = jiffies + LOAD_FREQ; -#ifdef CONFIG_SMP idle_thread_set_boot_cpu(); + balance_push_set(smp_processor_id(), false); -#endif init_sched_fair_class(); init_sched_ext_class(); @@ -8830,7 +8987,7 @@ void __cant_sleep(const char *file, int line, int preempt_offset) } EXPORT_SYMBOL_GPL(__cant_sleep); -#ifdef CONFIG_SMP +# ifdef CONFIG_SMP void __cant_migrate(const char *file, int line) { static unsigned long prev_jiffy; @@ -8861,8 +9018,8 @@ void __cant_migrate(const char *file, int line) add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } EXPORT_SYMBOL_GPL(__cant_migrate); -#endif -#endif +# endif /* CONFIG_SMP */ +#endif /* CONFIG_DEBUG_ATOMIC_SLEEP */ #ifdef CONFIG_MAGIC_SYSRQ void normalize_rt_tasks(void) @@ -8902,7 +9059,7 @@ void normalize_rt_tasks(void) #endif /* CONFIG_MAGIC_SYSRQ */ -#if defined(CONFIG_KGDB_KDB) +#ifdef CONFIG_KGDB_KDB /* * These functions are only useful for KDB. * @@ -8926,7 +9083,7 @@ struct task_struct *curr_task(int cpu) return cpu_curr(cpu); } -#endif /* defined(CONFIG_KGDB_KDB) */ +#endif /* CONFIG_KGDB_KDB */ #ifdef CONFIG_CGROUP_SCHED /* task_group_lock serializes the addition/removal of task groups */ @@ -9422,47 +9579,23 @@ static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css, #ifdef CONFIG_CFS_BANDWIDTH static DEFINE_MUTEX(cfs_constraints_mutex); -const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ -static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ -/* More than 203 days if BW_SHIFT equals 20. */ -static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC; - static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); -static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota, - u64 burst) +static int tg_set_cfs_bandwidth(struct task_group *tg, + u64 period_us, u64 quota_us, u64 burst_us) { int i, ret = 0, runtime_enabled, runtime_was_enabled; struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; + u64 period, quota, burst; - if (tg == &root_task_group) - return -EINVAL; + period = (u64)period_us * NSEC_PER_USEC; - /* - * Ensure we have at some amount of bandwidth every period. This is - * to prevent reaching a state of large arrears when throttled via - * entity_tick() resulting in prolonged exit starvation. - */ - if (quota < min_cfs_quota_period || period < min_cfs_quota_period) - return -EINVAL; - - /* - * Likewise, bound things on the other side by preventing insane quota - * periods. This also allows us to normalize in computing quota - * feasibility. - */ - if (period > max_cfs_quota_period) - return -EINVAL; - - /* - * Bound quota to defend quota against overflow during bandwidth shift. - */ - if (quota != RUNTIME_INF && quota > max_cfs_runtime) - return -EINVAL; + if (quota_us == RUNTIME_INF) + quota = RUNTIME_INF; + else + quota = (u64)quota_us * NSEC_PER_USEC; - if (quota != RUNTIME_INF && (burst > quota || - burst + quota > max_cfs_runtime)) - return -EINVAL; + burst = (u64)burst_us * NSEC_PER_USEC; /* * Prevent race between setting of cfs_rq->runtime_enabled and @@ -9517,28 +9650,22 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota, return 0; } -static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) +static u64 tg_get_cfs_period(struct task_group *tg) { - u64 quota, period, burst; + u64 cfs_period_us; - period = ktime_to_ns(tg->cfs_bandwidth.period); - burst = tg->cfs_bandwidth.burst; - if (cfs_quota_us < 0) - quota = RUNTIME_INF; - else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC) - quota = (u64)cfs_quota_us * NSEC_PER_USEC; - else - return -EINVAL; + cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period); + do_div(cfs_period_us, NSEC_PER_USEC); - return tg_set_cfs_bandwidth(tg, period, quota, burst); + return cfs_period_us; } -static long tg_get_cfs_quota(struct task_group *tg) +static u64 tg_get_cfs_quota(struct task_group *tg) { u64 quota_us; if (tg->cfs_bandwidth.quota == RUNTIME_INF) - return -1; + return RUNTIME_INF; quota_us = tg->cfs_bandwidth.quota; do_div(quota_us, NSEC_PER_USEC); @@ -9546,45 +9673,7 @@ static long tg_get_cfs_quota(struct task_group *tg) return quota_us; } -static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) -{ - u64 quota, period, burst; - - if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC) - return -EINVAL; - - period = (u64)cfs_period_us * NSEC_PER_USEC; - quota = tg->cfs_bandwidth.quota; - burst = tg->cfs_bandwidth.burst; - - return tg_set_cfs_bandwidth(tg, period, quota, burst); -} - -static long tg_get_cfs_period(struct task_group *tg) -{ - u64 cfs_period_us; - - cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period); - do_div(cfs_period_us, NSEC_PER_USEC); - - return cfs_period_us; -} - -static int tg_set_cfs_burst(struct task_group *tg, long cfs_burst_us) -{ - u64 quota, period, burst; - - if ((u64)cfs_burst_us > U64_MAX / NSEC_PER_USEC) - return -EINVAL; - - burst = (u64)cfs_burst_us * NSEC_PER_USEC; - period = ktime_to_ns(tg->cfs_bandwidth.period); - quota = tg->cfs_bandwidth.quota; - - return tg_set_cfs_bandwidth(tg, period, quota, burst); -} - -static long tg_get_cfs_burst(struct task_group *tg) +static u64 tg_get_cfs_burst(struct task_group *tg) { u64 burst_us; @@ -9594,42 +9683,6 @@ static long tg_get_cfs_burst(struct task_group *tg) return burst_us; } -static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - return tg_get_cfs_quota(css_tg(css)); -} - -static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css, - struct cftype *cftype, s64 cfs_quota_us) -{ - return tg_set_cfs_quota(css_tg(css), cfs_quota_us); -} - -static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - return tg_get_cfs_period(css_tg(css)); -} - -static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css, - struct cftype *cftype, u64 cfs_period_us) -{ - return tg_set_cfs_period(css_tg(css), cfs_period_us); -} - -static u64 cpu_cfs_burst_read_u64(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - return tg_get_cfs_burst(css_tg(css)); -} - -static int cpu_cfs_burst_write_u64(struct cgroup_subsys_state *css, - struct cftype *cftype, u64 cfs_burst_us) -{ - return tg_set_cfs_burst(css_tg(css), cfs_burst_us); -} - struct cfs_schedulable_data { struct task_group *tg; u64 period, quota; @@ -9762,6 +9815,126 @@ static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v) return 0; } + +const u64 max_bw_quota_period_us = 1 * USEC_PER_SEC; /* 1s */ +static const u64 min_bw_quota_period_us = 1 * USEC_PER_MSEC; /* 1ms */ +/* More than 203 days if BW_SHIFT equals 20. */ +static const u64 max_bw_runtime_us = MAX_BW; + +static void tg_bandwidth(struct task_group *tg, + u64 *period_us_p, u64 *quota_us_p, u64 *burst_us_p) +{ + if (period_us_p) + *period_us_p = tg_get_cfs_period(tg); + if (quota_us_p) + *quota_us_p = tg_get_cfs_quota(tg); + if (burst_us_p) + *burst_us_p = tg_get_cfs_burst(tg); +} + +static u64 cpu_period_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + u64 period_us; + + tg_bandwidth(css_tg(css), &period_us, NULL, NULL); + return period_us; +} + +static int tg_set_bandwidth(struct task_group *tg, + u64 period_us, u64 quota_us, u64 burst_us) +{ + const u64 max_usec = U64_MAX / NSEC_PER_USEC; + + if (tg == &root_task_group) + return -EINVAL; + + /* Values should survive translation to nsec */ + if (period_us > max_usec || + (quota_us != RUNTIME_INF && quota_us > max_usec) || + burst_us > max_usec) + return -EINVAL; + + /* + * Ensure we have some amount of bandwidth every period. This is to + * prevent reaching a state of large arrears when throttled via + * entity_tick() resulting in prolonged exit starvation. + */ + if (quota_us < min_bw_quota_period_us || + period_us < min_bw_quota_period_us) + return -EINVAL; + + /* + * Likewise, bound things on the other side by preventing insane quota + * periods. This also allows us to normalize in computing quota + * feasibility. + */ + if (period_us > max_bw_quota_period_us) + return -EINVAL; + + /* + * Bound quota to defend quota against overflow during bandwidth shift. + */ + if (quota_us != RUNTIME_INF && quota_us > max_bw_runtime_us) + return -EINVAL; + + if (quota_us != RUNTIME_INF && (burst_us > quota_us || + burst_us + quota_us > max_bw_runtime_us)) + return -EINVAL; + + return tg_set_cfs_bandwidth(tg, period_us, quota_us, burst_us); +} + +static s64 cpu_quota_read_s64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + u64 quota_us; + + tg_bandwidth(css_tg(css), NULL, "a_us, NULL); + return quota_us; /* (s64)RUNTIME_INF becomes -1 */ +} + +static u64 cpu_burst_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + u64 burst_us; + + tg_bandwidth(css_tg(css), NULL, NULL, &burst_us); + return burst_us; +} + +static int cpu_period_write_u64(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 period_us) +{ + struct task_group *tg = css_tg(css); + u64 quota_us, burst_us; + + tg_bandwidth(tg, NULL, "a_us, &burst_us); + return tg_set_bandwidth(tg, period_us, quota_us, burst_us); +} + +static int cpu_quota_write_s64(struct cgroup_subsys_state *css, + struct cftype *cftype, s64 quota_us) +{ + struct task_group *tg = css_tg(css); + u64 period_us, burst_us; + + if (quota_us < 0) + quota_us = RUNTIME_INF; + + tg_bandwidth(tg, &period_us, NULL, &burst_us); + return tg_set_bandwidth(tg, period_us, quota_us, burst_us); +} + +static int cpu_burst_write_u64(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 burst_us) +{ + struct task_group *tg = css_tg(css); + u64 period_us, quota_us; + + tg_bandwidth(tg, &period_us, "a_us, NULL); + return tg_set_bandwidth(tg, period_us, quota_us, burst_us); +} #endif /* CONFIG_CFS_BANDWIDTH */ #ifdef CONFIG_RT_GROUP_SCHED @@ -9807,7 +9980,7 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css, scx_group_set_idle(css_tg(css), idle); return ret; } -#endif +#endif /* CONFIG_GROUP_SCHED_WEIGHT */ static struct cftype cpu_legacy_files[] = { #ifdef CONFIG_GROUP_SCHED_WEIGHT @@ -9824,19 +9997,19 @@ static struct cftype cpu_legacy_files[] = { #endif #ifdef CONFIG_CFS_BANDWIDTH { - .name = "cfs_quota_us", - .read_s64 = cpu_cfs_quota_read_s64, - .write_s64 = cpu_cfs_quota_write_s64, + .name = "cfs_period_us", + .read_u64 = cpu_period_read_u64, + .write_u64 = cpu_period_write_u64, }, { - .name = "cfs_period_us", - .read_u64 = cpu_cfs_period_read_u64, - .write_u64 = cpu_cfs_period_write_u64, + .name = "cfs_quota_us", + .read_s64 = cpu_quota_read_s64, + .write_s64 = cpu_quota_write_s64, }, { .name = "cfs_burst_us", - .read_u64 = cpu_cfs_burst_read_u64, - .write_u64 = cpu_cfs_burst_write_u64, + .read_u64 = cpu_burst_read_u64, + .write_u64 = cpu_burst_write_u64, }, { .name = "stat", @@ -9935,7 +10108,7 @@ static int cpu_extra_stat_show(struct seq_file *sf, cfs_b->nr_periods, cfs_b->nr_throttled, throttled_usec, cfs_b->nr_burst, burst_usec); } -#endif +#endif /* CONFIG_CFS_BANDWIDTH */ return 0; } @@ -10033,22 +10206,20 @@ static void __maybe_unused cpu_period_quota_print(struct seq_file *sf, } /* caller should put the current value in *@periodp before calling */ -static int __maybe_unused cpu_period_quota_parse(char *buf, - u64 *periodp, u64 *quotap) +static int __maybe_unused cpu_period_quota_parse(char *buf, u64 *period_us_p, + u64 *quota_us_p) { char tok[21]; /* U64_MAX */ - if (sscanf(buf, "%20s %llu", tok, periodp) < 1) + if (sscanf(buf, "%20s %llu", tok, period_us_p) < 1) return -EINVAL; - *periodp *= NSEC_PER_USEC; - - if (sscanf(tok, "%llu", quotap)) - *quotap *= NSEC_PER_USEC; - else if (!strcmp(tok, "max")) - *quotap = RUNTIME_INF; - else - return -EINVAL; + if (sscanf(tok, "%llu", quota_us_p) < 1) { + if (!strcmp(tok, "max")) + *quota_us_p = RUNTIME_INF; + else + return -EINVAL; + } return 0; } @@ -10057,8 +10228,10 @@ static int __maybe_unused cpu_period_quota_parse(char *buf, static int cpu_max_show(struct seq_file *sf, void *v) { struct task_group *tg = css_tg(seq_css(sf)); + u64 period_us, quota_us; - cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg)); + tg_bandwidth(tg, &period_us, "a_us, NULL); + cpu_period_quota_print(sf, period_us, quota_us); return 0; } @@ -10066,17 +10239,16 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct task_group *tg = css_tg(of_css(of)); - u64 period = tg_get_cfs_period(tg); - u64 burst = tg->cfs_bandwidth.burst; - u64 quota; + u64 period_us, quota_us, burst_us; int ret; - ret = cpu_period_quota_parse(buf, &period, "a); + tg_bandwidth(tg, &period_us, NULL, &burst_us); + ret = cpu_period_quota_parse(buf, &period_us, "a_us); if (!ret) - ret = tg_set_cfs_bandwidth(tg, period, quota, burst); + ret = tg_set_bandwidth(tg, period_us, quota_us, burst_us); return ret ?: nbytes; } -#endif +#endif /* CONFIG_CFS_BANDWIDTH */ static struct cftype cpu_files[] = { #ifdef CONFIG_GROUP_SCHED_WEIGHT @@ -10109,10 +10281,10 @@ static struct cftype cpu_files[] = { { .name = "max.burst", .flags = CFTYPE_NOT_ON_ROOT, - .read_u64 = cpu_cfs_burst_read_u64, - .write_u64 = cpu_cfs_burst_write_u64, + .read_u64 = cpu_burst_read_u64, + .write_u64 = cpu_burst_write_u64, }, -#endif +#endif /* CONFIG_CFS_BANDWIDTH */ #ifdef CONFIG_UCLAMP_TASK_GROUP { .name = "uclamp.min", @@ -10126,7 +10298,7 @@ static struct cftype cpu_files[] = { .seq_show = cpu_uclamp_max_show, .write = cpu_uclamp_max_write, }, -#endif +#endif /* CONFIG_UCLAMP_TASK_GROUP */ { } /* terminate */ }; @@ -10147,7 +10319,7 @@ struct cgroup_subsys cpu_cgrp_subsys = { .threaded = true, }; -#endif /* CONFIG_CGROUP_SCHED */ +#endif /* CONFIG_CGROUP_SCHED */ void dump_cpu_task(int cpu) { @@ -10733,7 +10905,7 @@ void sched_mm_cid_fork(struct task_struct *t) WARN_ON_ONCE(!t->mm || t->mm_cid != -1); t->mm_cid_active = 1; } -#endif +#endif /* CONFIG_SCHED_MM_CID */ #ifdef CONFIG_SCHED_CLASS_EXT void sched_deq_and_put_task(struct task_struct *p, int queue_flags, @@ -10768,4 +10940,4 @@ void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx) if (ctx->running) set_next_task(rq, ctx->p); } -#endif /* CONFIG_SCHED_CLASS_EXT */ +#endif /* CONFIG_SCHED_CLASS_EXT */ diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c index c4606ca89210..9ede71ecba7f 100644 --- a/kernel/sched/core_sched.c +++ b/kernel/sched/core_sched.c @@ -4,6 +4,8 @@ * A simple wrapper around refcount. An allocated sched_core_cookie's * address is used to compute the cookie of the task. */ +#include "sched.h" + struct sched_core_cookie { refcount_t refcnt; }; diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 0de9dda09949..23a56ba12d81 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -6,6 +6,8 @@ * Based on the work by Paul Menage (menage@google.com) and Balbir Singh * (balbir@in.ibm.com). */ +#include <linux/sched/cputime.h> +#include "sched.h" /* Time spent by the tasks of the CPU accounting group executing in ... */ enum cpuacct_stat_index { diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 95baa12a1029..cdd740b3f774 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -6,6 +6,7 @@ * * Author: Juri Lelli <j.lelli@sssup.it> */ +#include "sched.h" static inline int parent(int i) { diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index 0adeda93b5fb..11c0f1faa7e1 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h @@ -1,4 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/types.h> +#include <linux/spinlock.h> #define IDX_INVALID -1 @@ -15,7 +17,6 @@ struct cpudl { struct cpudl_item *elements; }; -#ifdef CONFIG_SMP int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask); void cpudl_set(struct cpudl *cp, int cpu, u64 dl); void cpudl_clear(struct cpudl *cp, int cpu); @@ -23,4 +24,3 @@ int cpudl_init(struct cpudl *cp); void cpudl_set_freecpu(struct cpudl *cp, int cpu); void cpudl_clear_freecpu(struct cpudl *cp, int cpu); void cpudl_cleanup(struct cpudl *cp); -#endif /* CONFIG_SMP */ diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c index 5252fb191fae..742fb9e62e1a 100644 --- a/kernel/sched/cpufreq.c +++ b/kernel/sched/cpufreq.c @@ -5,6 +5,7 @@ * Copyright (C) 2016, Intel Corporation * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> */ +#include "sched.h" DEFINE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 461242ec958a..0ab5f9d4bc59 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -5,6 +5,8 @@ * Copyright (C) 2016, Intel Corporation * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> */ +#include <uapi/linux/sched/types.h> +#include "sched.h" #define IOWAIT_BOOST_MIN (SCHED_CAPACITY_SCALE / 8) @@ -380,9 +382,9 @@ static bool sugov_hold_freq(struct sugov_cpu *sg_cpu) sg_cpu->saved_idle_calls = idle_calls; return ret; } -#else +#else /* !CONFIG_NO_HZ_COMMON: */ static inline bool sugov_hold_freq(struct sugov_cpu *sg_cpu) { return false; } -#endif /* CONFIG_NO_HZ_COMMON */ +#endif /* !CONFIG_NO_HZ_COMMON */ /* * Make sugov_should_update_freq() ignore the rate limit when DL diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 42c40cfdf836..76a9ac5eb794 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -22,6 +22,7 @@ * worst case complexity of O(min(101, nr_domcpus)), though the scenario that * yields the worst case search is fairly contrived. */ +#include "sched.h" /* * p->rt_priority p->prio newpri cpupri diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index d6cba0020064..6f562088c056 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h @@ -1,4 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/atomic.h> +#include <linux/cpumask.h> +#include <linux/sched/rt.h> #define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO+1) @@ -17,7 +20,6 @@ struct cpupri { int *cpu_to_pri; }; -#ifdef CONFIG_SMP int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask); int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p, @@ -26,4 +28,3 @@ int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p, void cpupri_set(struct cpupri *cp, int cpu, int pri); int cpupri_init(struct cpupri *cp); void cpupri_cleanup(struct cpupri *cp); -#endif diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 6dab4854c6c0..7097de2c8cda 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -2,6 +2,9 @@ /* * Simple CPU accounting cgroup controller */ +#include <linux/sched/cputime.h> +#include <linux/tsacct_kern.h> +#include "sched.h" #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE #include <asm/cputime.h> @@ -88,7 +91,7 @@ static u64 irqtime_tick_accounted(u64 maxtime) return delta; } -#else /* CONFIG_IRQ_TIME_ACCOUNTING */ +#else /* !CONFIG_IRQ_TIME_ACCOUNTING: */ static u64 irqtime_tick_accounted(u64 dummy) { @@ -241,7 +244,7 @@ void __account_forceidle_time(struct task_struct *p, u64 delta) task_group_account_field(p, CPUTIME_FORCEIDLE, delta); } -#endif +#endif /* CONFIG_SCHED_CORE */ /* * When a guest is interrupted for a longer amount of time, missed clock @@ -262,7 +265,7 @@ static __always_inline u64 steal_account_process_time(u64 maxtime) return steal; } -#endif +#endif /* CONFIG_PARAVIRT */ return 0; } @@ -288,7 +291,7 @@ static inline u64 read_sum_exec_runtime(struct task_struct *t) { return t->se.sum_exec_runtime; } -#else +#else /* !CONFIG_64BIT: */ static u64 read_sum_exec_runtime(struct task_struct *t) { u64 ns; @@ -301,7 +304,7 @@ static u64 read_sum_exec_runtime(struct task_struct *t) return ns; } -#endif +#endif /* !CONFIG_64BIT */ /* * Accumulate raw cputime values of dead tasks (sig->[us]time) and live @@ -411,11 +414,11 @@ static void irqtime_account_idle_ticks(int ticks) { irqtime_account_process_tick(current, 0, ticks); } -#else /* CONFIG_IRQ_TIME_ACCOUNTING */ +#else /* !CONFIG_IRQ_TIME_ACCOUNTING: */ static inline void irqtime_account_idle_ticks(int ticks) { } static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, int nr_ticks) { } -#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ +#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ /* * Use precise platform statistics if available: diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 89019a140826..e2d51f4306b3 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -17,6 +17,10 @@ */ #include <linux/cpuset.h> +#include <linux/sched/clock.h> +#include <uapi/linux/sched/types.h> +#include "sched.h" +#include "pelt.h" /* * Default limits for DL period; on the top end we guard against small util @@ -51,7 +55,7 @@ static int __init sched_dl_sysctl_init(void) return 0; } late_initcall(sched_dl_sysctl_init); -#endif +#endif /* CONFIG_SYSCTL */ static bool dl_server(struct sched_dl_entity *dl_se) { @@ -99,7 +103,7 @@ static inline bool is_dl_boosted(struct sched_dl_entity *dl_se) { return pi_of(dl_se) != dl_se; } -#else +#else /* !CONFIG_RT_MUTEXES: */ static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se) { return dl_se; @@ -109,9 +113,8 @@ static inline bool is_dl_boosted(struct sched_dl_entity *dl_se) { return false; } -#endif +#endif /* !CONFIG_RT_MUTEXES */ -#ifdef CONFIG_SMP static inline struct dl_bw *dl_bw_of(int i) { RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), @@ -191,35 +194,6 @@ void __dl_update(struct dl_bw *dl_b, s64 bw) rq->dl.extra_bw += bw; } } -#else -static inline struct dl_bw *dl_bw_of(int i) -{ - return &cpu_rq(i)->dl.dl_bw; -} - -static inline int dl_bw_cpus(int i) -{ - return 1; -} - -static inline unsigned long dl_bw_capacity(int i) -{ - return SCHED_CAPACITY_SCALE; -} - -bool dl_bw_visited(int cpu, u64 cookie) -{ - return false; -} - -static inline -void __dl_update(struct dl_bw *dl_b, s64 bw) -{ - struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw); - - dl->extra_bw += bw; -} -#endif static inline void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus) @@ -552,23 +526,17 @@ void init_dl_rq(struct dl_rq *dl_rq) { dl_rq->root = RB_ROOT_CACHED; -#ifdef CONFIG_SMP /* zero means no -deadline tasks */ dl_rq->earliest_dl.curr = dl_rq->earliest_dl.next = 0; dl_rq->overloaded = 0; dl_rq->pushable_dl_tasks_root = RB_ROOT_CACHED; -#else - init_dl_bw(&dl_rq->dl_bw); -#endif dl_rq->running_bw = 0; dl_rq->this_bw = 0; init_dl_rq_bw_ratio(dl_rq); } -#ifdef CONFIG_SMP - static inline int dl_overloaded(struct rq *rq) { return atomic_read(&rq->rd->dlo_count); @@ -753,37 +721,6 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p return later_rq; } -#else - -static inline -void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) -{ -} - -static inline -void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) -{ -} - -static inline -void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) -{ -} - -static inline -void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) -{ -} - -static inline void deadline_queue_push_tasks(struct rq *rq) -{ -} - -static inline void deadline_queue_pull_task(struct rq *rq) -{ -} -#endif /* CONFIG_SMP */ - static void enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags); static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); @@ -824,6 +761,8 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se) struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); + update_rq_clock(rq); + WARN_ON(is_dl_boosted(dl_se)); WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline)); @@ -1195,7 +1134,6 @@ static int start_dl_timer(struct sched_dl_entity *dl_se) static void __push_dl_task(struct rq *rq, struct rq_flags *rf) { -#ifdef CONFIG_SMP /* * Queueing this task back might have overloaded rq, check if we need * to kick someone away. @@ -1209,12 +1147,13 @@ static void __push_dl_task(struct rq *rq, struct rq_flags *rf) push_dl_task(rq); rq_repin_lock(rq, rf); } -#endif } /* a defer timer will not be reset if the runtime consumed was < dl_server_min_res */ static const u64 dl_server_min_res = 1 * NSEC_PER_MSEC; +static bool dl_server_stopped(struct sched_dl_entity *dl_se); + static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_dl_entity *dl_se) { struct rq *rq = rq_of_dl_se(dl_se); @@ -1234,6 +1173,7 @@ static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_ if (!dl_se->server_has_tasks(dl_se)) { replenish_dl_entity(dl_se); + dl_server_stopped(dl_se); return HRTIMER_NORESTART; } @@ -1339,7 +1279,6 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) goto unlock; } -#ifdef CONFIG_SMP if (unlikely(!rq->online)) { /* * If the runqueue is no longer available, migrate the @@ -1356,7 +1295,6 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * there. */ } -#endif enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); if (dl_task(rq->donor)) @@ -1600,7 +1538,7 @@ throttle: rt_rq->rt_time += delta_exec; raw_spin_unlock(&rt_rq->rt_runtime_lock); } -#endif +#endif /* CONFIG_RT_GROUP_SCHED */ } /* @@ -1639,31 +1577,17 @@ void dl_server_update_idle_time(struct rq *rq, struct task_struct *p) void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec) { /* 0 runtime = fair server disabled */ - if (dl_se->dl_runtime) + if (dl_se->dl_runtime) { + dl_se->dl_server_idle = 0; update_curr_dl_se(dl_se->rq, dl_se, delta_exec); + } } void dl_server_start(struct sched_dl_entity *dl_se) { struct rq *rq = dl_se->rq; - /* - * XXX: the apply do not work fine at the init phase for the - * fair server because things are not yet set. We need to improve - * this before getting generic. - */ - if (!dl_server(dl_se)) { - u64 runtime = 50 * NSEC_PER_MSEC; - u64 period = 1000 * NSEC_PER_MSEC; - - dl_server_apply_params(dl_se, runtime, period, 1); - - dl_se->dl_server = 1; - dl_se->dl_defer = 1; - setup_new_dl_entity(dl_se); - } - - if (!dl_se->dl_runtime) + if (!dl_server(dl_se) || dl_se->dl_server_active) return; dl_se->dl_server_active = 1; @@ -1674,7 +1598,7 @@ void dl_server_start(struct sched_dl_entity *dl_se) void dl_server_stop(struct sched_dl_entity *dl_se) { - if (!dl_se->dl_runtime) + if (!dl_server(dl_se) || !dl_server_active(dl_se)) return; dequeue_dl_entity(dl_se, DEQUEUE_SLEEP); @@ -1684,6 +1608,20 @@ void dl_server_stop(struct sched_dl_entity *dl_se) dl_se->dl_server_active = 0; } +static bool dl_server_stopped(struct sched_dl_entity *dl_se) +{ + if (!dl_se->dl_server_active) + return false; + + if (dl_se->dl_server_idle) { + dl_server_stop(dl_se); + return true; + } + + dl_se->dl_server_idle = 1; + return false; +} + void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, dl_server_has_tasks_f has_tasks, dl_server_pick_f pick_task) @@ -1693,6 +1631,32 @@ void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, dl_se->server_pick_task = pick_task; } +void sched_init_dl_servers(void) +{ + int cpu; + struct rq *rq; + struct sched_dl_entity *dl_se; + + for_each_online_cpu(cpu) { + u64 runtime = 50 * NSEC_PER_MSEC; + u64 period = 1000 * NSEC_PER_MSEC; + + rq = cpu_rq(cpu); + + guard(rq_lock_irq)(rq); + + dl_se = &rq->fair_server; + + WARN_ON(dl_server(dl_se)); + + dl_server_apply_params(dl_se, runtime, period, 1); + + dl_se->dl_server = 1; + dl_se->dl_defer = 1; + setup_new_dl_entity(dl_se); + } +} + void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq) { u64 new_bw = dl_se->dl_bw; @@ -1844,8 +1808,6 @@ static void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se) #define __node_2_dle(node) \ rb_entry((node), struct sched_dl_entity, rb_node) -#ifdef CONFIG_SMP - static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) { struct rq *rq = rq_of_dl_rq(dl_rq); @@ -1881,13 +1843,6 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) } } -#else - -static inline void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {} -static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {} - -#endif /* CONFIG_SMP */ - static inline void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { @@ -2166,6 +2121,9 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) if (dl_server(&p->dl)) return; + if (task_is_blocked(p)) + return; + if (!task_current(rq, p) && !p->dl.dl_throttled && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); } @@ -2214,8 +2172,6 @@ static void yield_task_dl(struct rq *rq) rq_clock_skip_update(rq); } -#ifdef CONFIG_SMP - static inline bool dl_task_is_earliest_deadline(struct task_struct *p, struct rq *rq) { @@ -2345,7 +2301,6 @@ static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf) return sched_stop_runnable(rq) || sched_dl_runnable(rq); } -#endif /* CONFIG_SMP */ /* * Only called when both the current and waking task are -deadline @@ -2359,7 +2314,6 @@ static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, return; } -#ifdef CONFIG_SMP /* * In the unlikely case current and p have the same deadline * let us try to decide what's the best thing to do... @@ -2367,7 +2321,6 @@ static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, if ((p->dl.deadline == rq->donor->dl.deadline) && !test_tsk_need_resched(rq->curr)) check_preempt_equal_dl(rq, p); -#endif /* CONFIG_SMP */ } #ifdef CONFIG_SCHED_HRTICK @@ -2375,11 +2328,11 @@ static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se) { hrtick_start(rq, dl_se->runtime); } -#else /* !CONFIG_SCHED_HRTICK */ +#else /* !CONFIG_SCHED_HRTICK: */ static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se) { } -#endif +#endif /* !CONFIG_SCHED_HRTICK */ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first) { @@ -2435,7 +2388,7 @@ again: if (dl_server(dl_se)) { p = dl_se->server_pick_task(dl_se); if (!p) { - if (dl_server_active(dl_se)) { + if (!dl_server_stopped(dl_se)) { dl_se->dl_yielded = 1; update_curr_dl_se(rq, dl_se, 0); } @@ -2465,6 +2418,10 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct task_s update_curr_dl(rq); update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1); + + if (task_is_blocked(p)) + return; + if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); } @@ -2500,8 +2457,6 @@ static void task_fork_dl(struct task_struct *p) */ } -#ifdef CONFIG_SMP - /* Only try algorithms three times */ #define DL_MAX_TRIES 3 @@ -2976,7 +2931,14 @@ void dl_clear_root_domain(struct root_domain *rd) int i; guard(raw_spinlock_irqsave)(&rd->dl_bw.lock); + + /* + * Reset total_bw to zero and extra_bw to max_bw so that next + * loop will add dl-servers contributions back properly, + */ rd->dl_bw.total_bw = 0; + for_each_cpu(i, rd->span) + cpu_rq(i)->dl.extra_bw = cpu_rq(i)->dl.max_bw; /* * dl_servers are not tasks. Since dl_add_task_root_domain ignores @@ -2995,8 +2957,6 @@ void dl_clear_root_domain_cpu(int cpu) dl_clear_root_domain(cpu_rq(cpu)->rd); } -#endif /* CONFIG_SMP */ - static void switched_from_dl(struct rq *rq, struct task_struct *p) { /* @@ -3069,10 +3029,8 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) } if (rq->donor != p) { -#ifdef CONFIG_SMP if (p->nr_cpus_allowed > 1 && rq->dl.overloaded) deadline_queue_push_tasks(rq); -#endif if (dl_task(rq->donor)) wakeup_preempt_dl(rq, p, 0); else @@ -3092,7 +3050,6 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, if (!task_on_rq_queued(p)) return; -#ifdef CONFIG_SMP /* * This might be too much, but unfortunately * we don't have the old deadline value, and @@ -3121,13 +3078,6 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, dl_time_before(p->dl.deadline, rq->curr->dl.deadline)) resched_curr(rq); } -#else - /* - * We don't know if p has a earlier or later deadline, so let's blindly - * set a (maybe not needed) rescheduling point. - */ - resched_curr(rq); -#endif } #ifdef CONFIG_SCHED_CORE @@ -3149,7 +3099,6 @@ DEFINE_SCHED_CLASS(dl) = { .put_prev_task = put_prev_task_dl, .set_next_task = set_next_task_dl, -#ifdef CONFIG_SMP .balance = balance_dl, .select_task_rq = select_task_rq_dl, .migrate_task_rq = migrate_task_rq_dl, @@ -3158,7 +3107,6 @@ DEFINE_SCHED_CLASS(dl) = { .rq_offline = rq_offline_dl, .task_woken = task_woken_dl, .find_lock_rq = find_lock_later_rq, -#endif .task_tick = task_tick_dl, .task_fork = task_fork_dl, @@ -3242,6 +3190,9 @@ void sched_dl_do_global(void) if (global_rt_runtime() != RUNTIME_INF) new_bw = to_ratio(global_rt_period(), global_rt_runtime()); + for_each_possible_cpu(cpu) + init_dl_rq_bw_ratio(&cpu_rq(cpu)->dl); + for_each_possible_cpu(cpu) { rcu_read_lock_sched(); @@ -3257,7 +3208,6 @@ void sched_dl_do_global(void) raw_spin_unlock_irqrestore(&dl_b->lock, flags); rcu_read_unlock_sched(); - init_dl_rq_bw_ratio(&cpu_rq(cpu)->dl); } } @@ -3458,7 +3408,6 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) return false; } -#ifdef CONFIG_SMP int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial) { @@ -3570,7 +3519,6 @@ void dl_bw_free(int cpu, u64 dl_bw) { dl_bw_manage(dl_bw_req_free, cpu, dl_bw); } -#endif void print_dl_stats(struct seq_file *m, int cpu) { diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 557246880a7e..3f06ab84d53f 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -6,6 +6,9 @@ * * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar */ +#include <linux/debugfs.h> +#include <linux/nmi.h> +#include "sched.h" /* * This allows printing both to /sys/kernel/debug/sched/debug and @@ -90,10 +93,10 @@ static void sched_feat_enable(int i) { static_key_enable_cpuslocked(&sched_feat_keys[i]); } -#else +#else /* !CONFIG_JUMP_LABEL: */ static void sched_feat_disable(int i) { }; static void sched_feat_enable(int i) { }; -#endif /* CONFIG_JUMP_LABEL */ +#endif /* !CONFIG_JUMP_LABEL */ static int sched_feat_set(char *cmp) { @@ -166,8 +169,6 @@ static const struct file_operations sched_feat_fops = { .release = single_release, }; -#ifdef CONFIG_SMP - static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { @@ -214,8 +215,6 @@ static const struct file_operations sched_scaling_fops = { .release = single_release, }; -#endif /* SMP */ - #ifdef CONFIG_PREEMPT_DYNAMIC static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf, @@ -283,7 +282,6 @@ static const struct file_operations sched_dynamic_fops = { __read_mostly bool sched_debug_verbose; -#ifdef CONFIG_SMP static struct dentry *sd_dentry; @@ -311,9 +309,6 @@ static ssize_t sched_verbose_write(struct file *filp, const char __user *ubuf, return result; } -#else -#define sched_verbose_write debugfs_write_file_bool -#endif static const struct file_operations sched_verbose_fops = { .read = debugfs_read_file_bool, @@ -512,7 +507,6 @@ static __init int sched_init_debug(void) debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms); debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once); -#ifdef CONFIG_SMP debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops); debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost); debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate); @@ -520,7 +514,6 @@ static __init int sched_init_debug(void) sched_domains_mutex_lock(); update_sched_domain_debugfs(); sched_domains_mutex_unlock(); -#endif #ifdef CONFIG_NUMA_BALANCING numa = debugfs_create_dir("numa_balancing", debugfs_sched); @@ -530,7 +523,7 @@ static __init int sched_init_debug(void) debugfs_create_u32("scan_period_max_ms", 0644, numa, &sysctl_numa_balancing_scan_period_max); debugfs_create_u32("scan_size_mb", 0644, numa, &sysctl_numa_balancing_scan_size); debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold); -#endif +#endif /* CONFIG_NUMA_BALANCING */ debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops); @@ -540,8 +533,6 @@ static __init int sched_init_debug(void) } late_initcall(sched_init_debug); -#ifdef CONFIG_SMP - static cpumask_var_t sd_sysctl_cpus; static int sd_flags_show(struct seq_file *m, void *v) @@ -652,8 +643,6 @@ void dirty_sched_domain_sysctl(int cpu) __cpumask_set_cpu(cpu, sd_sysctl_cpus); } -#endif /* CONFIG_SMP */ - #ifdef CONFIG_FAIR_GROUP_SCHED static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg) { @@ -690,18 +679,16 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group } P(se->load.weight); -#ifdef CONFIG_SMP P(se->avg.load_avg); P(se->avg.util_avg); P(se->avg.runnable_avg); -#endif #undef PN_SCHEDSTAT #undef PN #undef P_SCHEDSTAT #undef P } -#endif +#endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_CGROUP_SCHED static DEFINE_SPINLOCK(sched_debug_lock); @@ -854,7 +841,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %d\n", "h_nr_queued", cfs_rq->h_nr_queued); SEQ_printf(m, " .%-30s: %d\n", "h_nr_idle", cfs_rq->h_nr_idle); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); -#ifdef CONFIG_SMP SEQ_printf(m, " .%-30s: %lu\n", "load_avg", cfs_rq->avg.load_avg); SEQ_printf(m, " .%-30s: %lu\n", "runnable_avg", @@ -874,8 +860,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cfs_rq->tg_load_avg_contrib); SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg", atomic_long_read(&cfs_rq->tg->load_avg)); -#endif -#endif +#endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_CFS_BANDWIDTH SEQ_printf(m, " .%-30s: %d\n", "throttled", cfs_rq->throttled); @@ -929,11 +914,7 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq) SEQ_printf(m, " .%-30s: %lu\n", #x, (unsigned long)(dl_rq->x)) PU(dl_nr_running); -#ifdef CONFIG_SMP dl_bw = &cpu_rq(cpu)->rd->dl_bw; -#else - dl_bw = &dl_rq->dl_bw; -#endif SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->bw", dl_bw->bw); SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->total_bw", dl_bw->total_bw); @@ -951,9 +932,9 @@ static void print_cpu(struct seq_file *m, int cpu) SEQ_printf(m, "cpu#%d, %u.%03u MHz\n", cpu, freq / 1000, (freq % 1000)); } -#else +#else /* !CONFIG_X86: */ SEQ_printf(m, "cpu#%d\n", cpu); -#endif +#endif /* !CONFIG_X86 */ #define P(x) \ do { \ @@ -976,12 +957,10 @@ do { \ #undef P #undef PN -#ifdef CONFIG_SMP #define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n); P64(avg_idle); P64(max_idle_balance_cost); #undef P64 -#endif #define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, schedstat_val(rq->n)); if (schedstat_enabled()) { @@ -1163,7 +1142,7 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m) SEQ_printf(m, "current_node=%d, numa_group_id=%d\n", task_node(p), task_numa_group_id(p)); show_numa_stats(p, m); -#endif +#endif /* CONFIG_NUMA_BALANCING */ } void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, @@ -1247,7 +1226,6 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, __PS("nr_involuntary_switches", p->nivcsw); P(se.load.weight); -#ifdef CONFIG_SMP P(se.avg.load_sum); P(se.avg.runnable_sum); P(se.avg.util_sum); @@ -1256,13 +1234,12 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P(se.avg.util_avg); P(se.avg.last_update_time); PM(se.avg.util_est, ~UTIL_AVG_UNCHANGED); -#endif #ifdef CONFIG_UCLAMP_TASK __PS("uclamp.min", p->uclamp_req[UCLAMP_MIN].value); __PS("uclamp.max", p->uclamp_req[UCLAMP_MAX].value); __PS("effective uclamp.min", uclamp_eff_value(p, UCLAMP_MIN)); __PS("effective uclamp.max", uclamp_eff_value(p, UCLAMP_MAX)); -#endif +#endif /* CONFIG_UCLAMP_TASK */ P(policy); P(prio); if (task_has_dl_policy(p)) { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7a14da5396fb..b173a059315c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -88,7 +88,6 @@ static int __init setup_sched_thermal_decay_shift(char *str) } __setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift); -#ifdef CONFIG_SMP /* * For asym packing, by default the lower numbered CPU has higher priority. */ @@ -111,7 +110,6 @@ int __weak arch_asym_cpu_priority(int cpu) * (default: ~5%) */ #define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078) -#endif #ifdef CONFIG_CFS_BANDWIDTH /* @@ -162,7 +160,7 @@ static int __init sched_fair_sysctl_init(void) return 0; } late_initcall(sched_fair_sysctl_init); -#endif +#endif /* CONFIG_SYSCTL */ static inline void update_load_add(struct load_weight *lw, unsigned long inc) { @@ -471,7 +469,7 @@ static int se_is_idle(struct sched_entity *se) return cfs_rq_is_idle(group_cfs_rq(se)); } -#else /* !CONFIG_FAIR_GROUP_SCHED */ +#else /* !CONFIG_FAIR_GROUP_SCHED: */ #define for_each_sched_entity(se) \ for (; se; se = NULL) @@ -517,7 +515,7 @@ static int se_is_idle(struct sched_entity *se) return task_has_idle_policy(task_of(se)); } -#endif /* CONFIG_FAIR_GROUP_SCHED */ +#endif /* !CONFIG_FAIR_GROUP_SCHED */ static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec); @@ -884,23 +882,44 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) } /* - * HACK, stash a copy of deadline at the point of pick in vlag, - * which isn't used until dequeue. + * Set the vruntime up to which an entity can run before looking + * for another entity to pick. + * In case of run to parity, we use the shortest slice of the enqueued + * entities to set the protected period. + * When run to parity is disabled, we give a minimum quantum to the running + * entity to ensure progress. */ -static inline void set_protect_slice(struct sched_entity *se) +static inline void set_protect_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) { - se->vlag = se->deadline; + u64 slice = normalized_sysctl_sched_base_slice; + u64 vprot = se->deadline; + + if (sched_feat(RUN_TO_PARITY)) + slice = cfs_rq_min_slice(cfs_rq); + + slice = min(slice, se->slice); + if (slice != se->slice) + vprot = min_vruntime(vprot, se->vruntime + calc_delta_fair(slice, se)); + + se->vprot = vprot; +} + +static inline void update_protect_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + u64 slice = cfs_rq_min_slice(cfs_rq); + + se->vprot = min_vruntime(se->vprot, se->vruntime + calc_delta_fair(slice, se)); } static inline bool protect_slice(struct sched_entity *se) { - return se->vlag == se->deadline; + return ((s64)(se->vprot - se->vruntime) > 0); } static inline void cancel_protect_slice(struct sched_entity *se) { if (protect_slice(se)) - se->vlag = se->deadline + 1; + se->vprot = se->vruntime; } /* @@ -922,7 +941,7 @@ static inline void cancel_protect_slice(struct sched_entity *se) * * Which allows tree pruning through eligibility. */ -static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) +static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool protect) { struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node; struct sched_entity *se = __pick_first_entity(cfs_rq); @@ -939,7 +958,7 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr))) curr = NULL; - if (sched_feat(RUN_TO_PARITY) && curr && protect_slice(curr)) + if (curr && protect && protect_slice(curr)) return curr; /* Pick the leftmost entity if it's eligible */ @@ -983,6 +1002,11 @@ found: return best; } +static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) +{ + return __pick_eevdf(cfs_rq, true); +} + struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) { struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root); @@ -996,7 +1020,6 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) /************************************************************** * Scheduling class statistics methods: */ -#ifdef CONFIG_SMP int sched_update_scaling(void) { unsigned int factor = get_update_sysctl_factor(); @@ -1008,7 +1031,6 @@ int sched_update_scaling(void) return 0; } -#endif static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se); @@ -1041,7 +1063,6 @@ static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) } #include "pelt.h" -#ifdef CONFIG_SMP static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); static unsigned long task_h_load(struct task_struct *p); @@ -1131,34 +1152,40 @@ void post_init_entity_util_avg(struct task_struct *p) sa->runnable_avg = sa->util_avg; } -#else /* !CONFIG_SMP */ -void init_entity_runnable_average(struct sched_entity *se) -{ -} -void post_init_entity_util_avg(struct task_struct *p) -{ -} -static void update_tg_load_avg(struct cfs_rq *cfs_rq) -{ -} -#endif /* CONFIG_SMP */ - -static s64 update_curr_se(struct rq *rq, struct sched_entity *curr) +static s64 update_se(struct rq *rq, struct sched_entity *se) { u64 now = rq_clock_task(rq); s64 delta_exec; - delta_exec = now - curr->exec_start; + delta_exec = now - se->exec_start; if (unlikely(delta_exec <= 0)) return delta_exec; - curr->exec_start = now; - curr->sum_exec_runtime += delta_exec; + se->exec_start = now; + if (entity_is_task(se)) { + struct task_struct *donor = task_of(se); + struct task_struct *running = rq->curr; + /* + * If se is a task, we account the time against the running + * task, as w/ proxy-exec they may not be the same. + */ + running->se.exec_start = now; + running->se.sum_exec_runtime += delta_exec; + + trace_sched_stat_runtime(running, delta_exec); + account_group_exec_runtime(running, delta_exec); + + /* cgroup time is always accounted against the donor */ + cgroup_account_cputime(donor, delta_exec); + } else { + /* If not task, account the time against donor se */ + se->sum_exec_runtime += delta_exec; + } if (schedstat_enabled()) { struct sched_statistics *stats; - stats = __schedstats_from_se(curr); + stats = __schedstats_from_se(se); __schedstat_set(stats->exec_max, max(delta_exec, stats->exec_max)); } @@ -1166,58 +1193,12 @@ static s64 update_curr_se(struct rq *rq, struct sched_entity *curr) return delta_exec; } -static inline void update_curr_task(struct task_struct *p, s64 delta_exec) -{ - trace_sched_stat_runtime(p, delta_exec); - account_group_exec_runtime(p, delta_exec); - cgroup_account_cputime(p, delta_exec); -} - -static inline bool did_preempt_short(struct cfs_rq *cfs_rq, struct sched_entity *curr) -{ - if (!sched_feat(PREEMPT_SHORT)) - return false; - - if (curr->vlag == curr->deadline) - return false; - - return !entity_eligible(cfs_rq, curr); -} - -static inline bool do_preempt_short(struct cfs_rq *cfs_rq, - struct sched_entity *pse, struct sched_entity *se) -{ - if (!sched_feat(PREEMPT_SHORT)) - return false; - - if (pse->slice >= se->slice) - return false; - - if (!entity_eligible(cfs_rq, pse)) - return false; - - if (entity_before(pse, se)) - return true; - - if (!entity_eligible(cfs_rq, se)) - return true; - - return false; -} - /* * Used by other classes to account runtime. */ s64 update_curr_common(struct rq *rq) { - struct task_struct *donor = rq->donor; - s64 delta_exec; - - delta_exec = update_curr_se(rq, &donor->se); - if (likely(delta_exec > 0)) - update_curr_task(donor, delta_exec); - - return delta_exec; + return update_se(rq, &rq->donor->se); } /* @@ -1225,6 +1206,12 @@ s64 update_curr_common(struct rq *rq) */ static void update_curr(struct cfs_rq *cfs_rq) { + /* + * Note: cfs_rq->curr corresponds to the task picked to + * run (ie: rq->donor.se) which due to proxy-exec may + * not necessarily be the actual task running + * (rq->curr.se). This is easy to confuse! + */ struct sched_entity *curr = cfs_rq->curr; struct rq *rq = rq_of(cfs_rq); s64 delta_exec; @@ -1233,7 +1220,7 @@ static void update_curr(struct cfs_rq *cfs_rq) if (unlikely(!curr)) return; - delta_exec = update_curr_se(rq, curr); + delta_exec = update_se(rq, curr); if (unlikely(delta_exec <= 0)) return; @@ -1242,10 +1229,6 @@ static void update_curr(struct cfs_rq *cfs_rq) update_min_vruntime(cfs_rq); if (entity_is_task(curr)) { - struct task_struct *p = task_of(curr); - - update_curr_task(p, delta_exec); - /* * If the fair_server is active, we need to account for the * fair_server time whether or not the task is running on @@ -1265,7 +1248,7 @@ static void update_curr(struct cfs_rq *cfs_rq) if (cfs_rq->nr_queued == 1) return; - if (resched || did_preempt_short(cfs_rq, curr)) { + if (resched || !protect_slice(curr)) { resched_curr_lazy(rq); clear_buddies(cfs_rq, curr); } @@ -2114,12 +2097,12 @@ static inline int numa_idle_core(int idle_core, int cpu) return idle_core; } -#else +#else /* !CONFIG_SCHED_SMT: */ static inline int numa_idle_core(int idle_core, int cpu) { return idle_core; } -#endif +#endif /* !CONFIG_SCHED_SMT */ /* * Gather all necessary information to make NUMA balancing placement @@ -3673,7 +3656,8 @@ static void update_scan_period(struct task_struct *p, int new_cpu) p->numa_scan_period = task_scan_start(p); } -#else +#else /* !CONFIG_NUMA_BALANCING: */ + static void task_tick_numa(struct rq *rq, struct task_struct *curr) { } @@ -3690,20 +3674,18 @@ static inline void update_scan_period(struct task_struct *p, int new_cpu) { } -#endif /* CONFIG_NUMA_BALANCING */ +#endif /* !CONFIG_NUMA_BALANCING */ static void account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_add(&cfs_rq->load, se->load.weight); -#ifdef CONFIG_SMP if (entity_is_task(se)) { struct rq *rq = rq_of(cfs_rq); account_numa_enqueue(rq, task_of(se)); list_add(&se->group_node, &rq->cfs_tasks); } -#endif cfs_rq->nr_queued++; } @@ -3711,12 +3693,10 @@ static void account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_sub(&cfs_rq->load, se->load.weight); -#ifdef CONFIG_SMP if (entity_is_task(se)) { account_numa_dequeue(rq_of(cfs_rq), task_of(se)); list_del_init(&se->group_node); } -#endif cfs_rq->nr_queued--; } @@ -3768,7 +3748,6 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) *ptr -= min_t(typeof(*ptr), *ptr, _val); \ } while (0) -#ifdef CONFIG_SMP static inline void enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -3785,12 +3764,6 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum, cfs_rq->avg.load_avg * PELT_MIN_DIVIDER); } -#else -static inline void -enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } -static inline void -dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } -#endif static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags); @@ -3822,13 +3795,11 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, update_load_set(&se->load, weight); -#ifdef CONFIG_SMP do { u32 divider = get_pelt_divider(&se->avg); se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider); } while (0); -#endif enqueue_load_avg(cfs_rq, se); if (se->on_rq) { @@ -3863,7 +3834,6 @@ static void reweight_task_fair(struct rq *rq, struct task_struct *p, static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); #ifdef CONFIG_FAIR_GROUP_SCHED -#ifdef CONFIG_SMP /* * All this does is approximate the hierarchical proportion which includes that * global sum we all love to hate. @@ -3970,7 +3940,6 @@ static long calc_group_shares(struct cfs_rq *cfs_rq) */ return clamp_t(long, shares, MIN_SHARES, tg_shares); } -#endif /* CONFIG_SMP */ /* * Recomputes the group entity based on the current state of its group @@ -3991,20 +3960,16 @@ static void update_cfs_group(struct sched_entity *se) if (throttled_hierarchy(gcfs_rq)) return; -#ifndef CONFIG_SMP - shares = READ_ONCE(gcfs_rq->tg->shares); -#else shares = calc_group_shares(gcfs_rq); -#endif if (unlikely(se->load.weight != shares)) reweight_entity(cfs_rq_of(se), se, shares); } -#else /* CONFIG_FAIR_GROUP_SCHED */ +#else /* !CONFIG_FAIR_GROUP_SCHED: */ static inline void update_cfs_group(struct sched_entity *se) { } -#endif /* CONFIG_FAIR_GROUP_SCHED */ +#endif /* !CONFIG_FAIR_GROUP_SCHED */ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) { @@ -4029,7 +3994,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) } } -#ifdef CONFIG_SMP static inline bool load_avg_is_decayed(struct sched_avg *sa) { if (sa->load_sum) @@ -4481,7 +4445,7 @@ static inline bool skip_blocked_update(struct sched_entity *se) return true; } -#else /* CONFIG_FAIR_GROUP_SCHED */ +#else /* !CONFIG_FAIR_GROUP_SCHED: */ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {} @@ -4494,7 +4458,7 @@ static inline int propagate_entity_load_avg(struct sched_entity *se) static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {} -#endif /* CONFIG_FAIR_GROUP_SCHED */ +#endif /* !CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_NO_HZ_COMMON static inline void migrate_se_pelt_lag(struct sched_entity *se) @@ -4575,9 +4539,9 @@ static inline void migrate_se_pelt_lag(struct sched_entity *se) __update_load_avg_blocked_se(now, se); } -#else +#else /* !CONFIG_NO_HZ_COMMON: */ static void migrate_se_pelt_lag(struct sched_entity *se) {} -#endif +#endif /* !CONFIG_NO_HZ_COMMON */ /** * update_cfs_rq_load_avg - update the cfs_rq's load/util averages @@ -5144,48 +5108,6 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1); } -#else /* CONFIG_SMP */ - -static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) -{ - return !cfs_rq->nr_queued; -} - -#define UPDATE_TG 0x0 -#define SKIP_AGE_LOAD 0x0 -#define DO_ATTACH 0x0 -#define DO_DETACH 0x0 - -static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1) -{ - cfs_rq_util_change(cfs_rq, 0); -} - -static inline void remove_entity_load_avg(struct sched_entity *se) {} - -static inline void -attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} -static inline void -detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} - -static inline int sched_balance_newidle(struct rq *rq, struct rq_flags *rf) -{ - return 0; -} - -static inline void -util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {} - -static inline void -util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) {} - -static inline void -util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p, - bool task_sleep) {} -static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} - -#endif /* CONFIG_SMP */ - void __setparam_fair(struct task_struct *p, const struct sched_attr *attr) { struct sched_entity *se = &p->se; @@ -5253,7 +5175,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * V' = (\Sum w_j*v_j + w_i*v_i) / (W + w_i) * = (W*V + w_i*(V - vl_i)) / (W + w_i) * = (W*V + w_i*V - w_i*vl_i) / (W + w_i) - * = (V*(W + w_i) - w_i*l) / (W + w_i) + * = (V*(W + w_i) - w_i*vl_i) / (W + w_i) * = V - w_i*vl_i / (W + w_i) * * And the actual lag after adding an entity with vl_i is: @@ -5550,7 +5472,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) __dequeue_entity(cfs_rq, se); update_load_avg(cfs_rq, se, UPDATE_TG); - set_protect_slice(se); + set_protect_slice(cfs_rq, se); } update_stats_curr_start(cfs_rq, se); @@ -5685,7 +5607,7 @@ void cfs_bandwidth_usage_dec(void) { static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used); } -#else /* CONFIG_JUMP_LABEL */ +#else /* !CONFIG_JUMP_LABEL: */ static bool cfs_bandwidth_used(void) { return true; @@ -5693,16 +5615,7 @@ static bool cfs_bandwidth_used(void) void cfs_bandwidth_usage_inc(void) {} void cfs_bandwidth_usage_dec(void) {} -#endif /* CONFIG_JUMP_LABEL */ - -/* - * default period for cfs group bandwidth. - * default: 0.1s, units: nanoseconds - */ -static inline u64 default_cfs_period(void) -{ - return 100000000ULL; -} +#endif /* !CONFIG_JUMP_LABEL */ static inline u64 sched_cfs_bandwidth_slice(void) { @@ -5889,7 +5802,6 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; long queued_delta, runnable_delta, idle_delta, dequeue = 1; - long rq_h_nr_queued = rq->cfs.h_nr_queued; raw_spin_lock(&cfs_b->lock); /* This will start the period timer if necessary */ @@ -5973,10 +5885,6 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) /* At this point se is NULL and we are at root level*/ sub_nr_running(rq, queued_delta); - - /* Stop the fair server if throttling resulted in no runnable tasks */ - if (rq_h_nr_queued && !rq->cfs.h_nr_queued) - dl_server_stop(&rq->fair_server); done: /* * Note: distribution will already see us throttled via the @@ -6088,7 +5996,6 @@ unthrottle_throttle: resched_curr(rq); } -#ifdef CONFIG_SMP static void __cfsb_csd_unthrottle(void *arg) { struct cfs_rq *cursor, *tmp; @@ -6147,12 +6054,6 @@ static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) if (first) smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd); } -#else -static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) -{ - unthrottle_cfs_rq(cfs_rq); -} -#endif static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) { @@ -6490,8 +6391,6 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) return HRTIMER_NORESTART; } -extern const u64 max_cfs_quota_period; - static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) { struct cfs_bandwidth *cfs_b = @@ -6518,7 +6417,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) * to fail. */ new = old * 2; - if (new < max_cfs_quota_period) { + if (new < max_bw_quota_period_us * NSEC_PER_USEC) { cfs_b->period = ns_to_ktime(new); cfs_b->quota *= 2; cfs_b->burst *= 2; @@ -6552,7 +6451,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *paren raw_spin_lock_init(&cfs_b->lock); cfs_b->runtime = 0; cfs_b->quota = RUNTIME_INF; - cfs_b->period = ns_to_ktime(default_cfs_period()); + cfs_b->period = us_to_ktime(default_bw_period_us()); cfs_b->burst = 0; cfs_b->hierarchical_quota = parent ? parent->hierarchical_quota : RUNTIME_INF; @@ -6608,7 +6507,6 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) * guaranteed at this point that no additional cfs_rq of this group can * join a CSD list. */ -#ifdef CONFIG_SMP for_each_possible_cpu(i) { struct rq *rq = cpu_rq(i); unsigned long flags; @@ -6620,7 +6518,6 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) __cfsb_csd_unthrottle(rq); local_irq_restore(flags); } -#endif } /* @@ -6733,9 +6630,9 @@ static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) if (cfs_task_bw_constrained(p)) tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); } -#endif +#endif /* CONFIG_NO_HZ_FULL */ -#else /* CONFIG_CFS_BANDWIDTH */ +#else /* !CONFIG_CFS_BANDWIDTH: */ static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } @@ -6777,7 +6674,7 @@ bool cfs_task_bw_constrained(struct task_struct *p) return false; } #endif -#endif /* CONFIG_CFS_BANDWIDTH */ +#endif /* !CONFIG_CFS_BANDWIDTH */ #if !defined(CONFIG_CFS_BANDWIDTH) || !defined(CONFIG_NO_HZ_FULL) static inline void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) {} @@ -6822,7 +6719,7 @@ static void hrtick_update(struct rq *rq) hrtick_start_fair(rq, donor); } -#else /* !CONFIG_SCHED_HRTICK */ +#else /* !CONFIG_SCHED_HRTICK: */ static inline void hrtick_start_fair(struct rq *rq, struct task_struct *p) { @@ -6831,9 +6728,8 @@ hrtick_start_fair(struct rq *rq, struct task_struct *p) static inline void hrtick_update(struct rq *rq) { } -#endif +#endif /* !CONFIG_SCHED_HRTICK */ -#ifdef CONFIG_SMP static inline bool cpu_overutilized(int cpu) { unsigned long rq_util_min, rq_util_max; @@ -6875,9 +6771,6 @@ static inline void check_update_overutilized_status(struct rq *rq) if (!is_rd_overutilized(rq->rd) && cpu_overutilized(rq->cpu)) set_rd_overutilized(rq->rd, 1); } -#else -static inline void check_update_overutilized_status(struct rq *rq) { } -#endif /* Runqueue only has SCHED_IDLE tasks enqueued */ static int sched_idle_rq(struct rq *rq) @@ -6886,12 +6779,10 @@ static int sched_idle_rq(struct rq *rq) rq->nr_running); } -#ifdef CONFIG_SMP static int sched_idle_cpu(int cpu) { return sched_idle_rq(cpu_rq(cpu)); } -#endif static void requeue_delayed_entity(struct sched_entity *se) @@ -7070,7 +6961,6 @@ static void set_next_buddy(struct sched_entity *se); static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) { bool was_sched_idle = sched_idle_rq(rq); - int rq_h_nr_queued = rq->cfs.h_nr_queued; bool task_sleep = flags & DEQUEUE_SLEEP; bool task_delayed = flags & DEQUEUE_DELAYED; struct task_struct *p = NULL; @@ -7154,9 +7044,6 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) sub_nr_running(rq, h_nr_queued); - if (rq_h_nr_queued && !rq->cfs.h_nr_queued) - dl_server_stop(&rq->fair_server); - /* balance early to pull high priority tasks */ if (unlikely(!was_sched_idle && sched_idle_rq(rq))) rq->next_balance = jiffies; @@ -7206,8 +7093,6 @@ static inline unsigned int cfs_h_nr_delayed(struct rq *rq) return (rq->cfs.h_nr_queued - rq->cfs.h_nr_runnable); } -#ifdef CONFIG_SMP - /* Working cpumask for: sched_balance_rq(), sched_balance_newidle(). */ static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask); @@ -7677,7 +7562,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t return -1; } -#else /* CONFIG_SCHED_SMT */ +#else /* !CONFIG_SCHED_SMT: */ static inline void set_idle_cores(int cpu, int val) { @@ -7698,7 +7583,7 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd return -1; } -#endif /* CONFIG_SCHED_SMT */ +#endif /* !CONFIG_SCHED_SMT */ /* * Scan the LLC domain for idle CPUs; this is dynamically regulated by @@ -8743,9 +8628,6 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) return sched_balance_newidle(rq, rf) != 0; } -#else -static inline void set_task_max_allowed_capacity(struct task_struct *p) {} -#endif /* CONFIG_SMP */ static void set_next_buddy(struct sched_entity *se) { @@ -8767,6 +8649,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int struct sched_entity *se = &donor->se, *pse = &p->se; struct cfs_rq *cfs_rq = task_cfs_rq(donor); int cse_is_idle, pse_is_idle; + bool do_preempt_short = false; if (unlikely(se == pse)) return; @@ -8815,7 +8698,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int * When non-idle entity preempt an idle entity, * don't give idle entity slice protection. */ - cancel_protect_slice(se); + do_preempt_short = true; goto preempt; } @@ -8833,22 +8716,24 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int /* * If @p has a shorter slice than current and @p is eligible, override * current's slice protection in order to allow preemption. - * - * Note that even if @p does not turn out to be the most eligible - * task at this moment, current's slice protection will be lost. */ - if (do_preempt_short(cfs_rq, pse, se)) - cancel_protect_slice(se); + do_preempt_short = sched_feat(PREEMPT_SHORT) && (pse->slice < se->slice); /* * If @p has become the most eligible task, force preemption. */ - if (pick_eevdf(cfs_rq) == pse) + if (__pick_eevdf(cfs_rq, !do_preempt_short) == pse) goto preempt; + if (sched_feat(RUN_TO_PARITY) && do_preempt_short) + update_protect_slice(cfs_rq, se); + return; preempt: + if (do_preempt_short) + cancel_protect_slice(se); + resched_curr_lazy(rq); } @@ -8939,7 +8824,7 @@ again: return p; simple: -#endif +#endif /* CONFIG_FAIR_GROUP_SCHED */ put_prev_set_next_task(rq, prev, p); return p; @@ -9055,7 +8940,6 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) return true; } -#ifdef CONFIG_SMP /************************************************** * Fair scheduling class load-balancing methods. * @@ -9357,13 +9241,13 @@ static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env) return src_weight - dst_weight; } -#else +#else /* !CONFIG_NUMA_BALANCING: */ static inline long migrate_degrades_locality(struct task_struct *p, struct lb_env *env) { return 0; } -#endif +#endif /* !CONFIG_NUMA_BALANCING */ /* * Check whether the task is ineligible on the destination cpu @@ -9407,7 +9291,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * 2) throttled_lb_pair, or * 3) cannot be migrated to this CPU due to cpus_ptr, or * 4) running (obviously), or - * 5) are cache-hot on their current CPU. + * 5) are cache-hot on their current CPU, or + * 6) are blocked on mutexes (if SCHED_PROXY_EXEC is enabled) */ if ((p->se.sched_delayed) && (env->migration_type != migrate_load)) return 0; @@ -9429,6 +9314,9 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (kthread_is_per_cpu(p)) return 0; + if (task_is_blocked(p)) + return 0; + if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) { int cpu; @@ -9464,7 +9352,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* Record that we found at least one task that could run on dst_cpu */ env->flags &= ~LBF_ALL_PINNED; - if (task_on_cpu(env->src_rq, p)) { + if (task_on_cpu(env->src_rq, p) || + task_current_donor(env->src_rq, p)) { schedstat_inc(p->stats.nr_failed_migrations_running); return 0; } @@ -9508,6 +9397,9 @@ static void detach_task(struct task_struct *p, struct lb_env *env) schedstat_inc(p->stats.nr_forced_migrations); } + WARN_ON(task_current(env->src_rq, p)); + WARN_ON(task_current_donor(env->src_rq, p)); + deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK); set_task_cpu(p, env->dst_cpu); } @@ -9772,12 +9664,12 @@ static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) if (!has_blocked) rq->has_blocked_load = 0; } -#else +#else /* !CONFIG_NO_HZ_COMMON: */ static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; } static inline bool others_have_blocked(struct rq *rq) { return false; } static inline void update_blocked_load_tick(struct rq *rq) {} static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {} -#endif +#endif /* !CONFIG_NO_HZ_COMMON */ static bool __update_blocked_others(struct rq *rq, bool *done) { @@ -9886,7 +9778,7 @@ static unsigned long task_h_load(struct task_struct *p) return div64_ul(p->se.avg.load_avg * cfs_rq->h_load, cfs_rq_load_avg(cfs_rq) + 1); } -#else +#else /* !CONFIG_FAIR_GROUP_SCHED: */ static bool __update_blocked_fair(struct rq *rq, bool *done) { struct cfs_rq *cfs_rq = &rq->cfs; @@ -9903,7 +9795,7 @@ static unsigned long task_h_load(struct task_struct *p) { return p->se.avg.load_avg; } -#endif +#endif /* !CONFIG_FAIR_GROUP_SCHED */ static void sched_balance_update_blocked_averages(int cpu) { @@ -10048,9 +9940,9 @@ void update_group_capacity(struct sched_domain *sd, int cpu) min_capacity = ULONG_MAX; max_capacity = 0; - if (child->flags & SD_OVERLAP) { + if (child->flags & SD_NUMA) { /* - * SD_OVERLAP domains cannot assume that child groups + * SD_NUMA domains cannot assume that child groups * span the current group. */ @@ -10063,7 +9955,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) } } else { /* - * !SD_OVERLAP domains can assume that child groups + * !SD_NUMA domains can assume that child groups * span the current group. */ @@ -10616,7 +10508,7 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq) return remote; return all; } -#else +#else /* !CONFIG_NUMA_BALANCING: */ static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs) { return all; @@ -10626,7 +10518,7 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq) { return regular; } -#endif /* CONFIG_NUMA_BALANCING */ +#endif /* !CONFIG_NUMA_BALANCING */ struct sg_lb_stats; @@ -12174,8 +12066,14 @@ static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost) /* * Track max cost of a domain to make sure to not delay the * next wakeup on the CPU. + * + * sched_balance_newidle() bumps the cost whenever newidle + * balance fails, and we don't want things to grow out of + * control. Use the sysctl_sched_migration_cost as the upper + * limit, plus a litle extra to avoid off by ones. */ - sd->max_newidle_lb_cost = cost; + sd->max_newidle_lb_cost = + min(cost, sysctl_sched_migration_cost + 200); sd->last_decay_max_lb_cost = jiffies; } else if (time_after(jiffies, sd->last_decay_max_lb_cost + HZ)) { /* @@ -12772,7 +12670,7 @@ static void nohz_newidle_balance(struct rq *this_rq) atomic_or(NOHZ_NEWILB_KICK, nohz_flags(this_cpu)); } -#else /* !CONFIG_NO_HZ_COMMON */ +#else /* !CONFIG_NO_HZ_COMMON: */ static inline void nohz_balancer_kick(struct rq *rq) { } static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) @@ -12781,7 +12679,7 @@ static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle } static inline void nohz_newidle_balance(struct rq *this_rq) { } -#endif /* CONFIG_NO_HZ_COMMON */ +#endif /* !CONFIG_NO_HZ_COMMON */ /* * sched_balance_newidle is called by schedule() if this_cpu is about to become @@ -12867,10 +12765,17 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) t1 = sched_clock_cpu(this_cpu); domain_cost = t1 - t0; - update_newidle_cost(sd, domain_cost); - curr_cost += domain_cost; t0 = t1; + + /* + * Failing newidle means it is not effective; + * bump the cost so we end up doing less of it. + */ + if (!pulled_task) + domain_cost = (3 * sd->max_newidle_lb_cost) / 2; + + update_newidle_cost(sd, domain_cost); } /* @@ -12978,8 +12883,6 @@ static void rq_offline_fair(struct rq *rq) clear_tg_offline_cfs_rqs(rq); } -#endif /* CONFIG_SMP */ - #ifdef CONFIG_SCHED_CORE static inline bool __entity_slice_used(struct sched_entity *se, int min_nr_tasks) @@ -13076,10 +12979,10 @@ bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b, cfs_rqa = sea->cfs_rq; cfs_rqb = seb->cfs_rq; -#else +#else /* !CONFIG_FAIR_GROUP_SCHED: */ cfs_rqa = &task_rq(a)->cfs; cfs_rqb = &task_rq(b)->cfs; -#endif +#endif /* !CONFIG_FAIR_GROUP_SCHED */ /* * Find delta after normalizing se's vruntime with its cfs_rq's @@ -13103,9 +13006,9 @@ static int task_is_throttled_fair(struct task_struct *p, int cpu) #endif return throttled_hierarchy(cfs_rq); } -#else +#else /* !CONFIG_SCHED_CORE: */ static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {} -#endif +#endif /* !CONFIG_SCHED_CORE */ /* * scheduler tick hitting a task of our scheduling class. @@ -13199,15 +13102,14 @@ static void propagate_entity_cfs_rq(struct sched_entity *se) list_add_leaf_cfs_rq(cfs_rq); } } -#else +#else /* !CONFIG_FAIR_GROUP_SCHED: */ static void propagate_entity_cfs_rq(struct sched_entity *se) { } -#endif +#endif /* !CONFIG_FAIR_GROUP_SCHED */ static void detach_entity_cfs_rq(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); -#ifdef CONFIG_SMP /* * In case the task sched_avg hasn't been attached: * - A forked task which hasn't been woken up by wake_up_new_task(). @@ -13216,7 +13118,6 @@ static void detach_entity_cfs_rq(struct sched_entity *se) */ if (!se->avg.last_update_time) return; -#endif /* Catch up with the cfs_rq and remove our load when we leave */ update_load_avg(cfs_rq, se, 0); @@ -13280,7 +13181,6 @@ static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool firs { struct sched_entity *se = &p->se; -#ifdef CONFIG_SMP if (task_on_rq_queued(p)) { /* * Move the next running task to the front of the list, so our @@ -13288,7 +13188,6 @@ static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool firs */ list_move(&se->group_node, &rq->cfs_tasks); } -#endif if (!first) return; @@ -13326,9 +13225,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) { cfs_rq->tasks_timeline = RB_ROOT_CACHED; cfs_rq->min_vruntime = (u64)(-(1LL << 20)); -#ifdef CONFIG_SMP raw_spin_lock_init(&cfs_rq->removed.lock); -#endif } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -13343,10 +13240,8 @@ static void task_change_group_fair(struct task_struct *p) detach_task_cfs_rq(p); -#ifdef CONFIG_SMP /* Tell se's cfs_rq has been changed -- migrated */ p->se.avg.last_update_time = 0; -#endif set_task_rq(p, task_cpu(p)); attach_task_cfs_rq(p); } @@ -13642,7 +13537,6 @@ DEFINE_SCHED_CLASS(fair) = { .put_prev_task = put_prev_task_fair, .set_next_task = set_next_task_fair, -#ifdef CONFIG_SMP .balance = balance_fair, .select_task_rq = select_task_rq_fair, .migrate_task_rq = migrate_task_rq_fair, @@ -13652,7 +13546,6 @@ DEFINE_SCHED_CLASS(fair) = { .task_dead = task_dead_fair, .set_cpus_allowed = set_cpus_allowed_fair, -#endif .task_tick = task_tick_fair, .task_fork = task_fork_fair, @@ -13715,7 +13608,6 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m) __init void init_sched_fair_class(void) { -#ifdef CONFIG_SMP int i; for_each_possible_cpu(i) { @@ -13737,6 +13629,4 @@ __init void init_sched_fair_class(void) nohz.next_blocked = jiffies; zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); #endif -#endif /* SMP */ - } diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 2c85c86b455f..c39b089d4f09 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -6,6 +6,11 @@ * (NOTE: these are not related to SCHED_IDLE batch scheduled * tasks which are handled in sched/fair.c ) */ +#include <linux/cpuidle.h> +#include <linux/suspend.h> +#include <linux/livepatch.h> +#include "sched.h" +#include "smp.h" /* Linker adds these: start and end of __cpuidle functions */ extern char __cpuidle_text_start[], __cpuidle_text_end[]; @@ -47,7 +52,7 @@ static int __init cpu_idle_nopoll_setup(char *__unused) return 1; } __setup("hlt", cpu_idle_nopoll_setup); -#endif +#endif /* CONFIG_GENERIC_IDLE_POLL_SETUP */ static noinline int __cpuidle cpu_idle_poll(void) { @@ -95,10 +100,10 @@ static inline void cond_tick_broadcast_exit(void) if (static_branch_unlikely(&arch_needs_tick_broadcast)) tick_broadcast_exit(); } -#else +#else /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST_IDLE: */ static inline void cond_tick_broadcast_enter(void) { } static inline void cond_tick_broadcast_exit(void) { } -#endif +#endif /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST_IDLE */ /** * default_idle_call - Default CPU idle routine. @@ -427,7 +432,6 @@ void cpu_startup_entry(enum cpuhp_state state) * idle-task scheduling class. */ -#ifdef CONFIG_SMP static int select_task_rq_idle(struct task_struct *p, int cpu, int flags) { @@ -439,7 +443,6 @@ balance_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { return WARN_ON_ONCE(1); } -#endif /* * Idle tasks are unconditionally rescheduled: @@ -526,11 +529,9 @@ DEFINE_SCHED_CLASS(idle) = { .put_prev_task = put_prev_task_idle, .set_next_task = set_next_task_idle, -#ifdef CONFIG_SMP .balance = balance_idle, .select_task_rq = select_task_rq_idle, .set_cpus_allowed = set_cpus_allowed_common, -#endif .task_tick = task_tick_idle, diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 93b038d48900..a4cf17b1fab0 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -7,6 +7,8 @@ * Copyright (C) 2017-2018 SUSE, Frederic Weisbecker * */ +#include <linux/sched/isolation.h> +#include "sched.h" enum hk_flags { HK_FLAG_DOMAIN = BIT(HK_TYPE_DOMAIN), diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index 52ca8e268cfc..b601e0243d0e 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -6,6 +6,8 @@ * figure. Its a silly number but people think its important. We go through * great pains to make it work on big machines and tickless kernels. */ +#include <linux/sched/nohz.h> +#include "sched.h" /* * Global load-average calculations @@ -333,12 +335,12 @@ static void calc_global_nohz(void) smp_wmb(); calc_load_idx++; } -#else /* !CONFIG_NO_HZ_COMMON */ +#else /* !CONFIG_NO_HZ_COMMON: */ static inline long calc_load_nohz_read(void) { return 0; } static inline void calc_global_nohz(void) { } -#endif /* CONFIG_NO_HZ_COMMON */ +#endif /* !CONFIG_NO_HZ_COMMON */ /* * calc_load - update the avenrun load estimates 10 ticks after the diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 809194cd779f..62fba83b7bb1 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -4,6 +4,8 @@ * * membarrier system call */ +#include <uapi/linux/membarrier.h> +#include "sched.h" /* * For documentation purposes, here are some membarrier ordering diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index 7a8534a2deff..fa83bbaf4f3e 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -23,6 +23,7 @@ * Move PELT related code from fair.c into this pelt.c file * Author: Vincent Guittot <vincent.guittot@linaro.org> */ +#include "pelt.h" /* * Approximate: @@ -413,7 +414,7 @@ int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity) return 0; } -#endif +#endif /* CONFIG_SCHED_HW_PRESSURE */ #ifdef CONFIG_HAVE_SCHED_AVG_IRQ /* @@ -466,7 +467,7 @@ int update_irq_load_avg(struct rq *rq, u64 running) return ret; } -#endif +#endif /* CONFIG_HAVE_SCHED_AVG_IRQ */ /* * Load avg and utiliztion metrics need to be updated periodically and before diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index f4f6a0875c66..62c3fa543c0f 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -1,4 +1,8 @@ -#ifdef CONFIG_SMP +// SPDX-License-Identifier: GPL-2.0 +#ifndef _KERNEL_SCHED_PELT_H +#define _KERNEL_SCHED_PELT_H +#include "sched.h" + #include "sched-pelt.h" int __update_load_avg_blocked_se(u64 now, struct sched_entity *se); @@ -15,7 +19,7 @@ static inline u64 hw_load_avg(struct rq *rq) { return READ_ONCE(rq->avg_hw.load_avg); } -#else +#else /* !CONFIG_SCHED_HW_PRESSURE: */ static inline int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity) { @@ -26,7 +30,7 @@ static inline u64 hw_load_avg(struct rq *rq) { return 0; } -#endif +#endif /* !CONFIG_SCHED_HW_PRESSURE */ #ifdef CONFIG_HAVE_SCHED_AVG_IRQ int update_irq_load_avg(struct rq *rq, u64 running); @@ -174,63 +178,12 @@ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time; } -#else +#else /* !CONFIG_CFS_BANDWIDTH: */ static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { } static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { return rq_clock_pelt(rq_of(cfs_rq)); } -#endif - -#else - -static inline int -update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) -{ - return 0; -} - -static inline int -update_rt_rq_load_avg(u64 now, struct rq *rq, int running) -{ - return 0; -} - -static inline int -update_dl_rq_load_avg(u64 now, struct rq *rq, int running) -{ - return 0; -} - -static inline int -update_hw_load_avg(u64 now, struct rq *rq, u64 capacity) -{ - return 0; -} - -static inline u64 hw_load_avg(struct rq *rq) -{ - return 0; -} - -static inline int -update_irq_load_avg(struct rq *rq, u64 running) -{ - return 0; -} - -static inline u64 rq_clock_pelt(struct rq *rq) -{ - return rq_clock_task(rq); -} - -static inline void -update_rq_clock_pelt(struct rq *rq, s64 delta) { } - -static inline void -update_idle_rq_clock_pelt(struct rq *rq) { } - -static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { } -#endif - +#endif /* !CONFIG_CFS_BANDWIDTH */ +#endif /* _KERNEL_SCHED_PELT_H */ diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index ad04a5c3162a..2024c1d36402 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -136,6 +136,10 @@ * cost-wise, yet way more sensitive and accurate than periodic * sampling of the aggregate task states would be. */ +#include <linux/sched/clock.h> +#include <linux/workqueue.h> +#include <linux/psi.h> +#include "sched.h" static int psi_bug __read_mostly; @@ -172,6 +176,28 @@ struct psi_group psi_system = { .pcpu = &system_group_pcpu, }; +static DEFINE_PER_CPU(seqcount_t, psi_seq); + +static inline void psi_write_begin(int cpu) +{ + write_seqcount_begin(per_cpu_ptr(&psi_seq, cpu)); +} + +static inline void psi_write_end(int cpu) +{ + write_seqcount_end(per_cpu_ptr(&psi_seq, cpu)); +} + +static inline u32 psi_read_begin(int cpu) +{ + return read_seqcount_begin(per_cpu_ptr(&psi_seq, cpu)); +} + +static inline bool psi_read_retry(int cpu, u32 seq) +{ + return read_seqcount_retry(per_cpu_ptr(&psi_seq, cpu), seq); +} + static void psi_avgs_work(struct work_struct *work); static void poll_timer_fn(struct timer_list *t); @@ -182,7 +208,7 @@ static void group_init(struct psi_group *group) group->enabled = true; for_each_possible_cpu(cpu) - seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq); + seqcount_init(per_cpu_ptr(&psi_seq, cpu)); group->avg_last_update = sched_clock(); group->avg_next_update = group->avg_last_update + psi_period; mutex_init(&group->avgs_lock); @@ -262,14 +288,14 @@ static void get_recent_times(struct psi_group *group, int cpu, /* Snapshot a coherent view of the CPU state */ do { - seq = read_seqcount_begin(&groupc->seq); + seq = psi_read_begin(cpu); now = cpu_clock(cpu); memcpy(times, groupc->times, sizeof(groupc->times)); state_mask = groupc->state_mask; state_start = groupc->state_start; if (cpu == current_cpu) memcpy(tasks, groupc->tasks, sizeof(groupc->tasks)); - } while (read_seqcount_retry(&groupc->seq, seq)); + } while (psi_read_retry(cpu, seq)); /* Calculate state time deltas against the previous snapshot */ for (s = 0; s < NR_PSI_STATES; s++) { @@ -768,31 +794,21 @@ static void record_times(struct psi_group_cpu *groupc, u64 now) groupc->times[PSI_NONIDLE] += delta; } +#define for_each_group(iter, group) \ + for (typeof(group) iter = group; iter; iter = iter->parent) + static void psi_group_change(struct psi_group *group, int cpu, unsigned int clear, unsigned int set, - bool wake_clock) + u64 now, bool wake_clock) { struct psi_group_cpu *groupc; unsigned int t, m; u32 state_mask; - u64 now; lockdep_assert_rq_held(cpu_rq(cpu)); groupc = per_cpu_ptr(group->pcpu, cpu); /* - * First we update the task counts according to the state - * change requested through the @clear and @set bits. - * - * Then if the cgroup PSI stats accounting enabled, we - * assess the aggregate resource states this CPU's tasks - * have been in since the last change, and account any - * SOME and FULL time these may have resulted in. - */ - write_seqcount_begin(&groupc->seq); - now = cpu_clock(cpu); - - /* * Start with TSK_ONCPU, which doesn't have a corresponding * task count - it's just a boolean flag directly encoded in * the state mask. Clear, set, or carry the current state if @@ -843,7 +859,6 @@ static void psi_group_change(struct psi_group *group, int cpu, groupc->state_mask = state_mask; - write_seqcount_end(&groupc->seq); return; } @@ -864,8 +879,6 @@ static void psi_group_change(struct psi_group *group, int cpu, groupc->state_mask = state_mask; - write_seqcount_end(&groupc->seq); - if (state_mask & group->rtpoll_states) psi_schedule_rtpoll_work(group, 1, false); @@ -900,24 +913,29 @@ static void psi_flags_change(struct task_struct *task, int clear, int set) void psi_task_change(struct task_struct *task, int clear, int set) { int cpu = task_cpu(task); - struct psi_group *group; + u64 now; if (!task->pid) return; psi_flags_change(task, clear, set); - group = task_psi_group(task); - do { - psi_group_change(group, cpu, clear, set, true); - } while ((group = group->parent)); + psi_write_begin(cpu); + now = cpu_clock(cpu); + for_each_group(group, task_psi_group(task)) + psi_group_change(group, cpu, clear, set, now, true); + psi_write_end(cpu); } void psi_task_switch(struct task_struct *prev, struct task_struct *next, bool sleep) { - struct psi_group *group, *common = NULL; + struct psi_group *common = NULL; int cpu = task_cpu(prev); + u64 now; + + psi_write_begin(cpu); + now = cpu_clock(cpu); if (next->pid) { psi_flags_change(next, 0, TSK_ONCPU); @@ -926,16 +944,15 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, * ancestors with @prev, those will already have @prev's * TSK_ONCPU bit set, and we can stop the iteration there. */ - group = task_psi_group(next); - do { - if (per_cpu_ptr(group->pcpu, cpu)->state_mask & - PSI_ONCPU) { + for_each_group(group, task_psi_group(next)) { + struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); + + if (groupc->state_mask & PSI_ONCPU) { common = group; break; } - - psi_group_change(group, cpu, 0, TSK_ONCPU, true); - } while ((group = group->parent)); + psi_group_change(group, cpu, 0, TSK_ONCPU, now, true); + } } if (prev->pid) { @@ -968,12 +985,11 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, psi_flags_change(prev, clear, set); - group = task_psi_group(prev); - do { + for_each_group(group, task_psi_group(prev)) { if (group == common) break; - psi_group_change(group, cpu, clear, set, wake_clock); - } while ((group = group->parent)); + psi_group_change(group, cpu, clear, set, now, wake_clock); + } /* * TSK_ONCPU is handled up to the common ancestor. If there are @@ -983,20 +999,21 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, */ if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) { clear &= ~TSK_ONCPU; - for (; group; group = group->parent) - psi_group_change(group, cpu, clear, set, wake_clock); + for_each_group(group, common) + psi_group_change(group, cpu, clear, set, now, wake_clock); } } + psi_write_end(cpu); } #ifdef CONFIG_IRQ_TIME_ACCOUNTING void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev) { int cpu = task_cpu(curr); - struct psi_group *group; struct psi_group_cpu *groupc; s64 delta; u64 irq; + u64 now; if (static_branch_likely(&psi_disabled) || !irqtime_enabled()) return; @@ -1005,8 +1022,7 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st return; lockdep_assert_rq_held(rq); - group = task_psi_group(curr); - if (prev && task_psi_group(prev) == group) + if (prev && task_psi_group(prev) == task_psi_group(curr)) return; irq = irq_time_read(cpu); @@ -1015,27 +1031,24 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st return; rq->psi_irq_time = irq; - do { - u64 now; + psi_write_begin(cpu); + now = cpu_clock(cpu); + for_each_group(group, task_psi_group(curr)) { if (!group->enabled) continue; groupc = per_cpu_ptr(group->pcpu, cpu); - write_seqcount_begin(&groupc->seq); - now = cpu_clock(cpu); - record_times(groupc, now); groupc->times[PSI_IRQ_FULL] += delta; - write_seqcount_end(&groupc->seq); - if (group->rtpoll_states & (1 << PSI_IRQ_FULL)) psi_schedule_rtpoll_work(group, 1, false); - } while ((group = group->parent)); + } + psi_write_end(cpu); } -#endif +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ /** * psi_memstall_enter - mark the beginning of a memory stall section @@ -1221,12 +1234,14 @@ void psi_cgroup_restart(struct psi_group *group) return; for_each_possible_cpu(cpu) { - struct rq *rq = cpu_rq(cpu); - struct rq_flags rf; + u64 now; + + guard(rq_lock_irq)(cpu_rq(cpu)); - rq_lock_irq(rq, &rf); - psi_group_change(group, cpu, 0, 0, true); - rq_unlock_irq(rq, &rf); + psi_write_begin(cpu); + now = cpu_clock(cpu); + psi_group_change(group, cpu, 0, 0, now, true); + psi_write_end(cpu); } } #endif /* CONFIG_CGROUPS */ @@ -1651,7 +1666,7 @@ static const struct proc_ops psi_irq_proc_ops = { .proc_poll = psi_fop_poll, .proc_release = psi_fop_release, }; -#endif +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ static int __init psi_proc_init(void) { diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index e40422c37033..7936d4333731 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -4,6 +4,9 @@ * policies) */ +#include "sched.h" +#include "pelt.h" + int sched_rr_timeslice = RR_TIMESLICE; /* More than 4 hours if BW_SHIFT equals 20. */ static const u64 max_rt_runtime = MAX_BW; @@ -60,7 +63,7 @@ static int __init sched_rt_sysctl_init(void) return 0; } late_initcall(sched_rt_sysctl_init); -#endif +#endif /* CONFIG_SYSCTL */ void init_rt_rq(struct rt_rq *rt_rq) { @@ -75,12 +78,10 @@ void init_rt_rq(struct rt_rq *rt_rq) /* delimiter for bitsearch: */ __set_bit(MAX_RT_PRIO, array->bitmap); -#if defined CONFIG_SMP rt_rq->highest_prio.curr = MAX_RT_PRIO-1; rt_rq->highest_prio.next = MAX_RT_PRIO-1; rt_rq->overloaded = 0; plist_head_init(&rt_rq->pushable_tasks); -#endif /* CONFIG_SMP */ /* We start is dequeued state, because no RT tasks are queued */ rt_rq->rt_queued = 0; @@ -291,7 +292,7 @@ err: return 0; } -#else /* CONFIG_RT_GROUP_SCHED */ +#else /* !CONFIG_RT_GROUP_SCHED: */ #define rt_entity_is_task(rt_se) (1) @@ -327,9 +328,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) { return 1; } -#endif /* CONFIG_RT_GROUP_SCHED */ - -#ifdef CONFIG_SMP +#endif /* !CONFIG_RT_GROUP_SCHED */ static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) { @@ -430,21 +429,6 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) } } -#else - -static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p) -{ -} - -static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p) -{ -} - -static inline void rt_queue_push_tasks(struct rq *rq) -{ -} -#endif /* CONFIG_SMP */ - static void enqueue_top_rt_rq(struct rt_rq *rt_rq); static void dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count); @@ -485,12 +469,12 @@ static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu) return cpu_cap >= min(min_cap, max_cap); } -#else +#else /* !CONFIG_UCLAMP_TASK: */ static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu) { return true; } -#endif +#endif /* !CONFIG_UCLAMP_TASK */ #ifdef CONFIG_RT_GROUP_SCHED @@ -594,17 +578,10 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se) return p->prio != p->normal_prio; } -#ifdef CONFIG_SMP static inline const struct cpumask *sched_rt_period_mask(void) { return this_rq()->rd->span; } -#else -static inline const struct cpumask *sched_rt_period_mask(void) -{ - return cpu_online_mask; -} -#endif static inline struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) @@ -625,7 +602,6 @@ bool sched_rt_bandwidth_account(struct rt_rq *rt_rq) rt_rq->rt_time < rt_b->rt_runtime); } -#ifdef CONFIG_SMP /* * We ran out of runtime, see if we can borrow some from our neighbours. */ @@ -798,9 +774,6 @@ static void balance_runtime(struct rt_rq *rt_rq) raw_spin_lock(&rt_rq->rt_runtime_lock); } } -#else /* !CONFIG_SMP */ -static inline void balance_runtime(struct rt_rq *rt_rq) {} -#endif /* CONFIG_SMP */ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) { @@ -930,7 +903,7 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) return 0; } -#else /* !CONFIG_RT_GROUP_SCHED */ +#else /* !CONFIG_RT_GROUP_SCHED: */ typedef struct rt_rq *rt_rq_iter_t; @@ -977,12 +950,10 @@ struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) return &cpu_rq(cpu)->rt; } -#ifdef CONFIG_SMP static void __enable_runtime(struct rq *rq) { } static void __disable_runtime(struct rq *rq) { } -#endif -#endif /* CONFIG_RT_GROUP_SCHED */ +#endif /* !CONFIG_RT_GROUP_SCHED */ static inline int rt_se_prio(struct sched_rt_entity *rt_se) { @@ -1033,7 +1004,7 @@ static void update_curr_rt(struct rq *rq) do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq)); } } -#endif +#endif /* CONFIG_RT_GROUP_SCHED */ } static void @@ -1075,8 +1046,6 @@ enqueue_top_rt_rq(struct rt_rq *rt_rq) cpufreq_update_util(rq, 0); } -#if defined CONFIG_SMP - static void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) { @@ -1107,16 +1076,6 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); } -#else /* CONFIG_SMP */ - -static inline -void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {} -static inline -void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {} - -#endif /* CONFIG_SMP */ - -#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED static void inc_rt_prio(struct rt_rq *rt_rq, int prio) { @@ -1155,13 +1114,6 @@ dec_rt_prio(struct rt_rq *rt_rq, int prio) dec_rt_prio_smp(rt_rq, prio, prev_prio); } -#else - -static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {} -static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {} - -#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */ - #ifdef CONFIG_RT_GROUP_SCHED static void @@ -1182,7 +1134,7 @@ dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted); } -#else /* CONFIG_RT_GROUP_SCHED */ +#else /* !CONFIG_RT_GROUP_SCHED: */ static void inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) @@ -1192,7 +1144,7 @@ inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) static inline void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {} -#endif /* CONFIG_RT_GROUP_SCHED */ +#endif /* !CONFIG_RT_GROUP_SCHED */ static inline unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se) @@ -1488,6 +1440,9 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) enqueue_rt_entity(rt_se, flags); + if (task_is_blocked(p)) + return; + if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); } @@ -1538,7 +1493,6 @@ static void yield_task_rt(struct rq *rq) requeue_task_rt(rq, rq->curr, 0); } -#ifdef CONFIG_SMP static int find_lowest_rq(struct task_struct *task); static int @@ -1653,7 +1607,6 @@ static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf) return sched_stop_runnable(rq) || sched_dl_runnable(rq) || sched_rt_runnable(rq); } -#endif /* CONFIG_SMP */ /* * Preempt the current task with a newly woken task if needed: @@ -1667,7 +1620,6 @@ static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags) return; } -#ifdef CONFIG_SMP /* * If: * @@ -1682,7 +1634,6 @@ static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags) */ if (p->prio == donor->prio && !test_tsk_need_resched(rq->curr)) check_preempt_equal_prio(rq, p); -#endif } static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first) @@ -1768,6 +1719,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct task_s update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1); + if (task_is_blocked(p)) + return; /* * The previous task needs to be made eligible for pushing * if it is still active @@ -1776,8 +1729,6 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct task_s enqueue_pushable_task(rq, p); } -#ifdef CONFIG_SMP - /* Only try algorithms three times */ #define RT_MAX_TRIES 3 @@ -2451,7 +2402,6 @@ void __init init_sched_rt_class(void) GFP_KERNEL, cpu_to_node(i)); } } -#endif /* CONFIG_SMP */ /* * When switching a task to RT, we may overload the runqueue @@ -2475,10 +2425,8 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) * then see if we can move to another run queue. */ if (task_on_rq_queued(p)) { -#ifdef CONFIG_SMP if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) rt_queue_push_tasks(rq); -#endif /* CONFIG_SMP */ if (p->prio < rq->donor->prio && cpu_online(cpu_of(rq))) resched_curr(rq); } @@ -2495,7 +2443,6 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) return; if (task_current_donor(rq, p)) { -#ifdef CONFIG_SMP /* * If our priority decreases while running, we * may need to pull tasks to this runqueue. @@ -2509,11 +2456,6 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) */ if (p->prio > rq->rt.highest_prio.curr) resched_curr(rq); -#else - /* For UP simply resched on drop of prio */ - if (oldprio < p->prio) - resched_curr(rq); -#endif /* CONFIG_SMP */ } else { /* * This task is not running, but if it is @@ -2549,9 +2491,9 @@ static void watchdog(struct rq *rq, struct task_struct *p) } } } -#else +#else /* !CONFIG_POSIX_TIMERS: */ static inline void watchdog(struct rq *rq, struct task_struct *p) { } -#endif +#endif /* !CONFIG_POSIX_TIMERS */ /* * scheduler tick hitting a task of our scheduling class. @@ -2620,7 +2562,7 @@ static int task_is_throttled_rt(struct task_struct *p, int cpu) return rt_rq_throttled(rt_rq); } -#endif +#endif /* CONFIG_SCHED_CORE */ DEFINE_SCHED_CLASS(rt) = { @@ -2634,7 +2576,6 @@ DEFINE_SCHED_CLASS(rt) = { .put_prev_task = put_prev_task_rt, .set_next_task = set_next_task_rt, -#ifdef CONFIG_SMP .balance = balance_rt, .select_task_rq = select_task_rq_rt, .set_cpus_allowed = set_cpus_allowed_common, @@ -2643,7 +2584,6 @@ DEFINE_SCHED_CLASS(rt) = { .task_woken = task_woken_rt, .switched_from = switched_from_rt, .find_lock_rq = find_lock_lowest_rq, -#endif .task_tick = task_tick_rt, @@ -2887,7 +2827,7 @@ int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) return 1; } -#else /* !CONFIG_RT_GROUP_SCHED */ +#else /* !CONFIG_RT_GROUP_SCHED: */ #ifdef CONFIG_SYSCTL static int sched_rt_global_constraints(void) @@ -2895,7 +2835,7 @@ static int sched_rt_global_constraints(void) return 0; } #endif /* CONFIG_SYSCTL */ -#endif /* CONFIG_RT_GROUP_SCHED */ +#endif /* !CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_SYSCTL static int sched_rt_global_validate(void) @@ -2951,6 +2891,12 @@ undo: sched_domains_mutex_unlock(); mutex_unlock(&mutex); + /* + * After changing maximum available bandwidth for DEADLINE, we need to + * recompute per root domain and per cpus variables accordingly. + */ + rebuild_sched_domains(); + return ret; } diff --git a/kernel/sched/sched-pelt.h b/kernel/sched/sched-pelt.h index c529706bed11..6803cfec7a1e 100644 --- a/kernel/sched/sched-pelt.h +++ b/kernel/sched/sched-pelt.h @@ -1,5 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* Generated by Documentation/scheduler/sched-pelt; do not modify. */ +#include <linux/types.h> static const u32 runnable_avg_yN_inv[] __maybe_unused = { 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 83e3aa917142..d3f33d10c58c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -69,6 +69,7 @@ #include <linux/wait_bit.h> #include <linux/workqueue_api.h> #include <linux/delayacct.h> +#include <linux/mmu_context.h> #include <trace/events/power.h> #include <trace/events/sched.h> @@ -384,6 +385,7 @@ extern void dl_server_stop(struct sched_dl_entity *dl_se); extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, dl_server_has_tasks_f has_tasks, dl_server_pick_f pick_task); +extern void sched_init_dl_servers(void); extern void dl_server_update_idle_time(struct rq *rq, struct task_struct *p); @@ -401,6 +403,19 @@ static inline bool dl_server_active(struct sched_dl_entity *dl_se) extern struct list_head task_groups; +#ifdef CONFIG_CFS_BANDWIDTH +extern const u64 max_bw_quota_period_us; + +/* + * default period for group bandwidth. + * default: 0.1s, units: microseconds + */ +static inline u64 default_bw_period_us(void) +{ + return 100000ULL; +} +#endif /* CONFIG_CFS_BANDWIDTH */ + struct cfs_bandwidth { #ifdef CONFIG_CFS_BANDWIDTH raw_spinlock_t lock; @@ -424,7 +439,7 @@ struct cfs_bandwidth { int nr_burst; u64 throttled_time; u64 burst_time; -#endif +#endif /* CONFIG_CFS_BANDWIDTH */ }; /* Task group related information */ @@ -442,15 +457,13 @@ struct task_group { /* runqueue "owned" by this group on each CPU */ struct cfs_rq **cfs_rq; unsigned long shares; -#ifdef CONFIG_SMP /* * load_avg can be heavily contended at clock tick time, so put * it in its own cache-line separated from the fields above which * will also be accessed at each tick. */ atomic_long_t load_avg ____cacheline_aligned; -#endif -#endif +#endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED struct sched_rt_entity **rt_se; @@ -531,7 +544,7 @@ extern void free_fair_sched_group(struct task_group *tg); extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); extern void online_fair_sched_group(struct task_group *tg); extern void unregister_fair_sched_group(struct task_group *tg); -#else +#else /* !CONFIG_FAIR_GROUP_SCHED: */ static inline void free_fair_sched_group(struct task_group *tg) { } static inline int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { @@ -539,7 +552,7 @@ static inline int alloc_fair_sched_group(struct task_group *tg, struct task_grou } static inline void online_fair_sched_group(struct task_group *tg) { } static inline void unregister_fair_sched_group(struct task_group *tg) { } -#endif +#endif /* !CONFIG_FAIR_GROUP_SCHED */ extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, struct sched_entity *se, int cpu, @@ -573,25 +586,20 @@ extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); extern int sched_group_set_idle(struct task_group *tg, long idle); -#ifdef CONFIG_SMP extern void set_task_rq_fair(struct sched_entity *se, struct cfs_rq *prev, struct cfs_rq *next); -#else /* !CONFIG_SMP */ -static inline void set_task_rq_fair(struct sched_entity *se, - struct cfs_rq *prev, struct cfs_rq *next) { } -#endif /* CONFIG_SMP */ -#else /* !CONFIG_FAIR_GROUP_SCHED */ +#else /* !CONFIG_FAIR_GROUP_SCHED: */ static inline int sched_group_set_shares(struct task_group *tg, unsigned long shares) { return 0; } static inline int sched_group_set_idle(struct task_group *tg, long idle) { return 0; } -#endif /* CONFIG_FAIR_GROUP_SCHED */ +#endif /* !CONFIG_FAIR_GROUP_SCHED */ -#else /* CONFIG_CGROUP_SCHED */ +#else /* !CONFIG_CGROUP_SCHED: */ struct cfs_bandwidth { }; static inline bool cfs_task_bw_constrained(struct task_struct *p) { return false; } -#endif /* CONFIG_CGROUP_SCHED */ +#endif /* !CONFIG_CGROUP_SCHED */ extern void unregister_rt_sched_group(struct task_group *tg); extern void free_rt_sched_group(struct task_group *tg); @@ -667,7 +675,6 @@ struct cfs_rq { struct sched_entity *curr; struct sched_entity *next; -#ifdef CONFIG_SMP /* * CFS load tracking */ @@ -699,7 +706,6 @@ struct cfs_rq { u64 last_h_load_update; struct sched_entity *h_load_next; #endif /* CONFIG_FAIR_GROUP_SCHED */ -#endif /* CONFIG_SMP */ #ifdef CONFIG_FAIR_GROUP_SCHED struct rq *rq; /* CPU runqueue to which this cfs_rq is attached */ @@ -796,19 +802,13 @@ struct rt_rq { struct rt_prio_array active; unsigned int rt_nr_running; unsigned int rr_nr_running; -#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED struct { int curr; /* highest queued rt task prio */ -#ifdef CONFIG_SMP int next; /* next highest */ -#endif } highest_prio; -#endif -#ifdef CONFIG_SMP bool overloaded; struct plist_head pushable_tasks; -#endif /* CONFIG_SMP */ int rt_queued; #ifdef CONFIG_RT_GROUP_SCHED @@ -839,7 +839,6 @@ struct dl_rq { unsigned int dl_nr_running; -#ifdef CONFIG_SMP /* * Deadline values of the currently executing and the * earliest ready task on this rq. Caching these facilitates @@ -859,9 +858,7 @@ struct dl_rq { * of the leftmost (earliest deadline) element. */ struct rb_root_cached pushable_dl_tasks_root; -#else - struct dl_bw dl_bw; -#endif + /* * "Active utilization" for this runqueue: increased when a * task wakes up (becomes TASK_RUNNING) and decreased when a @@ -932,7 +929,6 @@ static inline long se_runnable(struct sched_entity *se) #endif /* !CONFIG_FAIR_GROUP_SCHED */ -#ifdef CONFIG_SMP /* * XXX we want to get rid of these helpers and use the full load resolution. */ @@ -1008,7 +1004,7 @@ struct root_domain { /* These atomics are updated outside of a lock */ atomic_t rto_loop_next; atomic_t rto_loop_start; -#endif +#endif /* HAVE_RT_PUSH_IPI */ /* * The "RT overload" flag: it gets set if a CPU has more than * one runnable RT task. @@ -1043,7 +1039,6 @@ static inline void set_rd_overloaded(struct root_domain *rd, int status) #ifdef HAVE_RT_PUSH_IPI extern void rto_push_irq_work_func(struct irq_work *work); #endif -#endif /* CONFIG_SMP */ #ifdef CONFIG_UCLAMP_TASK /* @@ -1107,18 +1102,14 @@ struct rq { unsigned int numa_migrate_on; #endif #ifdef CONFIG_NO_HZ_COMMON -#ifdef CONFIG_SMP unsigned long last_blocked_load_update_tick; unsigned int has_blocked_load; call_single_data_t nohz_csd; -#endif /* CONFIG_SMP */ unsigned int nohz_tick_stopped; atomic_t nohz_flags; #endif /* CONFIG_NO_HZ_COMMON */ -#ifdef CONFIG_SMP unsigned int ttwu_pending; -#endif u64 nr_switches; #ifdef CONFIG_UCLAMP_TASK @@ -1151,10 +1142,15 @@ struct rq { */ unsigned long nr_uninterruptible; +#ifdef CONFIG_SCHED_PROXY_EXEC + struct task_struct __rcu *donor; /* Scheduling context */ + struct task_struct __rcu *curr; /* Execution context */ +#else union { struct task_struct __rcu *donor; /* Scheduler context */ struct task_struct __rcu *curr; /* Execution context */ }; +#endif struct sched_dl_entity *dl_server; struct task_struct *idle; struct task_struct *stop; @@ -1183,7 +1179,6 @@ struct rq { int membarrier_state; #endif -#ifdef CONFIG_SMP struct root_domain *rd; struct sched_domain __rcu *sd; @@ -1224,7 +1219,6 @@ struct rq { #ifdef CONFIG_HOTPLUG_CPU struct rcuwait hotplug_wait; #endif -#endif /* CONFIG_SMP */ #ifdef CONFIG_IRQ_TIME_ACCOUNTING u64 prev_irq_time; @@ -1242,9 +1236,7 @@ struct rq { long calc_load_active; #ifdef CONFIG_SCHED_HRTICK -#ifdef CONFIG_SMP call_single_data_t hrtick_csd; -#endif struct hrtimer hrtick_timer; ktime_t hrtick_time; #endif @@ -1271,9 +1263,7 @@ struct rq { struct cpuidle_state *idle_state; #endif -#ifdef CONFIG_SMP unsigned int nr_pinned; -#endif unsigned int push_busy; struct cpu_stop_work push_work; @@ -1294,12 +1284,12 @@ struct rq { unsigned int core_forceidle_seq; unsigned int core_forceidle_occupation; u64 core_forceidle_start; -#endif +#endif /* CONFIG_SCHED_CORE */ /* Scratch cpumask to be temporarily used under rq_lock */ cpumask_var_t scratch_mask; -#if defined(CONFIG_CFS_BANDWIDTH) && defined(CONFIG_SMP) +#ifdef CONFIG_CFS_BANDWIDTH call_single_data_t cfsb_csd; struct list_head cfsb_csd_list; #endif @@ -1313,32 +1303,24 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) return cfs_rq->rq; } -#else +#else /* !CONFIG_FAIR_GROUP_SCHED: */ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) { return container_of(cfs_rq, struct rq, cfs); } -#endif +#endif /* !CONFIG_FAIR_GROUP_SCHED */ static inline int cpu_of(struct rq *rq) { -#ifdef CONFIG_SMP return rq->cpu; -#else - return 0; -#endif } #define MDF_PUSH 0x01 static inline bool is_migration_disabled(struct task_struct *p) { -#ifdef CONFIG_SMP return p->migration_disabled; -#else - return false; -#endif } DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); @@ -1349,10 +1331,17 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); #define cpu_curr(cpu) (cpu_rq(cpu)->curr) #define raw_rq() raw_cpu_ptr(&runqueues) +#ifdef CONFIG_SCHED_PROXY_EXEC +static inline void rq_set_donor(struct rq *rq, struct task_struct *t) +{ + rcu_assign_pointer(rq->donor, t); +} +#else static inline void rq_set_donor(struct rq *rq, struct task_struct *t) { /* Do nothing */ } +#endif #ifdef CONFIG_SCHED_CORE static inline struct cpumask *sched_group_span(struct sched_group *sg); @@ -1500,6 +1489,7 @@ static inline bool sched_group_cookie_match(struct rq *rq, } #endif /* !CONFIG_SCHED_CORE */ + #ifdef CONFIG_RT_GROUP_SCHED # ifdef CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED DECLARE_STATIC_KEY_FALSE(rt_group_sched); @@ -1507,16 +1497,16 @@ static inline bool rt_group_sched_enabled(void) { return static_branch_unlikely(&rt_group_sched); } -# else +# else /* !CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED: */ DECLARE_STATIC_KEY_TRUE(rt_group_sched); static inline bool rt_group_sched_enabled(void) { return static_branch_likely(&rt_group_sched); } -# endif /* CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED */ -#else +# endif /* !CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED */ +#else /* !CONFIG_RT_GROUP_SCHED: */ # define rt_group_sched_enabled() false -#endif /* CONFIG_RT_GROUP_SCHED */ +#endif /* !CONFIG_RT_GROUP_SCHED */ static inline void lockdep_assert_rq_held(struct rq *rq) { @@ -1574,9 +1564,9 @@ static inline void update_idle_core(struct rq *rq) __update_idle_core(rq); } -#else +#else /* !CONFIG_SCHED_SMT: */ static inline void update_idle_core(struct rq *rq) { } -#endif +#endif /* !CONFIG_SCHED_SMT */ #ifdef CONFIG_FAIR_GROUP_SCHED @@ -1757,7 +1747,7 @@ static inline void scx_rq_clock_invalidate(struct rq *rq) WRITE_ONCE(rq->scx.flags, rq->scx.flags & ~SCX_RQ_CLK_VALID); } -#else /* !CONFIG_SCHED_CLASS_EXT */ +#else /* !CONFIG_SCHED_CLASS_EXT: */ #define scx_enabled() false #define scx_switched_all() false @@ -1781,9 +1771,7 @@ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf) rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); rf->clock_update_flags = 0; -#ifdef CONFIG_SMP WARN_ON_ONCE(rq->balance_callback && rq->balance_callback != &balance_push_callback); -#endif } static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf) @@ -1961,8 +1949,6 @@ init_numa_balancing(unsigned long clone_flags, struct task_struct *p) #endif /* !CONFIG_NUMA_BALANCING */ -#ifdef CONFIG_SMP - static inline void queue_balance_callback(struct rq *rq, struct balance_callback *head, @@ -2128,8 +2114,6 @@ static inline const struct cpumask *task_user_cpus(struct task_struct *p) return p->user_cpus_ptr; } -#endif /* CONFIG_SMP */ - #ifdef CONFIG_CGROUP_SCHED /* @@ -2174,7 +2158,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) tg = &root_task_group; p->rt.rt_rq = tg->rt_rq[cpu]; p->rt.parent = tg->rt_se[cpu]; -#endif +#endif /* CONFIG_RT_GROUP_SCHED */ } #else /* !CONFIG_CGROUP_SCHED: */ @@ -2200,7 +2184,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) smp_wmb(); WRITE_ONCE(task_thread_info(p)->cpu, cpu); p->wake_cpu = cpu; -#endif +#endif /* CONFIG_SMP */ } /* @@ -2278,13 +2262,17 @@ static inline int task_current_donor(struct rq *rq, struct task_struct *p) return rq->donor == p; } +static inline bool task_is_blocked(struct task_struct *p) +{ + if (!sched_proxy_exec()) + return false; + + return !!p->blocked_on; +} + static inline int task_on_cpu(struct rq *rq, struct task_struct *p) { -#ifdef CONFIG_SMP return p->on_cpu; -#else - return task_current(rq, p); -#endif } static inline int task_on_rq_queued(struct task_struct *p) @@ -2307,11 +2295,9 @@ static inline int task_on_rq_migrating(struct task_struct *p) #define WF_CURRENT_CPU 0x40 /* Prefer to move the wakee to the current CPU. */ #define WF_RQ_SELECTED 0x80 /* ->select_task_rq() was called */ -#ifdef CONFIG_SMP static_assert(WF_EXEC == SD_BALANCE_EXEC); static_assert(WF_FORK == SD_BALANCE_FORK); static_assert(WF_TTWU == SD_BALANCE_WAKE); -#endif /* * To aid in avoiding the subversion of "niceness" due to uneven distribution @@ -2367,11 +2353,7 @@ extern const u32 sched_prio_to_wmult[40]; #define ENQUEUE_HEAD 0x10 #define ENQUEUE_REPLENISH 0x20 -#ifdef CONFIG_SMP #define ENQUEUE_MIGRATED 0x40 -#else -#define ENQUEUE_MIGRATED 0x00 -#endif #define ENQUEUE_INITIAL 0x80 #define ENQUEUE_MIGRATING 0x100 #define ENQUEUE_DELAYED 0x200 @@ -2416,7 +2398,6 @@ struct sched_class { void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct task_struct *next); void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first); -#ifdef CONFIG_SMP int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags); void (*migrate_task_rq)(struct task_struct *p, int new_cpu); @@ -2429,7 +2410,6 @@ struct sched_class { void (*rq_offline)(struct rq *rq); struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq); -#endif void (*task_tick)(struct rq *rq, struct task_struct *p, int queued); void (*task_fork)(struct task_struct *p); @@ -2487,7 +2467,7 @@ static inline void put_prev_set_next_task(struct rq *rq, struct task_struct *prev, struct task_struct *next) { - WARN_ON_ONCE(rq->curr != prev); + WARN_ON_ONCE(rq->donor != prev); __put_prev_set_next_dl_server(rq, prev, next); @@ -2581,8 +2561,6 @@ extern struct task_struct *pick_task_idle(struct rq *rq); #define SCA_MIGRATE_ENABLE 0x04 #define SCA_USER 0x08 -#ifdef CONFIG_SMP - extern void update_group_capacity(struct sched_domain *sd, int cpu); extern void sched_balance_trigger(struct rq *rq); @@ -2634,26 +2612,6 @@ static inline struct task_struct *get_push_task(struct rq *rq) extern int push_cpu_stop(void *arg); -#else /* !CONFIG_SMP: */ - -static inline bool task_allowed_on_cpu(struct task_struct *p, int cpu) -{ - return true; -} - -static inline int __set_cpus_allowed_ptr(struct task_struct *p, - struct affinity_context *ctx) -{ - return set_cpus_allowed_ptr(p, ctx->new_mask); -} - -static inline cpumask_t *alloc_user_cpus_ptr(int node) -{ - return NULL; -} - -#endif /* !CONFIG_SMP */ - #ifdef CONFIG_CPU_IDLE static inline void idle_set_state(struct rq *rq, @@ -2749,10 +2707,8 @@ static inline void add_nr_running(struct rq *rq, unsigned count) call_trace_sched_update_nr_running(rq, count); } -#ifdef CONFIG_SMP if (prev_nr < 2 && rq->nr_running >= 2) set_rd_overloaded(rq->rd, 1); -#endif sched_update_tick_dependency(rq); } @@ -2918,10 +2874,7 @@ unsigned long arch_scale_freq_capacity(int cpu) static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) { rq1->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); - /* rq1 == rq2 for !CONFIG_SMP, so just clear RQCF_UPDATED once. */ -#ifdef CONFIG_SMP rq2->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); -#endif } #define DEFINE_LOCK_GUARD_2(name, type, _lock, _unlock, ...) \ @@ -2930,8 +2883,6 @@ static inline class_##name##_t class_##name##_constructor(type *lock, type *lock { class_##name##_t _t = { .lock = lock, .lock2 = lock2 }, *_T = &_t; \ _lock; return _t; } -#ifdef CONFIG_SMP - static inline bool rq_order_less(struct rq *rq1, struct rq *rq2) { #ifdef CONFIG_SCHED_CORE @@ -2954,7 +2905,7 @@ static inline bool rq_order_less(struct rq *rq1, struct rq *rq2) /* * __sched_core_flip() relies on SMT having cpu-id lock order. */ -#endif +#endif /* CONFIG_SCHED_CORE */ return rq1->cpu < rq2->cpu; } @@ -3091,42 +3042,6 @@ extern void set_rq_offline(struct rq *rq); extern bool sched_smp_initialized; -#else /* !CONFIG_SMP: */ - -/* - * double_rq_lock - safely lock two runqueues - * - * Note this does not disable interrupts like task_rq_lock, - * you need to do so manually before calling. - */ -static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) - __acquires(rq1->lock) - __acquires(rq2->lock) -{ - WARN_ON_ONCE(!irqs_disabled()); - WARN_ON_ONCE(rq1 != rq2); - raw_spin_rq_lock(rq1); - __acquire(rq2->lock); /* Fake it out ;) */ - double_rq_clock_clear_update(rq1, rq2); -} - -/* - * double_rq_unlock - safely unlock two runqueues - * - * Note this does not restore interrupts like task_rq_unlock, - * you need to do so manually after calling. - */ -static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) - __releases(rq1->lock) - __releases(rq2->lock) -{ - WARN_ON_ONCE(rq1 != rq2); - raw_spin_rq_unlock(rq1); - __release(rq2->lock); -} - -#endif /* !CONFIG_SMP */ - DEFINE_LOCK_GUARD_2(double_rq_lock, struct rq, double_rq_lock(_T->lock, _T->lock2), double_rq_unlock(_T->lock, _T->lock2)) @@ -3145,6 +3060,7 @@ extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq); extern void resched_latency_warn(int cpu, u64 latency); + #ifdef CONFIG_NUMA_BALANCING extern void show_numa_stats(struct task_struct *p, struct seq_file *m); extern void @@ -3184,7 +3100,7 @@ extern void nohz_balance_exit_idle(struct rq *rq); static inline void nohz_balance_exit_idle(struct rq *rq) { } #endif /* !CONFIG_NO_HZ_COMMON */ -#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +#ifdef CONFIG_NO_HZ_COMMON extern void nohz_run_idle_balance(int cpu); #else static inline void nohz_run_idle_balance(int cpu) { } @@ -3254,14 +3170,14 @@ static inline u64 irq_time_read(int cpu) return total; } -#else +#else /* !CONFIG_IRQ_TIME_ACCOUNTING: */ static inline int irqtime_enabled(void) { return 0; } -#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ +#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ #ifdef CONFIG_CPU_FREQ @@ -3310,8 +3226,6 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) { } # define arch_scale_freq_invariant() false #endif -#ifdef CONFIG_SMP - unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, unsigned long *min, unsigned long *max); @@ -3355,10 +3269,6 @@ static inline unsigned long cpu_util_rt(struct rq *rq) return READ_ONCE(rq->avg_rt.util_avg); } -#else /* !CONFIG_SMP */ -static inline bool update_other_load_avgs(struct rq *rq) { return false; } -#endif /* CONFIG_SMP */ - #ifdef CONFIG_UCLAMP_TASK unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); @@ -3535,13 +3445,13 @@ static inline bool sched_energy_enabled(void) return static_branch_unlikely(&sched_energy_present); } -#else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ +#else /* !(CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL): */ #define perf_domain_span(pd) NULL static inline bool sched_energy_enabled(void) { return false; } -#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ +#endif /* !(CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ #ifdef CONFIG_MEMBARRIER @@ -3567,7 +3477,7 @@ static inline void membarrier_switch_mm(struct rq *rq, WRITE_ONCE(rq->membarrier_state, membarrier_state); } -#else /* !CONFIG_MEMBARRIER :*/ +#else /* !CONFIG_MEMBARRIER: */ static inline void membarrier_switch_mm(struct rq *rq, struct mm_struct *prev_mm, @@ -3577,7 +3487,6 @@ static inline void membarrier_switch_mm(struct rq *rq, #endif /* !CONFIG_MEMBARRIER */ -#ifdef CONFIG_SMP static inline bool is_per_cpu_kthread(struct task_struct *p) { if (!(p->flags & PF_KTHREAD)) @@ -3588,7 +3497,6 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) return true; } -#endif extern void swake_up_all_locked(struct swait_queue_head *q); extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); @@ -3887,7 +3795,6 @@ static inline void init_sched_mm_cid(struct task_struct *t) { } extern u64 avg_vruntime(struct cfs_rq *cfs_rq); extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se); -#ifdef CONFIG_SMP static inline void move_queued_task_locked(struct rq *src_rq, struct rq *dst_rq, struct task_struct *task) { @@ -3908,7 +3815,6 @@ bool task_is_pushable(struct rq *rq, struct task_struct *p, int cpu) return false; } -#endif #ifdef CONFIG_RT_MUTEXES @@ -3949,21 +3855,8 @@ extern void check_class_changed(struct rq *rq, struct task_struct *p, const struct sched_class *prev_class, int oldprio); -#ifdef CONFIG_SMP extern struct balance_callback *splice_balance_callbacks(struct rq *rq); extern void balance_callbacks(struct rq *rq, struct balance_callback *head); -#else - -static inline struct balance_callback *splice_balance_callbacks(struct rq *rq) -{ - return NULL; -} - -static inline void balance_callbacks(struct rq *rq, struct balance_callback *head) -{ -} - -#endif #ifdef CONFIG_SCHED_CLASS_EXT /* diff --git a/kernel/sched/smp.h b/kernel/sched/smp.h index 21ac44428bb0..7f151d96dba9 100644 --- a/kernel/sched/smp.h +++ b/kernel/sched/smp.h @@ -1,8 +1,13 @@ /* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _KERNEL_SCHED_SMP_H +#define _KERNEL_SCHED_SMP_H + /* * Scheduler internal SMP callback types and methods between the scheduler * and other internal parts of the core kernel: */ +#include <linux/types.h> extern void sched_ttwu_pending(void *arg); @@ -13,3 +18,5 @@ extern void flush_smp_call_function_queue(void); #else static inline void flush_smp_call_function_queue(void) { } #endif + +#endif /* _KERNEL_SCHED_SMP_H */ diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 4346fd81c31f..d1c9429a4ac5 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -2,6 +2,7 @@ /* * /proc/schedstat implementation */ +#include "sched.h" void __update_stats_wait_start(struct rq *rq, struct task_struct *p, struct sched_statistics *stats) @@ -114,10 +115,8 @@ static int show_schedstat(struct seq_file *seq, void *v) seq_printf(seq, "timestamp %lu\n", jiffies); } else { struct rq *rq; -#ifdef CONFIG_SMP struct sched_domain *sd; int dcount = 0; -#endif cpu = (unsigned long)(v - 2); rq = cpu_rq(cpu); @@ -132,7 +131,6 @@ static int show_schedstat(struct seq_file *seq, void *v) seq_printf(seq, "\n"); -#ifdef CONFIG_SMP /* domain-specific stats */ rcu_read_lock(); for_each_domain(cpu, sd) { @@ -163,7 +161,6 @@ static int show_schedstat(struct seq_file *seq, void *v) sd->ttwu_move_balance); } rcu_read_unlock(); -#endif } return 0; } diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 452826df6ae1..26f3fd4d34ce 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -112,10 +112,10 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, bool sleep); #ifdef CONFIG_IRQ_TIME_ACCOUNTING void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev); -#else +#else /* !CONFIG_IRQ_TIME_ACCOUNTING: */ static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev) {} -#endif /*CONFIG_IRQ_TIME_ACCOUNTING */ +#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ /* * PSI tracks state that persists across sleeps, such as iowaits and * memory stalls. As a result, it has to distinguish between sleeps, @@ -220,7 +220,7 @@ static inline void psi_sched_switch(struct task_struct *prev, psi_task_switch(prev, next, sleep); } -#else /* CONFIG_PSI */ +#else /* !CONFIG_PSI: */ static inline void psi_enqueue(struct task_struct *p, bool migrate) {} static inline void psi_dequeue(struct task_struct *p, bool migrate) {} static inline void psi_ttwu_dequeue(struct task_struct *p) {} @@ -229,7 +229,7 @@ static inline void psi_sched_switch(struct task_struct *prev, bool sleep) {} static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev) {} -#endif /* CONFIG_PSI */ +#endif /* !CONFIG_PSI */ #ifdef CONFIG_SCHED_INFO /* @@ -334,6 +334,6 @@ sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *n # define sched_info_enqueue(rq, t) do { } while (0) # define sched_info_dequeue(rq, t) do { } while (0) # define sched_info_switch(rq, t, next) do { } while (0) -#endif /* CONFIG_SCHED_INFO */ +#endif /* !CONFIG_SCHED_INFO */ #endif /* _KERNEL_STATS_H */ diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 058dd42e3d9b..2d4e279f05ee 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -7,8 +7,8 @@ * * See kernel/stop_machine.c */ +#include "sched.h" -#ifdef CONFIG_SMP static int select_task_rq_stop(struct task_struct *p, int cpu, int flags) { @@ -20,7 +20,6 @@ balance_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { return sched_stop_runnable(rq); } -#endif /* CONFIG_SMP */ static void wakeup_preempt_stop(struct rq *rq, struct task_struct *p, int flags) @@ -106,11 +105,9 @@ DEFINE_SCHED_CLASS(stop) = { .put_prev_task = put_prev_task_stop, .set_next_task = set_next_task_stop, -#ifdef CONFIG_SMP .balance = balance_stop, .select_task_rq = select_task_rq_stop, .set_cpus_allowed = set_cpus_allowed_common, -#endif .task_tick = task_tick_stop, diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c index 72505cd3b60a..0fef6496c4c8 100644 --- a/kernel/sched/swait.c +++ b/kernel/sched/swait.c @@ -2,6 +2,7 @@ /* * <linux/swait.h> (simple wait queues ) implementation: */ +#include "sched.h" void __init_swait_queue_head(struct swait_queue_head *q, const char *name, struct lock_class_key *key) diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index 547c1f05b667..77ae87f36e84 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -174,7 +174,7 @@ SYSCALL_DEFINE1(nice, int, increment) return 0; } -#endif +#endif /* __ARCH_WANT_SYS_NICE */ /** * task_prio - return the priority value of a given task. @@ -209,10 +209,8 @@ int idle_cpu(int cpu) if (rq->nr_running) return 0; -#ifdef CONFIG_SMP if (rq->ttwu_pending) return 0; -#endif return 1; } @@ -255,8 +253,7 @@ int sched_core_idle_cpu(int cpu) return idle_cpu(cpu); } - -#endif +#endif /* CONFIG_SCHED_CORE */ /** * find_process_by_pid - find a process with a matching PID value. @@ -448,7 +445,7 @@ static inline int uclamp_validate(struct task_struct *p, } static void __setscheduler_uclamp(struct task_struct *p, const struct sched_attr *attr) { } -#endif +#endif /* !CONFIG_UCLAMP_TASK */ /* * Allow unprivileged RT tasks to decrease priority. @@ -642,7 +639,6 @@ change: goto unlock; } #endif /* CONFIG_RT_GROUP_SCHED */ -#ifdef CONFIG_SMP if (dl_bandwidth_enabled() && dl_policy(policy) && !(attr->sched_flags & SCHED_FLAG_SUGOV)) { cpumask_t *span = rq->rd->span; @@ -658,7 +654,6 @@ change: goto unlock; } } -#endif } /* Re-check policy now with rq lock held: */ @@ -1120,7 +1115,6 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, return copy_struct_to_user(uattr, usize, &kattr, sizeof(kattr), NULL); } -#ifdef CONFIG_SMP int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) { /* @@ -1149,7 +1143,6 @@ int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) return 0; } -#endif /* CONFIG_SMP */ int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx) { @@ -1242,7 +1235,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) user_mask = alloc_user_cpus_ptr(NUMA_NO_NODE); if (user_mask) { cpumask_copy(user_mask, in_mask); - } else if (IS_ENABLED(CONFIG_SMP)) { + } else { return -ENOMEM; } diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index b958fe48e020..977e133bb8a4 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -3,7 +3,9 @@ * Scheduler topology setup/handling methods */ +#include <linux/sched/isolation.h> #include <linux/bsearch.h> +#include "sched.h" DEFINE_MUTEX(sched_domains_mutex); void sched_domains_mutex_lock(void) @@ -87,7 +89,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, break; } - if (!(sd->flags & SD_OVERLAP) && + if (!(sd->flags & SD_NUMA) && cpumask_intersects(groupmask, sched_group_span(group))) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: repeated CPUs\n"); @@ -100,7 +102,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, group->sgc->id, cpumask_pr_args(sched_group_span(group))); - if ((sd->flags & SD_OVERLAP) && + if ((sd->flags & SD_NUMA) && !cpumask_equal(group_balance_mask(group), sched_group_span(group))) { printk(KERN_CONT " mask=%*pbl", cpumask_pr_args(group_balance_mask(group))); @@ -313,7 +315,7 @@ static int __init sched_energy_aware_sysctl_init(void) } late_initcall(sched_energy_aware_sysctl_init); -#endif +#endif /* CONFIG_PROC_SYSCTL */ static void free_pd(struct perf_domain *pd) { @@ -449,9 +451,9 @@ free: return false; } -#else +#else /* !(CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL): */ static void free_pd(struct perf_domain *pd) { } -#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL*/ +#endif /* !(CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ static void free_rootdomain(struct rcu_head *rcu) { @@ -1318,8 +1320,6 @@ next: update_group_capacity(sd, cpu); } -#ifdef CONFIG_SMP - /* Update the "asym_prefer_cpu" when arch_asym_cpu_priority() changes. */ void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio) { @@ -1344,7 +1344,7 @@ void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio) * "sg->asym_prefer_cpu" to "sg->sgc->asym_prefer_cpu" * which is shared by all the overlapping groups. */ - WARN_ON_ONCE(sd->flags & SD_OVERLAP); + WARN_ON_ONCE(sd->flags & SD_NUMA); sg = sd->groups; if (cpu != sg->asym_prefer_cpu) { @@ -1374,8 +1374,6 @@ void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio) } } -#endif /* CONFIG_SMP */ - /* * Set of available CPUs grouped by their corresponding capacities * Each list entry contains a CPU mask reflecting CPUs that share the same @@ -1598,7 +1596,7 @@ static int sched_domains_curr_level; int sched_max_numa_distance; static int *sched_domains_numa_distance; static struct cpumask ***sched_domains_numa_masks; -#endif +#endif /* CONFIG_NUMA */ /* * SD_flags allowed in topology descriptions. @@ -1714,7 +1712,7 @@ sd_init(struct sched_domain_topology_level *tl, SD_WAKE_AFFINE); } -#endif +#endif /* CONFIG_NUMA */ } else { sd->cache_nice_tries = 1; } @@ -1739,17 +1737,17 @@ sd_init(struct sched_domain_topology_level *tl, */ static struct sched_domain_topology_level default_topology[] = { #ifdef CONFIG_SCHED_SMT - { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, + SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT), #endif #ifdef CONFIG_SCHED_CLUSTER - { cpu_clustergroup_mask, cpu_cluster_flags, SD_INIT_NAME(CLS) }, + SDTL_INIT(cpu_clustergroup_mask, cpu_cluster_flags, CLS), #endif #ifdef CONFIG_SCHED_MC - { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, + SDTL_INIT(cpu_coregroup_mask, cpu_core_flags, MC), #endif - { cpu_cpu_mask, SD_INIT_NAME(PKG) }, + SDTL_INIT(cpu_cpu_mask, NULL, PKG), { NULL, }, }; @@ -2010,23 +2008,14 @@ void sched_init_numa(int offline_node) /* * Add the NUMA identity distance, aka single NODE. */ - tl[i++] = (struct sched_domain_topology_level){ - .mask = sd_numa_mask, - .numa_level = 0, - SD_INIT_NAME(NODE) - }; + tl[i++] = SDTL_INIT(sd_numa_mask, NULL, NODE); /* * .. and append 'j' levels of NUMA goodness. */ for (j = 1; j < nr_levels; i++, j++) { - tl[i] = (struct sched_domain_topology_level){ - .mask = sd_numa_mask, - .sd_flags = cpu_numa_flags, - .flags = SDTL_OVERLAP, - .numa_level = j, - SD_INIT_NAME(NUMA) - }; + tl[i] = SDTL_INIT(sd_numa_mask, cpu_numa_flags, NUMA); + tl[i].numa_level = j; } sched_domain_topology_saved = sched_domain_topology; @@ -2337,7 +2326,7 @@ static void __sdt_free(const struct cpumask *cpu_map) if (sdd->sd) { sd = *per_cpu_ptr(sdd->sd, j); - if (sd && (sd->flags & SD_OVERLAP)) + if (sd && (sd->flags & SD_NUMA)) free_sched_groups(sd->groups, 0); kfree(*per_cpu_ptr(sdd->sd, j)); } @@ -2403,9 +2392,13 @@ static bool topology_span_sane(const struct cpumask *cpu_map) id_seen = sched_domains_tmpmask2; for_each_sd_topology(tl) { + int tl_common_flags = 0; + + if (tl->sd_flags) + tl_common_flags = (*tl->sd_flags)(); /* NUMA levels are allowed to overlap */ - if (tl->flags & SDTL_OVERLAP) + if (tl_common_flags & SD_NUMA) continue; cpumask_clear(covered); @@ -2476,8 +2469,6 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att if (tl == sched_domain_topology) *per_cpu_ptr(d.sd, i) = sd; - if (tl->flags & SDTL_OVERLAP) - sd->flags |= SD_OVERLAP; if (cpumask_equal(cpu_map, sched_domain_span(sd))) break; } @@ -2490,7 +2481,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att for_each_cpu(i, cpu_map) { for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { sd->span_weight = cpumask_weight(sched_domain_span(sd)); - if (sd->flags & SD_OVERLAP) { + if (sd->flags & SD_NUMA) { if (build_overlap_sched_groups(sd, i)) goto error; } else { diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 51e38f5f4701..20f27e2cf7ae 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -4,6 +4,7 @@ * * (C) 2004 Nadia Yvette Chambers, Oracle */ +#include "sched.h" void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key) { @@ -40,13 +41,31 @@ void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_ { unsigned long flags; - wq_entry->flags |= WQ_FLAG_EXCLUSIVE | WQ_FLAG_PRIORITY; + wq_entry->flags |= WQ_FLAG_PRIORITY; spin_lock_irqsave(&wq_head->lock, flags); __add_wait_queue(wq_head, wq_entry); spin_unlock_irqrestore(&wq_head->lock, flags); } EXPORT_SYMBOL_GPL(add_wait_queue_priority); +int add_wait_queue_priority_exclusive(struct wait_queue_head *wq_head, + struct wait_queue_entry *wq_entry) +{ + struct list_head *head = &wq_head->head; + + wq_entry->flags |= WQ_FLAG_EXCLUSIVE | WQ_FLAG_PRIORITY; + + guard(spinlock_irqsave)(&wq_head->lock); + + if (!list_empty(head) && + (list_first_entry(head, typeof(*wq_entry), entry)->flags & WQ_FLAG_PRIORITY)) + return -EBUSY; + + list_add(&wq_entry->entry, head); + return 0; +} +EXPORT_SYMBOL_GPL(add_wait_queue_priority_exclusive); + void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { unsigned long flags; @@ -64,7 +83,7 @@ EXPORT_SYMBOL(remove_wait_queue); * the non-exclusive tasks. Normally, exclusive tasks will be at the end of * the list and any non-exclusive tasks will be woken first. A priority task * may be at the head of the list, and can consume the event without any other - * tasks being woken. + * tasks being woken if it's also an exclusive task. * * There are circumstances in which we can try to wake a task which has already * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index b410b61cec95..1088d3b7012c 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c @@ -1,5 +1,8 @@ // SPDX-License-Identifier: GPL-2.0-only +#include <linux/sched/debug.h> +#include "sched.h" + /* * The implementation of the wait_bit*() and related waiting APIs: */ diff --git a/kernel/smp.c b/kernel/smp.c index 974f3a3962e8..4649fa4872ff 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -86,13 +86,15 @@ int smpcfd_dead_cpu(unsigned int cpu) int smpcfd_dying_cpu(unsigned int cpu) { /* - * The IPIs for the smp-call-function callbacks queued by other - * CPUs might arrive late, either due to hardware latencies or - * because this CPU disabled interrupts (inside stop-machine) - * before the IPIs were sent. So flush out any pending callbacks - * explicitly (without waiting for the IPIs to arrive), to - * ensure that the outgoing CPU doesn't go offline with work - * still pending. + * The IPIs for the smp-call-function callbacks queued by other CPUs + * might arrive late, either due to hardware latencies or because this + * CPU disabled interrupts (inside stop-machine) before the IPIs were + * sent. So flush out any pending callbacks explicitly (without waiting + * for the IPIs to arrive), to ensure that the outgoing CPU doesn't go + * offline with work still pending. + * + * This runs with interrupts disabled inside the stopper task invoked by + * stop_machine(), ensuring mutually exclusive CPU offlining and IPI flush. */ __flush_smp_call_function_queue(false); irq_work_run(); @@ -418,6 +420,10 @@ void __smp_call_single_queue(int cpu, struct llist_node *node) */ static int generic_exec_single(int cpu, call_single_data_t *csd) { + /* + * Preemption already disabled here so stopper cannot run on this CPU, + * ensuring mutually exclusive CPU offlining and last IPI flush. + */ if (cpu == smp_processor_id()) { smp_call_func_t func = csd->func; void *info = csd->info; @@ -638,8 +644,10 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, int err; /* - * prevent preemption and reschedule on another processor, - * as well as CPU removal + * Prevent preemption and reschedule on another CPU, as well as CPU + * removal. This prevents stopper from running on this CPU, thus + * providing mutual exclusion of the below cpu_online() check and + * IPI sending ensuring IPI are not missed by CPU going offline. */ this_cpu = get_cpu(); @@ -741,32 +749,19 @@ EXPORT_SYMBOL_GPL(smp_call_function_single_async); * * Selection preference: * 1) current cpu if in @mask - * 2) any cpu of current node if in @mask - * 3) any other online cpu in @mask + * 2) nearest cpu in @mask, based on NUMA topology */ int smp_call_function_any(const struct cpumask *mask, smp_call_func_t func, void *info, int wait) { unsigned int cpu; - const struct cpumask *nodemask; int ret; /* Try for same CPU (cheapest) */ cpu = get_cpu(); - if (cpumask_test_cpu(cpu, mask)) - goto call; - - /* Try for same node. */ - nodemask = cpumask_of_node(cpu_to_node(cpu)); - for (cpu = cpumask_first_and(nodemask, mask); cpu < nr_cpu_ids; - cpu = cpumask_next_and(cpu, nodemask, mask)) { - if (cpu_online(cpu)) - goto call; - } + if (!cpumask_test_cpu(cpu, mask)) + cpu = sched_numa_find_nth_cpu(mask, 0, cpu_to_node(cpu)); - /* Any online will do: smp_call_function_single handles nr_cpu_ids. */ - cpu = cpumask_any_and(mask, cpu_online_mask); -call: ret = smp_call_function_single(cpu, func, info, wait); put_cpu(); return ret; @@ -792,7 +787,6 @@ static void smp_call_function_many_cond(const struct cpumask *mask, bool wait = scf_flags & SCF_WAIT; int nr_cpus = 0; bool run_remote = false; - bool run_local = false; lockdep_assert_preemption_disabled(); @@ -814,19 +808,8 @@ static void smp_call_function_many_cond(const struct cpumask *mask, */ WARN_ON_ONCE(!in_task()); - /* Check if we need local execution. */ - if ((scf_flags & SCF_RUN_LOCAL) && cpumask_test_cpu(this_cpu, mask) && - (!cond_func || cond_func(this_cpu, info))) - run_local = true; - /* Check if we need remote execution, i.e., any CPU excluding this one. */ - cpu = cpumask_first_and(mask, cpu_online_mask); - if (cpu == this_cpu) - cpu = cpumask_next_and(cpu, mask, cpu_online_mask); - if (cpu < nr_cpu_ids) - run_remote = true; - - if (run_remote) { + if (cpumask_any_and_but(mask, cpu_online_mask, this_cpu) < nr_cpu_ids) { cfd = this_cpu_ptr(&cfd_data); cpumask_and(cfd->cpumask, mask, cpu_online_mask); __cpumask_clear_cpu(this_cpu, cfd->cpumask); @@ -840,6 +823,9 @@ static void smp_call_function_many_cond(const struct cpumask *mask, continue; } + /* Work is enqueued on a remote CPU. */ + run_remote = true; + csd_lock(csd); if (wait) csd->node.u_flags |= CSD_TYPE_SYNC; @@ -851,6 +837,10 @@ static void smp_call_function_many_cond(const struct cpumask *mask, #endif trace_csd_queue_cpu(cpu, _RET_IP_, func, csd); + /* + * Kick the remote CPU if this is the first work + * item enqueued. + */ if (llist_add(&csd->node.llist, &per_cpu(call_single_queue, cpu))) { __cpumask_set_cpu(cpu, cfd->cpumask_ipi); nr_cpus++; @@ -869,7 +859,9 @@ static void smp_call_function_many_cond(const struct cpumask *mask, send_call_function_ipi_mask(cfd->cpumask_ipi); } - if (run_local) { + /* Check if we need local execution. */ + if ((scf_flags & SCF_RUN_LOCAL) && cpumask_test_cpu(this_cpu, mask) && + (!cond_func || cond_func(this_cpu, info))) { unsigned long flags; local_irq_save(flags); diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 1992b62e980b..4503b60ce9bd 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -18,8 +18,6 @@ #include "smpboot.h" -#ifdef CONFIG_SMP - #ifdef CONFIG_GENERIC_SMP_IDLE_THREAD /* * For the hotplug case we keep the task structs around and reuse @@ -76,8 +74,6 @@ void __init idle_threads_init(void) } #endif -#endif /* #ifdef CONFIG_SMP */ - static LIST_HEAD(hotplug_threads); static DEFINE_MUTEX(smpboot_threads_lock); diff --git a/kernel/sys.c b/kernel/sys.c index adc0de0aa364..18a037cc6f61 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -181,6 +181,35 @@ int fs_overflowgid = DEFAULT_FS_OVERFLOWGID; EXPORT_SYMBOL(fs_overflowuid); EXPORT_SYMBOL(fs_overflowgid); +static const struct ctl_table overflow_sysctl_table[] = { + { + .procname = "overflowuid", + .data = &overflowuid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_MAXOLDUID, + }, + { + .procname = "overflowgid", + .data = &overflowgid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_MAXOLDUID, + }, +}; + +static int __init init_overflow_sysctl(void) +{ + register_sysctl_init("kernel", overflow_sysctl_table); + return 0; +} + +postcore_initcall(init_overflow_sysctl); + /* * Returns true if current's euid is same as p's uid or euid, * or has CAP_SYS_NICE to p's user_ns. diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9b4f0cff76ea..cb6196e3fa99 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1,69 +1,28 @@ // SPDX-License-Identifier: GPL-2.0-only /* * sysctl.c: General linux system control interface - * - * Begun 24 March 1995, Stephen Tweedie - * Added /proc support, Dec 1995 - * Added bdflush entry and intvec min/max checking, 2/23/96, Tom Dyas. - * Added hooks for /proc/sys/net (minor, minor patch), 96/4/1, Mike Shaver. - * Added kernel/java-{interpreter,appletviewer}, 96/5/10, Mike Shaver. - * Dynamic registration fixes, Stephen Tweedie. - * Added kswapd-interval, ctrl-alt-del, printk stuff, 1/8/97, Chris Horn. - * Made sysctl support optional via CONFIG_SYSCTL, 1/10/97, Chris - * Horn. - * Added proc_doulongvec_ms_jiffies_minmax, 09/08/99, Carlos H. Bauer. - * Added proc_doulongvec_minmax, 09/08/99, Carlos H. Bauer. - * Changed linked lists to use list.h instead of lists.h, 02/24/00, Bill - * Wendling. - * The list_for_each() macro wasn't appropriate for the sysctl loop. - * Removed it and replaced it with older style, 03/23/00, Bill Wendling */ -#include <linux/module.h> #include <linux/sysctl.h> #include <linux/bitmap.h> -#include <linux/printk.h> #include <linux/proc_fs.h> -#include <linux/security.h> #include <linux/ctype.h> -#include <linux/filter.h> -#include <linux/fs.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/kobject.h> -#include <linux/net.h> -#include <linux/sysrq.h> #include <linux/highuid.h> #include <linux/writeback.h> -#include <linux/ratelimit.h> #include <linux/initrd.h> -#include <linux/key.h> #include <linux/times.h> #include <linux/limits.h> #include <linux/syscalls.h> -#include <linux/nfs_fs.h> -#include <linux/acpi.h> -#include <linux/reboot.h> -#include <linux/kmod.h> #include <linux/capability.h> -#include <linux/binfmts.h> -#include <linux/sched/sysctl.h> -#include <linux/mount.h> -#include <linux/pid.h> #include "../lib/kstrtox.h" #include <linux/uaccess.h> #include <asm/processor.h> -#ifdef CONFIG_X86 -#include <asm/nmi.h> -#include <asm/io.h> -#endif -#ifdef CONFIG_RT_MUTEXES -#include <linux/rtmutex.h> -#endif - /* shared constants to be used in various sysctls */ const int sysctl_vals[] = { 0, 1, 2, 3, 4, 100, 200, 1000, 3000, INT_MAX, 65535, -1 }; EXPORT_SYMBOL(sysctl_vals); @@ -736,49 +695,6 @@ int proc_douintvec(const struct ctl_table *table, int write, void *buffer, do_proc_douintvec_conv, NULL); } -/* - * Taint values can only be increased - * This means we can safely use a temporary. - */ -static int proc_taint(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - struct ctl_table t; - unsigned long tmptaint = get_taint(); - int err; - - if (write && !capable(CAP_SYS_ADMIN)) - return -EPERM; - - t = *table; - t.data = &tmptaint; - err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos); - if (err < 0) - return err; - - if (write) { - int i; - - /* - * If we are relying on panic_on_taint not producing - * false positives due to userspace input, bail out - * before setting the requested taint flags. - */ - if (panic_on_taint_nousertaint && (tmptaint & panic_on_taint)) - return -EINVAL; - - /* - * Poor man's atomic or. Not worth adding a primitive - * to everyone's atomic.h for this - */ - for (i = 0; i < TAINT_FLAGS_COUNT; i++) - if ((1UL << i) & tmptaint) - add_taint(i, LOCKDEP_STILL_OK); - } - - return err; -} - /** * struct do_proc_dointvec_minmax_conv_param - proc_dointvec_minmax() range checking structure * @min: pointer to minimum allowable value @@ -968,26 +884,6 @@ int proc_dou8vec_minmax(const struct ctl_table *table, int write, } EXPORT_SYMBOL_GPL(proc_dou8vec_minmax); -#ifdef CONFIG_MAGIC_SYSRQ -static int sysrq_sysctl_handler(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - int tmp, ret; - - tmp = sysrq_mask(); - - ret = __do_proc_dointvec(&tmp, table, write, buffer, - lenp, ppos, NULL, NULL); - if (ret || !write) - return ret; - - if (write) - sysrq_toggle_support(tmp); - - return 0; -} -#endif - static int __do_proc_doulongvec_minmax(void *data, const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos, @@ -1292,28 +1188,6 @@ int proc_dointvec_ms_jiffies(const struct ctl_table *table, int write, void *buf do_proc_dointvec_ms_jiffies_conv, NULL); } -static int proc_do_cad_pid(const struct ctl_table *table, int write, void *buffer, - size_t *lenp, loff_t *ppos) -{ - struct pid *new_pid; - pid_t tmp; - int r; - - tmp = pid_vnr(cad_pid); - - r = __do_proc_dointvec(&tmp, table, write, buffer, - lenp, ppos, NULL, NULL); - if (r || !write) - return r; - - new_pid = find_get_pid(tmp); - if (!new_pid) - return -ESRCH; - - put_pid(xchg(&cad_pid, new_pid)); - return 0; -} - /** * proc_do_large_bitmap - read/write from/to a large bitmap * @table: the sysctl table @@ -1580,15 +1454,9 @@ int proc_do_static_key(const struct ctl_table *table, int write, return ret; } -static const struct ctl_table kern_table[] = { +static const struct ctl_table sysctl_subsys_table[] = { #ifdef CONFIG_PROC_SYSCTL { - .procname = "tainted", - .maxlen = sizeof(long), - .mode = 0644, - .proc_handler = proc_taint, - }, - { .procname = "sysctl_writes_strict", .data = &sysctl_writes_strict, .maxlen = sizeof(int), @@ -1598,95 +1466,6 @@ static const struct ctl_table kern_table[] = { .extra2 = SYSCTL_ONE, }, #endif -#ifdef CONFIG_PARISC - { - .procname = "soft-power", - .data = &pwrsw_enabled, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW - { - .procname = "unaligned-trap", - .data = &unaligned_enabled, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_MODULES - { - .procname = "modprobe", - .data = &modprobe_path, - .maxlen = KMOD_PATH_LEN, - .mode = 0644, - .proc_handler = proc_dostring, - }, - { - .procname = "modules_disabled", - .data = &modules_disabled, - .maxlen = sizeof(int), - .mode = 0644, - /* only handle a transition from default "0" to "1" */ - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ONE, - .extra2 = SYSCTL_ONE, - }, -#endif -#ifdef CONFIG_UEVENT_HELPER - { - .procname = "hotplug", - .data = &uevent_helper, - .maxlen = UEVENT_HELPER_PATH_LEN, - .mode = 0644, - .proc_handler = proc_dostring, - }, -#endif -#ifdef CONFIG_MAGIC_SYSRQ - { - .procname = "sysrq", - .data = NULL, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = sysrq_sysctl_handler, - }, -#endif -#ifdef CONFIG_PROC_SYSCTL - { - .procname = "cad_pid", - .data = NULL, - .maxlen = sizeof (int), - .mode = 0600, - .proc_handler = proc_do_cad_pid, - }, -#endif - { - .procname = "threads-max", - .data = NULL, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = sysctl_max_threads, - }, - { - .procname = "overflowuid", - .data = &overflowuid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_MAXOLDUID, - }, - { - .procname = "overflowgid", - .data = &overflowgid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_MAXOLDUID, - }, { .procname = "ngroups_max", .data = (void *)&ngroups_max, @@ -1701,20 +1480,10 @@ static const struct ctl_table kern_table[] = { .mode = 0444, .proc_handler = proc_dointvec, }, -#if (defined(CONFIG_X86_32) || defined(CONFIG_PARISC)) && \ - defined(CONFIG_DEBUG_STACKOVERFLOW) - { - .procname = "panic_on_stackoverflow", - .data = &sysctl_panic_on_stackoverflow, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#if defined(CONFIG_MMU) +#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW { - .procname = "randomize_va_space", - .data = &randomize_va_space, + .procname = "unaligned-trap", + .data = &unaligned_enabled, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, @@ -1729,40 +1498,11 @@ static const struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_RT_MUTEXES - { - .procname = "max_lock_depth", - .data = &max_lock_depth, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_TREE_RCU - { - .procname = "panic_on_rcu_stall", - .data = &sysctl_panic_on_rcu_stall, - .maxlen = sizeof(sysctl_panic_on_rcu_stall), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "max_rcu_stall_to_panic", - .data = &sysctl_max_rcu_stall_to_panic, - .maxlen = sizeof(sysctl_max_rcu_stall_to_panic), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ONE, - .extra2 = SYSCTL_INT_MAX, - }, -#endif }; int __init sysctl_init_bases(void) { - register_sysctl_init("kernel", kern_table); + register_sysctl_init("kernel", sysctl_subsys_table); return 0; } diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index b0b97a60aaa6..7c6a52f7836c 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -82,9 +82,9 @@ config CONTEXT_TRACKING_IDLE help Tracks idle state on behalf of RCU. -if GENERIC_CLOCKEVENTS menu "Timers subsystem" +if GENERIC_CLOCKEVENTS # Core internal switch. Selected by NO_HZ_COMMON / HIGH_RES_TIMERS. This is # only related to the tick functionality. Oneshot clockevent devices # are supported independent of this. @@ -208,6 +208,17 @@ config CLOCKSOURCE_WATCHDOG_MAX_SKEW_US interval and NTP's maximum frequency drift of 500 parts per million. If the clocksource is good enough for NTP, it is good enough for the clocksource watchdog! +endif + +config POSIX_AUX_CLOCKS + bool "Enable auxiliary POSIX clocks" + depends on POSIX_TIMERS + help + Auxiliary POSIX clocks are clocks which can be steered + independently of the core timekeeper, which controls the + MONOTONIC, REALTIME, BOOTTIME and TAI clocks. They are useful to + provide e.g. lockless time accessors to independent PTP clocks + and other clock domains, which are not correlated to the TAI/NTP + notion of time. endmenu -endif diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 6a8bc7da9062..e400fe150f9d 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -323,9 +323,7 @@ static void clocksource_verify_choose_cpus(void) return; /* Make sure to select at least one CPU other than the current CPU. */ - cpu = cpumask_first(cpu_online_mask); - if (cpu == smp_processor_id()) - cpu = cpumask_next(cpu, cpu_online_mask); + cpu = cpumask_any_but(cpu_online_mask, smp_processor_id()); if (WARN_ON_ONCE(cpu >= nr_cpu_ids)) return; cpumask_set_cpu(cpu, &cpus_chosen); @@ -589,9 +587,7 @@ static void clocksource_watchdog(struct timer_list *unused) * Cycle through CPUs to check if the CPUs stay synchronized * to each other. */ - next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask); - if (next_cpu >= nr_cpu_ids) - next_cpu = cpumask_first(cpu_online_mask); + next_cpu = cpumask_next_wrap(raw_smp_processor_id(), cpu_online_mask); /* * Arm timer if not already pending: could race with concurrent diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index e3642278df43..667452768ed3 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -242,6 +242,11 @@ static void timens_set_vvar_page(struct task_struct *task, for (i = 0; i < CS_BASES; i++) timens_setup_vdso_clock_data(&vc[i], ns); + if (IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS)) { + for (i = 0; i < ARRAY_SIZE(vdata->aux_clock_data); i++) + timens_setup_vdso_clock_data(&vdata->aux_clock_data[i], ns); + } + out: mutex_unlock(&offset_lock); } diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index b837d3d9d325..97fa99b96dd0 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -18,6 +18,7 @@ #include <linux/module.h> #include <linux/rtc.h> #include <linux/audit.h> +#include <linux/timekeeper_internal.h> #include "ntp_internal.h" #include "timekeeping_internal.h" @@ -86,14 +87,16 @@ struct ntp_data { #endif }; -static struct ntp_data tk_ntp_data = { - .tick_usec = USER_TICK_USEC, - .time_state = TIME_OK, - .time_status = STA_UNSYNC, - .time_constant = 2, - .time_maxerror = NTP_PHASE_LIMIT, - .time_esterror = NTP_PHASE_LIMIT, - .ntp_next_leap_sec = TIME64_MAX, +static struct ntp_data tk_ntp_data[TIMEKEEPERS_MAX] = { + [ 0 ... TIMEKEEPERS_MAX - 1 ] = { + .tick_usec = USER_TICK_USEC, + .time_state = TIME_OK, + .time_status = STA_UNSYNC, + .time_constant = 2, + .time_maxerror = NTP_PHASE_LIMIT, + .time_esterror = NTP_PHASE_LIMIT, + .ntp_next_leap_sec = TIME64_MAX, + }, }; #define SECS_PER_DAY 86400 @@ -300,7 +303,7 @@ static void ntp_update_offset(struct ntp_data *ntpdata, long offset) * Select how the frequency is to be controlled * and in which mode (PLL or FLL). */ - real_secs = __ktime_get_real_seconds(); + real_secs = ktime_get_ntp_seconds(ntpdata - tk_ntp_data); secs = (long)(real_secs - ntpdata->time_reftime); if (unlikely(ntpdata->time_status & STA_FREQHOLD)) secs = 0; @@ -348,33 +351,38 @@ static void __ntp_clear(struct ntp_data *ntpdata) /** * ntp_clear - Clears the NTP state variables + * @tkid: Timekeeper ID to be able to select proper ntp data array member */ -void ntp_clear(void) +void ntp_clear(unsigned int tkid) { - __ntp_clear(&tk_ntp_data); + __ntp_clear(&tk_ntp_data[tkid]); } -u64 ntp_tick_length(void) +u64 ntp_tick_length(unsigned int tkid) { - return tk_ntp_data.tick_length; + return tk_ntp_data[tkid].tick_length; } /** * ntp_get_next_leap - Returns the next leapsecond in CLOCK_REALTIME ktime_t + * @tkid: Timekeeper ID * - * Provides the time of the next leapsecond against CLOCK_REALTIME in - * a ktime_t format. Returns KTIME_MAX if no leapsecond is pending. + * Returns: For @tkid == TIMEKEEPER_CORE this provides the time of the next + * leap second against CLOCK_REALTIME in a ktime_t format if a + * leap second is pending. KTIME_MAX otherwise. */ -ktime_t ntp_get_next_leap(void) +ktime_t ntp_get_next_leap(unsigned int tkid) { - struct ntp_data *ntpdata = &tk_ntp_data; - ktime_t ret; + struct ntp_data *ntpdata = &tk_ntp_data[TIMEKEEPER_CORE]; + + if (tkid != TIMEKEEPER_CORE) + return KTIME_MAX; if ((ntpdata->time_state == TIME_INS) && (ntpdata->time_status & STA_INS)) return ktime_set(ntpdata->ntp_next_leap_sec, 0); - ret = KTIME_MAX; - return ret; + + return KTIME_MAX; } /* @@ -387,9 +395,9 @@ ktime_t ntp_get_next_leap(void) * * Also handles leap second processing, and returns leap offset */ -int second_overflow(time64_t secs) +int second_overflow(unsigned int tkid, time64_t secs) { - struct ntp_data *ntpdata = &tk_ntp_data; + struct ntp_data *ntpdata = &tk_ntp_data[tkid]; s64 delta; int leap = 0; s32 rem; @@ -605,7 +613,7 @@ static inline int update_rtc(struct timespec64 *to_set, unsigned long *offset_ns */ static inline bool ntp_synced(void) { - return !(tk_ntp_data.time_status & STA_UNSYNC); + return !(tk_ntp_data[TIMEKEEPER_CORE].time_status & STA_UNSYNC); } /* @@ -702,7 +710,7 @@ static inline void process_adj_status(struct ntp_data *ntpdata, const struct __k * reference time to current time. */ if (!(ntpdata->time_status & STA_PLL) && (txc->status & STA_PLL)) - ntpdata->time_reftime = __ktime_get_real_seconds(); + ntpdata->time_reftime = ktime_get_ntp_seconds(ntpdata - tk_ntp_data); /* only set allowed bits */ ntpdata->time_status &= STA_RONLY; @@ -759,10 +767,10 @@ static inline void process_adjtimex_modes(struct ntp_data *ntpdata, const struct * adjtimex() mainly allows reading (and writing, if superuser) of * kernel time-keeping variables. used by xntpd. */ -int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, - s32 *time_tai, struct audit_ntp_data *ad) +int ntp_adjtimex(unsigned int tkid, struct __kernel_timex *txc, const struct timespec64 *ts, + s32 *time_tai, struct audit_ntp_data *ad) { - struct ntp_data *ntpdata = &tk_ntp_data; + struct ntp_data *ntpdata = &tk_ntp_data[tkid]; int result; if (txc->modes & ADJ_ADJTIME) { @@ -1031,8 +1039,8 @@ static void hardpps_update_phase(struct ntp_data *ntpdata, long error) */ void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts) { + struct ntp_data *ntpdata = &tk_ntp_data[TIMEKEEPER_CORE]; struct pps_normtime pts_norm, freq_norm; - struct ntp_data *ntpdata = &tk_ntp_data; pts_norm = pps_normalize_ts(*phase_ts); @@ -1083,18 +1091,18 @@ void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_t static int __init ntp_tick_adj_setup(char *str) { - int rc = kstrtos64(str, 0, &tk_ntp_data.ntp_tick_adj); + int rc = kstrtos64(str, 0, &tk_ntp_data[TIMEKEEPER_CORE].ntp_tick_adj); if (rc) return rc; - tk_ntp_data.ntp_tick_adj <<= NTP_SCALE_SHIFT; + tk_ntp_data[TIMEKEEPER_CORE].ntp_tick_adj <<= NTP_SCALE_SHIFT; return 1; } - __setup("ntp_tick_adj=", ntp_tick_adj_setup); void __init ntp_init(void) { - ntp_clear(); + for (int id = 0; id < TIMEKEEPERS_MAX; id++) + __ntp_clear(tk_ntp_data + id); ntp_init_cmos_sync(); } diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h index 5a633dce9057..7084d839c207 100644 --- a/kernel/time/ntp_internal.h +++ b/kernel/time/ntp_internal.h @@ -3,14 +3,13 @@ #define _LINUX_NTP_INTERNAL_H extern void ntp_init(void); -extern void ntp_clear(void); +extern void ntp_clear(unsigned int tkid); /* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */ -extern u64 ntp_tick_length(void); -extern ktime_t ntp_get_next_leap(void); -extern int second_overflow(time64_t secs); -extern int __do_adjtimex(struct __kernel_timex *txc, - const struct timespec64 *ts, - s32 *time_tai, struct audit_ntp_data *ad); +extern u64 ntp_tick_length(unsigned int tkid); +extern ktime_t ntp_get_next_leap(unsigned int tkid); +extern int second_overflow(unsigned int tkid, time64_t secs); +extern int ntp_adjtimex(unsigned int tkid, struct __kernel_timex *txc, const struct timespec64 *ts, + s32 *time_tai, struct audit_ntp_data *ad); extern void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts); #if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 2053b1a4c9e4..8b582174b1f9 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -1526,6 +1526,9 @@ static const struct k_clock * const posix_clocks[] = { [CLOCK_REALTIME_ALARM] = &alarm_clock, [CLOCK_BOOTTIME_ALARM] = &alarm_clock, [CLOCK_TAI] = &clock_tai, +#ifdef CONFIG_POSIX_AUX_CLOCKS + [CLOCK_AUX ... CLOCK_AUX_LAST] = &clock_aux, +#endif }; static const struct k_clock *clockid_to_kclock(const clockid_t id) diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h index 61906f0688c1..7f259e845d24 100644 --- a/kernel/time/posix-timers.h +++ b/kernel/time/posix-timers.h @@ -41,6 +41,7 @@ extern const struct k_clock clock_posix_dynamic; extern const struct k_clock clock_process; extern const struct k_clock clock_thread; extern const struct k_clock alarm_clock; +extern const struct k_clock clock_aux; void posix_timer_queue_signal(struct k_itimer *timr); diff --git a/kernel/time/timecounter.c b/kernel/time/timecounter.c index e6285288d765..3d2a354cfe1c 100644 --- a/kernel/time/timecounter.c +++ b/kernel/time/timecounter.c @@ -6,7 +6,7 @@ #include <linux/timecounter.h> void timecounter_init(struct timecounter *tc, - const struct cyclecounter *cc, + struct cyclecounter *cc, u64 start_tstamp) { tc->cc = cc; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 83c65f3afcca..059fa8b79be6 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -6,6 +6,7 @@ #include <linux/timekeeper_internal.h> #include <linux/module.h> #include <linux/interrupt.h> +#include <linux/kobject.h> #include <linux/percpu.h> #include <linux/init.h> #include <linux/mm.h> @@ -25,6 +26,8 @@ #include <linux/audit.h> #include <linux/random.h> +#include <vdso/auxclock.h> + #include "tick-internal.h" #include "ntp_internal.h" #include "timekeeping_internal.h" @@ -53,7 +56,32 @@ struct tk_data { raw_spinlock_t lock; } ____cacheline_aligned; -static struct tk_data tk_core; +static struct tk_data timekeeper_data[TIMEKEEPERS_MAX]; + +/* The core timekeeper */ +#define tk_core (timekeeper_data[TIMEKEEPER_CORE]) + +#ifdef CONFIG_POSIX_AUX_CLOCKS +static inline bool tk_get_aux_ts64(unsigned int tkid, struct timespec64 *ts) +{ + return ktime_get_aux_ts64(CLOCK_AUX + tkid - TIMEKEEPER_AUX_FIRST, ts); +} + +static inline bool tk_is_aux(const struct timekeeper *tk) +{ + return tk->id >= TIMEKEEPER_AUX_FIRST && tk->id <= TIMEKEEPER_AUX_LAST; +} +#else +static inline bool tk_get_aux_ts64(unsigned int tkid, struct timespec64 *ts) +{ + return false; +} + +static inline bool tk_is_aux(const struct timekeeper *tk) +{ + return false; +} +#endif /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; @@ -113,6 +141,16 @@ static struct tk_fast tk_fast_raw ____cacheline_aligned = { .base[1] = FAST_TK_INIT, }; +#ifdef CONFIG_POSIX_AUX_CLOCKS +static __init void tk_aux_setup(void); +static void tk_aux_update_clocksource(void); +static void tk_aux_advance(void); +#else +static inline void tk_aux_setup(void) { } +static inline void tk_aux_update_clocksource(void) { } +static inline void tk_aux_advance(void) { } +#endif + unsigned long timekeeper_lock_irqsave(void) { unsigned long flags; @@ -601,7 +639,7 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); */ static inline void tk_update_leap_state(struct timekeeper *tk) { - tk->next_leap_ktime = ntp_get_next_leap(); + tk->next_leap_ktime = ntp_get_next_leap(tk->id); if (tk->next_leap_ktime != KTIME_MAX) /* Convert to monotonic time */ tk->next_leap_ktime = ktime_sub(tk->next_leap_ktime, tk->offs_real); @@ -663,7 +701,7 @@ static void timekeeping_restore_shadow(struct tk_data *tkd) static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int action) { - struct timekeeper *tk = &tk_core.shadow_timekeeper; + struct timekeeper *tk = &tkd->shadow_timekeeper; lockdep_assert_held(&tkd->lock); @@ -678,18 +716,22 @@ static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int act if (action & TK_CLEAR_NTP) { tk->ntp_error = 0; - ntp_clear(); + ntp_clear(tk->id); } tk_update_leap_state(tk); tk_update_ktime_data(tk); + tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real; - update_vsyscall(tk); - update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); + if (tk->id == TIMEKEEPER_CORE) { + update_vsyscall(tk); + update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); - tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real; - update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono); - update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw); + update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono); + update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw); + } else if (tk_is_aux(tk)) { + vdso_time_update_aux(tk); + } if (action & TK_CLOCK_WAS_SET) tk->clock_was_set_seq++; @@ -975,9 +1017,14 @@ time64_t ktime_get_real_seconds(void) EXPORT_SYMBOL_GPL(ktime_get_real_seconds); /** - * __ktime_get_real_seconds - The same as ktime_get_real_seconds - * but without the sequence counter protect. This internal function - * is called just when timekeeping lock is already held. + * __ktime_get_real_seconds - Unprotected access to CLOCK_REALTIME seconds + * + * The same as ktime_get_real_seconds() but without the sequence counter + * protection. This function is used in restricted contexts like the x86 MCE + * handler and in KGDB. It's unprotected on 32-bit vs. concurrent half + * completed modification and only to be used for such critical contexts. + * + * Returns: Racy snapshot of the CLOCK_REALTIME seconds value */ noinstr time64_t __ktime_get_real_seconds(void) { @@ -1412,41 +1459,73 @@ int do_settimeofday64(const struct timespec64 *ts) } EXPORT_SYMBOL(do_settimeofday64); +static inline bool timekeeper_is_core_tk(struct timekeeper *tk) +{ + return !IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS) || tk->id == TIMEKEEPER_CORE; +} + /** - * timekeeping_inject_offset - Adds or subtracts from the current time. + * __timekeeping_inject_offset - Adds or subtracts from the current time. + * @tkd: Pointer to the timekeeper to modify * @ts: Pointer to the timespec variable containing the offset * * Adds or subtracts an offset value from the current time. */ -static int timekeeping_inject_offset(const struct timespec64 *ts) +static int __timekeeping_inject_offset(struct tk_data *tkd, const struct timespec64 *ts) { + struct timekeeper *tks = &tkd->shadow_timekeeper; + struct timespec64 tmp; + if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC) return -EINVAL; - scoped_guard (raw_spinlock_irqsave, &tk_core.lock) { - struct timekeeper *tks = &tk_core.shadow_timekeeper; - struct timespec64 tmp; - - timekeeping_forward_now(tks); + timekeeping_forward_now(tks); + if (timekeeper_is_core_tk(tks)) { /* Make sure the proposed value is valid */ tmp = timespec64_add(tk_xtime(tks), *ts); if (timespec64_compare(&tks->wall_to_monotonic, ts) > 0 || !timespec64_valid_settod(&tmp)) { - timekeeping_restore_shadow(&tk_core); + timekeeping_restore_shadow(tkd); return -EINVAL; } tk_xtime_add(tks, ts); tk_set_wall_to_mono(tks, timespec64_sub(tks->wall_to_monotonic, *ts)); - timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL); + } else { + struct tk_read_base *tkr_mono = &tks->tkr_mono; + ktime_t now, offs; + + /* Get the current time */ + now = ktime_add_ns(tkr_mono->base, timekeeping_get_ns(tkr_mono)); + /* Add the relative offset change */ + offs = ktime_add(tks->offs_aux, timespec64_to_ktime(*ts)); + + /* Prevent that the resulting time becomes negative */ + if (ktime_add(now, offs) < 0) { + timekeeping_restore_shadow(tkd); + return -EINVAL; + } + tks->offs_aux = offs; } - /* Signal hrtimers about time change */ - clock_was_set(CLOCK_SET_WALL); + timekeeping_update_from_shadow(tkd, TK_UPDATE_ALL); return 0; } +static int timekeeping_inject_offset(const struct timespec64 *ts) +{ + int ret; + + scoped_guard (raw_spinlock_irqsave, &tk_core.lock) + ret = __timekeeping_inject_offset(&tk_core, ts); + + /* Signal hrtimers about time change */ + if (!ret) + clock_was_set(CLOCK_SET_WALL); + return ret; +} + /* * Indicates if there is an offset between the system clock and the hardware * clock/persistent clock/rtc. @@ -1522,6 +1601,8 @@ static int change_clocksource(void *data) timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL); } + tk_aux_update_clocksource(); + if (old) { if (old->disable) old->disable(old); @@ -1573,6 +1654,39 @@ void ktime_get_raw_ts64(struct timespec64 *ts) } EXPORT_SYMBOL(ktime_get_raw_ts64); +/** + * ktime_get_clock_ts64 - Returns time of a clock in a timespec + * @id: POSIX clock ID of the clock to read + * @ts: Pointer to the timespec64 to be set + * + * The timestamp is invalidated (@ts->sec is set to -1) if the + * clock @id is not available. + */ +void ktime_get_clock_ts64(clockid_t id, struct timespec64 *ts) +{ + /* Invalidate time stamp */ + ts->tv_sec = -1; + ts->tv_nsec = 0; + + switch (id) { + case CLOCK_REALTIME: + ktime_get_real_ts64(ts); + return; + case CLOCK_MONOTONIC: + ktime_get_ts64(ts); + return; + case CLOCK_MONOTONIC_RAW: + ktime_get_raw_ts64(ts); + return; + case CLOCK_AUX ... CLOCK_AUX_LAST: + if (IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS)) + ktime_get_aux_ts64(id, ts); + return; + default: + WARN_ON_ONCE(1); + } +} +EXPORT_SYMBOL_GPL(ktime_get_clock_ts64); /** * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres @@ -1649,10 +1763,12 @@ read_persistent_wall_and_boot_offset(struct timespec64 *wall_time, *boot_offset = ns_to_timespec64(local_clock()); } -static __init void tkd_basic_setup(struct tk_data *tkd) +static __init void tkd_basic_setup(struct tk_data *tkd, enum timekeeper_ids tk_id, bool valid) { raw_spin_lock_init(&tkd->lock); seqcount_raw_spinlock_init(&tkd->seq, &tkd->lock); + tkd->timekeeper.id = tkd->shadow_timekeeper.id = tk_id; + tkd->timekeeper.clock_valid = tkd->shadow_timekeeper.clock_valid = valid; } /* @@ -1682,7 +1798,8 @@ void __init timekeeping_init(void) struct timekeeper *tks = &tk_core.shadow_timekeeper; struct clocksource *clock; - tkd_basic_setup(&tk_core); + tkd_basic_setup(&tk_core, TIMEKEEPER_CORE, true); + tk_aux_setup(); read_persistent_wall_and_boot_offset(&wall_time, &boot_offset); if (timespec64_valid_settod(&wall_time) && @@ -2034,7 +2151,7 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk, */ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) { - u64 ntp_tl = ntp_tick_length(); + u64 ntp_tl = ntp_tick_length(tk->id); u32 mult; /* @@ -2115,7 +2232,7 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) } /* Figure out if its a leap sec and apply if needed */ - leap = second_overflow(tk->xtime_sec); + leap = second_overflow(tk->id, tk->xtime_sec); if (unlikely(leap)) { struct timespec64 ts; @@ -2181,16 +2298,14 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset, * timekeeping_advance - Updates the timekeeper to the current time and * current NTP tick length */ -static bool timekeeping_advance(enum timekeeping_adv_mode mode) +static bool __timekeeping_advance(struct tk_data *tkd, enum timekeeping_adv_mode mode) { - struct timekeeper *tk = &tk_core.shadow_timekeeper; - struct timekeeper *real_tk = &tk_core.timekeeper; + struct timekeeper *tk = &tkd->shadow_timekeeper; + struct timekeeper *real_tk = &tkd->timekeeper; unsigned int clock_set = 0; int shift = 0, maxshift; u64 offset, orig_offset; - guard(raw_spinlock_irqsave)(&tk_core.lock); - /* Make sure we're fully resumed: */ if (unlikely(timekeeping_suspended)) return false; @@ -2214,7 +2329,7 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) shift = ilog2(offset) - ilog2(tk->cycle_interval); shift = max(0, shift); /* Bound shift to one less than what overflows tick_length */ - maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1; + maxshift = (64 - (ilog2(ntp_tick_length(tk->id)) + 1)) - 1; shift = min(shift, maxshift); while (offset >= tk->cycle_interval) { offset = logarithmic_accumulation(tk, offset, shift, &clock_set); @@ -2239,19 +2354,27 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) if (orig_offset != offset) tk_update_coarse_nsecs(tk); - timekeeping_update_from_shadow(&tk_core, clock_set); + timekeeping_update_from_shadow(tkd, clock_set); return !!clock_set; } +static bool timekeeping_advance(enum timekeeping_adv_mode mode) +{ + guard(raw_spinlock_irqsave)(&tk_core.lock); + return __timekeeping_advance(&tk_core, mode); +} + /** * update_wall_time - Uses the current clocksource to increment the wall time * + * It also updates the enabled auxiliary clock timekeepers */ void update_wall_time(void) { if (timekeeping_advance(TK_ADV_TICK)) clock_was_set_delayed(); + tk_aux_advance(); } /** @@ -2449,7 +2572,7 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, /* * timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex */ -static int timekeeping_validate_timex(const struct __kernel_timex *txc) +static int timekeeping_validate_timex(const struct __kernel_timex *txc, bool aux_clock) { if (txc->modes & ADJ_ADJTIME) { /* singleshot must not be used with any other mode bits */ @@ -2508,6 +2631,20 @@ static int timekeeping_validate_timex(const struct __kernel_timex *txc) return -EINVAL; } + if (aux_clock) { + /* Auxiliary clocks are similar to TAI and do not have leap seconds */ + if (txc->status & (STA_INS | STA_DEL)) + return -EINVAL; + + /* No TAI offset setting */ + if (txc->modes & ADJ_TAI) + return -EINVAL; + + /* No PPS support either */ + if (txc->status & (STA_PPSFREQ | STA_PPSTIME)) + return -EINVAL; + } + return 0; } @@ -2526,74 +2663,103 @@ unsigned long random_get_entropy_fallback(void) } EXPORT_SYMBOL_GPL(random_get_entropy_fallback); -/** - * do_adjtimex() - Accessor function to NTP __do_adjtimex function - * @txc: Pointer to kernel_timex structure containing NTP parameters - */ -int do_adjtimex(struct __kernel_timex *txc) +struct adjtimex_result { + struct audit_ntp_data ad; + struct timespec64 delta; + bool clock_set; +}; + +static int __do_adjtimex(struct tk_data *tkd, struct __kernel_timex *txc, + struct adjtimex_result *result) { - struct audit_ntp_data ad; - bool offset_set = false; - bool clock_set = false; + struct timekeeper *tks = &tkd->shadow_timekeeper; + bool aux_clock = !timekeeper_is_core_tk(tks); struct timespec64 ts; + s32 orig_tai, tai; int ret; /* Validate the data before disabling interrupts */ - ret = timekeeping_validate_timex(txc); + ret = timekeeping_validate_timex(txc, aux_clock); if (ret) return ret; add_device_randomness(txc, sizeof(*txc)); - if (txc->modes & ADJ_SETOFFSET) { - struct timespec64 delta; + if (!aux_clock) + ktime_get_real_ts64(&ts); + else + tk_get_aux_ts64(tkd->timekeeper.id, &ts); - delta.tv_sec = txc->time.tv_sec; - delta.tv_nsec = txc->time.tv_usec; + add_device_randomness(&ts, sizeof(ts)); + + guard(raw_spinlock_irqsave)(&tkd->lock); + + if (!tks->clock_valid) + return -ENODEV; + + if (txc->modes & ADJ_SETOFFSET) { + result->delta.tv_sec = txc->time.tv_sec; + result->delta.tv_nsec = txc->time.tv_usec; if (!(txc->modes & ADJ_NANO)) - delta.tv_nsec *= 1000; - ret = timekeeping_inject_offset(&delta); + result->delta.tv_nsec *= 1000; + ret = __timekeeping_inject_offset(tkd, &result->delta); if (ret) return ret; - - offset_set = delta.tv_sec != 0; - audit_tk_injoffset(delta); + result->clock_set = true; } - audit_ntp_init(&ad); + orig_tai = tai = tks->tai_offset; + ret = ntp_adjtimex(tks->id, txc, &ts, &tai, &result->ad); - ktime_get_real_ts64(&ts); - add_device_randomness(&ts, sizeof(ts)); + if (tai != orig_tai) { + __timekeeping_set_tai_offset(tks, tai); + timekeeping_update_from_shadow(tkd, TK_CLOCK_WAS_SET); + result->clock_set = true; + } else { + tk_update_leap_state_all(&tk_core); + } - scoped_guard (raw_spinlock_irqsave, &tk_core.lock) { - struct timekeeper *tks = &tk_core.shadow_timekeeper; - s32 orig_tai, tai; + /* Update the multiplier immediately if frequency was set directly */ + if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK)) + result->clock_set |= __timekeeping_advance(tkd, TK_ADV_FREQ); - orig_tai = tai = tks->tai_offset; - ret = __do_adjtimex(txc, &ts, &tai, &ad); + return ret; +} - if (tai != orig_tai) { - __timekeeping_set_tai_offset(tks, tai); - timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET); - clock_set = true; - } else { - tk_update_leap_state_all(&tk_core); - } - } +/** + * do_adjtimex() - Accessor function to NTP __do_adjtimex function + * @txc: Pointer to kernel_timex structure containing NTP parameters + */ +int do_adjtimex(struct __kernel_timex *txc) +{ + struct adjtimex_result result = { }; + int ret; - audit_ntp_log(&ad); + ret = __do_adjtimex(&tk_core, txc, &result); + if (ret < 0) + return ret; - /* Update the multiplier immediately if frequency was set directly */ - if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK)) - clock_set |= timekeeping_advance(TK_ADV_FREQ); + if (txc->modes & ADJ_SETOFFSET) + audit_tk_injoffset(result.delta); - if (clock_set) + audit_ntp_log(&result.ad); + + if (result.clock_set) clock_was_set(CLOCK_SET_WALL); - ntp_notify_cmos_timer(offset_set); + ntp_notify_cmos_timer(result.delta.tv_sec != 0); return ret; } +/* + * Invoked from NTP with the time keeper lock held, so lockless access is + * fine. + */ +long ktime_get_ntp_seconds(unsigned int id) +{ + return timekeeper_data[id].timekeeper.xtime_sec; +} + #ifdef CONFIG_NTP_PPS /** * hardpps() - Accessor function to NTP __hardpps function @@ -2607,3 +2773,316 @@ void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts) } EXPORT_SYMBOL(hardpps); #endif /* CONFIG_NTP_PPS */ + +#ifdef CONFIG_POSIX_AUX_CLOCKS +#include "posix-timers.h" + +/* + * Bitmap for the activated auxiliary timekeepers to allow lockless quick + * checks in the hot paths without touching extra cache lines. If set, then + * the state of the corresponding timekeeper has to be re-checked under + * timekeeper::lock. + */ +static unsigned long aux_timekeepers; + +static inline unsigned int clockid_to_tkid(unsigned int id) +{ + return TIMEKEEPER_AUX_FIRST + id - CLOCK_AUX; +} + +static inline struct tk_data *aux_get_tk_data(clockid_t id) +{ + if (!clockid_aux_valid(id)) + return NULL; + return &timekeeper_data[clockid_to_tkid(id)]; +} + +/* Invoked from timekeeping after a clocksource change */ +static void tk_aux_update_clocksource(void) +{ + unsigned long active = READ_ONCE(aux_timekeepers); + unsigned int id; + + for_each_set_bit(id, &active, BITS_PER_LONG) { + struct tk_data *tkd = &timekeeper_data[id + TIMEKEEPER_AUX_FIRST]; + struct timekeeper *tks = &tkd->shadow_timekeeper; + + guard(raw_spinlock_irqsave)(&tkd->lock); + if (!tks->clock_valid) + continue; + + timekeeping_forward_now(tks); + tk_setup_internals(tks, tk_core.timekeeper.tkr_mono.clock); + timekeeping_update_from_shadow(tkd, TK_UPDATE_ALL); + } +} + +static void tk_aux_advance(void) +{ + unsigned long active = READ_ONCE(aux_timekeepers); + unsigned int id; + + /* Lockless quick check to avoid extra cache lines */ + for_each_set_bit(id, &active, BITS_PER_LONG) { + struct tk_data *aux_tkd = &timekeeper_data[id + TIMEKEEPER_AUX_FIRST]; + + guard(raw_spinlock)(&aux_tkd->lock); + if (aux_tkd->shadow_timekeeper.clock_valid) + __timekeeping_advance(aux_tkd, TK_ADV_TICK); + } +} + +/** + * ktime_get_aux - Get time for a AUX clock + * @id: ID of the clock to read (CLOCK_AUX...) + * @kt: Pointer to ktime_t to store the time stamp + * + * Returns: True if the timestamp is valid, false otherwise + */ +bool ktime_get_aux(clockid_t id, ktime_t *kt) +{ + struct tk_data *aux_tkd = aux_get_tk_data(id); + struct timekeeper *aux_tk; + unsigned int seq; + ktime_t base; + u64 nsecs; + + WARN_ON(timekeeping_suspended); + + if (!aux_tkd) + return false; + + aux_tk = &aux_tkd->timekeeper; + do { + seq = read_seqcount_begin(&aux_tkd->seq); + if (!aux_tk->clock_valid) + return false; + + base = ktime_add(aux_tk->tkr_mono.base, aux_tk->offs_aux); + nsecs = timekeeping_get_ns(&aux_tk->tkr_mono); + } while (read_seqcount_retry(&aux_tkd->seq, seq)); + + *kt = ktime_add_ns(base, nsecs); + return true; +} +EXPORT_SYMBOL_GPL(ktime_get_aux); + +/** + * ktime_get_aux_ts64 - Get time for a AUX clock + * @id: ID of the clock to read (CLOCK_AUX...) + * @ts: Pointer to timespec64 to store the time stamp + * + * Returns: True if the timestamp is valid, false otherwise + */ +bool ktime_get_aux_ts64(clockid_t id, struct timespec64 *ts) +{ + ktime_t now; + + if (!ktime_get_aux(id, &now)) + return false; + *ts = ktime_to_timespec64(now); + return true; +} +EXPORT_SYMBOL_GPL(ktime_get_aux_ts64); + +static int aux_get_res(clockid_t id, struct timespec64 *tp) +{ + if (!clockid_aux_valid(id)) + return -ENODEV; + + tp->tv_sec = aux_clock_resolution_ns() / NSEC_PER_SEC; + tp->tv_nsec = aux_clock_resolution_ns() % NSEC_PER_SEC; + return 0; +} + +static int aux_get_timespec(clockid_t id, struct timespec64 *tp) +{ + return ktime_get_aux_ts64(id, tp) ? 0 : -ENODEV; +} + +static int aux_clock_set(const clockid_t id, const struct timespec64 *tnew) +{ + struct tk_data *aux_tkd = aux_get_tk_data(id); + struct timekeeper *aux_tks; + ktime_t tnow, nsecs; + + if (!timespec64_valid_settod(tnew)) + return -EINVAL; + if (!aux_tkd) + return -ENODEV; + + aux_tks = &aux_tkd->shadow_timekeeper; + + guard(raw_spinlock_irq)(&aux_tkd->lock); + if (!aux_tks->clock_valid) + return -ENODEV; + + /* Forward the timekeeper base time */ + timekeeping_forward_now(aux_tks); + /* + * Get the updated base time. tkr_mono.base has not been + * updated yet, so do that first. That makes the update + * in timekeeping_update_from_shadow() redundant, but + * that's harmless. After that @tnow can be calculated + * by using tkr_mono::cycle_last, which has been set + * by timekeeping_forward_now(). + */ + tk_update_ktime_data(aux_tks); + nsecs = timekeeping_cycles_to_ns(&aux_tks->tkr_mono, aux_tks->tkr_mono.cycle_last); + tnow = ktime_add(aux_tks->tkr_mono.base, nsecs); + + /* + * Calculate the new AUX offset as delta to @tnow ("monotonic"). + * That avoids all the tk::xtime back and forth conversions as + * xtime ("realtime") is not applicable for auxiliary clocks and + * kept in sync with "monotonic". + */ + aux_tks->offs_aux = ktime_sub(timespec64_to_ktime(*tnew), tnow); + + timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL); + return 0; +} + +static int aux_clock_adj(const clockid_t id, struct __kernel_timex *txc) +{ + struct tk_data *aux_tkd = aux_get_tk_data(id); + struct adjtimex_result result = { }; + + if (!aux_tkd) + return -ENODEV; + + /* + * @result is ignored for now as there are neither hrtimers nor a + * RTC related to auxiliary clocks for now. + */ + return __do_adjtimex(aux_tkd, txc, &result); +} + +const struct k_clock clock_aux = { + .clock_getres = aux_get_res, + .clock_get_timespec = aux_get_timespec, + .clock_set = aux_clock_set, + .clock_adj = aux_clock_adj, +}; + +static void aux_clock_enable(clockid_t id) +{ + struct tk_read_base *tkr_raw = &tk_core.timekeeper.tkr_raw; + struct tk_data *aux_tkd = aux_get_tk_data(id); + struct timekeeper *aux_tks = &aux_tkd->shadow_timekeeper; + + /* Prevent the core timekeeper from changing. */ + guard(raw_spinlock_irq)(&tk_core.lock); + + /* + * Setup the auxiliary clock assuming that the raw core timekeeper + * clock frequency conversion is close enough. Userspace has to + * adjust for the deviation via clock_adjtime(2). + */ + guard(raw_spinlock_nested)(&aux_tkd->lock); + + /* Remove leftovers of a previous registration */ + memset(aux_tks, 0, sizeof(*aux_tks)); + /* Restore the timekeeper id */ + aux_tks->id = aux_tkd->timekeeper.id; + /* Setup the timekeeper based on the current system clocksource */ + tk_setup_internals(aux_tks, tkr_raw->clock); + + /* Mark it valid and set it live */ + aux_tks->clock_valid = true; + timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL); +} + +static void aux_clock_disable(clockid_t id) +{ + struct tk_data *aux_tkd = aux_get_tk_data(id); + + guard(raw_spinlock_irq)(&aux_tkd->lock); + aux_tkd->shadow_timekeeper.clock_valid = false; + timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL); +} + +static DEFINE_MUTEX(aux_clock_mutex); + +static ssize_t aux_clock_enable_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + /* Lazy atoi() as name is "0..7" */ + int id = kobj->name[0] & 0x7; + bool enable; + + if (!capable(CAP_SYS_TIME)) + return -EPERM; + + if (kstrtobool(buf, &enable) < 0) + return -EINVAL; + + guard(mutex)(&aux_clock_mutex); + if (enable == test_bit(id, &aux_timekeepers)) + return count; + + if (enable) { + aux_clock_enable(CLOCK_AUX + id); + set_bit(id, &aux_timekeepers); + } else { + aux_clock_disable(CLOCK_AUX + id); + clear_bit(id, &aux_timekeepers); + } + return count; +} + +static ssize_t aux_clock_enable_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + unsigned long active = READ_ONCE(aux_timekeepers); + /* Lazy atoi() as name is "0..7" */ + int id = kobj->name[0] & 0x7; + + return sysfs_emit(buf, "%d\n", test_bit(id, &active)); +} + +static struct kobj_attribute aux_clock_enable_attr = __ATTR_RW(aux_clock_enable); + +static struct attribute *aux_clock_enable_attrs[] = { + &aux_clock_enable_attr.attr, + NULL +}; + +static const struct attribute_group aux_clock_enable_attr_group = { + .attrs = aux_clock_enable_attrs, +}; + +static int __init tk_aux_sysfs_init(void) +{ + struct kobject *auxo, *tko = kobject_create_and_add("time", kernel_kobj); + + if (!tko) + return -ENOMEM; + + auxo = kobject_create_and_add("aux_clocks", tko); + if (!auxo) { + kobject_put(tko); + return -ENOMEM; + } + + for (int i = 0; i <= MAX_AUX_CLOCKS; i++) { + char id[2] = { [0] = '0' + i, }; + struct kobject *clk = kobject_create_and_add(id, auxo); + + if (!clk) + return -ENOMEM; + + int ret = sysfs_create_group(clk, &aux_clock_enable_attr_group); + + if (ret) + return ret; + } + return 0; +} +late_initcall(tk_aux_sysfs_init); + +static __init void tk_aux_setup(void) +{ + for (int i = TIMEKEEPER_AUX_FIRST; i <= TIMEKEEPER_AUX_LAST; i++) + tkd_basic_setup(&timekeeper_data[i], i, false); +} +#endif /* CONFIG_POSIX_AUX_CLOCKS */ diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h index 8c9079108ffb..973ede670a36 100644 --- a/kernel/time/timekeeping_internal.h +++ b/kernel/time/timekeeping_internal.h @@ -45,4 +45,7 @@ static inline u64 clocksource_delta(u64 now, u64 last, u64 mask, u64 max_delta) unsigned long timekeeper_lock_irqsave(void); void timekeeper_unlock_irqrestore(unsigned long flags); +/* NTP specific interface to access the current seconds value */ +long ktime_get_ntp_seconds(unsigned int id); + #endif /* _TIMEKEEPING_INTERNAL_H */ diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c index 2f6330831f08..c0c54dc5314c 100644 --- a/kernel/time/timer_migration.c +++ b/kernel/time/timer_migration.c @@ -1405,23 +1405,20 @@ u64 tmigr_quick_check(u64 nextevt) return KTIME_MAX; do { - if (!tmigr_check_lonely(group)) { + if (!tmigr_check_lonely(group)) return KTIME_MAX; - } else { - /* - * Since current CPU is active, events may not be sorted - * from bottom to the top because the CPU's event is ignored - * up to the top and its sibling's events not propagated upwards. - * Thus keep track of the lowest observed expiry. - */ - nextevt = min_t(u64, nextevt, READ_ONCE(group->next_expiry)); - if (!group->parent) - return nextevt; - } + + /* + * Since current CPU is active, events may not be sorted + * from bottom to the top because the CPU's event is ignored + * up to the top and its sibling's events not propagated upwards. + * Thus keep track of the lowest observed expiry. + */ + nextevt = min_t(u64, nextevt, READ_ONCE(group->next_expiry)); group = group->parent; } while (group); - return KTIME_MAX; + return nextevt; } /* diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c index 32ef27c71b57..8ba8b0d8a387 100644 --- a/kernel/time/vsyscall.c +++ b/kernel/time/vsyscall.c @@ -15,26 +15,25 @@ #include "timekeeping_internal.h" +static inline void fill_clock_configuration(struct vdso_clock *vc, const struct tk_read_base *base) +{ + vc->cycle_last = base->cycle_last; +#ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT + vc->max_cycles = base->clock->max_cycles; +#endif + vc->mask = base->mask; + vc->mult = base->mult; + vc->shift = base->shift; +} + static inline void update_vdso_time_data(struct vdso_time_data *vdata, struct timekeeper *tk) { struct vdso_clock *vc = vdata->clock_data; struct vdso_timestamp *vdso_ts; u64 nsec, sec; - vc[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last; -#ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT - vc[CS_HRES_COARSE].max_cycles = tk->tkr_mono.clock->max_cycles; -#endif - vc[CS_HRES_COARSE].mask = tk->tkr_mono.mask; - vc[CS_HRES_COARSE].mult = tk->tkr_mono.mult; - vc[CS_HRES_COARSE].shift = tk->tkr_mono.shift; - vc[CS_RAW].cycle_last = tk->tkr_raw.cycle_last; -#ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT - vc[CS_RAW].max_cycles = tk->tkr_raw.clock->max_cycles; -#endif - vc[CS_RAW].mask = tk->tkr_raw.mask; - vc[CS_RAW].mult = tk->tkr_raw.mult; - vc[CS_RAW].shift = tk->tkr_raw.shift; + fill_clock_configuration(&vc[CS_HRES_COARSE], &tk->tkr_mono); + fill_clock_configuration(&vc[CS_RAW], &tk->tkr_raw); /* CLOCK_MONOTONIC */ vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC]; @@ -119,7 +118,8 @@ void update_vsyscall(struct timekeeper *tk) if (clock_mode != VDSO_CLOCKMODE_NONE) update_vdso_time_data(vdata, tk); - __arch_update_vsyscall(vdata); + __arch_update_vdso_clock(&vc[CS_HRES_COARSE]); + __arch_update_vdso_clock(&vc[CS_RAW]); vdso_write_end(vdata); @@ -136,6 +136,46 @@ void update_vsyscall_tz(void) __arch_sync_vdso_time_data(vdata); } +#ifdef CONFIG_POSIX_AUX_CLOCKS +void vdso_time_update_aux(struct timekeeper *tk) +{ + struct vdso_time_data *vdata = vdso_k_time_data; + struct vdso_timestamp *vdso_ts; + struct vdso_clock *vc; + s32 clock_mode; + u64 nsec; + + vc = &vdata->aux_clock_data[tk->id - TIMEKEEPER_AUX_FIRST]; + vdso_ts = &vc->basetime[VDSO_BASE_AUX]; + clock_mode = tk->tkr_mono.clock->vdso_clock_mode; + if (!tk->clock_valid) + clock_mode = VDSO_CLOCKMODE_NONE; + + /* copy vsyscall data */ + vdso_write_begin_clock(vc); + + vc->clock_mode = clock_mode; + + if (clock_mode != VDSO_CLOCKMODE_NONE) { + fill_clock_configuration(vc, &tk->tkr_mono); + + vdso_ts->sec = tk->xtime_sec; + + nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; + nsec += tk->offs_aux; + vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &nsec); + nsec = nsec << tk->tkr_mono.shift; + vdso_ts->nsec = nsec; + } + + __arch_update_vdso_clock(vc); + + vdso_write_end_clock(vc); + + __arch_sync_vdso_time_data(vdata); +} +#endif + /** * vdso_update_begin - Start of a VDSO update section * diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index a3f35c7d83b6..f135d04e1860 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -53,6 +53,12 @@ config HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS config HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS bool +config HAVE_EXTRA_IPI_TRACEPOINTS + bool + help + For architectures that use ipi_raise, ipi_entry and ipi_exit + tracepoints. + config HAVE_DYNAMIC_FTRACE_WITH_ARGS bool help @@ -74,11 +80,6 @@ config HAVE_DYNAMIC_FTRACE_NO_PATCHABLE If the architecture generates __patchable_function_entries sections but does not want them included in the ftrace locations. -config HAVE_FTRACE_MCOUNT_RECORD - bool - help - See Documentation/trace/ftrace-design.rst - config HAVE_SYSCALL_TRACEPOINTS bool help @@ -275,7 +276,7 @@ config FUNCTION_TRACE_ARGS funcgraph-args (for the function graph tracer) config DYNAMIC_FTRACE - bool "enable/disable function tracing dynamically" + bool depends on FUNCTION_TRACER depends on HAVE_DYNAMIC_FTRACE default y @@ -803,27 +804,22 @@ config BPF_KPROBE_OVERRIDE Allows BPF to override the execution of a probed function and set a different return value. This is used for error injection. -config FTRACE_MCOUNT_RECORD - def_bool y - depends on DYNAMIC_FTRACE - depends on HAVE_FTRACE_MCOUNT_RECORD - config FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY bool - depends on FTRACE_MCOUNT_RECORD + depends on DYNAMIC_FTRACE config FTRACE_MCOUNT_USE_CC def_bool y depends on $(cc-option,-mrecord-mcount) depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY - depends on FTRACE_MCOUNT_RECORD + depends on DYNAMIC_FTRACE config FTRACE_MCOUNT_USE_OBJTOOL def_bool y depends on HAVE_OBJTOOL_MCOUNT depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY depends on !FTRACE_MCOUNT_USE_CC - depends on FTRACE_MCOUNT_RECORD + depends on DYNAMIC_FTRACE select OBJTOOL config FTRACE_MCOUNT_USE_RECORDMCOUNT @@ -831,7 +827,7 @@ config FTRACE_MCOUNT_USE_RECORDMCOUNT depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY depends on !FTRACE_MCOUNT_USE_CC depends on !FTRACE_MCOUNT_USE_OBJTOOL - depends on FTRACE_MCOUNT_RECORD + depends on DYNAMIC_FTRACE config TRACING_MAP bool diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 132c8be6f635..3ae52978cae6 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -781,8 +781,7 @@ BPF_CALL_1(bpf_task_pt_regs, struct task_struct *, task) return (unsigned long) task_pt_regs(task); } -BTF_ID_LIST(bpf_task_pt_regs_ids) -BTF_ID(struct, pt_regs) +BTF_ID_LIST_SINGLE(bpf_task_pt_regs_ids, struct, pt_regs) const struct bpf_func_proto bpf_task_pt_regs_proto = { .func = bpf_task_pt_regs, @@ -1270,7 +1269,7 @@ __bpf_kfunc_start_defs(); * Return: a bpf_key pointer with a valid key pointer if the key is found, a * NULL pointer otherwise. */ -__bpf_kfunc struct bpf_key *bpf_lookup_user_key(u32 serial, u64 flags) +__bpf_kfunc struct bpf_key *bpf_lookup_user_key(s32 serial, u64 flags) { key_ref_t key_ref; struct bpf_key *bkey; @@ -2466,7 +2465,6 @@ struct bpf_kprobe_multi_link { u32 cnt; u32 mods_cnt; struct module **mods; - u32 flags; }; struct bpf_kprobe_multi_run_ctx { @@ -2586,7 +2584,7 @@ static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link, kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link); info->kprobe_multi.count = kmulti_link->cnt; - info->kprobe_multi.flags = kmulti_link->flags; + info->kprobe_multi.flags = kmulti_link->link.flags; info->kprobe_multi.missed = kmulti_link->fp.nmissed; if (!uaddrs) @@ -2620,10 +2618,37 @@ static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link, return err; } +#ifdef CONFIG_PROC_FS +static void bpf_kprobe_multi_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_kprobe_multi_link *kmulti_link; + + kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link); + + seq_printf(seq, + "kprobe_cnt:\t%u\n" + "missed:\t%lu\n", + kmulti_link->cnt, + kmulti_link->fp.nmissed); + + seq_printf(seq, "%s\t %s\n", "cookie", "func"); + for (int i = 0; i < kmulti_link->cnt; i++) { + seq_printf(seq, + "%llu\t %pS\n", + kmulti_link->cookies[i], + (void *)kmulti_link->addrs[i]); + } +} +#endif + static const struct bpf_link_ops bpf_kprobe_multi_link_lops = { .release = bpf_kprobe_multi_link_release, .dealloc_deferred = bpf_kprobe_multi_link_dealloc, .fill_link_info = bpf_kprobe_multi_link_fill_link_info, +#ifdef CONFIG_PROC_FS + .show_fdinfo = bpf_kprobe_multi_show_fdinfo, +#endif }; static void bpf_kprobe_multi_cookie_swap(void *a, void *b, int size, const void *priv) @@ -2960,7 +2985,7 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr } bpf_link_init(&link->link, BPF_LINK_TYPE_KPROBE_MULTI, - &bpf_kprobe_multi_link_lops, prog); + &bpf_kprobe_multi_link_lops, prog, attr->link_create.attach_type); err = bpf_link_prime(&link->link, &link_primer); if (err) @@ -2976,7 +3001,7 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr link->addrs = addrs; link->cookies = cookies; link->cnt = cnt; - link->flags = flags; + link->link.flags = flags; if (cookies) { /* @@ -3045,7 +3070,6 @@ struct bpf_uprobe_multi_link { struct path path; struct bpf_link link; u32 cnt; - u32 flags; struct bpf_uprobe *uprobes; struct task_struct *task; }; @@ -3109,7 +3133,7 @@ static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link, umulti_link = container_of(link, struct bpf_uprobe_multi_link, link); info->uprobe_multi.count = umulti_link->cnt; - info->uprobe_multi.flags = umulti_link->flags; + info->uprobe_multi.flags = umulti_link->link.flags; info->uprobe_multi.pid = umulti_link->task ? task_pid_nr_ns(umulti_link->task, task_active_pid_ns(current)) : 0; @@ -3154,10 +3178,54 @@ static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link, return err; } +#ifdef CONFIG_PROC_FS +static void bpf_uprobe_multi_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_uprobe_multi_link *umulti_link; + char *p, *buf; + pid_t pid; + + umulti_link = container_of(link, struct bpf_uprobe_multi_link, link); + + buf = kmalloc(PATH_MAX, GFP_KERNEL); + if (!buf) + return; + + p = d_path(&umulti_link->path, buf, PATH_MAX); + if (IS_ERR(p)) { + kfree(buf); + return; + } + + pid = umulti_link->task ? + task_pid_nr_ns(umulti_link->task, task_active_pid_ns(current)) : 0; + seq_printf(seq, + "uprobe_cnt:\t%u\n" + "pid:\t%u\n" + "path:\t%s\n", + umulti_link->cnt, pid, p); + + seq_printf(seq, "%s\t %s\t %s\n", "cookie", "offset", "ref_ctr_offset"); + for (int i = 0; i < umulti_link->cnt; i++) { + seq_printf(seq, + "%llu\t %#llx\t %#lx\n", + umulti_link->uprobes[i].cookie, + umulti_link->uprobes[i].offset, + umulti_link->uprobes[i].ref_ctr_offset); + } + + kfree(buf); +} +#endif + static const struct bpf_link_ops bpf_uprobe_multi_link_lops = { .release = bpf_uprobe_multi_link_release, .dealloc_deferred = bpf_uprobe_multi_link_dealloc, .fill_link_info = bpf_uprobe_multi_link_fill_link_info, +#ifdef CONFIG_PROC_FS + .show_fdinfo = bpf_uprobe_multi_show_fdinfo, +#endif }; static int uprobe_prog_run(struct bpf_uprobe *uprobe, @@ -3369,10 +3437,10 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr link->uprobes = uprobes; link->path = path; link->task = task; - link->flags = flags; + link->link.flags = flags; bpf_link_init(&link->link, BPF_LINK_TYPE_UPROBE_MULTI, - &bpf_uprobe_multi_link_lops, prog); + &bpf_uprobe_multi_link_lops, prog, attr->link_create.attach_type); for (i = 0; i < cnt; i++) { uprobes[i].uprobe = uprobe_register(d_real_inode(link->path.dentry), diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index c5b207992fb4..f4d200f0c610 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -1325,6 +1325,10 @@ int register_ftrace_graph(struct fgraph_ops *gops) int ret = 0; int i = -1; + if (WARN_ONCE(gops->ops.flags & FTRACE_OPS_FL_GRAPH, + "function graph ops registered again")) + return -EBUSY; + guard(mutex)(&ftrace_lock); if (!fgraph_stack_cachep) { @@ -1401,17 +1405,21 @@ void unregister_ftrace_graph(struct fgraph_ops *gops) { int command = 0; + if (WARN_ONCE(!(gops->ops.flags & FTRACE_OPS_FL_GRAPH), + "function graph ops unregistered without registering")) + return; + guard(mutex)(&ftrace_lock); if (unlikely(!ftrace_graph_active)) - return; + goto out; if (unlikely(gops->idx < 0 || gops->idx >= FGRAPH_ARRAY_SIZE || fgraph_array[gops->idx] != gops)) - return; + goto out; if (fgraph_lru_release_index(gops->idx) < 0) - return; + goto out; fgraph_array[gops->idx] = &fgraph_stub; @@ -1434,4 +1442,6 @@ void unregister_ftrace_graph(struct fgraph_ops *gops) unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); } gops->saved_func = NULL; + out: + gops->ops.flags &= ~FTRACE_OPS_FL_GRAPH; } diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index ba7ff14f5339..c8034dfc1070 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -352,7 +352,7 @@ static void fprobe_return(struct ftrace_graph_ret *trace, size_words = SIZE_IN_LONG(size); ret_ip = ftrace_regs_get_instruction_pointer(fregs); - preempt_disable(); + preempt_disable_notrace(); curr = 0; while (size_words > curr) { @@ -368,7 +368,7 @@ static void fprobe_return(struct ftrace_graph_ret *trace, } curr += size; } - preempt_enable(); + preempt_enable_notrace(); } NOKPROBE_SYMBOL(fprobe_return); @@ -648,6 +648,11 @@ static int fprobe_init(struct fprobe *fp, unsigned long *addrs, int num) #define FPROBE_IPS_MAX INT_MAX +int fprobe_count_ips_from_filter(const char *filter, const char *notfilter) +{ + return get_ips_from_filter(filter, notfilter, NULL, NULL, FPROBE_IPS_MAX); +} + /** * register_fprobe() - Register fprobe to ftrace by pattern. * @fp: A fprobe data structure to be registered. diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4203fad56b6c..00b76d450a89 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1042,10 +1042,6 @@ static struct ftrace_ops *removed_ops; */ static bool update_all_ops; -#ifndef CONFIG_FTRACE_MCOUNT_RECORD -# error Dynamic ftrace depends on MCOUNT_RECORD -#endif - struct ftrace_func_probe { struct ftrace_probe_ops *probe_ops; struct ftrace_ops ops; diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index 21bb161c2316..f2fe33573e54 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c @@ -17,5 +17,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume); EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_frequency); -EXPORT_TRACEPOINT_SYMBOL_GPL(powernv_throttle); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 00fc38d70e86..5176e0270f07 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1358,6 +1358,13 @@ static inline void rb_inc_page(struct buffer_page **bpage) *bpage = list_entry(p, struct buffer_page, list); } +static inline void rb_dec_page(struct buffer_page **bpage) +{ + struct list_head *p = rb_list_head((*bpage)->list.prev); + + *bpage = list_entry(p, struct buffer_page, list); +} + static struct buffer_page * rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer) { @@ -1866,10 +1873,11 @@ static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu) static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) { struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; - struct buffer_page *head_page; + struct buffer_page *head_page, *orig_head; unsigned long entry_bytes = 0; unsigned long entries = 0; int ret; + u64 ts; int i; if (!meta || !meta->head_buffer) @@ -1885,8 +1893,98 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) entry_bytes += local_read(&cpu_buffer->reader_page->page->commit); local_set(&cpu_buffer->reader_page->entries, ret); - head_page = cpu_buffer->head_page; + orig_head = head_page = cpu_buffer->head_page; + ts = head_page->page->time_stamp; + + /* + * Try to rewind the head so that we can read the pages which already + * read in the previous boot. + */ + if (head_page == cpu_buffer->tail_page) + goto skip_rewind; + + rb_dec_page(&head_page); + for (i = 0; i < meta->nr_subbufs + 1; i++, rb_dec_page(&head_page)) { + + /* Rewind until tail (writer) page. */ + if (head_page == cpu_buffer->tail_page) + break; + + /* Ensure the page has older data than head. */ + if (ts < head_page->page->time_stamp) + break; + + ts = head_page->page->time_stamp; + /* Ensure the page has correct timestamp and some data. */ + if (!ts || rb_page_commit(head_page) == 0) + break; + + /* Stop rewind if the page is invalid. */ + ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu); + if (ret < 0) + break; + + /* Recover the number of entries and update stats. */ + local_set(&head_page->entries, ret); + if (ret) + local_inc(&cpu_buffer->pages_touched); + entries += ret; + entry_bytes += rb_page_commit(head_page); + } + if (i) + pr_info("Ring buffer [%d] rewound %d pages\n", cpu_buffer->cpu, i); + + /* The last rewound page must be skipped. */ + if (head_page != orig_head) + rb_inc_page(&head_page); + + /* + * If the ring buffer was rewound, then inject the reader page + * into the location just before the original head page. + */ + if (head_page != orig_head) { + struct buffer_page *bpage = orig_head; + + rb_dec_page(&bpage); + /* + * Insert the reader_page before the original head page. + * Since the list encode RB_PAGE flags, general list + * operations should be avoided. + */ + cpu_buffer->reader_page->list.next = &orig_head->list; + cpu_buffer->reader_page->list.prev = orig_head->list.prev; + orig_head->list.prev = &cpu_buffer->reader_page->list; + bpage->list.next = &cpu_buffer->reader_page->list; + + /* Make the head_page the reader page */ + cpu_buffer->reader_page = head_page; + bpage = head_page; + rb_inc_page(&head_page); + head_page->list.prev = bpage->list.prev; + rb_dec_page(&bpage); + bpage->list.next = &head_page->list; + rb_set_list_to_head(&bpage->list); + cpu_buffer->pages = &head_page->list; + + cpu_buffer->head_page = head_page; + meta->head_buffer = (unsigned long)head_page->page; + + /* Reset all the indexes */ + bpage = cpu_buffer->reader_page; + meta->buffers[0] = rb_meta_subbuf_idx(meta, bpage->page); + bpage->id = 0; + + for (i = 1, bpage = head_page; i < meta->nr_subbufs; + i++, rb_inc_page(&bpage)) { + meta->buffers[i] = rb_meta_subbuf_idx(meta, bpage->page); + bpage->id = i; + } + + /* We'll restart verifying from orig_head */ + head_page = orig_head; + } + skip_rewind: /* If the commit_buffer is the reader page, update the commit page */ if (meta->commit_buffer == (unsigned long)cpu_buffer->reader_page->page) { cpu_buffer->commit_page = cpu_buffer->reader_page; @@ -4118,7 +4216,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); static const char *show_irq_str(int bits) { - const char *type[] = { + static const char * type[] = { ".", // 0 "s", // 1 "h", // 2 @@ -5342,7 +5440,6 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) */ local_set(&cpu_buffer->reader_page->write, 0); local_set(&cpu_buffer->reader_page->entries, 0); - local_set(&cpu_buffer->reader_page->page->commit, 0); cpu_buffer->reader_page->real_end = 0; spin: @@ -5846,24 +5943,20 @@ ring_buffer_consume(struct trace_buffer *buffer, int cpu, u64 *ts, EXPORT_SYMBOL_GPL(ring_buffer_consume); /** - * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer + * ring_buffer_read_start - start a non consuming read of the buffer * @buffer: The ring buffer to read from * @cpu: The cpu buffer to iterate over * @flags: gfp flags to use for memory allocation * - * This performs the initial preparations necessary to iterate - * through the buffer. Memory is allocated, buffer resizing - * is disabled, and the iterator pointer is returned to the caller. + * This creates an iterator to allow non-consuming iteration through + * the buffer. If the buffer is disabled for writing, it will produce + * the same information each time, but if the buffer is still writing + * then the first hit of a write will cause the iteration to stop. * - * After a sequence of ring_buffer_read_prepare calls, the user is - * expected to make at least one call to ring_buffer_read_prepare_sync. - * Afterwards, ring_buffer_read_start is invoked to get things going - * for real. - * - * This overall must be paired with ring_buffer_read_finish. + * Must be paired with ring_buffer_read_finish. */ struct ring_buffer_iter * -ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags) +ring_buffer_read_start(struct trace_buffer *buffer, int cpu, gfp_t flags) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_iter *iter; @@ -5889,51 +5982,12 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags) atomic_inc(&cpu_buffer->resize_disabled); - return iter; -} -EXPORT_SYMBOL_GPL(ring_buffer_read_prepare); - -/** - * ring_buffer_read_prepare_sync - Synchronize a set of prepare calls - * - * All previously invoked ring_buffer_read_prepare calls to prepare - * iterators will be synchronized. Afterwards, read_buffer_read_start - * calls on those iterators are allowed. - */ -void -ring_buffer_read_prepare_sync(void) -{ - synchronize_rcu(); -} -EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync); - -/** - * ring_buffer_read_start - start a non consuming read of the buffer - * @iter: The iterator returned by ring_buffer_read_prepare - * - * This finalizes the startup of an iteration through the buffer. - * The iterator comes from a call to ring_buffer_read_prepare and - * an intervening ring_buffer_read_prepare_sync must have been - * performed. - * - * Must be paired with ring_buffer_read_finish. - */ -void -ring_buffer_read_start(struct ring_buffer_iter *iter) -{ - struct ring_buffer_per_cpu *cpu_buffer; - unsigned long flags; - - if (!iter) - return; - - cpu_buffer = iter->cpu_buffer; - - raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock); arch_spin_lock(&cpu_buffer->lock); rb_iter_reset(iter); arch_spin_unlock(&cpu_buffer->lock); - raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + return iter; } EXPORT_SYMBOL_GPL(ring_buffer_read_start); diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig index b39f36013ef2..5b4be87ba59d 100644 --- a/kernel/trace/rv/Kconfig +++ b/kernel/trace/rv/Kconfig @@ -1,19 +1,31 @@ # SPDX-License-Identifier: GPL-2.0-only # -config DA_MON_EVENTS +config RV_MON_EVENTS + bool + +config RV_MON_MAINTENANCE_EVENTS bool config DA_MON_EVENTS_IMPLICIT - select DA_MON_EVENTS + select RV_MON_EVENTS + select RV_MON_MAINTENANCE_EVENTS bool config DA_MON_EVENTS_ID - select DA_MON_EVENTS + select RV_MON_EVENTS + select RV_MON_MAINTENANCE_EVENTS + bool + +config LTL_MON_EVENTS_ID + select RV_MON_EVENTS + bool + +config RV_LTL_MONITOR bool menuconfig RV bool "Runtime Verification" - depends on TRACING + select TRACING help Enable the kernel runtime verification infrastructure. RV is a lightweight (yet rigorous) method that complements classical @@ -25,15 +37,34 @@ menuconfig RV For further information, see: Documentation/trace/rv/runtime-verification.rst +config RV_PER_TASK_MONITORS + int "Maximum number of per-task monitor" + depends on RV + range 1 8 + default 2 + help + This option configures the maximum number of per-task RV monitors that can run + simultaneously. + source "kernel/trace/rv/monitors/wip/Kconfig" source "kernel/trace/rv/monitors/wwnr/Kconfig" + source "kernel/trace/rv/monitors/sched/Kconfig" -source "kernel/trace/rv/monitors/tss/Kconfig" source "kernel/trace/rv/monitors/sco/Kconfig" source "kernel/trace/rv/monitors/snroc/Kconfig" source "kernel/trace/rv/monitors/scpd/Kconfig" source "kernel/trace/rv/monitors/snep/Kconfig" -source "kernel/trace/rv/monitors/sncid/Kconfig" +source "kernel/trace/rv/monitors/sts/Kconfig" +source "kernel/trace/rv/monitors/nrp/Kconfig" +source "kernel/trace/rv/monitors/sssw/Kconfig" +source "kernel/trace/rv/monitors/opid/Kconfig" +# Add new sched monitors here + +source "kernel/trace/rv/monitors/rtapp/Kconfig" +source "kernel/trace/rv/monitors/pagefault/Kconfig" +source "kernel/trace/rv/monitors/sleep/Kconfig" +# Add new rtapp monitors here + # Add new monitors here config RV_REACTORS diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile index f9b2cd0483c3..750e4ad6fa0f 100644 --- a/kernel/trace/rv/Makefile +++ b/kernel/trace/rv/Makefile @@ -6,12 +6,17 @@ obj-$(CONFIG_RV) += rv.o obj-$(CONFIG_RV_MON_WIP) += monitors/wip/wip.o obj-$(CONFIG_RV_MON_WWNR) += monitors/wwnr/wwnr.o obj-$(CONFIG_RV_MON_SCHED) += monitors/sched/sched.o -obj-$(CONFIG_RV_MON_TSS) += monitors/tss/tss.o obj-$(CONFIG_RV_MON_SCO) += monitors/sco/sco.o obj-$(CONFIG_RV_MON_SNROC) += monitors/snroc/snroc.o obj-$(CONFIG_RV_MON_SCPD) += monitors/scpd/scpd.o obj-$(CONFIG_RV_MON_SNEP) += monitors/snep/snep.o -obj-$(CONFIG_RV_MON_SNCID) += monitors/sncid/sncid.o +obj-$(CONFIG_RV_MON_RTAPP) += monitors/rtapp/rtapp.o +obj-$(CONFIG_RV_MON_PAGEFAULT) += monitors/pagefault/pagefault.o +obj-$(CONFIG_RV_MON_SLEEP) += monitors/sleep/sleep.o +obj-$(CONFIG_RV_MON_STS) += monitors/sts/sts.o +obj-$(CONFIG_RV_MON_NRP) += monitors/nrp/nrp.o +obj-$(CONFIG_RV_MON_SSSW) += monitors/sssw/sssw.o +obj-$(CONFIG_RV_MON_OPID) += monitors/opid/opid.o # Add new monitors here obj-$(CONFIG_RV_REACTORS) += rv_reactors.o obj-$(CONFIG_RV_REACT_PRINTK) += reactor_printk.o diff --git a/kernel/trace/rv/monitors/tss/Kconfig b/kernel/trace/rv/monitors/nrp/Kconfig index 479f86f52e60..f5ec08f65535 100644 --- a/kernel/trace/rv/monitors/tss/Kconfig +++ b/kernel/trace/rv/monitors/nrp/Kconfig @@ -1,14 +1,16 @@ # SPDX-License-Identifier: GPL-2.0-only # -config RV_MON_TSS +config RV_MON_NRP depends on RV depends on RV_MON_SCHED - default y - select DA_MON_EVENTS_IMPLICIT - bool "tss monitor" + default y if !ARM64 + select DA_MON_EVENTS_ID + bool "nrp monitor" help - Monitor to ensure sched_switch happens only in scheduling context. + Monitor to ensure preemption requires need resched. This monitor is part of the sched monitors collection. + This monitor is unstable on arm64, say N unless you are testing it. + For further information, see: Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/nrp/nrp.c b/kernel/trace/rv/monitors/nrp/nrp.c new file mode 100644 index 000000000000..5a83b7171432 --- /dev/null +++ b/kernel/trace/rv/monitors/nrp/nrp.c @@ -0,0 +1,138 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> +#include <rv/da_monitor.h> + +#define MODULE_NAME "nrp" + +#include <trace/events/irq.h> +#include <trace/events/sched.h> +#include <rv_trace.h> +#include <monitors/sched/sched.h> + +#include "nrp.h" + +static struct rv_monitor rv_nrp; +DECLARE_DA_MON_PER_TASK(nrp, unsigned char); + +#ifdef CONFIG_X86_LOCAL_APIC +#include <asm/trace/irq_vectors.h> + +static void handle_vector_irq_entry(void *data, int vector) +{ + da_handle_event_nrp(current, irq_entry_nrp); +} + +static void attach_vector_irq(void) +{ + rv_attach_trace_probe("nrp", local_timer_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_IRQ_WORK)) + rv_attach_trace_probe("nrp", irq_work_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_SMP)) { + rv_attach_trace_probe("nrp", reschedule_entry, handle_vector_irq_entry); + rv_attach_trace_probe("nrp", call_function_entry, handle_vector_irq_entry); + rv_attach_trace_probe("nrp", call_function_single_entry, handle_vector_irq_entry); + } +} + +static void detach_vector_irq(void) +{ + rv_detach_trace_probe("nrp", local_timer_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_IRQ_WORK)) + rv_detach_trace_probe("nrp", irq_work_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_SMP)) { + rv_detach_trace_probe("nrp", reschedule_entry, handle_vector_irq_entry); + rv_detach_trace_probe("nrp", call_function_entry, handle_vector_irq_entry); + rv_detach_trace_probe("nrp", call_function_single_entry, handle_vector_irq_entry); + } +} + +#else +/* We assume irq_entry tracepoints are sufficient on other architectures */ +static void attach_vector_irq(void) { } +static void detach_vector_irq(void) { } +#endif + +static void handle_irq_entry(void *data, int irq, struct irqaction *action) +{ + da_handle_event_nrp(current, irq_entry_nrp); +} + +static void handle_sched_need_resched(void *data, struct task_struct *tsk, + int cpu, int tif) +{ + /* + * Although need_resched leads to both the rescheduling and preempt_irq + * states, it is safer to start the monitor always in preempt_irq, + * which may not mirror the system state but makes the monitor simpler, + */ + if (tif == TIF_NEED_RESCHED) + da_handle_start_event_nrp(tsk, sched_need_resched_nrp); +} + +static void handle_schedule_entry(void *data, bool preempt) +{ + if (preempt) + da_handle_event_nrp(current, schedule_entry_preempt_nrp); + else + da_handle_event_nrp(current, schedule_entry_nrp); +} + +static int enable_nrp(void) +{ + int retval; + + retval = da_monitor_init_nrp(); + if (retval) + return retval; + + rv_attach_trace_probe("nrp", irq_handler_entry, handle_irq_entry); + rv_attach_trace_probe("nrp", sched_set_need_resched_tp, handle_sched_need_resched); + rv_attach_trace_probe("nrp", sched_entry_tp, handle_schedule_entry); + attach_vector_irq(); + + return 0; +} + +static void disable_nrp(void) +{ + rv_nrp.enabled = 0; + + rv_detach_trace_probe("nrp", irq_handler_entry, handle_irq_entry); + rv_detach_trace_probe("nrp", sched_set_need_resched_tp, handle_sched_need_resched); + rv_detach_trace_probe("nrp", sched_entry_tp, handle_schedule_entry); + detach_vector_irq(); + + da_monitor_destroy_nrp(); +} + +static struct rv_monitor rv_nrp = { + .name = "nrp", + .description = "need resched preempts.", + .enable = enable_nrp, + .disable = disable_nrp, + .reset = da_monitor_reset_all_nrp, + .enabled = 0, +}; + +static int __init register_nrp(void) +{ + return rv_register_monitor(&rv_nrp, &rv_sched); +} + +static void __exit unregister_nrp(void) +{ + rv_unregister_monitor(&rv_nrp); +} + +module_init(register_nrp); +module_exit(unregister_nrp); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("nrp: need resched preempts."); diff --git a/kernel/trace/rv/monitors/nrp/nrp.h b/kernel/trace/rv/monitors/nrp/nrp.h new file mode 100644 index 000000000000..c9f12207cbf6 --- /dev/null +++ b/kernel/trace/rv/monitors/nrp/nrp.h @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of nrp automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +enum states_nrp { + preempt_irq_nrp = 0, + any_thread_running_nrp, + nested_preempt_nrp, + rescheduling_nrp, + state_max_nrp +}; + +#define INVALID_STATE state_max_nrp + +enum events_nrp { + irq_entry_nrp = 0, + sched_need_resched_nrp, + schedule_entry_nrp, + schedule_entry_preempt_nrp, + event_max_nrp +}; + +struct automaton_nrp { + char *state_names[state_max_nrp]; + char *event_names[event_max_nrp]; + unsigned char function[state_max_nrp][event_max_nrp]; + unsigned char initial_state; + bool final_states[state_max_nrp]; +}; + +static const struct automaton_nrp automaton_nrp = { + .state_names = { + "preempt_irq", + "any_thread_running", + "nested_preempt", + "rescheduling" + }, + .event_names = { + "irq_entry", + "sched_need_resched", + "schedule_entry", + "schedule_entry_preempt" + }, + .function = { + { + preempt_irq_nrp, + preempt_irq_nrp, + nested_preempt_nrp, + nested_preempt_nrp + }, + { + any_thread_running_nrp, + rescheduling_nrp, + any_thread_running_nrp, + INVALID_STATE + }, + { + nested_preempt_nrp, + preempt_irq_nrp, + any_thread_running_nrp, + any_thread_running_nrp + }, + { + preempt_irq_nrp, + rescheduling_nrp, + any_thread_running_nrp, + any_thread_running_nrp + }, + }, + .initial_state = preempt_irq_nrp, + .final_states = { 0, 1, 0, 0 }, +}; diff --git a/kernel/trace/rv/monitors/nrp/nrp_trace.h b/kernel/trace/rv/monitors/nrp/nrp_trace.h new file mode 100644 index 000000000000..2e13497de3b6 --- /dev/null +++ b/kernel/trace/rv/monitors/nrp/nrp_trace.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_NRP +DEFINE_EVENT(event_da_monitor_id, event_nrp, + TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state), + TP_ARGS(id, state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor_id, error_nrp, + TP_PROTO(int id, char *state, char *event), + TP_ARGS(id, state, event)); +#endif /* CONFIG_RV_MON_NRP */ diff --git a/kernel/trace/rv/monitors/opid/Kconfig b/kernel/trace/rv/monitors/opid/Kconfig new file mode 100644 index 000000000000..561d32da572b --- /dev/null +++ b/kernel/trace/rv/monitors/opid/Kconfig @@ -0,0 +1,19 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_OPID + depends on RV + depends on TRACE_IRQFLAGS + depends on TRACE_PREEMPT_TOGGLE + depends on RV_MON_SCHED + default y if PREEMPT_RT + select DA_MON_EVENTS_IMPLICIT + bool "opid monitor" + help + Monitor to ensure operations like wakeup and need resched occur with + interrupts and preemption disabled or during IRQs, where preemption + may not be disabled explicitly. + + This monitor is unstable on !PREEMPT_RT, say N unless you are testing it. + + For further information, see: + Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/opid/opid.c b/kernel/trace/rv/monitors/opid/opid.c new file mode 100644 index 000000000000..50d64e7fb8c4 --- /dev/null +++ b/kernel/trace/rv/monitors/opid/opid.c @@ -0,0 +1,168 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> +#include <rv/da_monitor.h> + +#define MODULE_NAME "opid" + +#include <trace/events/sched.h> +#include <trace/events/irq.h> +#include <trace/events/preemptirq.h> +#include <rv_trace.h> +#include <monitors/sched/sched.h> + +#include "opid.h" + +static struct rv_monitor rv_opid; +DECLARE_DA_MON_PER_CPU(opid, unsigned char); + +#ifdef CONFIG_X86_LOCAL_APIC +#include <asm/trace/irq_vectors.h> + +static void handle_vector_irq_entry(void *data, int vector) +{ + da_handle_event_opid(irq_entry_opid); +} + +static void attach_vector_irq(void) +{ + rv_attach_trace_probe("opid", local_timer_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_IRQ_WORK)) + rv_attach_trace_probe("opid", irq_work_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_SMP)) { + rv_attach_trace_probe("opid", reschedule_entry, handle_vector_irq_entry); + rv_attach_trace_probe("opid", call_function_entry, handle_vector_irq_entry); + rv_attach_trace_probe("opid", call_function_single_entry, handle_vector_irq_entry); + } +} + +static void detach_vector_irq(void) +{ + rv_detach_trace_probe("opid", local_timer_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_IRQ_WORK)) + rv_detach_trace_probe("opid", irq_work_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_SMP)) { + rv_detach_trace_probe("opid", reschedule_entry, handle_vector_irq_entry); + rv_detach_trace_probe("opid", call_function_entry, handle_vector_irq_entry); + rv_detach_trace_probe("opid", call_function_single_entry, handle_vector_irq_entry); + } +} + +#else +/* We assume irq_entry tracepoints are sufficient on other architectures */ +static void attach_vector_irq(void) { } +static void detach_vector_irq(void) { } +#endif + +static void handle_irq_disable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_event_opid(irq_disable_opid); +} + +static void handle_irq_enable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_event_opid(irq_enable_opid); +} + +static void handle_irq_entry(void *data, int irq, struct irqaction *action) +{ + da_handle_event_opid(irq_entry_opid); +} + +static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_event_opid(preempt_disable_opid); +} + +static void handle_preempt_enable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_event_opid(preempt_enable_opid); +} + +static void handle_sched_need_resched(void *data, struct task_struct *tsk, int cpu, int tif) +{ + /* The monitor's intitial state is not in_irq */ + if (this_cpu_read(hardirq_context)) + da_handle_event_opid(sched_need_resched_opid); + else + da_handle_start_event_opid(sched_need_resched_opid); +} + +static void handle_sched_waking(void *data, struct task_struct *p) +{ + /* The monitor's intitial state is not in_irq */ + if (this_cpu_read(hardirq_context)) + da_handle_event_opid(sched_waking_opid); + else + da_handle_start_event_opid(sched_waking_opid); +} + +static int enable_opid(void) +{ + int retval; + + retval = da_monitor_init_opid(); + if (retval) + return retval; + + rv_attach_trace_probe("opid", irq_disable, handle_irq_disable); + rv_attach_trace_probe("opid", irq_enable, handle_irq_enable); + rv_attach_trace_probe("opid", irq_handler_entry, handle_irq_entry); + rv_attach_trace_probe("opid", preempt_disable, handle_preempt_disable); + rv_attach_trace_probe("opid", preempt_enable, handle_preempt_enable); + rv_attach_trace_probe("opid", sched_set_need_resched_tp, handle_sched_need_resched); + rv_attach_trace_probe("opid", sched_waking, handle_sched_waking); + attach_vector_irq(); + + return 0; +} + +static void disable_opid(void) +{ + rv_opid.enabled = 0; + + rv_detach_trace_probe("opid", irq_disable, handle_irq_disable); + rv_detach_trace_probe("opid", irq_enable, handle_irq_enable); + rv_detach_trace_probe("opid", irq_handler_entry, handle_irq_entry); + rv_detach_trace_probe("opid", preempt_disable, handle_preempt_disable); + rv_detach_trace_probe("opid", preempt_enable, handle_preempt_enable); + rv_detach_trace_probe("opid", sched_set_need_resched_tp, handle_sched_need_resched); + rv_detach_trace_probe("opid", sched_waking, handle_sched_waking); + detach_vector_irq(); + + da_monitor_destroy_opid(); +} + +/* + * This is the monitor register section. + */ +static struct rv_monitor rv_opid = { + .name = "opid", + .description = "operations with preemption and irq disabled.", + .enable = enable_opid, + .disable = disable_opid, + .reset = da_monitor_reset_all_opid, + .enabled = 0, +}; + +static int __init register_opid(void) +{ + return rv_register_monitor(&rv_opid, &rv_sched); +} + +static void __exit unregister_opid(void) +{ + rv_unregister_monitor(&rv_opid); +} + +module_init(register_opid); +module_exit(unregister_opid); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("opid: operations with preemption and irq disabled."); diff --git a/kernel/trace/rv/monitors/opid/opid.h b/kernel/trace/rv/monitors/opid/opid.h new file mode 100644 index 000000000000..b4b8c2ff7f64 --- /dev/null +++ b/kernel/trace/rv/monitors/opid/opid.h @@ -0,0 +1,104 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of opid automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +enum states_opid { + disabled_opid = 0, + enabled_opid, + in_irq_opid, + irq_disabled_opid, + preempt_disabled_opid, + state_max_opid +}; + +#define INVALID_STATE state_max_opid + +enum events_opid { + irq_disable_opid = 0, + irq_enable_opid, + irq_entry_opid, + preempt_disable_opid, + preempt_enable_opid, + sched_need_resched_opid, + sched_waking_opid, + event_max_opid +}; + +struct automaton_opid { + char *state_names[state_max_opid]; + char *event_names[event_max_opid]; + unsigned char function[state_max_opid][event_max_opid]; + unsigned char initial_state; + bool final_states[state_max_opid]; +}; + +static const struct automaton_opid automaton_opid = { + .state_names = { + "disabled", + "enabled", + "in_irq", + "irq_disabled", + "preempt_disabled" + }, + .event_names = { + "irq_disable", + "irq_enable", + "irq_entry", + "preempt_disable", + "preempt_enable", + "sched_need_resched", + "sched_waking" + }, + .function = { + { + INVALID_STATE, + preempt_disabled_opid, + disabled_opid, + INVALID_STATE, + irq_disabled_opid, + disabled_opid, + disabled_opid + }, + { + irq_disabled_opid, + INVALID_STATE, + INVALID_STATE, + preempt_disabled_opid, + enabled_opid, + INVALID_STATE, + INVALID_STATE + }, + { + INVALID_STATE, + enabled_opid, + in_irq_opid, + INVALID_STATE, + INVALID_STATE, + in_irq_opid, + in_irq_opid + }, + { + INVALID_STATE, + enabled_opid, + in_irq_opid, + disabled_opid, + INVALID_STATE, + irq_disabled_opid, + INVALID_STATE + }, + { + disabled_opid, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + enabled_opid, + INVALID_STATE, + INVALID_STATE + }, + }, + .initial_state = disabled_opid, + .final_states = { 0, 1, 0, 0, 0 }, +}; diff --git a/kernel/trace/rv/monitors/sncid/sncid_trace.h b/kernel/trace/rv/monitors/opid/opid_trace.h index 3ce42a57671d..3df6ff955c30 100644 --- a/kernel/trace/rv/monitors/sncid/sncid_trace.h +++ b/kernel/trace/rv/monitors/opid/opid_trace.h @@ -4,12 +4,12 @@ * Snippet to be included in rv_trace.h */ -#ifdef CONFIG_RV_MON_SNCID -DEFINE_EVENT(event_da_monitor, event_sncid, +#ifdef CONFIG_RV_MON_OPID +DEFINE_EVENT(event_da_monitor, event_opid, TP_PROTO(char *state, char *event, char *next_state, bool final_state), TP_ARGS(state, event, next_state, final_state)); -DEFINE_EVENT(error_da_monitor, error_sncid, +DEFINE_EVENT(error_da_monitor, error_opid, TP_PROTO(char *state, char *event), TP_ARGS(state, event)); -#endif /* CONFIG_RV_MON_SNCID */ +#endif /* CONFIG_RV_MON_OPID */ diff --git a/kernel/trace/rv/monitors/pagefault/Kconfig b/kernel/trace/rv/monitors/pagefault/Kconfig new file mode 100644 index 000000000000..5e16625f1653 --- /dev/null +++ b/kernel/trace/rv/monitors/pagefault/Kconfig @@ -0,0 +1,20 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_PAGEFAULT + depends on RV + select RV_LTL_MONITOR + depends on RV_MON_RTAPP + depends on X86 || RISCV + default y + select LTL_MON_EVENTS_ID + bool "pagefault monitor" + help + Monitor that real-time tasks do not raise page faults, causing + undesirable latency. + + If you are developing a real-time system and not entirely sure whether + the applications are designed correctly for real-time, you want to say + Y here. + + This monitor does not affect execution speed while it is not running, + therefore it is safe to enable this in production kernel. diff --git a/kernel/trace/rv/monitors/pagefault/pagefault.c b/kernel/trace/rv/monitors/pagefault/pagefault.c new file mode 100644 index 000000000000..9fe6123b2200 --- /dev/null +++ b/kernel/trace/rv/monitors/pagefault/pagefault.c @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/rv.h> +#include <linux/sched/deadline.h> +#include <linux/sched/rt.h> +#include <linux/tracepoint.h> +#include <rv/instrumentation.h> + +#define MODULE_NAME "pagefault" + +#include <rv_trace.h> +#include <trace/events/exceptions.h> +#include <monitors/rtapp/rtapp.h> + +#include "pagefault.h" +#include <rv/ltl_monitor.h> + +static void ltl_atoms_fetch(struct task_struct *task, struct ltl_monitor *mon) +{ + /* + * This includes "actual" real-time tasks and also PI-boosted + * tasks. A task being PI-boosted means it is blocking an "actual" + * real-task, therefore it should also obey the monitor's rule, + * otherwise the "actual" real-task may be delayed. + */ + ltl_atom_set(mon, LTL_RT, rt_or_dl_task(task)); +} + +static void ltl_atoms_init(struct task_struct *task, struct ltl_monitor *mon, bool task_creation) +{ + if (task_creation) + ltl_atom_set(mon, LTL_PAGEFAULT, false); +} + +static void handle_page_fault(void *data, unsigned long address, struct pt_regs *regs, + unsigned long error_code) +{ + ltl_atom_pulse(current, LTL_PAGEFAULT, true); +} + +static int enable_pagefault(void) +{ + int retval; + + retval = ltl_monitor_init(); + if (retval) + return retval; + + rv_attach_trace_probe("rtapp_pagefault", page_fault_kernel, handle_page_fault); + rv_attach_trace_probe("rtapp_pagefault", page_fault_user, handle_page_fault); + + return 0; +} + +static void disable_pagefault(void) +{ + rv_detach_trace_probe("rtapp_pagefault", page_fault_kernel, handle_page_fault); + rv_detach_trace_probe("rtapp_pagefault", page_fault_user, handle_page_fault); + + ltl_monitor_destroy(); +} + +static struct rv_monitor rv_pagefault = { + .name = "pagefault", + .description = "Monitor that RT tasks do not raise page faults", + .enable = enable_pagefault, + .disable = disable_pagefault, +}; + +static int __init register_pagefault(void) +{ + return rv_register_monitor(&rv_pagefault, &rv_rtapp); +} + +static void __exit unregister_pagefault(void) +{ + rv_unregister_monitor(&rv_pagefault); +} + +module_init(register_pagefault); +module_exit(unregister_pagefault); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Nam Cao <namcao@linutronix.de>"); +MODULE_DESCRIPTION("pagefault: Monitor that RT tasks do not raise page faults"); diff --git a/kernel/trace/rv/monitors/pagefault/pagefault.h b/kernel/trace/rv/monitors/pagefault/pagefault.h new file mode 100644 index 000000000000..c580ec194009 --- /dev/null +++ b/kernel/trace/rv/monitors/pagefault/pagefault.h @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * C implementation of Buchi automaton, automatically generated by + * tools/verification/rvgen from the linear temporal logic specification. + * For further information, see kernel documentation: + * Documentation/trace/rv/linear_temporal_logic.rst + */ + +#include <linux/rv.h> + +#define MONITOR_NAME pagefault + +enum ltl_atom { + LTL_PAGEFAULT, + LTL_RT, + LTL_NUM_ATOM +}; +static_assert(LTL_NUM_ATOM <= RV_MAX_LTL_ATOM); + +static const char *ltl_atom_str(enum ltl_atom atom) +{ + static const char *const names[] = { + "pa", + "rt", + }; + + return names[atom]; +} + +enum ltl_buchi_state { + S0, + RV_NUM_BA_STATES +}; +static_assert(RV_NUM_BA_STATES <= RV_MAX_BA_STATES); + +static void ltl_start(struct task_struct *task, struct ltl_monitor *mon) +{ + bool pagefault = test_bit(LTL_PAGEFAULT, mon->atoms); + bool val3 = !pagefault; + bool rt = test_bit(LTL_RT, mon->atoms); + bool val1 = !rt; + bool val4 = val1 || val3; + + if (val4) + __set_bit(S0, mon->states); +} + +static void +ltl_possible_next_states(struct ltl_monitor *mon, unsigned int state, unsigned long *next) +{ + bool pagefault = test_bit(LTL_PAGEFAULT, mon->atoms); + bool val3 = !pagefault; + bool rt = test_bit(LTL_RT, mon->atoms); + bool val1 = !rt; + bool val4 = val1 || val3; + + switch (state) { + case S0: + if (val4) + __set_bit(S0, next); + break; + } +} diff --git a/kernel/trace/rv/monitors/pagefault/pagefault_trace.h b/kernel/trace/rv/monitors/pagefault/pagefault_trace.h new file mode 100644 index 000000000000..fe1f82597b1a --- /dev/null +++ b/kernel/trace/rv/monitors/pagefault/pagefault_trace.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_PAGEFAULT +DEFINE_EVENT(event_ltl_monitor_id, event_pagefault, + TP_PROTO(struct task_struct *task, char *states, char *atoms, char *next), + TP_ARGS(task, states, atoms, next)); +DEFINE_EVENT(error_ltl_monitor_id, error_pagefault, + TP_PROTO(struct task_struct *task), + TP_ARGS(task)); +#endif /* CONFIG_RV_MON_PAGEFAULT */ diff --git a/kernel/trace/rv/monitors/rtapp/Kconfig b/kernel/trace/rv/monitors/rtapp/Kconfig new file mode 100644 index 000000000000..1ce9370a9ba8 --- /dev/null +++ b/kernel/trace/rv/monitors/rtapp/Kconfig @@ -0,0 +1,11 @@ +config RV_MON_RTAPP + depends on RV + depends on RV_PER_TASK_MONITORS >= 2 + bool "rtapp monitor" + help + Collection of monitors to check for common problems with real-time + application that may cause unexpected latency. + + If you are developing a real-time system and not entirely sure whether + the applications are designed correctly for real-time, you want to say + Y here. diff --git a/kernel/trace/rv/monitors/rtapp/rtapp.c b/kernel/trace/rv/monitors/rtapp/rtapp.c new file mode 100644 index 000000000000..fd75fc927d65 --- /dev/null +++ b/kernel/trace/rv/monitors/rtapp/rtapp.c @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> + +#define MODULE_NAME "rtapp" + +#include "rtapp.h" + +struct rv_monitor rv_rtapp; + +struct rv_monitor rv_rtapp = { + .name = "rtapp", + .description = "Collection of monitors for detecting problems with real-time applications", +}; + +static int __init register_rtapp(void) +{ + return rv_register_monitor(&rv_rtapp, NULL); +} + +static void __exit unregister_rtapp(void) +{ + rv_unregister_monitor(&rv_rtapp); +} + +module_init(register_rtapp); +module_exit(unregister_rtapp); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Nam Cao <namcao@linutronix.de>"); +MODULE_DESCRIPTION("Collection of monitors for detecting problems with real-time applications"); diff --git a/kernel/trace/rv/monitors/rtapp/rtapp.h b/kernel/trace/rv/monitors/rtapp/rtapp.h new file mode 100644 index 000000000000..4c200d67c7f6 --- /dev/null +++ b/kernel/trace/rv/monitors/rtapp/rtapp.h @@ -0,0 +1,3 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +extern struct rv_monitor rv_rtapp; diff --git a/kernel/trace/rv/monitors/sched/Kconfig b/kernel/trace/rv/monitors/sched/Kconfig index ae3eb410abd7..aa16456da864 100644 --- a/kernel/trace/rv/monitors/sched/Kconfig +++ b/kernel/trace/rv/monitors/sched/Kconfig @@ -2,6 +2,7 @@ # config RV_MON_SCHED depends on RV + depends on RV_PER_TASK_MONITORS >= 3 bool "sched monitor" help Collection of monitors to check the scheduler behaves according to specifications. diff --git a/kernel/trace/rv/monitors/sched/sched.c b/kernel/trace/rv/monitors/sched/sched.c index 905e03c3c934..d04db4b543f9 100644 --- a/kernel/trace/rv/monitors/sched/sched.c +++ b/kernel/trace/rv/monitors/sched/sched.c @@ -21,8 +21,7 @@ struct rv_monitor rv_sched = { static int __init register_sched(void) { - rv_register_monitor(&rv_sched, NULL); - return 0; + return rv_register_monitor(&rv_sched, NULL); } static void __exit unregister_sched(void) diff --git a/kernel/trace/rv/monitors/sco/sco.c b/kernel/trace/rv/monitors/sco/sco.c index 4cff59220bfc..04c36405e2e3 100644 --- a/kernel/trace/rv/monitors/sco/sco.c +++ b/kernel/trace/rv/monitors/sco/sco.c @@ -24,12 +24,12 @@ static void handle_sched_set_state(void *data, struct task_struct *tsk, int stat da_handle_start_event_sco(sched_set_state_sco); } -static void handle_schedule_entry(void *data, bool preempt, unsigned long ip) +static void handle_schedule_entry(void *data, bool preempt) { da_handle_event_sco(schedule_entry_sco); } -static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip) +static void handle_schedule_exit(void *data, bool is_switch) { da_handle_start_event_sco(schedule_exit_sco); } @@ -71,8 +71,7 @@ static struct rv_monitor rv_sco = { static int __init register_sco(void) { - rv_register_monitor(&rv_sco, &rv_sched); - return 0; + return rv_register_monitor(&rv_sco, &rv_sched); } static void __exit unregister_sco(void) diff --git a/kernel/trace/rv/monitors/scpd/Kconfig b/kernel/trace/rv/monitors/scpd/Kconfig index b9114fbf680f..682d0416188b 100644 --- a/kernel/trace/rv/monitors/scpd/Kconfig +++ b/kernel/trace/rv/monitors/scpd/Kconfig @@ -2,7 +2,7 @@ # config RV_MON_SCPD depends on RV - depends on PREEMPT_TRACER + depends on TRACE_PREEMPT_TOGGLE depends on RV_MON_SCHED default y select DA_MON_EVENTS_IMPLICIT diff --git a/kernel/trace/rv/monitors/scpd/scpd.c b/kernel/trace/rv/monitors/scpd/scpd.c index cbdd6a5f8d7f..1e351ba52fee 100644 --- a/kernel/trace/rv/monitors/scpd/scpd.c +++ b/kernel/trace/rv/monitors/scpd/scpd.c @@ -30,12 +30,12 @@ static void handle_preempt_enable(void *data, unsigned long ip, unsigned long pa da_handle_start_event_scpd(preempt_enable_scpd); } -static void handle_schedule_entry(void *data, bool preempt, unsigned long ip) +static void handle_schedule_entry(void *data, bool preempt) { da_handle_event_scpd(schedule_entry_scpd); } -static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip) +static void handle_schedule_exit(void *data, bool is_switch) { da_handle_event_scpd(schedule_exit_scpd); } @@ -79,8 +79,7 @@ static struct rv_monitor rv_scpd = { static int __init register_scpd(void) { - rv_register_monitor(&rv_scpd, &rv_sched); - return 0; + return rv_register_monitor(&rv_scpd, &rv_sched); } static void __exit unregister_scpd(void) diff --git a/kernel/trace/rv/monitors/sleep/Kconfig b/kernel/trace/rv/monitors/sleep/Kconfig new file mode 100644 index 000000000000..6b7a122e7b47 --- /dev/null +++ b/kernel/trace/rv/monitors/sleep/Kconfig @@ -0,0 +1,22 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_SLEEP + depends on RV + select RV_LTL_MONITOR + depends on HAVE_SYSCALL_TRACEPOINTS + depends on RV_MON_RTAPP + select TRACE_IRQFLAGS + default y + select LTL_MON_EVENTS_ID + bool "sleep monitor" + help + Monitor that real-time tasks do not sleep in a manner that may + cause undesirable latency. + + If you are developing a real-time system and not entirely sure whether + the applications are designed correctly for real-time, you want to say + Y here. + + Enabling this monitor may have performance impact (due to select + TRACE_IRQFLAGS). Therefore, you probably should say N for + production kernel. diff --git a/kernel/trace/rv/monitors/sleep/sleep.c b/kernel/trace/rv/monitors/sleep/sleep.c new file mode 100644 index 000000000000..eea447b06907 --- /dev/null +++ b/kernel/trace/rv/monitors/sleep/sleep.c @@ -0,0 +1,237 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/init.h> +#include <linux/irqflags.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/rv.h> +#include <linux/sched/deadline.h> +#include <linux/sched/rt.h> +#include <rv/instrumentation.h> + +#define MODULE_NAME "sleep" + +#include <trace/events/syscalls.h> +#include <trace/events/sched.h> +#include <trace/events/lock.h> +#include <uapi/linux/futex.h> +#include <rv_trace.h> +#include <monitors/rtapp/rtapp.h> + +#include "sleep.h" +#include <rv/ltl_monitor.h> + +static void ltl_atoms_fetch(struct task_struct *task, struct ltl_monitor *mon) +{ + /* + * This includes "actual" real-time tasks and also PI-boosted + * tasks. A task being PI-boosted means it is blocking an "actual" + * real-task, therefore it should also obey the monitor's rule, + * otherwise the "actual" real-task may be delayed. + */ + ltl_atom_set(mon, LTL_RT, rt_or_dl_task(task)); +} + +static void ltl_atoms_init(struct task_struct *task, struct ltl_monitor *mon, bool task_creation) +{ + ltl_atom_set(mon, LTL_SLEEP, false); + ltl_atom_set(mon, LTL_WAKE, false); + ltl_atom_set(mon, LTL_ABORT_SLEEP, false); + ltl_atom_set(mon, LTL_WOKEN_BY_HARDIRQ, false); + ltl_atom_set(mon, LTL_WOKEN_BY_NMI, false); + ltl_atom_set(mon, LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO, false); + + if (task_creation) { + ltl_atom_set(mon, LTL_KTHREAD_SHOULD_STOP, false); + ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_MONOTONIC, false); + ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, false); + ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, false); + ltl_atom_set(mon, LTL_CLOCK_NANOSLEEP, false); + ltl_atom_set(mon, LTL_FUTEX_WAIT, false); + ltl_atom_set(mon, LTL_FUTEX_LOCK_PI, false); + ltl_atom_set(mon, LTL_BLOCK_ON_RT_MUTEX, false); + } + + if (task->flags & PF_KTHREAD) { + ltl_atom_set(mon, LTL_KERNEL_THREAD, true); + + /* kernel tasks do not do syscall */ + ltl_atom_set(mon, LTL_FUTEX_WAIT, false); + ltl_atom_set(mon, LTL_FUTEX_LOCK_PI, false); + ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_MONOTONIC, false); + ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, false); + ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, false); + ltl_atom_set(mon, LTL_CLOCK_NANOSLEEP, false); + + if (strstarts(task->comm, "migration/")) + ltl_atom_set(mon, LTL_TASK_IS_MIGRATION, true); + else + ltl_atom_set(mon, LTL_TASK_IS_MIGRATION, false); + + if (strstarts(task->comm, "rcu")) + ltl_atom_set(mon, LTL_TASK_IS_RCU, true); + else + ltl_atom_set(mon, LTL_TASK_IS_RCU, false); + } else { + ltl_atom_set(mon, LTL_KTHREAD_SHOULD_STOP, false); + ltl_atom_set(mon, LTL_KERNEL_THREAD, false); + ltl_atom_set(mon, LTL_TASK_IS_RCU, false); + ltl_atom_set(mon, LTL_TASK_IS_MIGRATION, false); + } + +} + +static void handle_sched_set_state(void *data, struct task_struct *task, int state) +{ + if (state & TASK_INTERRUPTIBLE) + ltl_atom_pulse(task, LTL_SLEEP, true); + else if (state == TASK_RUNNING) + ltl_atom_pulse(task, LTL_ABORT_SLEEP, true); +} + +static void handle_sched_wakeup(void *data, struct task_struct *task) +{ + ltl_atom_pulse(task, LTL_WAKE, true); +} + +static void handle_sched_waking(void *data, struct task_struct *task) +{ + if (this_cpu_read(hardirq_context)) { + ltl_atom_pulse(task, LTL_WOKEN_BY_HARDIRQ, true); + } else if (in_task()) { + if (current->prio <= task->prio) + ltl_atom_pulse(task, LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO, true); + } else if (in_nmi()) { + ltl_atom_pulse(task, LTL_WOKEN_BY_NMI, true); + } +} + +static void handle_contention_begin(void *data, void *lock, unsigned int flags) +{ + if (flags & LCB_F_RT) + ltl_atom_update(current, LTL_BLOCK_ON_RT_MUTEX, true); +} + +static void handle_contention_end(void *data, void *lock, int ret) +{ + ltl_atom_update(current, LTL_BLOCK_ON_RT_MUTEX, false); +} + +static void handle_sys_enter(void *data, struct pt_regs *regs, long id) +{ + struct ltl_monitor *mon; + unsigned long args[6]; + int op, cmd; + + mon = ltl_get_monitor(current); + + switch (id) { + case __NR_clock_nanosleep: +#ifdef __NR_clock_nanosleep_time64 + case __NR_clock_nanosleep_time64: +#endif + syscall_get_arguments(current, regs, args); + ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_MONOTONIC, args[0] == CLOCK_MONOTONIC); + ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, args[0] == CLOCK_TAI); + ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, args[1] == TIMER_ABSTIME); + ltl_atom_update(current, LTL_CLOCK_NANOSLEEP, true); + break; + + case __NR_futex: +#ifdef __NR_futex_time64 + case __NR_futex_time64: +#endif + syscall_get_arguments(current, regs, args); + op = args[1]; + cmd = op & FUTEX_CMD_MASK; + + switch (cmd) { + case FUTEX_LOCK_PI: + case FUTEX_LOCK_PI2: + ltl_atom_update(current, LTL_FUTEX_LOCK_PI, true); + break; + case FUTEX_WAIT: + case FUTEX_WAIT_BITSET: + case FUTEX_WAIT_REQUEUE_PI: + ltl_atom_update(current, LTL_FUTEX_WAIT, true); + break; + } + break; + } +} + +static void handle_sys_exit(void *data, struct pt_regs *regs, long ret) +{ + struct ltl_monitor *mon = ltl_get_monitor(current); + + ltl_atom_set(mon, LTL_FUTEX_LOCK_PI, false); + ltl_atom_set(mon, LTL_FUTEX_WAIT, false); + ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_MONOTONIC, false); + ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, false); + ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, false); + ltl_atom_update(current, LTL_CLOCK_NANOSLEEP, false); +} + +static void handle_kthread_stop(void *data, struct task_struct *task) +{ + /* FIXME: this could race with other tracepoint handlers */ + ltl_atom_update(task, LTL_KTHREAD_SHOULD_STOP, true); +} + +static int enable_sleep(void) +{ + int retval; + + retval = ltl_monitor_init(); + if (retval) + return retval; + + rv_attach_trace_probe("rtapp_sleep", sched_waking, handle_sched_waking); + rv_attach_trace_probe("rtapp_sleep", sched_wakeup, handle_sched_wakeup); + rv_attach_trace_probe("rtapp_sleep", sched_set_state_tp, handle_sched_set_state); + rv_attach_trace_probe("rtapp_sleep", contention_begin, handle_contention_begin); + rv_attach_trace_probe("rtapp_sleep", contention_end, handle_contention_end); + rv_attach_trace_probe("rtapp_sleep", sched_kthread_stop, handle_kthread_stop); + rv_attach_trace_probe("rtapp_sleep", sys_enter, handle_sys_enter); + rv_attach_trace_probe("rtapp_sleep", sys_exit, handle_sys_exit); + return 0; +} + +static void disable_sleep(void) +{ + rv_detach_trace_probe("rtapp_sleep", sched_waking, handle_sched_waking); + rv_detach_trace_probe("rtapp_sleep", sched_wakeup, handle_sched_wakeup); + rv_detach_trace_probe("rtapp_sleep", sched_set_state_tp, handle_sched_set_state); + rv_detach_trace_probe("rtapp_sleep", contention_begin, handle_contention_begin); + rv_detach_trace_probe("rtapp_sleep", contention_end, handle_contention_end); + rv_detach_trace_probe("rtapp_sleep", sched_kthread_stop, handle_kthread_stop); + rv_detach_trace_probe("rtapp_sleep", sys_enter, handle_sys_enter); + rv_detach_trace_probe("rtapp_sleep", sys_exit, handle_sys_exit); + + ltl_monitor_destroy(); +} + +static struct rv_monitor rv_sleep = { + .name = "sleep", + .description = "Monitor that RT tasks do not undesirably sleep", + .enable = enable_sleep, + .disable = disable_sleep, +}; + +static int __init register_sleep(void) +{ + return rv_register_monitor(&rv_sleep, &rv_rtapp); +} + +static void __exit unregister_sleep(void) +{ + rv_unregister_monitor(&rv_sleep); +} + +module_init(register_sleep); +module_exit(unregister_sleep); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Nam Cao <namcao@linutronix.de>"); +MODULE_DESCRIPTION("sleep: Monitor that RT tasks do not undesirably sleep"); diff --git a/kernel/trace/rv/monitors/sleep/sleep.h b/kernel/trace/rv/monitors/sleep/sleep.h new file mode 100644 index 000000000000..2ab46fd218d2 --- /dev/null +++ b/kernel/trace/rv/monitors/sleep/sleep.h @@ -0,0 +1,257 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * C implementation of Buchi automaton, automatically generated by + * tools/verification/rvgen from the linear temporal logic specification. + * For further information, see kernel documentation: + * Documentation/trace/rv/linear_temporal_logic.rst + */ + +#include <linux/rv.h> + +#define MONITOR_NAME sleep + +enum ltl_atom { + LTL_ABORT_SLEEP, + LTL_BLOCK_ON_RT_MUTEX, + LTL_CLOCK_NANOSLEEP, + LTL_FUTEX_LOCK_PI, + LTL_FUTEX_WAIT, + LTL_KERNEL_THREAD, + LTL_KTHREAD_SHOULD_STOP, + LTL_NANOSLEEP_CLOCK_MONOTONIC, + LTL_NANOSLEEP_CLOCK_TAI, + LTL_NANOSLEEP_TIMER_ABSTIME, + LTL_RT, + LTL_SLEEP, + LTL_TASK_IS_MIGRATION, + LTL_TASK_IS_RCU, + LTL_WAKE, + LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO, + LTL_WOKEN_BY_HARDIRQ, + LTL_WOKEN_BY_NMI, + LTL_NUM_ATOM +}; +static_assert(LTL_NUM_ATOM <= RV_MAX_LTL_ATOM); + +static const char *ltl_atom_str(enum ltl_atom atom) +{ + static const char *const names[] = { + "ab_sl", + "bl_on_rt_mu", + "cl_na", + "fu_lo_pi", + "fu_wa", + "ker_th", + "kth_sh_st", + "na_cl_mo", + "na_cl_ta", + "na_ti_ab", + "rt", + "sl", + "ta_mi", + "ta_rc", + "wak", + "wo_eq_hi_pr", + "wo_ha", + "wo_nm", + }; + + return names[atom]; +} + +enum ltl_buchi_state { + S0, + S1, + S2, + S3, + S4, + S5, + S6, + S7, + RV_NUM_BA_STATES +}; +static_assert(RV_NUM_BA_STATES <= RV_MAX_BA_STATES); + +static void ltl_start(struct task_struct *task, struct ltl_monitor *mon) +{ + bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms); + bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms); + bool val40 = task_is_rcu || task_is_migration; + bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms); + bool val41 = futex_lock_pi || val40; + bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms); + bool val5 = block_on_rt_mutex || val41; + bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon->atoms); + bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms); + bool val32 = abort_sleep || kthread_should_stop; + bool woken_by_nmi = test_bit(LTL_WOKEN_BY_NMI, mon->atoms); + bool val33 = woken_by_nmi || val32; + bool woken_by_hardirq = test_bit(LTL_WOKEN_BY_HARDIRQ, mon->atoms); + bool val34 = woken_by_hardirq || val33; + bool woken_by_equal_or_higher_prio = test_bit(LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO, + mon->atoms); + bool val14 = woken_by_equal_or_higher_prio || val34; + bool wake = test_bit(LTL_WAKE, mon->atoms); + bool val13 = !wake; + bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms); + bool nanosleep_clock_tai = test_bit(LTL_NANOSLEEP_CLOCK_TAI, mon->atoms); + bool nanosleep_clock_monotonic = test_bit(LTL_NANOSLEEP_CLOCK_MONOTONIC, mon->atoms); + bool val24 = nanosleep_clock_monotonic || nanosleep_clock_tai; + bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME, mon->atoms); + bool val25 = nanosleep_timer_abstime && val24; + bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms); + bool val18 = clock_nanosleep && val25; + bool futex_wait = test_bit(LTL_FUTEX_WAIT, mon->atoms); + bool val9 = futex_wait || val18; + bool val11 = val9 || kernel_thread; + bool sleep = test_bit(LTL_SLEEP, mon->atoms); + bool val2 = !sleep; + bool rt = test_bit(LTL_RT, mon->atoms); + bool val1 = !rt; + bool val3 = val1 || val2; + + if (val3) + __set_bit(S0, mon->states); + if (val11 && val13) + __set_bit(S1, mon->states); + if (val11 && val14) + __set_bit(S4, mon->states); + if (val5) + __set_bit(S5, mon->states); +} + +static void +ltl_possible_next_states(struct ltl_monitor *mon, unsigned int state, unsigned long *next) +{ + bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms); + bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms); + bool val40 = task_is_rcu || task_is_migration; + bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms); + bool val41 = futex_lock_pi || val40; + bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms); + bool val5 = block_on_rt_mutex || val41; + bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon->atoms); + bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms); + bool val32 = abort_sleep || kthread_should_stop; + bool woken_by_nmi = test_bit(LTL_WOKEN_BY_NMI, mon->atoms); + bool val33 = woken_by_nmi || val32; + bool woken_by_hardirq = test_bit(LTL_WOKEN_BY_HARDIRQ, mon->atoms); + bool val34 = woken_by_hardirq || val33; + bool woken_by_equal_or_higher_prio = test_bit(LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO, + mon->atoms); + bool val14 = woken_by_equal_or_higher_prio || val34; + bool wake = test_bit(LTL_WAKE, mon->atoms); + bool val13 = !wake; + bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms); + bool nanosleep_clock_tai = test_bit(LTL_NANOSLEEP_CLOCK_TAI, mon->atoms); + bool nanosleep_clock_monotonic = test_bit(LTL_NANOSLEEP_CLOCK_MONOTONIC, mon->atoms); + bool val24 = nanosleep_clock_monotonic || nanosleep_clock_tai; + bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME, mon->atoms); + bool val25 = nanosleep_timer_abstime && val24; + bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms); + bool val18 = clock_nanosleep && val25; + bool futex_wait = test_bit(LTL_FUTEX_WAIT, mon->atoms); + bool val9 = futex_wait || val18; + bool val11 = val9 || kernel_thread; + bool sleep = test_bit(LTL_SLEEP, mon->atoms); + bool val2 = !sleep; + bool rt = test_bit(LTL_RT, mon->atoms); + bool val1 = !rt; + bool val3 = val1 || val2; + + switch (state) { + case S0: + if (val3) + __set_bit(S0, next); + if (val11 && val13) + __set_bit(S1, next); + if (val11 && val14) + __set_bit(S4, next); + if (val5) + __set_bit(S5, next); + break; + case S1: + if (val11 && val13) + __set_bit(S1, next); + if (val13 && val3) + __set_bit(S2, next); + if (val14 && val3) + __set_bit(S3, next); + if (val11 && val14) + __set_bit(S4, next); + if (val13 && val5) + __set_bit(S6, next); + if (val14 && val5) + __set_bit(S7, next); + break; + case S2: + if (val11 && val13) + __set_bit(S1, next); + if (val13 && val3) + __set_bit(S2, next); + if (val14 && val3) + __set_bit(S3, next); + if (val11 && val14) + __set_bit(S4, next); + if (val13 && val5) + __set_bit(S6, next); + if (val14 && val5) + __set_bit(S7, next); + break; + case S3: + if (val3) + __set_bit(S0, next); + if (val11 && val13) + __set_bit(S1, next); + if (val11 && val14) + __set_bit(S4, next); + if (val5) + __set_bit(S5, next); + break; + case S4: + if (val3) + __set_bit(S0, next); + if (val11 && val13) + __set_bit(S1, next); + if (val11 && val14) + __set_bit(S4, next); + if (val5) + __set_bit(S5, next); + break; + case S5: + if (val3) + __set_bit(S0, next); + if (val11 && val13) + __set_bit(S1, next); + if (val11 && val14) + __set_bit(S4, next); + if (val5) + __set_bit(S5, next); + break; + case S6: + if (val11 && val13) + __set_bit(S1, next); + if (val13 && val3) + __set_bit(S2, next); + if (val14 && val3) + __set_bit(S3, next); + if (val11 && val14) + __set_bit(S4, next); + if (val13 && val5) + __set_bit(S6, next); + if (val14 && val5) + __set_bit(S7, next); + break; + case S7: + if (val3) + __set_bit(S0, next); + if (val11 && val13) + __set_bit(S1, next); + if (val11 && val14) + __set_bit(S4, next); + if (val5) + __set_bit(S5, next); + break; + } +} diff --git a/kernel/trace/rv/monitors/sleep/sleep_trace.h b/kernel/trace/rv/monitors/sleep/sleep_trace.h new file mode 100644 index 000000000000..22eaf31da987 --- /dev/null +++ b/kernel/trace/rv/monitors/sleep/sleep_trace.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_SLEEP +DEFINE_EVENT(event_ltl_monitor_id, event_sleep, + TP_PROTO(struct task_struct *task, char *states, char *atoms, char *next), + TP_ARGS(task, states, atoms, next)); +DEFINE_EVENT(error_ltl_monitor_id, error_sleep, + TP_PROTO(struct task_struct *task), + TP_ARGS(task)); +#endif /* CONFIG_RV_MON_SLEEP */ diff --git a/kernel/trace/rv/monitors/sncid/sncid.c b/kernel/trace/rv/monitors/sncid/sncid.c deleted file mode 100644 index f5037cd6214c..000000000000 --- a/kernel/trace/rv/monitors/sncid/sncid.c +++ /dev/null @@ -1,96 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/ftrace.h> -#include <linux/tracepoint.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/rv.h> -#include <rv/instrumentation.h> -#include <rv/da_monitor.h> - -#define MODULE_NAME "sncid" - -#include <trace/events/sched.h> -#include <trace/events/preemptirq.h> -#include <rv_trace.h> -#include <monitors/sched/sched.h> - -#include "sncid.h" - -static struct rv_monitor rv_sncid; -DECLARE_DA_MON_PER_CPU(sncid, unsigned char); - -static void handle_irq_disable(void *data, unsigned long ip, unsigned long parent_ip) -{ - da_handle_event_sncid(irq_disable_sncid); -} - -static void handle_irq_enable(void *data, unsigned long ip, unsigned long parent_ip) -{ - da_handle_start_event_sncid(irq_enable_sncid); -} - -static void handle_schedule_entry(void *data, bool preempt, unsigned long ip) -{ - da_handle_start_event_sncid(schedule_entry_sncid); -} - -static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip) -{ - da_handle_start_event_sncid(schedule_exit_sncid); -} - -static int enable_sncid(void) -{ - int retval; - - retval = da_monitor_init_sncid(); - if (retval) - return retval; - - rv_attach_trace_probe("sncid", irq_disable, handle_irq_disable); - rv_attach_trace_probe("sncid", irq_enable, handle_irq_enable); - rv_attach_trace_probe("sncid", sched_entry_tp, handle_schedule_entry); - rv_attach_trace_probe("sncid", sched_exit_tp, handle_schedule_exit); - - return 0; -} - -static void disable_sncid(void) -{ - rv_sncid.enabled = 0; - - rv_detach_trace_probe("sncid", irq_disable, handle_irq_disable); - rv_detach_trace_probe("sncid", irq_enable, handle_irq_enable); - rv_detach_trace_probe("sncid", sched_entry_tp, handle_schedule_entry); - rv_detach_trace_probe("sncid", sched_exit_tp, handle_schedule_exit); - - da_monitor_destroy_sncid(); -} - -static struct rv_monitor rv_sncid = { - .name = "sncid", - .description = "schedule not called with interrupt disabled.", - .enable = enable_sncid, - .disable = disable_sncid, - .reset = da_monitor_reset_all_sncid, - .enabled = 0, -}; - -static int __init register_sncid(void) -{ - rv_register_monitor(&rv_sncid, &rv_sched); - return 0; -} - -static void __exit unregister_sncid(void) -{ - rv_unregister_monitor(&rv_sncid); -} - -module_init(register_sncid); -module_exit(unregister_sncid); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); -MODULE_DESCRIPTION("sncid: schedule not called with interrupt disabled."); diff --git a/kernel/trace/rv/monitors/sncid/sncid.h b/kernel/trace/rv/monitors/sncid/sncid.h deleted file mode 100644 index 21304725142b..000000000000 --- a/kernel/trace/rv/monitors/sncid/sncid.h +++ /dev/null @@ -1,49 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Automatically generated C representation of sncid automaton - * For further information about this format, see kernel documentation: - * Documentation/trace/rv/deterministic_automata.rst - */ - -enum states_sncid { - can_sched_sncid = 0, - cant_sched_sncid, - state_max_sncid -}; - -#define INVALID_STATE state_max_sncid - -enum events_sncid { - irq_disable_sncid = 0, - irq_enable_sncid, - schedule_entry_sncid, - schedule_exit_sncid, - event_max_sncid -}; - -struct automaton_sncid { - char *state_names[state_max_sncid]; - char *event_names[event_max_sncid]; - unsigned char function[state_max_sncid][event_max_sncid]; - unsigned char initial_state; - bool final_states[state_max_sncid]; -}; - -static const struct automaton_sncid automaton_sncid = { - .state_names = { - "can_sched", - "cant_sched" - }, - .event_names = { - "irq_disable", - "irq_enable", - "schedule_entry", - "schedule_exit" - }, - .function = { - { cant_sched_sncid, INVALID_STATE, can_sched_sncid, can_sched_sncid }, - { INVALID_STATE, can_sched_sncid, INVALID_STATE, INVALID_STATE }, - }, - .initial_state = can_sched_sncid, - .final_states = { 1, 0 }, -}; diff --git a/kernel/trace/rv/monitors/snep/Kconfig b/kernel/trace/rv/monitors/snep/Kconfig index 77527f971232..7dd54f434ff7 100644 --- a/kernel/trace/rv/monitors/snep/Kconfig +++ b/kernel/trace/rv/monitors/snep/Kconfig @@ -2,7 +2,7 @@ # config RV_MON_SNEP depends on RV - depends on PREEMPT_TRACER + depends on TRACE_PREEMPT_TOGGLE depends on RV_MON_SCHED default y select DA_MON_EVENTS_IMPLICIT diff --git a/kernel/trace/rv/monitors/snep/snep.c b/kernel/trace/rv/monitors/snep/snep.c index 0076ba6d7ea4..558950f524a5 100644 --- a/kernel/trace/rv/monitors/snep/snep.c +++ b/kernel/trace/rv/monitors/snep/snep.c @@ -30,12 +30,12 @@ static void handle_preempt_enable(void *data, unsigned long ip, unsigned long pa da_handle_start_event_snep(preempt_enable_snep); } -static void handle_schedule_entry(void *data, bool preempt, unsigned long ip) +static void handle_schedule_entry(void *data, bool preempt) { da_handle_event_snep(schedule_entry_snep); } -static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip) +static void handle_schedule_exit(void *data, bool is_switch) { da_handle_start_event_snep(schedule_exit_snep); } @@ -79,8 +79,7 @@ static struct rv_monitor rv_snep = { static int __init register_snep(void) { - rv_register_monitor(&rv_snep, &rv_sched); - return 0; + return rv_register_monitor(&rv_snep, &rv_sched); } static void __exit unregister_snep(void) diff --git a/kernel/trace/rv/monitors/snep/snep.h b/kernel/trace/rv/monitors/snep/snep.h index 6d16b9ad931e..4cd9abb77b7b 100644 --- a/kernel/trace/rv/monitors/snep/snep.h +++ b/kernel/trace/rv/monitors/snep/snep.h @@ -41,8 +41,18 @@ static const struct automaton_snep automaton_snep = { "schedule_exit" }, .function = { - { non_scheduling_context_snep, non_scheduling_context_snep, scheduling_contex_snep, INVALID_STATE }, - { INVALID_STATE, INVALID_STATE, INVALID_STATE, non_scheduling_context_snep }, + { + non_scheduling_context_snep, + non_scheduling_context_snep, + scheduling_contex_snep, + INVALID_STATE + }, + { + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + non_scheduling_context_snep + }, }, .initial_state = non_scheduling_context_snep, .final_states = { 1, 0 }, diff --git a/kernel/trace/rv/monitors/snroc/snroc.c b/kernel/trace/rv/monitors/snroc/snroc.c index bb1f60d55296..540e686e699f 100644 --- a/kernel/trace/rv/monitors/snroc/snroc.c +++ b/kernel/trace/rv/monitors/snroc/snroc.c @@ -68,8 +68,7 @@ static struct rv_monitor rv_snroc = { static int __init register_snroc(void) { - rv_register_monitor(&rv_snroc, &rv_sched); - return 0; + return rv_register_monitor(&rv_snroc, &rv_sched); } static void __exit unregister_snroc(void) diff --git a/kernel/trace/rv/monitors/sncid/Kconfig b/kernel/trace/rv/monitors/sssw/Kconfig index 76bcfef4fd10..23b7eeb38bbf 100644 --- a/kernel/trace/rv/monitors/sncid/Kconfig +++ b/kernel/trace/rv/monitors/sssw/Kconfig @@ -1,14 +1,14 @@ # SPDX-License-Identifier: GPL-2.0-only # -config RV_MON_SNCID +config RV_MON_SSSW depends on RV - depends on IRQSOFF_TRACER depends on RV_MON_SCHED default y - select DA_MON_EVENTS_IMPLICIT - bool "sncid monitor" + select DA_MON_EVENTS_ID + bool "sssw monitor" help - Monitor to ensure schedule is not called with interrupt disabled. + Monitor to ensure sched_set_state to sleepable leads to sleeping and + sleeping tasks require wakeup. This monitor is part of the sched monitors collection. For further information, see: diff --git a/kernel/trace/rv/monitors/sssw/sssw.c b/kernel/trace/rv/monitors/sssw/sssw.c new file mode 100644 index 000000000000..84b8d890d9d4 --- /dev/null +++ b/kernel/trace/rv/monitors/sssw/sssw.c @@ -0,0 +1,116 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> +#include <rv/da_monitor.h> + +#define MODULE_NAME "sssw" + +#include <trace/events/sched.h> +#include <trace/events/signal.h> +#include <rv_trace.h> +#include <monitors/sched/sched.h> + +#include "sssw.h" + +static struct rv_monitor rv_sssw; +DECLARE_DA_MON_PER_TASK(sssw, unsigned char); + +static void handle_sched_set_state(void *data, struct task_struct *tsk, int state) +{ + if (state == TASK_RUNNING) + da_handle_start_event_sssw(tsk, sched_set_state_runnable_sssw); + else + da_handle_event_sssw(tsk, sched_set_state_sleepable_sssw); +} + +static void handle_sched_switch(void *data, bool preempt, + struct task_struct *prev, + struct task_struct *next, + unsigned int prev_state) +{ + if (preempt) + da_handle_event_sssw(prev, sched_switch_preempt_sssw); + else if (prev_state == TASK_RUNNING) + da_handle_event_sssw(prev, sched_switch_yield_sssw); + else if (prev_state == TASK_RTLOCK_WAIT) + /* special case of sleeping task with racy conditions */ + da_handle_event_sssw(prev, sched_switch_blocking_sssw); + else + da_handle_event_sssw(prev, sched_switch_suspend_sssw); + da_handle_event_sssw(next, sched_switch_in_sssw); +} + +static void handle_sched_wakeup(void *data, struct task_struct *p) +{ + /* + * Wakeup can also lead to signal_wakeup although the system is + * actually runnable. The monitor can safely start with this event. + */ + da_handle_start_event_sssw(p, sched_wakeup_sssw); +} + +static void handle_signal_deliver(void *data, int sig, + struct kernel_siginfo *info, + struct k_sigaction *ka) +{ + da_handle_event_sssw(current, signal_deliver_sssw); +} + +static int enable_sssw(void) +{ + int retval; + + retval = da_monitor_init_sssw(); + if (retval) + return retval; + + rv_attach_trace_probe("sssw", sched_set_state_tp, handle_sched_set_state); + rv_attach_trace_probe("sssw", sched_switch, handle_sched_switch); + rv_attach_trace_probe("sssw", sched_wakeup, handle_sched_wakeup); + rv_attach_trace_probe("sssw", signal_deliver, handle_signal_deliver); + + return 0; +} + +static void disable_sssw(void) +{ + rv_sssw.enabled = 0; + + rv_detach_trace_probe("sssw", sched_set_state_tp, handle_sched_set_state); + rv_detach_trace_probe("sssw", sched_switch, handle_sched_switch); + rv_detach_trace_probe("sssw", sched_wakeup, handle_sched_wakeup); + rv_detach_trace_probe("sssw", signal_deliver, handle_signal_deliver); + + da_monitor_destroy_sssw(); +} + +static struct rv_monitor rv_sssw = { + .name = "sssw", + .description = "set state sleep and wakeup.", + .enable = enable_sssw, + .disable = disable_sssw, + .reset = da_monitor_reset_all_sssw, + .enabled = 0, +}; + +static int __init register_sssw(void) +{ + return rv_register_monitor(&rv_sssw, &rv_sched); +} + +static void __exit unregister_sssw(void) +{ + rv_unregister_monitor(&rv_sssw); +} + +module_init(register_sssw); +module_exit(unregister_sssw); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("sssw: set state sleep and wakeup."); diff --git a/kernel/trace/rv/monitors/sssw/sssw.h b/kernel/trace/rv/monitors/sssw/sssw.h new file mode 100644 index 000000000000..243d54050c94 --- /dev/null +++ b/kernel/trace/rv/monitors/sssw/sssw.h @@ -0,0 +1,105 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of sssw automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +enum states_sssw { + runnable_sssw = 0, + signal_wakeup_sssw, + sleepable_sssw, + sleeping_sssw, + state_max_sssw +}; + +#define INVALID_STATE state_max_sssw + +enum events_sssw { + sched_set_state_runnable_sssw = 0, + sched_set_state_sleepable_sssw, + sched_switch_blocking_sssw, + sched_switch_in_sssw, + sched_switch_preempt_sssw, + sched_switch_suspend_sssw, + sched_switch_yield_sssw, + sched_wakeup_sssw, + signal_deliver_sssw, + event_max_sssw +}; + +struct automaton_sssw { + char *state_names[state_max_sssw]; + char *event_names[event_max_sssw]; + unsigned char function[state_max_sssw][event_max_sssw]; + unsigned char initial_state; + bool final_states[state_max_sssw]; +}; + +static const struct automaton_sssw automaton_sssw = { + .state_names = { + "runnable", + "signal_wakeup", + "sleepable", + "sleeping" + }, + .event_names = { + "sched_set_state_runnable", + "sched_set_state_sleepable", + "sched_switch_blocking", + "sched_switch_in", + "sched_switch_preempt", + "sched_switch_suspend", + "sched_switch_yield", + "sched_wakeup", + "signal_deliver" + }, + .function = { + { + runnable_sssw, + sleepable_sssw, + sleeping_sssw, + runnable_sssw, + runnable_sssw, + INVALID_STATE, + runnable_sssw, + runnable_sssw, + runnable_sssw + }, + { + INVALID_STATE, + sleepable_sssw, + INVALID_STATE, + signal_wakeup_sssw, + signal_wakeup_sssw, + INVALID_STATE, + signal_wakeup_sssw, + signal_wakeup_sssw, + runnable_sssw + }, + { + runnable_sssw, + sleepable_sssw, + sleeping_sssw, + sleepable_sssw, + sleepable_sssw, + sleeping_sssw, + signal_wakeup_sssw, + runnable_sssw, + sleepable_sssw + }, + { + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + runnable_sssw, + INVALID_STATE + }, + }, + .initial_state = runnable_sssw, + .final_states = { 1, 0, 0, 0 }, +}; diff --git a/kernel/trace/rv/monitors/sssw/sssw_trace.h b/kernel/trace/rv/monitors/sssw/sssw_trace.h new file mode 100644 index 000000000000..6c03cfc6960b --- /dev/null +++ b/kernel/trace/rv/monitors/sssw/sssw_trace.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_SSSW +DEFINE_EVENT(event_da_monitor_id, event_sssw, + TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state), + TP_ARGS(id, state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor_id, error_sssw, + TP_PROTO(int id, char *state, char *event), + TP_ARGS(id, state, event)); +#endif /* CONFIG_RV_MON_SSSW */ diff --git a/kernel/trace/rv/monitors/sts/Kconfig b/kernel/trace/rv/monitors/sts/Kconfig new file mode 100644 index 000000000000..7d1ff0f6fc91 --- /dev/null +++ b/kernel/trace/rv/monitors/sts/Kconfig @@ -0,0 +1,19 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_STS + depends on RV + depends on TRACE_IRQFLAGS + depends on RV_MON_SCHED + default y + select DA_MON_EVENTS_IMPLICIT + bool "sts monitor" + help + Monitor to ensure relationships between scheduler and task switches + * the scheduler is called and returns with interrupts disabled + * each call to the scheduler has up to one switch + * switches only happen inside the scheduler + * each call to the scheduler disables interrupts to switch + This monitor is part of the sched monitors collection. + + For further information, see: + Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/sts/sts.c b/kernel/trace/rv/monitors/sts/sts.c new file mode 100644 index 000000000000..c4a9cd67c1d2 --- /dev/null +++ b/kernel/trace/rv/monitors/sts/sts.c @@ -0,0 +1,156 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> +#include <rv/da_monitor.h> + +#define MODULE_NAME "sts" + +#include <trace/events/sched.h> +#include <trace/events/irq.h> +#include <trace/events/preemptirq.h> +#include <rv_trace.h> +#include <monitors/sched/sched.h> + +#include "sts.h" + +static struct rv_monitor rv_sts; +DECLARE_DA_MON_PER_CPU(sts, unsigned char); + +#ifdef CONFIG_X86_LOCAL_APIC +#include <asm/trace/irq_vectors.h> + +static void handle_vector_irq_entry(void *data, int vector) +{ + da_handle_event_sts(irq_entry_sts); +} + +static void attach_vector_irq(void) +{ + rv_attach_trace_probe("sts", local_timer_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_IRQ_WORK)) + rv_attach_trace_probe("sts", irq_work_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_SMP)) { + rv_attach_trace_probe("sts", reschedule_entry, handle_vector_irq_entry); + rv_attach_trace_probe("sts", call_function_entry, handle_vector_irq_entry); + rv_attach_trace_probe("sts", call_function_single_entry, handle_vector_irq_entry); + } +} + +static void detach_vector_irq(void) +{ + rv_detach_trace_probe("sts", local_timer_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_IRQ_WORK)) + rv_detach_trace_probe("sts", irq_work_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_SMP)) { + rv_detach_trace_probe("sts", reschedule_entry, handle_vector_irq_entry); + rv_detach_trace_probe("sts", call_function_entry, handle_vector_irq_entry); + rv_detach_trace_probe("sts", call_function_single_entry, handle_vector_irq_entry); + } +} + +#else +/* We assume irq_entry tracepoints are sufficient on other architectures */ +static void attach_vector_irq(void) { } +static void detach_vector_irq(void) { } +#endif + +static void handle_irq_disable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_event_sts(irq_disable_sts); +} + +static void handle_irq_enable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_event_sts(irq_enable_sts); +} + +static void handle_irq_entry(void *data, int irq, struct irqaction *action) +{ + da_handle_event_sts(irq_entry_sts); +} + +static void handle_sched_switch(void *data, bool preempt, + struct task_struct *prev, + struct task_struct *next, + unsigned int prev_state) +{ + da_handle_event_sts(sched_switch_sts); +} + +static void handle_schedule_entry(void *data, bool preempt) +{ + da_handle_event_sts(schedule_entry_sts); +} + +static void handle_schedule_exit(void *data, bool is_switch) +{ + da_handle_start_event_sts(schedule_exit_sts); +} + +static int enable_sts(void) +{ + int retval; + + retval = da_monitor_init_sts(); + if (retval) + return retval; + + rv_attach_trace_probe("sts", irq_disable, handle_irq_disable); + rv_attach_trace_probe("sts", irq_enable, handle_irq_enable); + rv_attach_trace_probe("sts", irq_handler_entry, handle_irq_entry); + rv_attach_trace_probe("sts", sched_switch, handle_sched_switch); + rv_attach_trace_probe("sts", sched_entry_tp, handle_schedule_entry); + rv_attach_trace_probe("sts", sched_exit_tp, handle_schedule_exit); + attach_vector_irq(); + + return 0; +} + +static void disable_sts(void) +{ + rv_sts.enabled = 0; + + rv_detach_trace_probe("sts", irq_disable, handle_irq_disable); + rv_detach_trace_probe("sts", irq_enable, handle_irq_enable); + rv_detach_trace_probe("sts", irq_handler_entry, handle_irq_entry); + rv_detach_trace_probe("sts", sched_switch, handle_sched_switch); + rv_detach_trace_probe("sts", sched_entry_tp, handle_schedule_entry); + rv_detach_trace_probe("sts", sched_exit_tp, handle_schedule_exit); + detach_vector_irq(); + + da_monitor_destroy_sts(); +} + +/* + * This is the monitor register section. + */ +static struct rv_monitor rv_sts = { + .name = "sts", + .description = "schedule implies task switch.", + .enable = enable_sts, + .disable = disable_sts, + .reset = da_monitor_reset_all_sts, + .enabled = 0, +}; + +static int __init register_sts(void) +{ + return rv_register_monitor(&rv_sts, &rv_sched); +} + +static void __exit unregister_sts(void) +{ + rv_unregister_monitor(&rv_sts); +} + +module_init(register_sts); +module_exit(unregister_sts); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("sts: schedule implies task switch."); diff --git a/kernel/trace/rv/monitors/sts/sts.h b/kernel/trace/rv/monitors/sts/sts.h new file mode 100644 index 000000000000..3368b6599a00 --- /dev/null +++ b/kernel/trace/rv/monitors/sts/sts.h @@ -0,0 +1,117 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of sts automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +enum states_sts { + can_sched_sts = 0, + cant_sched_sts, + disable_to_switch_sts, + enable_to_exit_sts, + in_irq_sts, + scheduling_sts, + switching_sts, + state_max_sts +}; + +#define INVALID_STATE state_max_sts + +enum events_sts { + irq_disable_sts = 0, + irq_enable_sts, + irq_entry_sts, + sched_switch_sts, + schedule_entry_sts, + schedule_exit_sts, + event_max_sts +}; + +struct automaton_sts { + char *state_names[state_max_sts]; + char *event_names[event_max_sts]; + unsigned char function[state_max_sts][event_max_sts]; + unsigned char initial_state; + bool final_states[state_max_sts]; +}; + +static const struct automaton_sts automaton_sts = { + .state_names = { + "can_sched", + "cant_sched", + "disable_to_switch", + "enable_to_exit", + "in_irq", + "scheduling", + "switching" + }, + .event_names = { + "irq_disable", + "irq_enable", + "irq_entry", + "sched_switch", + "schedule_entry", + "schedule_exit" + }, + .function = { + { + cant_sched_sts, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + scheduling_sts, + INVALID_STATE + }, + { + INVALID_STATE, + can_sched_sts, + cant_sched_sts, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE + }, + { + INVALID_STATE, + enable_to_exit_sts, + in_irq_sts, + switching_sts, + INVALID_STATE, + INVALID_STATE + }, + { + enable_to_exit_sts, + enable_to_exit_sts, + enable_to_exit_sts, + INVALID_STATE, + INVALID_STATE, + can_sched_sts + }, + { + INVALID_STATE, + scheduling_sts, + in_irq_sts, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE + }, + { + disable_to_switch_sts, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE + }, + { + INVALID_STATE, + enable_to_exit_sts, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE + }, + }, + .initial_state = can_sched_sts, + .final_states = { 1, 0, 0, 0, 0, 0, 0 }, +}; diff --git a/kernel/trace/rv/monitors/tss/tss_trace.h b/kernel/trace/rv/monitors/sts/sts_trace.h index 4619dbb50cc0..d78beb58d5b3 100644 --- a/kernel/trace/rv/monitors/tss/tss_trace.h +++ b/kernel/trace/rv/monitors/sts/sts_trace.h @@ -4,12 +4,12 @@ * Snippet to be included in rv_trace.h */ -#ifdef CONFIG_RV_MON_TSS -DEFINE_EVENT(event_da_monitor, event_tss, +#ifdef CONFIG_RV_MON_STS +DEFINE_EVENT(event_da_monitor, event_sts, TP_PROTO(char *state, char *event, char *next_state, bool final_state), TP_ARGS(state, event, next_state, final_state)); -DEFINE_EVENT(error_da_monitor, error_tss, +DEFINE_EVENT(error_da_monitor, error_sts, TP_PROTO(char *state, char *event), TP_ARGS(state, event)); -#endif /* CONFIG_RV_MON_TSS */ +#endif /* CONFIG_RV_MON_STS */ diff --git a/kernel/trace/rv/monitors/tss/tss.c b/kernel/trace/rv/monitors/tss/tss.c deleted file mode 100644 index 542787e6524f..000000000000 --- a/kernel/trace/rv/monitors/tss/tss.c +++ /dev/null @@ -1,91 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/ftrace.h> -#include <linux/tracepoint.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/rv.h> -#include <rv/instrumentation.h> -#include <rv/da_monitor.h> - -#define MODULE_NAME "tss" - -#include <trace/events/sched.h> -#include <rv_trace.h> -#include <monitors/sched/sched.h> - -#include "tss.h" - -static struct rv_monitor rv_tss; -DECLARE_DA_MON_PER_CPU(tss, unsigned char); - -static void handle_sched_switch(void *data, bool preempt, - struct task_struct *prev, - struct task_struct *next, - unsigned int prev_state) -{ - da_handle_event_tss(sched_switch_tss); -} - -static void handle_schedule_entry(void *data, bool preempt, unsigned long ip) -{ - da_handle_event_tss(schedule_entry_tss); -} - -static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip) -{ - da_handle_start_event_tss(schedule_exit_tss); -} - -static int enable_tss(void) -{ - int retval; - - retval = da_monitor_init_tss(); - if (retval) - return retval; - - rv_attach_trace_probe("tss", sched_switch, handle_sched_switch); - rv_attach_trace_probe("tss", sched_entry_tp, handle_schedule_entry); - rv_attach_trace_probe("tss", sched_exit_tp, handle_schedule_exit); - - return 0; -} - -static void disable_tss(void) -{ - rv_tss.enabled = 0; - - rv_detach_trace_probe("tss", sched_switch, handle_sched_switch); - rv_detach_trace_probe("tss", sched_entry_tp, handle_schedule_entry); - rv_detach_trace_probe("tss", sched_exit_tp, handle_schedule_exit); - - da_monitor_destroy_tss(); -} - -static struct rv_monitor rv_tss = { - .name = "tss", - .description = "task switch while scheduling.", - .enable = enable_tss, - .disable = disable_tss, - .reset = da_monitor_reset_all_tss, - .enabled = 0, -}; - -static int __init register_tss(void) -{ - rv_register_monitor(&rv_tss, &rv_sched); - return 0; -} - -static void __exit unregister_tss(void) -{ - rv_unregister_monitor(&rv_tss); -} - -module_init(register_tss); -module_exit(unregister_tss); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); -MODULE_DESCRIPTION("tss: task switch while scheduling."); diff --git a/kernel/trace/rv/monitors/tss/tss.h b/kernel/trace/rv/monitors/tss/tss.h deleted file mode 100644 index f0a36fda1b87..000000000000 --- a/kernel/trace/rv/monitors/tss/tss.h +++ /dev/null @@ -1,47 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Automatically generated C representation of tss automaton - * For further information about this format, see kernel documentation: - * Documentation/trace/rv/deterministic_automata.rst - */ - -enum states_tss { - thread_tss = 0, - sched_tss, - state_max_tss -}; - -#define INVALID_STATE state_max_tss - -enum events_tss { - sched_switch_tss = 0, - schedule_entry_tss, - schedule_exit_tss, - event_max_tss -}; - -struct automaton_tss { - char *state_names[state_max_tss]; - char *event_names[event_max_tss]; - unsigned char function[state_max_tss][event_max_tss]; - unsigned char initial_state; - bool final_states[state_max_tss]; -}; - -static const struct automaton_tss automaton_tss = { - .state_names = { - "thread", - "sched" - }, - .event_names = { - "sched_switch", - "schedule_entry", - "schedule_exit" - }, - .function = { - { INVALID_STATE, sched_tss, INVALID_STATE }, - { sched_tss, INVALID_STATE, thread_tss }, - }, - .initial_state = thread_tss, - .final_states = { 1, 0 }, -}; diff --git a/kernel/trace/rv/monitors/wip/Kconfig b/kernel/trace/rv/monitors/wip/Kconfig index e464b9294865..87a26195792b 100644 --- a/kernel/trace/rv/monitors/wip/Kconfig +++ b/kernel/trace/rv/monitors/wip/Kconfig @@ -2,7 +2,7 @@ # config RV_MON_WIP depends on RV - depends on PREEMPT_TRACER + depends on TRACE_PREEMPT_TOGGLE select DA_MON_EVENTS_IMPLICIT bool "wip monitor" help diff --git a/kernel/trace/rv/monitors/wip/wip.c b/kernel/trace/rv/monitors/wip/wip.c index ed758fec8608..4b4e99615a11 100644 --- a/kernel/trace/rv/monitors/wip/wip.c +++ b/kernel/trace/rv/monitors/wip/wip.c @@ -71,8 +71,7 @@ static struct rv_monitor rv_wip = { static int __init register_wip(void) { - rv_register_monitor(&rv_wip, NULL); - return 0; + return rv_register_monitor(&rv_wip, NULL); } static void __exit unregister_wip(void) diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.c b/kernel/trace/rv/monitors/wwnr/wwnr.c index 172f31c4b0f3..4145bea2729e 100644 --- a/kernel/trace/rv/monitors/wwnr/wwnr.c +++ b/kernel/trace/rv/monitors/wwnr/wwnr.c @@ -70,8 +70,7 @@ static struct rv_monitor rv_wwnr = { static int __init register_wwnr(void) { - rv_register_monitor(&rv_wwnr, NULL); - return 0; + return rv_register_monitor(&rv_wwnr, NULL); } static void __exit unregister_wwnr(void) diff --git a/kernel/trace/rv/reactor_panic.c b/kernel/trace/rv/reactor_panic.c index 0186ff4cbd0b..74c6bcc2c749 100644 --- a/kernel/trace/rv/reactor_panic.c +++ b/kernel/trace/rv/reactor_panic.c @@ -13,9 +13,13 @@ #include <linux/init.h> #include <linux/rv.h> -static void rv_panic_reaction(char *msg) +__printf(1, 2) static void rv_panic_reaction(const char *msg, ...) { - panic(msg); + va_list args; + + va_start(args, msg); + vpanic(msg, args); + va_end(args); } static struct rv_reactor rv_panic = { diff --git a/kernel/trace/rv/reactor_printk.c b/kernel/trace/rv/reactor_printk.c index 178759dbf89f..2dae2916c05f 100644 --- a/kernel/trace/rv/reactor_printk.c +++ b/kernel/trace/rv/reactor_printk.c @@ -12,9 +12,13 @@ #include <linux/init.h> #include <linux/rv.h> -static void rv_printk_reaction(char *msg) +__printf(1, 2) static void rv_printk_reaction(const char *msg, ...) { - printk_deferred(msg); + va_list args; + + va_start(args, msg); + vprintk_deferred(msg, args); + va_end(args); } static struct rv_reactor rv_printk = { diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c index e4077500a91d..bd7d56dbf6c2 100644 --- a/kernel/trace/rv/rv.c +++ b/kernel/trace/rv/rv.c @@ -143,7 +143,7 @@ #include <linux/init.h> #include <linux/slab.h> -#ifdef CONFIG_DA_MON_EVENTS +#ifdef CONFIG_RV_MON_EVENTS #define CREATE_TRACE_POINTS #include <rv_trace.h> #endif @@ -165,7 +165,7 @@ struct dentry *get_monitors_root(void) LIST_HEAD(rv_monitors_list); static int task_monitor_count; -static bool task_monitor_slots[RV_PER_TASK_MONITORS]; +static bool task_monitor_slots[CONFIG_RV_PER_TASK_MONITORS]; int rv_get_task_monitor_slot(void) { @@ -173,12 +173,12 @@ int rv_get_task_monitor_slot(void) lockdep_assert_held(&rv_interface_lock); - if (task_monitor_count == RV_PER_TASK_MONITORS) + if (task_monitor_count == CONFIG_RV_PER_TASK_MONITORS) return -EBUSY; task_monitor_count++; - for (i = 0; i < RV_PER_TASK_MONITORS; i++) { + for (i = 0; i < CONFIG_RV_PER_TASK_MONITORS; i++) { if (task_monitor_slots[i] == false) { task_monitor_slots[i] = true; return i; @@ -194,7 +194,7 @@ void rv_put_task_monitor_slot(int slot) { lockdep_assert_held(&rv_interface_lock); - if (slot < 0 || slot >= RV_PER_TASK_MONITORS) { + if (slot < 0 || slot >= CONFIG_RV_PER_TASK_MONITORS) { WARN_ONCE(1, "RV releasing an invalid slot!: %d\n", slot); return; } @@ -210,9 +210,9 @@ void rv_put_task_monitor_slot(int slot) * Monitors with a parent are nested, * Monitors without a parent could be standalone or containers. */ -bool rv_is_nested_monitor(struct rv_monitor_def *mdef) +bool rv_is_nested_monitor(struct rv_monitor *mon) { - return mdef->parent != NULL; + return mon->parent != NULL; } /* @@ -223,16 +223,16 @@ bool rv_is_nested_monitor(struct rv_monitor_def *mdef) * for enable()/disable(). Use this condition to find empty containers. * Keep both conditions in case we have some non-compliant containers. */ -bool rv_is_container_monitor(struct rv_monitor_def *mdef) +bool rv_is_container_monitor(struct rv_monitor *mon) { - struct rv_monitor_def *next; + struct rv_monitor *next; - if (list_is_last(&mdef->list, &rv_monitors_list)) + if (list_is_last(&mon->list, &rv_monitors_list)) return false; - next = list_next_entry(mdef, list); + next = list_next_entry(mon, list); - return next->parent == mdef->monitor || !mdef->monitor->enable; + return next->parent == mon || !mon->enable; } /* @@ -241,10 +241,10 @@ bool rv_is_container_monitor(struct rv_monitor_def *mdef) static ssize_t monitor_enable_read_data(struct file *filp, char __user *user_buf, size_t count, loff_t *ppos) { - struct rv_monitor_def *mdef = filp->private_data; + struct rv_monitor *mon = filp->private_data; const char *buff; - buff = mdef->monitor->enabled ? "1\n" : "0\n"; + buff = mon->enabled ? "1\n" : "0\n"; return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff)+1); } @@ -252,14 +252,14 @@ static ssize_t monitor_enable_read_data(struct file *filp, char __user *user_buf /* * __rv_disable_monitor - disabled an enabled monitor */ -static int __rv_disable_monitor(struct rv_monitor_def *mdef, bool sync) +static int __rv_disable_monitor(struct rv_monitor *mon, bool sync) { lockdep_assert_held(&rv_interface_lock); - if (mdef->monitor->enabled) { - mdef->monitor->enabled = 0; - if (mdef->monitor->disable) - mdef->monitor->disable(); + if (mon->enabled) { + mon->enabled = 0; + if (mon->disable) + mon->disable(); /* * Wait for the execution of all events to finish. @@ -273,90 +273,90 @@ static int __rv_disable_monitor(struct rv_monitor_def *mdef, bool sync) return 0; } -static void rv_disable_single(struct rv_monitor_def *mdef) +static void rv_disable_single(struct rv_monitor *mon) { - __rv_disable_monitor(mdef, true); + __rv_disable_monitor(mon, true); } -static int rv_enable_single(struct rv_monitor_def *mdef) +static int rv_enable_single(struct rv_monitor *mon) { int retval; lockdep_assert_held(&rv_interface_lock); - if (mdef->monitor->enabled) + if (mon->enabled) return 0; - retval = mdef->monitor->enable(); + retval = mon->enable(); if (!retval) - mdef->monitor->enabled = 1; + mon->enabled = 1; return retval; } -static void rv_disable_container(struct rv_monitor_def *mdef) +static void rv_disable_container(struct rv_monitor *mon) { - struct rv_monitor_def *p = mdef; + struct rv_monitor *p = mon; int enabled = 0; list_for_each_entry_continue(p, &rv_monitors_list, list) { - if (p->parent != mdef->monitor) + if (p->parent != mon) break; enabled += __rv_disable_monitor(p, false); } if (enabled) tracepoint_synchronize_unregister(); - mdef->monitor->enabled = 0; + mon->enabled = 0; } -static int rv_enable_container(struct rv_monitor_def *mdef) +static int rv_enable_container(struct rv_monitor *mon) { - struct rv_monitor_def *p = mdef; + struct rv_monitor *p = mon; int retval = 0; list_for_each_entry_continue(p, &rv_monitors_list, list) { - if (retval || p->parent != mdef->monitor) + if (retval || p->parent != mon) break; retval = rv_enable_single(p); } if (retval) - rv_disable_container(mdef); + rv_disable_container(mon); else - mdef->monitor->enabled = 1; + mon->enabled = 1; return retval; } /** * rv_disable_monitor - disable a given runtime monitor - * @mdef: Pointer to the monitor definition structure. + * @mon: Pointer to the monitor definition structure. * * Returns 0 on success. */ -int rv_disable_monitor(struct rv_monitor_def *mdef) +int rv_disable_monitor(struct rv_monitor *mon) { - if (rv_is_container_monitor(mdef)) - rv_disable_container(mdef); + if (rv_is_container_monitor(mon)) + rv_disable_container(mon); else - rv_disable_single(mdef); + rv_disable_single(mon); return 0; } /** * rv_enable_monitor - enable a given runtime monitor - * @mdef: Pointer to the monitor definition structure. + * @mon: Pointer to the monitor definition structure. * * Returns 0 on success, error otherwise. */ -int rv_enable_monitor(struct rv_monitor_def *mdef) +int rv_enable_monitor(struct rv_monitor *mon) { int retval; - if (rv_is_container_monitor(mdef)) - retval = rv_enable_container(mdef); + if (rv_is_container_monitor(mon)) + retval = rv_enable_container(mon); else - retval = rv_enable_single(mdef); + retval = rv_enable_single(mon); return retval; } @@ -367,7 +367,7 @@ int rv_enable_monitor(struct rv_monitor_def *mdef) static ssize_t monitor_enable_write_data(struct file *filp, const char __user *user_buf, size_t count, loff_t *ppos) { - struct rv_monitor_def *mdef = filp->private_data; + struct rv_monitor *mon = filp->private_data; int retval; bool val; @@ -378,9 +378,9 @@ static ssize_t monitor_enable_write_data(struct file *filp, const char __user *u mutex_lock(&rv_interface_lock); if (val) - retval = rv_enable_monitor(mdef); + retval = rv_enable_monitor(mon); else - retval = rv_disable_monitor(mdef); + retval = rv_disable_monitor(mon); mutex_unlock(&rv_interface_lock); @@ -399,12 +399,12 @@ static const struct file_operations interface_enable_fops = { static ssize_t monitor_desc_read_data(struct file *filp, char __user *user_buf, size_t count, loff_t *ppos) { - struct rv_monitor_def *mdef = filp->private_data; + struct rv_monitor *mon = filp->private_data; char buff[256]; memset(buff, 0, sizeof(buff)); - snprintf(buff, sizeof(buff), "%s\n", mdef->monitor->description); + snprintf(buff, sizeof(buff), "%s\n", mon->description); return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff) + 1); } @@ -419,37 +419,37 @@ static const struct file_operations interface_desc_fops = { * the monitor dir, where the specific options of the monitor * are exposed. */ -static int create_monitor_dir(struct rv_monitor_def *mdef, struct rv_monitor_def *parent) +static int create_monitor_dir(struct rv_monitor *mon, struct rv_monitor *parent) { struct dentry *root = parent ? parent->root_d : get_monitors_root(); - const char *name = mdef->monitor->name; + const char *name = mon->name; struct dentry *tmp; int retval; - mdef->root_d = rv_create_dir(name, root); - if (!mdef->root_d) + mon->root_d = rv_create_dir(name, root); + if (!mon->root_d) return -ENOMEM; - tmp = rv_create_file("enable", RV_MODE_WRITE, mdef->root_d, mdef, &interface_enable_fops); + tmp = rv_create_file("enable", RV_MODE_WRITE, mon->root_d, mon, &interface_enable_fops); if (!tmp) { retval = -ENOMEM; goto out_remove_root; } - tmp = rv_create_file("desc", RV_MODE_READ, mdef->root_d, mdef, &interface_desc_fops); + tmp = rv_create_file("desc", RV_MODE_READ, mon->root_d, mon, &interface_desc_fops); if (!tmp) { retval = -ENOMEM; goto out_remove_root; } - retval = reactor_populate_monitor(mdef); + retval = reactor_populate_monitor(mon); if (retval) goto out_remove_root; return 0; out_remove_root: - rv_remove(mdef->root_d); + rv_remove(mon->root_d); return retval; } @@ -458,13 +458,12 @@ out_remove_root: */ static int monitors_show(struct seq_file *m, void *p) { - struct rv_monitor_def *mon_def = p; + struct rv_monitor *mon = container_of(p, struct rv_monitor, list); - if (mon_def->parent) - seq_printf(m, "%s:%s\n", mon_def->parent->name, - mon_def->monitor->name); + if (mon->parent) + seq_printf(m, "%s:%s\n", mon->parent->name, mon->name); else - seq_printf(m, "%s\n", mon_def->monitor->name); + seq_printf(m, "%s\n", mon->name); return 0; } @@ -496,13 +495,13 @@ static void *available_monitors_next(struct seq_file *m, void *p, loff_t *pos) */ static void *enabled_monitors_next(struct seq_file *m, void *p, loff_t *pos) { - struct rv_monitor_def *m_def = p; + struct rv_monitor *mon = p; (*pos)++; - list_for_each_entry_continue(m_def, &rv_monitors_list, list) { - if (m_def->monitor->enabled) - return m_def; + list_for_each_entry_continue(mon, &rv_monitors_list, list) { + if (mon->enabled) + return mon; } return NULL; @@ -510,7 +509,7 @@ static void *enabled_monitors_next(struct seq_file *m, void *p, loff_t *pos) static void *enabled_monitors_start(struct seq_file *m, loff_t *pos) { - struct rv_monitor_def *m_def; + struct rv_monitor *mon; loff_t l; mutex_lock(&rv_interface_lock); @@ -518,15 +517,15 @@ static void *enabled_monitors_start(struct seq_file *m, loff_t *pos) if (list_empty(&rv_monitors_list)) return NULL; - m_def = list_entry(&rv_monitors_list, struct rv_monitor_def, list); + mon = list_entry(&rv_monitors_list, struct rv_monitor, list); for (l = 0; l <= *pos; ) { - m_def = enabled_monitors_next(m, m_def, &l); - if (!m_def) + mon = enabled_monitors_next(m, mon, &l); + if (!mon) break; } - return m_def; + return mon; } /* @@ -566,13 +565,13 @@ static const struct file_operations available_monitors_ops = { */ static void disable_all_monitors(void) { - struct rv_monitor_def *mdef; + struct rv_monitor *mon; int enabled = 0; mutex_lock(&rv_interface_lock); - list_for_each_entry(mdef, &rv_monitors_list, list) - enabled += __rv_disable_monitor(mdef, false); + list_for_each_entry(mon, &rv_monitors_list, list) + enabled += __rv_disable_monitor(mon, false); if (enabled) { /* @@ -598,7 +597,7 @@ static ssize_t enabled_monitors_write(struct file *filp, const char __user *user size_t count, loff_t *ppos) { char buff[MAX_RV_MONITOR_NAME_SIZE + 2]; - struct rv_monitor_def *mdef; + struct rv_monitor *mon; int retval = -EINVAL; bool enable = true; char *ptr, *tmp; @@ -633,17 +632,17 @@ static ssize_t enabled_monitors_write(struct file *filp, const char __user *user if (tmp) ptr = tmp+1; - list_for_each_entry(mdef, &rv_monitors_list, list) { - if (strcmp(ptr, mdef->monitor->name) != 0) + list_for_each_entry(mon, &rv_monitors_list, list) { + if (strcmp(ptr, mon->name) != 0) continue; /* * Monitor found! */ if (enable) - retval = rv_enable_monitor(mdef); + retval = rv_enable_monitor(mon); else - retval = rv_disable_monitor(mdef); + retval = rv_disable_monitor(mon); if (!retval) retval = count; @@ -702,11 +701,11 @@ static void turn_monitoring_off(void) static void reset_all_monitors(void) { - struct rv_monitor_def *mdef; + struct rv_monitor *mon; - list_for_each_entry(mdef, &rv_monitors_list, list) { - if (mdef->monitor->enabled && mdef->monitor->reset) - mdef->monitor->reset(); + list_for_each_entry(mon, &rv_monitors_list, list) { + if (mon->enabled && mon->reset) + mon->reset(); } } @@ -768,10 +767,9 @@ static const struct file_operations monitoring_on_fops = { .read = monitoring_on_read_data, }; -static void destroy_monitor_dir(struct rv_monitor_def *mdef) +static void destroy_monitor_dir(struct rv_monitor *mon) { - reactor_cleanup_monitor(mdef); - rv_remove(mdef->root_d); + rv_remove(mon->root_d); } /** @@ -783,7 +781,7 @@ static void destroy_monitor_dir(struct rv_monitor_def *mdef) */ int rv_register_monitor(struct rv_monitor *monitor, struct rv_monitor *parent) { - struct rv_monitor_def *r, *p = NULL; + struct rv_monitor *r; int retval = 0; if (strlen(monitor->name) >= MAX_RV_MONITOR_NAME_SIZE) { @@ -795,49 +793,31 @@ int rv_register_monitor(struct rv_monitor *monitor, struct rv_monitor *parent) mutex_lock(&rv_interface_lock); list_for_each_entry(r, &rv_monitors_list, list) { - if (strcmp(monitor->name, r->monitor->name) == 0) { + if (strcmp(monitor->name, r->name) == 0) { pr_info("Monitor %s is already registered\n", monitor->name); retval = -EEXIST; goto out_unlock; } } - if (parent) { - list_for_each_entry(r, &rv_monitors_list, list) { - if (strcmp(parent->name, r->monitor->name) == 0) { - p = r; - break; - } - } - } - - if (p && rv_is_nested_monitor(p)) { + if (parent && rv_is_nested_monitor(parent)) { pr_info("Parent monitor %s is already nested, cannot nest further\n", parent->name); retval = -EINVAL; goto out_unlock; } - r = kzalloc(sizeof(struct rv_monitor_def), GFP_KERNEL); - if (!r) { - retval = -ENOMEM; - goto out_unlock; - } - - r->monitor = monitor; - r->parent = parent; + monitor->parent = parent; - retval = create_monitor_dir(r, p); - if (retval) { - kfree(r); - goto out_unlock; - } + retval = create_monitor_dir(monitor, parent); + if (retval) + return retval; /* keep children close to the parent for easier visualisation */ - if (p) - list_add(&r->list, &p->list); + if (parent) + list_add(&monitor->list, &parent->list); else - list_add_tail(&r->list, &rv_monitors_list); + list_add_tail(&monitor->list, &rv_monitors_list); out_unlock: mutex_unlock(&rv_interface_lock); @@ -852,17 +832,11 @@ out_unlock: */ int rv_unregister_monitor(struct rv_monitor *monitor) { - struct rv_monitor_def *ptr, *next; - mutex_lock(&rv_interface_lock); - list_for_each_entry_safe(ptr, next, &rv_monitors_list, list) { - if (strcmp(monitor->name, ptr->monitor->name) == 0) { - rv_disable_monitor(ptr); - list_del(&ptr->list); - destroy_monitor_dir(ptr); - } - } + rv_disable_monitor(monitor); + list_del(&monitor->list); + destroy_monitor_dir(monitor); mutex_unlock(&rv_interface_lock); return 0; diff --git a/kernel/trace/rv/rv.h b/kernel/trace/rv/rv.h index 98fca0a1adbc..1485a70c1bf4 100644 --- a/kernel/trace/rv/rv.h +++ b/kernel/trace/rv/rv.h @@ -23,48 +23,21 @@ struct rv_interface { extern struct mutex rv_interface_lock; extern struct list_head rv_monitors_list; -#ifdef CONFIG_RV_REACTORS -struct rv_reactor_def { - struct list_head list; - struct rv_reactor *reactor; - /* protected by the monitor interface lock */ - int counter; -}; -#endif - -struct rv_monitor_def { - struct list_head list; - struct rv_monitor *monitor; - struct rv_monitor *parent; - struct dentry *root_d; -#ifdef CONFIG_RV_REACTORS - struct rv_reactor_def *rdef; - bool reacting; -#endif - bool task_monitor; -}; - struct dentry *get_monitors_root(void); -int rv_disable_monitor(struct rv_monitor_def *mdef); -int rv_enable_monitor(struct rv_monitor_def *mdef); -bool rv_is_container_monitor(struct rv_monitor_def *mdef); -bool rv_is_nested_monitor(struct rv_monitor_def *mdef); +int rv_disable_monitor(struct rv_monitor *mon); +int rv_enable_monitor(struct rv_monitor *mon); +bool rv_is_container_monitor(struct rv_monitor *mon); +bool rv_is_nested_monitor(struct rv_monitor *mon); #ifdef CONFIG_RV_REACTORS -int reactor_populate_monitor(struct rv_monitor_def *mdef); -void reactor_cleanup_monitor(struct rv_monitor_def *mdef); +int reactor_populate_monitor(struct rv_monitor *mon); int init_rv_reactors(struct dentry *root_dir); #else -static inline int reactor_populate_monitor(struct rv_monitor_def *mdef) +static inline int reactor_populate_monitor(struct rv_monitor *mon) { return 0; } -static inline void reactor_cleanup_monitor(struct rv_monitor_def *mdef) -{ - return; -} - static inline int init_rv_reactors(struct dentry *root_dir) { return 0; diff --git a/kernel/trace/rv/rv_reactors.c b/kernel/trace/rv/rv_reactors.c index 9501ca886d83..d32859fec238 100644 --- a/kernel/trace/rv/rv_reactors.c +++ b/kernel/trace/rv/rv_reactors.c @@ -70,12 +70,12 @@ */ static LIST_HEAD(rv_reactors_list); -static struct rv_reactor_def *get_reactor_rdef_by_name(char *name) +static struct rv_reactor *get_reactor_rdef_by_name(char *name) { - struct rv_reactor_def *r; + struct rv_reactor *r; list_for_each_entry(r, &rv_reactors_list, list) { - if (strcmp(name, r->reactor->name) == 0) + if (strcmp(name, r->name) == 0) return r; } return NULL; @@ -86,9 +86,9 @@ static struct rv_reactor_def *get_reactor_rdef_by_name(char *name) */ static int reactors_show(struct seq_file *m, void *p) { - struct rv_reactor_def *rea_def = p; + struct rv_reactor *reactor = container_of(p, struct rv_reactor, list); - seq_printf(m, "%s\n", rea_def->reactor->name); + seq_printf(m, "%s\n", reactor->name); return 0; } @@ -138,13 +138,13 @@ static const struct file_operations available_reactors_ops = { */ static int monitor_reactor_show(struct seq_file *m, void *p) { - struct rv_monitor_def *mdef = m->private; - struct rv_reactor_def *rdef = p; + struct rv_monitor *mon = m->private; + struct rv_reactor *reactor = container_of(p, struct rv_reactor, list); - if (mdef->rdef == rdef) - seq_printf(m, "[%s]\n", rdef->reactor->name); + if (mon->reactor == reactor) + seq_printf(m, "[%s]\n", reactor->name); else - seq_printf(m, "%s\n", rdef->reactor->name); + seq_printf(m, "%s\n", reactor->name); return 0; } @@ -158,43 +158,37 @@ static const struct seq_operations monitor_reactors_seq_ops = { .show = monitor_reactor_show }; -static void monitor_swap_reactors_single(struct rv_monitor_def *mdef, - struct rv_reactor_def *rdef, - bool reacting, bool nested) +static void monitor_swap_reactors_single(struct rv_monitor *mon, + struct rv_reactor *reactor, + bool nested) { bool monitor_enabled; /* nothing to do */ - if (mdef->rdef == rdef) + if (mon->reactor == reactor) return; - monitor_enabled = mdef->monitor->enabled; + monitor_enabled = mon->enabled; if (monitor_enabled) - rv_disable_monitor(mdef); + rv_disable_monitor(mon); - /* swap reactor's usage */ - mdef->rdef->counter--; - rdef->counter++; - - mdef->rdef = rdef; - mdef->reacting = reacting; - mdef->monitor->react = rdef->reactor->react; + mon->reactor = reactor; + mon->react = reactor->react; /* enable only once if iterating through a container */ if (monitor_enabled && !nested) - rv_enable_monitor(mdef); + rv_enable_monitor(mon); } -static void monitor_swap_reactors(struct rv_monitor_def *mdef, - struct rv_reactor_def *rdef, bool reacting) +static void monitor_swap_reactors(struct rv_monitor *mon, struct rv_reactor *reactor) { - struct rv_monitor_def *p = mdef; + struct rv_monitor *p = mon; - if (rv_is_container_monitor(mdef)) + if (rv_is_container_monitor(mon)) list_for_each_entry_continue(p, &rv_monitors_list, list) { - if (p->parent != mdef->monitor) + if (p->parent != mon) break; - monitor_swap_reactors_single(p, rdef, reacting, true); + monitor_swap_reactors_single(p, reactor, true); } /* * This call enables and disables the monitor if they were active. @@ -202,7 +196,7 @@ static void monitor_swap_reactors(struct rv_monitor_def *mdef, * All nested monitors are enabled also if they were off, we may refine * this logic in the future. */ - monitor_swap_reactors_single(mdef, rdef, reacting, false); + monitor_swap_reactors_single(mon, reactor, false); } static ssize_t @@ -210,11 +204,10 @@ monitor_reactors_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { char buff[MAX_RV_REACTOR_NAME_SIZE + 2]; - struct rv_monitor_def *mdef; - struct rv_reactor_def *rdef; + struct rv_monitor *mon; + struct rv_reactor *reactor; struct seq_file *seq_f; int retval = -EINVAL; - bool enable; char *ptr; int len; @@ -237,22 +230,17 @@ monitor_reactors_write(struct file *file, const char __user *user_buf, * See monitor_reactors_open() */ seq_f = file->private_data; - mdef = seq_f->private; + mon = seq_f->private; mutex_lock(&rv_interface_lock); retval = -EINVAL; - list_for_each_entry(rdef, &rv_reactors_list, list) { - if (strcmp(ptr, rdef->reactor->name) != 0) + list_for_each_entry(reactor, &rv_reactors_list, list) { + if (strcmp(ptr, reactor->name) != 0) continue; - if (rdef == get_reactor_rdef_by_name("nop")) - enable = false; - else - enable = true; - - monitor_swap_reactors(mdef, rdef, enable); + monitor_swap_reactors(mon, reactor); retval = count; break; @@ -268,7 +256,7 @@ monitor_reactors_write(struct file *file, const char __user *user_buf, */ static int monitor_reactors_open(struct inode *inode, struct file *file) { - struct rv_monitor_def *mdef = inode->i_private; + struct rv_monitor *mon = inode->i_private; struct seq_file *seq_f; int ret; @@ -284,7 +272,7 @@ static int monitor_reactors_open(struct inode *inode, struct file *file) /* * Copy the create file "private" data to the seq_file private data. */ - seq_f->private = mdef; + seq_f->private = mon; return 0; }; @@ -299,23 +287,16 @@ static const struct file_operations monitor_reactors_ops = { static int __rv_register_reactor(struct rv_reactor *reactor) { - struct rv_reactor_def *r; + struct rv_reactor *r; list_for_each_entry(r, &rv_reactors_list, list) { - if (strcmp(reactor->name, r->reactor->name) == 0) { + if (strcmp(reactor->name, r->name) == 0) { pr_info("Reactor %s is already registered\n", reactor->name); return -EINVAL; } } - r = kzalloc(sizeof(struct rv_reactor_def), GFP_KERNEL); - if (!r) - return -ENOMEM; - - r->reactor = reactor; - r->counter = 0; - - list_add_tail(&r->list, &rv_reactors_list); + list_add_tail(&reactor->list, &rv_reactors_list); return 0; } @@ -350,30 +331,10 @@ int rv_register_reactor(struct rv_reactor *reactor) */ int rv_unregister_reactor(struct rv_reactor *reactor) { - struct rv_reactor_def *ptr, *next; - int ret = 0; - mutex_lock(&rv_interface_lock); - - list_for_each_entry_safe(ptr, next, &rv_reactors_list, list) { - if (strcmp(reactor->name, ptr->reactor->name) == 0) { - - if (!ptr->counter) { - list_del(&ptr->list); - } else { - printk(KERN_WARNING - "rv: the rv_reactor %s is in use by %d monitor(s)\n", - ptr->reactor->name, ptr->counter); - printk(KERN_WARNING "rv: the rv_reactor %s cannot be removed\n", - ptr->reactor->name); - ret = -EBUSY; - break; - } - } - } - + list_del(&reactor->list); mutex_unlock(&rv_interface_lock); - return ret; + return 0; } /* @@ -454,43 +415,30 @@ static const struct file_operations reacting_on_fops = { /** * reactor_populate_monitor - creates per monitor reactors file - * @mdef: monitor's definition. + * @mon: The monitor. * * Returns 0 if successful, error otherwise. */ -int reactor_populate_monitor(struct rv_monitor_def *mdef) +int reactor_populate_monitor(struct rv_monitor *mon) { struct dentry *tmp; - tmp = rv_create_file("reactors", RV_MODE_WRITE, mdef->root_d, mdef, &monitor_reactors_ops); + tmp = rv_create_file("reactors", RV_MODE_WRITE, mon->root_d, mon, &monitor_reactors_ops); if (!tmp) return -ENOMEM; /* * Configure as the rv_nop reactor. */ - mdef->rdef = get_reactor_rdef_by_name("nop"); - mdef->rdef->counter++; - mdef->reacting = false; + mon->reactor = get_reactor_rdef_by_name("nop"); return 0; } -/** - * reactor_cleanup_monitor - cleanup a monitor reference - * @mdef: monitor's definition. - */ -void reactor_cleanup_monitor(struct rv_monitor_def *mdef) -{ - lockdep_assert_held(&rv_interface_lock); - mdef->rdef->counter--; - WARN_ON_ONCE(mdef->rdef->counter < 0); -} - /* * Nop reactor register */ -static void rv_nop_reaction(char *msg) +__printf(1, 2) static void rv_nop_reaction(const char *msg, ...) { } diff --git a/kernel/trace/rv/rv_trace.h b/kernel/trace/rv/rv_trace.h index 422b75f58891..4a6faddac614 100644 --- a/kernel/trace/rv/rv_trace.h +++ b/kernel/trace/rv/rv_trace.h @@ -16,24 +16,24 @@ DECLARE_EVENT_CLASS(event_da_monitor, TP_ARGS(state, event, next_state, final_state), TP_STRUCT__entry( - __array( char, state, MAX_DA_NAME_LEN ) - __array( char, event, MAX_DA_NAME_LEN ) - __array( char, next_state, MAX_DA_NAME_LEN ) - __field( bool, final_state ) + __string( state, state ) + __string( event, event ) + __string( next_state, next_state ) + __field( bool, final_state ) ), TP_fast_assign( - memcpy(__entry->state, state, MAX_DA_NAME_LEN); - memcpy(__entry->event, event, MAX_DA_NAME_LEN); - memcpy(__entry->next_state, next_state, MAX_DA_NAME_LEN); - __entry->final_state = final_state; + __assign_str(state); + __assign_str(event); + __assign_str(next_state); + __entry->final_state = final_state; ), - TP_printk("%s x %s -> %s %s", - __entry->state, - __entry->event, - __entry->next_state, - __entry->final_state ? "(final)" : "") + TP_printk("%s x %s -> %s%s", + __get_str(state), + __get_str(event), + __get_str(next_state), + __entry->final_state ? " (final)" : "") ); DECLARE_EVENT_CLASS(error_da_monitor, @@ -43,26 +43,26 @@ DECLARE_EVENT_CLASS(error_da_monitor, TP_ARGS(state, event), TP_STRUCT__entry( - __array( char, state, MAX_DA_NAME_LEN ) - __array( char, event, MAX_DA_NAME_LEN ) + __string( state, state ) + __string( event, event ) ), TP_fast_assign( - memcpy(__entry->state, state, MAX_DA_NAME_LEN); - memcpy(__entry->event, event, MAX_DA_NAME_LEN); + __assign_str(state); + __assign_str(event); ), TP_printk("event %s not expected in the state %s", - __entry->event, - __entry->state) + __get_str(event), + __get_str(state)) ); #include <monitors/wip/wip_trace.h> -#include <monitors/tss/tss_trace.h> #include <monitors/sco/sco_trace.h> #include <monitors/scpd/scpd_trace.h> #include <monitors/snep/snep_trace.h> -#include <monitors/sncid/sncid_trace.h> +#include <monitors/sts/sts_trace.h> +#include <monitors/opid/opid_trace.h> // Add new monitors based on CONFIG_DA_MON_EVENTS_IMPLICIT here #endif /* CONFIG_DA_MON_EVENTS_IMPLICIT */ @@ -75,27 +75,27 @@ DECLARE_EVENT_CLASS(event_da_monitor_id, TP_ARGS(id, state, event, next_state, final_state), TP_STRUCT__entry( - __field( int, id ) - __array( char, state, MAX_DA_NAME_LEN ) - __array( char, event, MAX_DA_NAME_LEN ) - __array( char, next_state, MAX_DA_NAME_LEN ) - __field( bool, final_state ) + __field( int, id ) + __string( state, state ) + __string( event, event ) + __string( next_state, next_state ) + __field( bool, final_state ) ), TP_fast_assign( - memcpy(__entry->state, state, MAX_DA_NAME_LEN); - memcpy(__entry->event, event, MAX_DA_NAME_LEN); - memcpy(__entry->next_state, next_state, MAX_DA_NAME_LEN); - __entry->id = id; - __entry->final_state = final_state; + __assign_str(state); + __assign_str(event); + __assign_str(next_state); + __entry->id = id; + __entry->final_state = final_state; ), - TP_printk("%d: %s x %s -> %s %s", + TP_printk("%d: %s x %s -> %s%s", __entry->id, - __entry->state, - __entry->event, - __entry->next_state, - __entry->final_state ? "(final)" : "") + __get_str(state), + __get_str(event), + __get_str(next_state), + __entry->final_state ? " (final)" : "") ); DECLARE_EVENT_CLASS(error_da_monitor_id, @@ -105,32 +105,108 @@ DECLARE_EVENT_CLASS(error_da_monitor_id, TP_ARGS(id, state, event), TP_STRUCT__entry( - __field( int, id ) - __array( char, state, MAX_DA_NAME_LEN ) - __array( char, event, MAX_DA_NAME_LEN ) + __field( int, id ) + __string( state, state ) + __string( event, event ) ), TP_fast_assign( - memcpy(__entry->state, state, MAX_DA_NAME_LEN); - memcpy(__entry->event, event, MAX_DA_NAME_LEN); - __entry->id = id; + __assign_str(state); + __assign_str(event); + __entry->id = id; ), TP_printk("%d: event %s not expected in the state %s", __entry->id, - __entry->event, - __entry->state) + __get_str(event), + __get_str(state)) ); #include <monitors/wwnr/wwnr_trace.h> #include <monitors/snroc/snroc_trace.h> +#include <monitors/nrp/nrp_trace.h> +#include <monitors/sssw/sssw_trace.h> // Add new monitors based on CONFIG_DA_MON_EVENTS_ID here #endif /* CONFIG_DA_MON_EVENTS_ID */ +#ifdef CONFIG_LTL_MON_EVENTS_ID +DECLARE_EVENT_CLASS(event_ltl_monitor_id, + + TP_PROTO(struct task_struct *task, char *states, char *atoms, char *next), + + TP_ARGS(task, states, atoms, next), + + TP_STRUCT__entry( + __string(comm, task->comm) + __field(pid_t, pid) + __string(states, states) + __string(atoms, atoms) + __string(next, next) + ), + + TP_fast_assign( + __assign_str(comm); + __entry->pid = task->pid; + __assign_str(states); + __assign_str(atoms); + __assign_str(next); + ), + + TP_printk("%s[%d]: (%s) x (%s) -> (%s)", __get_str(comm), __entry->pid, + __get_str(states), __get_str(atoms), __get_str(next)) +); + +DECLARE_EVENT_CLASS(error_ltl_monitor_id, + + TP_PROTO(struct task_struct *task), + + TP_ARGS(task), + + TP_STRUCT__entry( + __string(comm, task->comm) + __field(pid_t, pid) + ), + + TP_fast_assign( + __assign_str(comm); + __entry->pid = task->pid; + ), + + TP_printk("%s[%d]: violation detected", __get_str(comm), __entry->pid) +); +#include <monitors/pagefault/pagefault_trace.h> +#include <monitors/sleep/sleep_trace.h> +// Add new monitors based on CONFIG_LTL_MON_EVENTS_ID here +#endif /* CONFIG_LTL_MON_EVENTS_ID */ + +#ifdef CONFIG_RV_MON_MAINTENANCE_EVENTS +/* Tracepoint useful for monitors development, currenly only used in DA */ +TRACE_EVENT(rv_retries_error, + + TP_PROTO(char *name, char *event), + + TP_ARGS(name, event), + + TP_STRUCT__entry( + __string( name, name ) + __string( event, event ) + ), + + TP_fast_assign( + __assign_str(name); + __assign_str(event); + ), + + TP_printk(__stringify(MAX_DA_RETRY_RACING_EVENTS) + " retries reached for event %s, resetting monitor %s", + __get_str(event), __get_str(name)) +); +#endif /* CONFIG_RV_MON_MAINTENANCE_EVENTS */ #endif /* _TRACE_RV_H */ -/* This part ust be outside protection */ +/* This part must be outside protection */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE rv_trace #include <trace/define_trace.h> diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 95ae7c4e5835..7996f26c3f46 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4735,21 +4735,15 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) if (iter->cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { iter->buffer_iter[cpu] = - ring_buffer_read_prepare(iter->array_buffer->buffer, - cpu, GFP_KERNEL); - } - ring_buffer_read_prepare_sync(); - for_each_tracing_cpu(cpu) { - ring_buffer_read_start(iter->buffer_iter[cpu]); + ring_buffer_read_start(iter->array_buffer->buffer, + cpu, GFP_KERNEL); tracing_iter_reset(iter, cpu); } } else { cpu = iter->cpu_file; iter->buffer_iter[cpu] = - ring_buffer_read_prepare(iter->array_buffer->buffer, - cpu, GFP_KERNEL); - ring_buffer_read_prepare_sync(); - ring_buffer_read_start(iter->buffer_iter[cpu]); + ring_buffer_read_start(iter->array_buffer->buffer, + cpu, GFP_KERNEL); tracing_iter_reset(iter, cpu); } diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 916555f0de81..a1d402124836 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -9,14 +9,15 @@ * Copyright (C) 2021, VMware Inc, Tzvetomir Stoyanov tz.stoyanov@gmail.com> * */ +#include <linux/cleanup.h> +#include <linux/ftrace.h> #include <linux/module.h> #include <linux/mutex.h> -#include <linux/ftrace.h> #include "trace_dynevent.h" #include "trace_probe.h" -#include "trace_probe_tmpl.h" #include "trace_probe_kernel.h" +#include "trace_probe_tmpl.h" #define EPROBE_EVENT_SYSTEM "eprobes" @@ -343,10 +344,15 @@ get_event_field(struct fetch_insn *code, void *rec) val = *(unsigned int *)addr; break; default: - if (field->is_signed) - val = *(long *)addr; - else - val = *(unsigned long *)addr; + if (field->size == sizeof(long)) { + if (field->is_signed) + val = *(long *)addr; + else + val = *(unsigned long *)addr; + break; + } + /* This is an array, point to the addr itself */ + val = (unsigned long)addr; break; } return val; @@ -797,18 +803,20 @@ find_and_get_event(const char *system, const char *event_name) static int trace_eprobe_tp_update_arg(struct trace_eprobe *ep, const char *argv[], int i) { - struct traceprobe_parse_context ctx = { - .event = ep->event, - .flags = TPARG_FL_KERNEL | TPARG_FL_TEVENT, - }; + struct traceprobe_parse_context *ctx __free(traceprobe_parse_context) = NULL; int ret; - ret = traceprobe_parse_probe_arg(&ep->tp, i, argv[i], &ctx); + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + ctx->event = ep->event; + ctx->flags = TPARG_FL_KERNEL | TPARG_FL_TEVENT; + + ret = traceprobe_parse_probe_arg(&ep->tp, i, argv[i], ctx); /* Handle symbols "@" */ if (!ret) ret = traceprobe_update_arg(&ep->tp.args[i]); - traceprobe_finish_parse(&ctx); return ret; } @@ -869,10 +877,10 @@ static int __trace_eprobe_create(int argc, const char *argv[]) const char *event = NULL, *group = EPROBE_EVENT_SYSTEM; const char *sys_event = NULL, *sys_name = NULL; struct trace_event_call *event_call; + char *buf1 __free(kfree) = NULL; + char *buf2 __free(kfree) = NULL; + char *gbuf __free(kfree) = NULL; struct trace_eprobe *ep = NULL; - char buf1[MAX_EVENT_NAME_LEN]; - char buf2[MAX_EVENT_NAME_LEN]; - char gbuf[MAX_EVENT_NAME_LEN]; int ret = 0, filter_idx = 0; int i, filter_cnt; @@ -883,6 +891,9 @@ static int __trace_eprobe_create(int argc, const char *argv[]) event = strchr(&argv[0][1], ':'); if (event) { + gbuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL); + if (!gbuf) + goto mem_error; event++; ret = traceprobe_parse_event_name(&event, &group, gbuf, event - argv[0]); @@ -892,6 +903,11 @@ static int __trace_eprobe_create(int argc, const char *argv[]) trace_probe_log_set_index(1); sys_event = argv[1]; + + buf2 = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL); + if (!buf2) + goto mem_error; + ret = traceprobe_parse_event_name(&sys_event, &sys_name, buf2, 0); if (ret || !sys_event || !sys_name) { trace_probe_log_err(0, NO_EVENT_INFO); @@ -899,7 +915,9 @@ static int __trace_eprobe_create(int argc, const char *argv[]) } if (!event) { - strscpy(buf1, sys_event, MAX_EVENT_NAME_LEN); + buf1 = kstrdup(sys_event, GFP_KERNEL); + if (!buf1) + goto mem_error; event = buf1; } @@ -972,6 +990,9 @@ static int __trace_eprobe_create(int argc, const char *argv[]) trace_probe_log_clear(); return ret; +mem_error: + ret = -ENOMEM; + goto error; parse_error: ret = -EINVAL; error: diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 3885aadc434d..e4581e10782b 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -2900,6 +2900,10 @@ static __init int ftrace_test_event_filter(void) if (i == DATA_CNT) printk(KERN_CONT "OK\n"); + /* Need to call ftrace_test_filter to prevent a warning */ + if (!trace_ftrace_test_filter_enabled()) + trace_ftrace_test_filter(1, 2, 3, 4, 5, 6, 7, 8); + return 0; } diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c index b40fa59159ac..b36ade43d4b3 100644 --- a/kernel/trace/trace_fprobe.c +++ b/kernel/trace/trace_fprobe.c @@ -4,15 +4,18 @@ * Copyright (C) 2022 Google LLC. */ #define pr_fmt(fmt) "trace_fprobe: " fmt -#include <asm/ptrace.h> #include <linux/fprobe.h> +#include <linux/list.h> #include <linux/module.h> +#include <linux/mutex.h> #include <linux/rculist.h> #include <linux/security.h> #include <linux/tracepoint.h> #include <linux/uaccess.h> +#include <asm/ptrace.h> + #include "trace_dynevent.h" #include "trace_probe.h" #include "trace_probe_kernel.h" @@ -21,7 +24,6 @@ #define FPROBE_EVENT_SYSTEM "fprobes" #define TRACEPOINT_EVENT_SYSTEM "tracepoints" #define RETHOOK_MAXACTIVE_MAX 4096 -#define TRACEPOINT_STUB ERR_PTR(-ENOENT) static int trace_fprobe_create(const char *raw_command); static int trace_fprobe_show(struct seq_file *m, struct dyn_event *ev); @@ -38,15 +40,156 @@ static struct dyn_event_operations trace_fprobe_ops = { .match = trace_fprobe_match, }; +/* List of tracepoint_user */ +static LIST_HEAD(tracepoint_user_list); +static DEFINE_MUTEX(tracepoint_user_mutex); + +/* While living tracepoint_user, @tpoint can be NULL and @refcount != 0. */ +struct tracepoint_user { + struct list_head list; + const char *name; + struct tracepoint *tpoint; + unsigned int refcount; +}; + +/* NOTE: you must lock tracepoint_user_mutex. */ +#define for_each_tracepoint_user(tuser) \ + list_for_each_entry(tuser, &tracepoint_user_list, list) + +static int tracepoint_user_register(struct tracepoint_user *tuser) +{ + struct tracepoint *tpoint = tuser->tpoint; + + if (!tpoint) + return 0; + + return tracepoint_probe_register_prio_may_exist(tpoint, + tpoint->probestub, NULL, 0); +} + +static void tracepoint_user_unregister(struct tracepoint_user *tuser) +{ + if (!tuser->tpoint) + return; + + WARN_ON_ONCE(tracepoint_probe_unregister(tuser->tpoint, tuser->tpoint->probestub, NULL)); + tuser->tpoint = NULL; +} + +static unsigned long tracepoint_user_ip(struct tracepoint_user *tuser) +{ + if (!tuser->tpoint) + return 0UL; + + return (unsigned long)tuser->tpoint->probestub; +} + +static void __tracepoint_user_free(struct tracepoint_user *tuser) +{ + if (!tuser) + return; + kfree(tuser->name); + kfree(tuser); +} + +DEFINE_FREE(tuser_free, struct tracepoint_user *, __tracepoint_user_free(_T)) + +static struct tracepoint_user *__tracepoint_user_init(const char *name, struct tracepoint *tpoint) +{ + struct tracepoint_user *tuser __free(tuser_free) = NULL; + int ret; + + tuser = kzalloc(sizeof(*tuser), GFP_KERNEL); + if (!tuser) + return NULL; + tuser->name = kstrdup(name, GFP_KERNEL); + if (!tuser->name) + return NULL; + + if (tpoint) { + ret = tracepoint_user_register(tuser); + if (ret) + return ERR_PTR(ret); + } + + tuser->tpoint = tpoint; + tuser->refcount = 1; + INIT_LIST_HEAD(&tuser->list); + list_add(&tuser->list, &tracepoint_user_list); + + return_ptr(tuser); +} + +static struct tracepoint *find_tracepoint(const char *tp_name, + struct module **tp_mod); + +/* + * Get tracepoint_user if exist, or allocate new one and register it. + * If tracepoint is on a module, get its refcounter too. + * This returns errno or NULL (not loaded yet) or tracepoint_user. + */ +static struct tracepoint_user *tracepoint_user_find_get(const char *name, struct module **pmod) +{ + struct module *mod __free(module_put) = NULL; + struct tracepoint_user *tuser; + struct tracepoint *tpoint; + + if (!name || !pmod) + return ERR_PTR(-EINVAL); + + /* Get and lock the module which has tracepoint. */ + tpoint = find_tracepoint(name, &mod); + + guard(mutex)(&tracepoint_user_mutex); + /* Search existing tracepoint_user */ + for_each_tracepoint_user(tuser) { + if (!strcmp(tuser->name, name)) { + tuser->refcount++; + *pmod = no_free_ptr(mod); + return tuser; + } + } + + /* The corresponding tracepoint_user is not found. */ + tuser = __tracepoint_user_init(name, tpoint); + if (!IS_ERR_OR_NULL(tuser)) + *pmod = no_free_ptr(mod); + + return tuser; +} + +static void tracepoint_user_put(struct tracepoint_user *tuser) +{ + scoped_guard(mutex, &tracepoint_user_mutex) { + if (--tuser->refcount > 0) + return; + + list_del(&tuser->list); + tracepoint_user_unregister(tuser); + } + + __tracepoint_user_free(tuser); +} + +DEFINE_FREE(tuser_put, struct tracepoint_user *, + if (!IS_ERR_OR_NULL(_T)) + tracepoint_user_put(_T)) + /* * Fprobe event core functions */ + +/* + * @tprobe is true for tracepoint probe. + * @tuser can be NULL if the trace_fprobe is disabled or the tracepoint is not + * loaded with a module. If @tuser != NULL, this trace_fprobe is enabled. + */ struct trace_fprobe { struct dyn_event devent; struct fprobe fp; const char *symbol; - struct tracepoint *tpoint; - struct module *mod; + bool tprobe; + struct tracepoint_user *tuser; struct trace_probe tp; }; @@ -76,7 +219,7 @@ static bool trace_fprobe_is_return(struct trace_fprobe *tf) static bool trace_fprobe_is_tracepoint(struct trace_fprobe *tf) { - return tf->tpoint != NULL; + return tf->tprobe; } static const char *trace_fprobe_symbol(struct trace_fprobe *tf) @@ -411,6 +554,8 @@ static void free_trace_fprobe(struct trace_fprobe *tf) { if (tf) { trace_probe_cleanup(&tf->tp); + if (tf->tuser) + tracepoint_user_put(tf->tuser); kfree(tf->symbol); kfree(tf); } @@ -425,9 +570,8 @@ DEFINE_FREE(free_trace_fprobe, struct trace_fprobe *, if (!IS_ERR_OR_NULL(_T)) f static struct trace_fprobe *alloc_trace_fprobe(const char *group, const char *event, const char *symbol, - struct tracepoint *tpoint, - struct module *mod, - int nargs, bool is_return) + int nargs, bool is_return, + bool is_tracepoint) { struct trace_fprobe *tf __free(free_trace_fprobe) = NULL; int ret = -ENOMEM; @@ -445,8 +589,7 @@ static struct trace_fprobe *alloc_trace_fprobe(const char *group, else tf->fp.entry_handler = fentry_dispatcher; - tf->tpoint = tpoint; - tf->mod = mod; + tf->tprobe = is_tracepoint; ret = trace_probe_init(&tf->tp, event, group, false, nargs); if (ret < 0) @@ -469,98 +612,6 @@ static struct trace_fprobe *find_trace_fprobe(const char *event, return NULL; } -static inline int __enable_trace_fprobe(struct trace_fprobe *tf) -{ - if (trace_fprobe_is_registered(tf)) - enable_fprobe(&tf->fp); - - return 0; -} - -static void __disable_trace_fprobe(struct trace_probe *tp) -{ - struct trace_fprobe *tf; - - list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) { - if (!trace_fprobe_is_registered(tf)) - continue; - disable_fprobe(&tf->fp); - } -} - -/* - * Enable trace_probe - * if the file is NULL, enable "perf" handler, or enable "trace" handler. - */ -static int enable_trace_fprobe(struct trace_event_call *call, - struct trace_event_file *file) -{ - struct trace_probe *tp; - struct trace_fprobe *tf; - bool enabled; - int ret = 0; - - tp = trace_probe_primary_from_call(call); - if (WARN_ON_ONCE(!tp)) - return -ENODEV; - enabled = trace_probe_is_enabled(tp); - - /* This also changes "enabled" state */ - if (file) { - ret = trace_probe_add_file(tp, file); - if (ret) - return ret; - } else - trace_probe_set_flag(tp, TP_FLAG_PROFILE); - - if (!enabled) { - list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) { - /* TODO: check the fprobe is gone */ - __enable_trace_fprobe(tf); - } - } - - return 0; -} - -/* - * Disable trace_probe - * if the file is NULL, disable "perf" handler, or disable "trace" handler. - */ -static int disable_trace_fprobe(struct trace_event_call *call, - struct trace_event_file *file) -{ - struct trace_probe *tp; - - tp = trace_probe_primary_from_call(call); - if (WARN_ON_ONCE(!tp)) - return -ENODEV; - - if (file) { - if (!trace_probe_get_file_link(tp, file)) - return -ENOENT; - if (!trace_probe_has_single_file(tp)) - goto out; - trace_probe_clear_flag(tp, TP_FLAG_TRACE); - } else - trace_probe_clear_flag(tp, TP_FLAG_PROFILE); - - if (!trace_probe_is_enabled(tp)) - __disable_trace_fprobe(tp); - - out: - if (file) - /* - * Synchronization is done in below function. For perf event, - * file == NULL and perf_trace_event_unreg() calls - * tracepoint_synchronize_unregister() to ensure synchronize - * event. We don't need to care about it. - */ - trace_probe_remove_file(tp, file); - - return 0; -} - /* Event entry printers */ static enum print_line_t print_fentry_event(struct trace_iterator *iter, int flags, @@ -712,20 +763,52 @@ static int unregister_fprobe_event(struct trace_fprobe *tf) static int __regsiter_tracepoint_fprobe(struct trace_fprobe *tf) { - struct tracepoint *tpoint = tf->tpoint; - unsigned long ip = (unsigned long)tpoint->probestub; + struct tracepoint_user *tuser __free(tuser_put) = NULL; + struct module *mod __free(module_put) = NULL; + unsigned long ip; int ret; + if (WARN_ON_ONCE(tf->tuser)) + return -EINVAL; + + /* If the tracepoint is in a module, it must be locked in this function. */ + tuser = tracepoint_user_find_get(tf->symbol, &mod); + /* This tracepoint is not loaded yet */ + if (IS_ERR(tuser)) + return PTR_ERR(tuser); + if (!tuser) + return -ENOMEM; + + /* Register fprobe only if the tracepoint is loaded. */ + if (tuser->tpoint) { + ip = tracepoint_user_ip(tuser); + if (WARN_ON_ONCE(!ip)) + return -ENOENT; + + ret = register_fprobe_ips(&tf->fp, &ip, 1); + if (ret < 0) + return ret; + } + + tf->tuser = no_free_ptr(tuser); + return 0; +} + +/* Returns an error if the target function is not available, or 0 */ +static int trace_fprobe_verify_target(struct trace_fprobe *tf) +{ + int ret; + + /* Tracepoint should have a stub function. */ + if (trace_fprobe_is_tracepoint(tf)) + return 0; + /* - * Here, we do 2 steps to enable fprobe on a tracepoint. - * At first, put __probestub_##TP function on the tracepoint - * and put a fprobe on the stub function. + * Note: since we don't lock the module, even if this succeeded, + * register_fprobe() later can fail. */ - ret = tracepoint_probe_register_prio_may_exist(tpoint, - tpoint->probestub, NULL, 0); - if (ret < 0) - return ret; - return register_fprobe_ips(&tf->fp, &ip, 1); + ret = fprobe_count_ips_from_filter(tf->symbol, NULL); + return (ret < 0) ? ret : 0; } /* Internal register function - just handle fprobe and flags */ @@ -747,20 +830,10 @@ static int __register_trace_fprobe(struct trace_fprobe *tf) return ret; } - /* Set/clear disabled flag according to tp->flag */ - if (trace_probe_is_enabled(&tf->tp)) - tf->fp.flags &= ~FPROBE_FL_DISABLED; - else - tf->fp.flags |= FPROBE_FL_DISABLED; - - if (trace_fprobe_is_tracepoint(tf)) { - - /* This tracepoint is not loaded yet */ - if (tf->tpoint == TRACEPOINT_STUB) - return 0; + tf->fp.flags &= ~FPROBE_FL_DISABLED; + if (trace_fprobe_is_tracepoint(tf)) return __regsiter_tracepoint_fprobe(tf); - } /* TODO: handle filter, nofilter or symbol list */ return register_fprobe(&tf->fp, tf->symbol, NULL); @@ -769,15 +842,11 @@ static int __register_trace_fprobe(struct trace_fprobe *tf) /* Internal unregister function - just handle fprobe and flags */ static void __unregister_trace_fprobe(struct trace_fprobe *tf) { - if (trace_fprobe_is_registered(tf)) { + if (trace_fprobe_is_registered(tf)) unregister_fprobe(&tf->fp); - memset(&tf->fp, 0, sizeof(tf->fp)); - if (trace_fprobe_is_tracepoint(tf)) { - tracepoint_probe_unregister(tf->tpoint, - tf->tpoint->probestub, NULL); - tf->tpoint = NULL; - tf->mod = NULL; - } + if (tf->tuser) { + tracepoint_user_put(tf->tuser); + tf->tuser = NULL; } } @@ -837,7 +906,7 @@ static bool trace_fprobe_has_same_fprobe(struct trace_fprobe *orig, return false; } -static int append_trace_fprobe(struct trace_fprobe *tf, struct trace_fprobe *to) +static int append_trace_fprobe_event(struct trace_fprobe *tf, struct trace_fprobe *to) { int ret; @@ -865,7 +934,7 @@ static int append_trace_fprobe(struct trace_fprobe *tf, struct trace_fprobe *to) if (ret) return ret; - ret = __register_trace_fprobe(tf); + ret = trace_fprobe_verify_target(tf); if (ret) trace_probe_unlink(&tf->tp); else @@ -874,8 +943,8 @@ static int append_trace_fprobe(struct trace_fprobe *tf, struct trace_fprobe *to) return ret; } -/* Register a trace_probe and probe_event */ -static int register_trace_fprobe(struct trace_fprobe *tf) +/* Register a trace_probe and probe_event, and check the fprobe is available. */ +static int register_trace_fprobe_event(struct trace_fprobe *tf) { struct trace_fprobe *old_tf; int ret; @@ -885,7 +954,7 @@ static int register_trace_fprobe(struct trace_fprobe *tf) old_tf = find_trace_fprobe(trace_probe_name(&tf->tp), trace_probe_group_name(&tf->tp)); if (old_tf) - return append_trace_fprobe(tf, old_tf); + return append_trace_fprobe_event(tf, old_tf); /* Register new event */ ret = register_fprobe_event(tf); @@ -898,8 +967,8 @@ static int register_trace_fprobe(struct trace_fprobe *tf) return ret; } - /* Register fprobe */ - ret = __register_trace_fprobe(tf); + /* Verify fprobe is sane. */ + ret = trace_fprobe_verify_target(tf); if (ret < 0) unregister_fprobe_event(tf); else @@ -963,15 +1032,6 @@ static struct tracepoint *find_tracepoint(const char *tp_name, } #ifdef CONFIG_MODULES -static void reenable_trace_fprobe(struct trace_fprobe *tf) -{ - struct trace_probe *tp = &tf->tp; - - list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) { - __enable_trace_fprobe(tf); - } -} - /* * Find a tracepoint from specified module. In this case, this does not get the * module's refcount. The caller must ensure the module is not freed. @@ -988,36 +1048,94 @@ static struct tracepoint *find_tracepoint_in_module(struct module *mod, return data.tpoint; } +/* These are CONFIG_MODULES=y specific functions. */ +static bool tracepoint_user_within_module(struct tracepoint_user *tuser, + struct module *mod) +{ + return within_module(tracepoint_user_ip(tuser), mod); +} + +static int tracepoint_user_register_again(struct tracepoint_user *tuser, + struct tracepoint *tpoint) +{ + tuser->tpoint = tpoint; + return tracepoint_user_register(tuser); +} + +static void tracepoint_user_unregister_clear(struct tracepoint_user *tuser) +{ + tracepoint_user_unregister(tuser); + tuser->tpoint = NULL; +} + +/* module callback for tracepoint_user */ static int __tracepoint_probe_module_cb(struct notifier_block *self, unsigned long val, void *data) { struct tp_module *tp_mod = data; + struct tracepoint_user *tuser; struct tracepoint *tpoint; + + if (val != MODULE_STATE_GOING && val != MODULE_STATE_COMING) + return NOTIFY_DONE; + + mutex_lock(&tracepoint_user_mutex); + for_each_tracepoint_user(tuser) { + if (val == MODULE_STATE_COMING) { + /* This is not a tracepoint in this module. Skip it. */ + tpoint = find_tracepoint_in_module(tp_mod->mod, tuser->name); + if (!tpoint) + continue; + WARN_ON_ONCE(tracepoint_user_register_again(tuser, tpoint)); + } else if (val == MODULE_STATE_GOING && + tracepoint_user_within_module(tuser, tp_mod->mod)) { + /* Unregister all tracepoint_user in this module. */ + tracepoint_user_unregister_clear(tuser); + } + } + mutex_unlock(&tracepoint_user_mutex); + + return NOTIFY_DONE; +} + +static struct notifier_block tracepoint_module_nb = { + .notifier_call = __tracepoint_probe_module_cb, +}; + +/* module callback for tprobe events */ +static int __tprobe_event_module_cb(struct notifier_block *self, + unsigned long val, void *data) +{ struct trace_fprobe *tf; struct dyn_event *pos; + struct module *mod = data; if (val != MODULE_STATE_GOING && val != MODULE_STATE_COMING) return NOTIFY_DONE; mutex_lock(&event_mutex); for_each_trace_fprobe(tf, pos) { - if (val == MODULE_STATE_COMING && tf->tpoint == TRACEPOINT_STUB) { - tpoint = find_tracepoint_in_module(tp_mod->mod, tf->symbol); - if (tpoint) { - tf->tpoint = tpoint; - tf->mod = tp_mod->mod; - if (!WARN_ON_ONCE(__regsiter_tracepoint_fprobe(tf)) && - trace_probe_is_enabled(&tf->tp)) - reenable_trace_fprobe(tf); - } - } else if (val == MODULE_STATE_GOING && tp_mod->mod == tf->mod) { + /* Skip fprobe and disabled tprobe events. */ + if (!trace_fprobe_is_tracepoint(tf) || !tf->tuser) + continue; + + /* Before this notification, tracepoint notifier has already done. */ + if (val == MODULE_STATE_COMING && + tracepoint_user_within_module(tf->tuser, mod)) { + unsigned long ip = tracepoint_user_ip(tf->tuser); + + WARN_ON_ONCE(register_fprobe_ips(&tf->fp, &ip, 1)); + } else if (val == MODULE_STATE_GOING && + /* + * tracepoint_user_within_module() does not work here because + * tracepoint_user is already unregistered and cleared tpoint. + * Instead, checking whether the fprobe is registered but + * tpoint is cleared(unregistered). Such unbalance probes + * must be adjusted anyway. + */ + trace_fprobe_is_registered(tf) && + !tf->tuser->tpoint) { unregister_fprobe(&tf->fp); - if (trace_fprobe_is_tracepoint(tf)) { - tracepoint_probe_unregister(tf->tpoint, - tf->tpoint->probestub, NULL); - tf->tpoint = TRACEPOINT_STUB; - tf->mod = NULL; - } } } mutex_unlock(&event_mutex); @@ -1025,8 +1143,11 @@ static int __tracepoint_probe_module_cb(struct notifier_block *self, return NOTIFY_DONE; } -static struct notifier_block tracepoint_module_nb = { - .notifier_call = __tracepoint_probe_module_cb, +/* NOTE: this must be called after tracepoint callback */ +static struct notifier_block tprobe_event_module_nb = { + .notifier_call = __tprobe_event_module_cb, + /* Make sure this is later than tracepoint module notifier. */ + .priority = -10, }; #endif /* CONFIG_MODULES */ @@ -1086,8 +1207,6 @@ static int parse_symbol_and_return(int argc, const char *argv[], return 0; } -DEFINE_FREE(module_put, struct module *, if (_T) module_put(_T)) - static int trace_fprobe_create_internal(int argc, const char *argv[], struct traceprobe_parse_context *ctx) { @@ -1116,19 +1235,18 @@ static int trace_fprobe_create_internal(int argc, const char *argv[], * FETCHARG:TYPE : use TYPE instead of unsigned long. */ struct trace_fprobe *tf __free(free_trace_fprobe) = NULL; - int i, new_argc = 0, ret = 0; - bool is_return = false; - char *symbol __free(kfree) = NULL; const char *event = NULL, *group = FPROBE_EVENT_SYSTEM; + struct module *mod __free(module_put) = NULL; const char **new_argv __free(kfree) = NULL; - char buf[MAX_EVENT_NAME_LEN]; - char gbuf[MAX_EVENT_NAME_LEN]; - char sbuf[KSYM_NAME_LEN]; - char abuf[MAX_BTF_ARGS_LEN]; + char *symbol __free(kfree) = NULL; + char *ebuf __free(kfree) = NULL; + char *gbuf __free(kfree) = NULL; + char *sbuf __free(kfree) = NULL; + char *abuf __free(kfree) = NULL; char *dbuf __free(kfree) = NULL; + int i, new_argc = 0, ret = 0; bool is_tracepoint = false; - struct module *tp_mod __free(module_put) = NULL; - struct tracepoint *tpoint = NULL; + bool is_return = false; if ((argv[0][0] != 'f' && argv[0][0] != 't') || argc < 2) return -ECANCELED; @@ -1156,6 +1274,9 @@ static int trace_fprobe_create_internal(int argc, const char *argv[], trace_probe_log_set_index(0); if (event) { + gbuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL); + if (!gbuf) + return -ENOMEM; ret = traceprobe_parse_event_name(&event, &group, gbuf, event - argv[0]); if (ret) @@ -1163,15 +1284,18 @@ static int trace_fprobe_create_internal(int argc, const char *argv[], } if (!event) { + ebuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL); + if (!ebuf) + return -ENOMEM; /* Make a new event name */ if (is_tracepoint) - snprintf(buf, MAX_EVENT_NAME_LEN, "%s%s", + snprintf(ebuf, MAX_EVENT_NAME_LEN, "%s%s", isdigit(*symbol) ? "_" : "", symbol); else - snprintf(buf, MAX_EVENT_NAME_LEN, "%s__%s", symbol, + snprintf(ebuf, MAX_EVENT_NAME_LEN, "%s__%s", symbol, is_return ? "exit" : "entry"); - sanitize_event_name(buf); - event = buf; + sanitize_event_name(ebuf); + event = ebuf; } if (is_return) @@ -1179,25 +1303,28 @@ static int trace_fprobe_create_internal(int argc, const char *argv[], else ctx->flags |= TPARG_FL_FENTRY; + ctx->funcname = NULL; if (is_tracepoint) { + /* Get tracepoint and lock its module until the end of the registration. */ + struct tracepoint *tpoint; + ctx->flags |= TPARG_FL_TPOINT; - tpoint = find_tracepoint(symbol, &tp_mod); + mod = NULL; + tpoint = find_tracepoint(symbol, &mod); if (tpoint) { - ctx->funcname = kallsyms_lookup( - (unsigned long)tpoint->probestub, - NULL, NULL, NULL, sbuf); - } else if (IS_ENABLED(CONFIG_MODULES)) { - /* This *may* be loaded afterwards */ - tpoint = TRACEPOINT_STUB; - ctx->funcname = symbol; - } else { - trace_probe_log_set_index(1); - trace_probe_log_err(0, NO_TRACEPOINT); - return -EINVAL; + sbuf = kmalloc(KSYM_NAME_LEN, GFP_KERNEL); + if (!sbuf) + return -ENOMEM; + ctx->funcname = kallsyms_lookup((unsigned long)tpoint->probestub, + NULL, NULL, NULL, sbuf); } - } else + } + if (!ctx->funcname) ctx->funcname = symbol; + abuf = kmalloc(MAX_BTF_ARGS_LEN, GFP_KERNEL); + if (!abuf) + return -ENOMEM; argc -= 2; argv += 2; new_argv = traceprobe_expand_meta_args(argc, argv, &new_argc, abuf, MAX_BTF_ARGS_LEN, ctx); @@ -1218,8 +1345,7 @@ static int trace_fprobe_create_internal(int argc, const char *argv[], return ret; /* setup a probe */ - tf = alloc_trace_fprobe(group, event, symbol, tpoint, tp_mod, - argc, is_return); + tf = alloc_trace_fprobe(group, event, symbol, argc, is_return, is_tracepoint); if (IS_ERR(tf)) { ret = PTR_ERR(tf); /* This must return -ENOMEM, else there is a bug */ @@ -1251,7 +1377,7 @@ static int trace_fprobe_create_internal(int argc, const char *argv[], if (ret < 0) return ret; - ret = register_trace_fprobe(tf); + ret = register_trace_fprobe_event(tf); if (ret) { trace_probe_log_set_index(1); if (ret == -EILSEQ) @@ -1271,14 +1397,17 @@ static int trace_fprobe_create_internal(int argc, const char *argv[], static int trace_fprobe_create_cb(int argc, const char *argv[]) { - struct traceprobe_parse_context ctx = { - .flags = TPARG_FL_KERNEL | TPARG_FL_FPROBE, - }; + struct traceprobe_parse_context *ctx __free(traceprobe_parse_context) = NULL; int ret; + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->flags = TPARG_FL_KERNEL | TPARG_FL_FPROBE; + trace_probe_log_init("trace_fprobe", argc, argv); - ret = trace_fprobe_create_internal(argc, argv, &ctx); - traceprobe_finish_parse(&ctx); + ret = trace_fprobe_create_internal(argc, argv, ctx); trace_probe_log_clear(); return ret; } @@ -1321,6 +1450,84 @@ static int trace_fprobe_show(struct seq_file *m, struct dyn_event *ev) } /* + * Enable trace_probe + * if the file is NULL, enable "perf" handler, or enable "trace" handler. + */ +static int enable_trace_fprobe(struct trace_event_call *call, + struct trace_event_file *file) +{ + struct trace_probe *tp; + struct trace_fprobe *tf; + bool enabled; + int ret = 0; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return -ENODEV; + enabled = trace_probe_is_enabled(tp); + + /* This also changes "enabled" state */ + if (file) { + ret = trace_probe_add_file(tp, file); + if (ret) + return ret; + } else + trace_probe_set_flag(tp, TP_FLAG_PROFILE); + + if (!enabled) { + list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) { + ret = __register_trace_fprobe(tf); + if (ret < 0) + return ret; + } + } + + return 0; +} + +/* + * Disable trace_probe + * if the file is NULL, disable "perf" handler, or disable "trace" handler. + */ +static int disable_trace_fprobe(struct trace_event_call *call, + struct trace_event_file *file) +{ + struct trace_fprobe *tf; + struct trace_probe *tp; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return -ENODEV; + + if (file) { + if (!trace_probe_get_file_link(tp, file)) + return -ENOENT; + if (!trace_probe_has_single_file(tp)) + goto out; + trace_probe_clear_flag(tp, TP_FLAG_TRACE); + } else + trace_probe_clear_flag(tp, TP_FLAG_PROFILE); + + if (!trace_probe_is_enabled(tp)) { + list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) { + unregister_fprobe(&tf->fp); + } + } + + out: + if (file) + /* + * Synchronization is done in below function. For perf event, + * file == NULL and perf_trace_event_unreg() calls + * tracepoint_synchronize_unregister() to ensure synchronize + * event. We don't need to care about it. + */ + trace_probe_remove_file(tp, file); + + return 0; +} + +/* * called by perf_trace_init() or __ftrace_set_clr_event() under event_mutex. */ static int fprobe_register(struct trace_event_call *event, @@ -1365,6 +1572,9 @@ static __init int init_fprobe_trace_early(void) ret = register_tracepoint_module_notifier(&tracepoint_module_nb); if (ret) return ret; + ret = register_module_notifier(&tprobe_event_module_nb); + if (ret) + return ret; #endif return 0; diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 14d74a7491b8..66e1a527cf1a 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -513,7 +513,7 @@ static void print_graph_proc(struct trace_seq *s, pid_t pid) { char comm[TASK_COMM_LEN]; /* sign + log10(MAX_INT) + '\0' */ - char pid_str[11]; + char pid_str[12]; int spaces = 0; int len; int i; diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index d7b135de958a..896ff78b8349 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -43,17 +43,15 @@ static void ftrace_dump_buf(int skip_entries, long cpu_file) if (cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { iter.buffer_iter[cpu] = - ring_buffer_read_prepare(iter.array_buffer->buffer, - cpu, GFP_ATOMIC); - ring_buffer_read_start(iter.buffer_iter[cpu]); + ring_buffer_read_start(iter.array_buffer->buffer, + cpu, GFP_ATOMIC); tracing_iter_reset(&iter, cpu); } } else { iter.cpu_file = cpu_file; iter.buffer_iter[cpu_file] = - ring_buffer_read_prepare(iter.array_buffer->buffer, + ring_buffer_read_start(iter.array_buffer->buffer, cpu_file, GFP_ATOMIC); - ring_buffer_read_start(iter.buffer_iter[cpu_file]); tracing_iter_reset(&iter, cpu_file); } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 3e5c47b6d7b2..ccae62d4fb91 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -9,19 +9,19 @@ #include <linux/bpf-cgroup.h> #include <linux/cleanup.h> -#include <linux/security.h> +#include <linux/error-injection.h> #include <linux/module.h> -#include <linux/uaccess.h> #include <linux/rculist.h> -#include <linux/error-injection.h> +#include <linux/security.h> +#include <linux/uaccess.h> #include <asm/setup.h> /* for COMMAND_LINE_SIZE */ #include "trace_dynevent.h" #include "trace_kprobe_selftest.h" #include "trace_probe.h" -#include "trace_probe_tmpl.h" #include "trace_probe_kernel.h" +#include "trace_probe_tmpl.h" #define KPROBE_EVENT_SYSTEM "kprobes" #define KRETPROBE_MAXACTIVE_MAX 4096 @@ -861,20 +861,20 @@ static int trace_kprobe_create_internal(int argc, const char *argv[], * FETCHARG:TYPE : use TYPE instead of unsigned long. */ struct trace_kprobe *tk __free(free_trace_kprobe) = NULL; + const char *event = NULL, *group = KPROBE_EVENT_SYSTEM; + const char **new_argv __free(kfree) = NULL; int i, len, new_argc = 0, ret = 0; - bool is_return = false; char *symbol __free(kfree) = NULL; - char *tmp = NULL; - const char **new_argv __free(kfree) = NULL; - const char *event = NULL, *group = KPROBE_EVENT_SYSTEM; + char *ebuf __free(kfree) = NULL; + char *gbuf __free(kfree) = NULL; + char *abuf __free(kfree) = NULL; + char *dbuf __free(kfree) = NULL; enum probe_print_type ptype; + bool is_return = false; int maxactive = 0; - long offset = 0; void *addr = NULL; - char buf[MAX_EVENT_NAME_LEN]; - char gbuf[MAX_EVENT_NAME_LEN]; - char abuf[MAX_BTF_ARGS_LEN]; - char *dbuf __free(kfree) = NULL; + char *tmp = NULL; + long offset = 0; switch (argv[0][0]) { case 'r': @@ -893,6 +893,8 @@ static int trace_kprobe_create_internal(int argc, const char *argv[], event++; if (isdigit(argv[0][1])) { + char *buf __free(kfree) = NULL; + if (!is_return) { trace_probe_log_err(1, BAD_MAXACT_TYPE); return -EINVAL; @@ -905,7 +907,7 @@ static int trace_kprobe_create_internal(int argc, const char *argv[], trace_probe_log_err(1, BAD_MAXACT); return -EINVAL; } - memcpy(buf, &argv[0][1], len); + buf = kmemdup(&argv[0][1], len + 1, GFP_KERNEL); buf[len] = '\0'; ret = kstrtouint(buf, 0, &maxactive); if (ret || !maxactive) { @@ -973,6 +975,9 @@ static int trace_kprobe_create_internal(int argc, const char *argv[], trace_probe_log_set_index(0); if (event) { + gbuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL); + if (!gbuf) + return -ENOMEM; ret = traceprobe_parse_event_name(&event, &group, gbuf, event - argv[0]); if (ret) @@ -981,16 +986,22 @@ static int trace_kprobe_create_internal(int argc, const char *argv[], if (!event) { /* Make a new event name */ + ebuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL); + if (!ebuf) + return -ENOMEM; if (symbol) - snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_%ld", + snprintf(ebuf, MAX_EVENT_NAME_LEN, "%c_%s_%ld", is_return ? 'r' : 'p', symbol, offset); else - snprintf(buf, MAX_EVENT_NAME_LEN, "%c_0x%p", + snprintf(ebuf, MAX_EVENT_NAME_LEN, "%c_0x%p", is_return ? 'r' : 'p', addr); - sanitize_event_name(buf); - event = buf; + sanitize_event_name(ebuf); + event = ebuf; } + abuf = kmalloc(MAX_BTF_ARGS_LEN, GFP_KERNEL); + if (!abuf) + return -ENOMEM; argc -= 2; argv += 2; ctx->funcname = symbol; new_argv = traceprobe_expand_meta_args(argc, argv, &new_argc, @@ -1065,14 +1076,18 @@ static int trace_kprobe_create_internal(int argc, const char *argv[], static int trace_kprobe_create_cb(int argc, const char *argv[]) { - struct traceprobe_parse_context ctx = { .flags = TPARG_FL_KERNEL }; + struct traceprobe_parse_context *ctx __free(traceprobe_parse_context) = NULL; int ret; + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + ctx->flags = TPARG_FL_KERNEL; + trace_probe_log_init("trace_kprobe", argc, argv); - ret = trace_kprobe_create_internal(argc, argv, &ctx); + ret = trace_kprobe_create_internal(argc, argv, ctx); - traceprobe_finish_parse(&ctx); trace_probe_log_clear(); return ret; } diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 40830a3ecd96..5cbdc423afeb 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -13,8 +13,8 @@ #include <linux/bpf.h> #include <linux/fs.h> -#include "trace_btf.h" +#include "trace_btf.h" #include "trace_probe.h" #undef C @@ -247,7 +247,25 @@ int traceprobe_split_symbol_offset(char *symbol, long *offset) return 0; } -/* @buf must has MAX_EVENT_NAME_LEN size */ +/** + * traceprobe_parse_event_name() - Parse a string into group and event names + * @pevent: A pointer to the string to be parsed. + * @pgroup: A pointer to the group name. + * @buf: A buffer to store the parsed group name. + * @offset: The offset of the string in the original user command, for logging. + * + * This parses a string with the format `[GROUP/][EVENT]` or `[GROUP.][EVENT]` + * (either GROUP or EVENT or both must be specified). + * Since the parsed group name is stored in @buf, the caller must ensure @buf + * is at least MAX_EVENT_NAME_LEN bytes. + * + * Return: 0 on success, or -EINVAL on failure. + * + * If success, *@pevent is updated to point to the event name part of the + * original string, or NULL if there is no event name. + * Also, *@pgroup is updated to point to the parsed group which is stored + * in @buf, or NULL if there is no group name. + */ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, char *buf, int offset) { @@ -779,6 +797,36 @@ static int check_prepare_btf_string_fetch(char *typename, #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API +static void store_entry_arg_at(struct fetch_insn *code, int argnum, int offset) +{ + code[0].op = FETCH_OP_ARG; + code[0].param = argnum; + code[1].op = FETCH_OP_ST_EDATA; + code[1].offset = offset; +} + +static int get_entry_arg_max_offset(struct probe_entry_arg *earg) +{ + int i, max_offset = 0; + + /* + * earg->code[] array has an operation sequence which is run in + * the entry handler. + * The sequence stopped by FETCH_OP_END and each data stored in + * the entry data buffer by FETCH_OP_ST_EDATA. The FETCH_OP_ST_EDATA + * stores the data at the data buffer + its offset, and all data are + * "unsigned long" size. The offset must be increased when a data is + * stored. Thus we need to find the last FETCH_OP_ST_EDATA in the + * code array. + */ + for (i = 0; i < earg->size - 1 && earg->code[i].op != FETCH_OP_END; i++) { + if (earg->code[i].op == FETCH_OP_ST_EDATA) + if (earg->code[i].offset > max_offset) + max_offset = earg->code[i].offset; + } + return max_offset; +} + /* * Add the entry code to store the 'argnum'th parameter and return the offset * in the entry data buffer where the data will be stored. @@ -786,8 +834,7 @@ static int check_prepare_btf_string_fetch(char *typename, static int __store_entry_arg(struct trace_probe *tp, int argnum) { struct probe_entry_arg *earg = tp->entry_arg; - bool match = false; - int i, offset; + int i, offset, last_offset = 0; if (!earg) { earg = kzalloc(sizeof(*tp->entry_arg), GFP_KERNEL); @@ -804,78 +851,59 @@ static int __store_entry_arg(struct trace_probe *tp, int argnum) for (i = 0; i < earg->size; i++) earg->code[i].op = FETCH_OP_END; tp->entry_arg = earg; + store_entry_arg_at(earg->code, argnum, 0); + return 0; } /* - * The entry code array is repeating the pair of - * [FETCH_OP_ARG(argnum)][FETCH_OP_ST_EDATA(offset of entry data buffer)] - * and the rest of entries are filled with [FETCH_OP_END]. + * NOTE: if anyone change the following rule, please rewrite this. + * The entry code array is filled with the pair of + * + * [FETCH_OP_ARG(argnum)] + * [FETCH_OP_ST_EDATA(offset of entry data buffer)] * - * To reduce the redundant function parameter fetching, we scan the entry - * code array to find the FETCH_OP_ARG which already fetches the 'argnum' - * parameter. If it doesn't match, update 'offset' to find the last - * offset. - * If we find the FETCH_OP_END without matching FETCH_OP_ARG entry, we - * will save the entry with FETCH_OP_ARG and FETCH_OP_ST_EDATA, and - * return data offset so that caller can find the data offset in the entry - * data buffer. + * and the rest of entries are filled with [FETCH_OP_END]. + * The offset should be incremented, thus the last pair should + * have the largest offset. */ - offset = 0; - for (i = 0; i < earg->size - 1; i++) { - switch (earg->code[i].op) { - case FETCH_OP_END: - earg->code[i].op = FETCH_OP_ARG; - earg->code[i].param = argnum; - earg->code[i + 1].op = FETCH_OP_ST_EDATA; - earg->code[i + 1].offset = offset; - return offset; - case FETCH_OP_ARG: - match = (earg->code[i].param == argnum); - break; - case FETCH_OP_ST_EDATA: - offset = earg->code[i].offset; - if (match) - return offset; - offset += sizeof(unsigned long); - break; - default: - break; - } + + /* Search the offset for the sprcified argnum. */ + for (i = 0; i < earg->size - 1 && earg->code[i].op != FETCH_OP_END; i += 2) { + if (WARN_ON_ONCE(earg->code[i].op != FETCH_OP_ARG)) + return -EINVAL; + + if (earg->code[i].param != argnum) + continue; + + if (WARN_ON_ONCE(earg->code[i + 1].op != FETCH_OP_ST_EDATA)) + return -EINVAL; + + return earg->code[i + 1].offset; + } + /* Not found, append new entry if possible. */ + if (i >= earg->size - 1) + return -ENOSPC; + + /* The last entry must have the largest offset. */ + if (i != 0) { + if (WARN_ON_ONCE(earg->code[i - 1].op != FETCH_OP_ST_EDATA)) + return -EINVAL; + last_offset = earg->code[i - 1].offset; } - return -ENOSPC; + + offset = last_offset + sizeof(unsigned long); + store_entry_arg_at(&earg->code[i], argnum, offset); + return offset; } int traceprobe_get_entry_data_size(struct trace_probe *tp) { struct probe_entry_arg *earg = tp->entry_arg; - int i, size = 0; if (!earg) return 0; - /* - * earg->code[] array has an operation sequence which is run in - * the entry handler. - * The sequence stopped by FETCH_OP_END and each data stored in - * the entry data buffer by FETCH_OP_ST_EDATA. The FETCH_OP_ST_EDATA - * stores the data at the data buffer + its offset, and all data are - * "unsigned long" size. The offset must be increased when a data is - * stored. Thus we need to find the last FETCH_OP_ST_EDATA in the - * code array. - */ - for (i = 0; i < earg->size; i++) { - switch (earg->code[i].op) { - case FETCH_OP_END: - goto out; - case FETCH_OP_ST_EDATA: - size = earg->code[i].offset + sizeof(unsigned long); - break; - default: - break; - } - } -out: - return size; + return get_entry_arg_max_offset(earg) + sizeof(unsigned long); } void store_trace_entry_data(void *edata, struct trace_probe *tp, struct pt_regs *regs) diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 854e5668f5ee..842383fbc03b 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -10,20 +10,22 @@ * Author: Srikar Dronamraju */ +#include <linux/bitops.h> +#include <linux/btf.h> +#include <linux/cleanup.h> +#include <linux/kprobes.h> +#include <linux/limits.h> +#include <linux/perf_event.h> +#include <linux/ptrace.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/smp.h> -#include <linux/tracefs.h> -#include <linux/types.h> #include <linux/string.h> -#include <linux/ptrace.h> -#include <linux/perf_event.h> -#include <linux/kprobes.h> #include <linux/stringify.h> -#include <linux/limits.h> +#include <linux/tracefs.h> +#include <linux/types.h> #include <linux/uaccess.h> -#include <linux/bitops.h> -#include <linux/btf.h> + #include <asm/bitsperlong.h> #include "trace.h" @@ -438,6 +440,14 @@ extern void traceprobe_free_probe_arg(struct probe_arg *arg); * this MUST be called for clean up the context and return a resource. */ void traceprobe_finish_parse(struct traceprobe_parse_context *ctx); +static inline void traceprobe_free_parse_ctx(struct traceprobe_parse_context *ctx) +{ + traceprobe_finish_parse(ctx); + kfree(ctx); +} + +DEFINE_FREE(traceprobe_parse_context, struct traceprobe_parse_context *, + if (_T) traceprobe_free_parse_ctx(_T)) extern int traceprobe_split_symbol_offset(char *symbol, long *offset); int traceprobe_parse_event_name(const char **pevent, const char **pgroup, diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index f95a2c3d5b1b..8b0bcc0d8f41 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -8,17 +8,19 @@ #define pr_fmt(fmt) "trace_uprobe: " fmt #include <linux/bpf-cgroup.h> -#include <linux/security.h> +#include <linux/cleanup.h> #include <linux/ctype.h> +#include <linux/filter.h> #include <linux/module.h> -#include <linux/uaccess.h> -#include <linux/uprobes.h> #include <linux/namei.h> -#include <linux/string.h> -#include <linux/rculist.h> -#include <linux/filter.h> #include <linux/percpu.h> +#include <linux/rculist.h> +#include <linux/security.h> +#include <linux/string.h> +#include <linux/uaccess.h> +#include <linux/uprobes.h> +#include "trace.h" #include "trace_dynevent.h" #include "trace_probe.h" #include "trace_probe_tmpl.h" @@ -537,15 +539,15 @@ static int register_trace_uprobe(struct trace_uprobe *tu) */ static int __trace_uprobe_create(int argc, const char **argv) { - struct trace_uprobe *tu; const char *event = NULL, *group = UPROBE_EVENT_SYSTEM; char *arg, *filename, *rctr, *rctr_end, *tmp; - char buf[MAX_EVENT_NAME_LEN]; - char gbuf[MAX_EVENT_NAME_LEN]; - enum probe_print_type ptype; - struct path path; unsigned long offset, ref_ctr_offset; + char *gbuf __free(kfree) = NULL; + char *buf __free(kfree) = NULL; + enum probe_print_type ptype; + struct trace_uprobe *tu; bool is_return = false; + struct path path; int i, ret; ref_ctr_offset = 0; @@ -653,6 +655,10 @@ static int __trace_uprobe_create(int argc, const char **argv) /* setup a probe */ trace_probe_log_set_index(0); if (event) { + gbuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL); + if (!gbuf) + goto fail_mem; + ret = traceprobe_parse_event_name(&event, &group, gbuf, event - argv[0]); if (ret) @@ -664,15 +670,16 @@ static int __trace_uprobe_create(int argc, const char **argv) char *ptr; tail = kstrdup(kbasename(filename), GFP_KERNEL); - if (!tail) { - ret = -ENOMEM; - goto fail_address_parse; - } + if (!tail) + goto fail_mem; ptr = strpbrk(tail, ".-_"); if (ptr) *ptr = '\0'; + buf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL); + if (!buf) + goto fail_mem; snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_0x%lx", 'p', tail, offset); event = buf; kfree(tail); @@ -695,13 +702,16 @@ static int __trace_uprobe_create(int argc, const char **argv) /* parse arguments */ for (i = 0; i < argc; i++) { - struct traceprobe_parse_context ctx = { - .flags = (is_return ? TPARG_FL_RETURN : 0) | TPARG_FL_USER, - }; + struct traceprobe_parse_context *ctx __free(traceprobe_parse_context) + = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) { + ret = -ENOMEM; + goto error; + } + ctx->flags = (is_return ? TPARG_FL_RETURN : 0) | TPARG_FL_USER; trace_probe_log_set_index(i + 2); - ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i], &ctx); - traceprobe_finish_parse(&ctx); + ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i], ctx); if (ret) goto error; } @@ -721,6 +731,9 @@ out: trace_probe_log_clear(); return ret; +fail_mem: + ret = -ENOMEM; + fail_address_parse: trace_probe_log_clear(); path_put(&path); diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c deleted file mode 100644 index 8303f4c7ca71..000000000000 --- a/kernel/usermode_driver.c +++ /dev/null @@ -1,191 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * umd - User mode driver support - */ -#include <linux/shmem_fs.h> -#include <linux/pipe_fs_i.h> -#include <linux/mount.h> -#include <linux/fs_struct.h> -#include <linux/task_work.h> -#include <linux/usermode_driver.h> - -static struct vfsmount *blob_to_mnt(const void *data, size_t len, const char *name) -{ - struct file_system_type *type; - struct vfsmount *mnt; - struct file *file; - ssize_t written; - loff_t pos = 0; - - type = get_fs_type("tmpfs"); - if (!type) - return ERR_PTR(-ENODEV); - - mnt = kern_mount(type); - put_filesystem(type); - if (IS_ERR(mnt)) - return mnt; - - file = file_open_root_mnt(mnt, name, O_CREAT | O_WRONLY, 0700); - if (IS_ERR(file)) { - kern_unmount(mnt); - return ERR_CAST(file); - } - - written = kernel_write(file, data, len, &pos); - if (written != len) { - int err = written; - if (err >= 0) - err = -ENOMEM; - filp_close(file, NULL); - kern_unmount(mnt); - return ERR_PTR(err); - } - - fput(file); - - /* Flush delayed fput so exec can open the file read-only */ - flush_delayed_fput(); - task_work_run(); - return mnt; -} - -/** - * umd_load_blob - Remember a blob of bytes for fork_usermode_driver - * @info: information about usermode driver - * @data: a blob of bytes that can be executed as a file - * @len: The lentgh of the blob - * - */ -int umd_load_blob(struct umd_info *info, const void *data, size_t len) -{ - struct vfsmount *mnt; - - if (WARN_ON_ONCE(info->wd.dentry || info->wd.mnt)) - return -EBUSY; - - mnt = blob_to_mnt(data, len, info->driver_name); - if (IS_ERR(mnt)) - return PTR_ERR(mnt); - - info->wd.mnt = mnt; - info->wd.dentry = mnt->mnt_root; - return 0; -} -EXPORT_SYMBOL_GPL(umd_load_blob); - -/** - * umd_unload_blob - Disassociate @info from a previously loaded blob - * @info: information about usermode driver - * - */ -int umd_unload_blob(struct umd_info *info) -{ - if (WARN_ON_ONCE(!info->wd.mnt || - !info->wd.dentry || - info->wd.mnt->mnt_root != info->wd.dentry)) - return -EINVAL; - - kern_unmount(info->wd.mnt); - info->wd.mnt = NULL; - info->wd.dentry = NULL; - return 0; -} -EXPORT_SYMBOL_GPL(umd_unload_blob); - -static int umd_setup(struct subprocess_info *info, struct cred *new) -{ - struct umd_info *umd_info = info->data; - struct file *from_umh[2]; - struct file *to_umh[2]; - int err; - - /* create pipe to send data to umh */ - err = create_pipe_files(to_umh, 0); - if (err) - return err; - err = replace_fd(0, to_umh[0], 0); - fput(to_umh[0]); - if (err < 0) { - fput(to_umh[1]); - return err; - } - - /* create pipe to receive data from umh */ - err = create_pipe_files(from_umh, 0); - if (err) { - fput(to_umh[1]); - replace_fd(0, NULL, 0); - return err; - } - err = replace_fd(1, from_umh[1], 0); - fput(from_umh[1]); - if (err < 0) { - fput(to_umh[1]); - replace_fd(0, NULL, 0); - fput(from_umh[0]); - return err; - } - - set_fs_pwd(current->fs, &umd_info->wd); - umd_info->pipe_to_umh = to_umh[1]; - umd_info->pipe_from_umh = from_umh[0]; - umd_info->tgid = get_pid(task_tgid(current)); - return 0; -} - -static void umd_cleanup(struct subprocess_info *info) -{ - struct umd_info *umd_info = info->data; - - /* cleanup if umh_setup() was successful but exec failed */ - if (info->retval) - umd_cleanup_helper(umd_info); -} - -/** - * umd_cleanup_helper - release the resources which were allocated in umd_setup - * @info: information about usermode driver - */ -void umd_cleanup_helper(struct umd_info *info) -{ - fput(info->pipe_to_umh); - fput(info->pipe_from_umh); - put_pid(info->tgid); - info->tgid = NULL; -} -EXPORT_SYMBOL_GPL(umd_cleanup_helper); - -/** - * fork_usermode_driver - fork a usermode driver - * @info: information about usermode driver (shouldn't be NULL) - * - * Returns either negative error or zero which indicates success in - * executing a usermode driver. In such case 'struct umd_info *info' - * is populated with two pipes and a tgid of the process. The caller is - * responsible for health check of the user process, killing it via - * tgid, and closing the pipes when user process is no longer needed. - */ -int fork_usermode_driver(struct umd_info *info) -{ - struct subprocess_info *sub_info; - const char *argv[] = { info->driver_name, NULL }; - int err; - - if (WARN_ON_ONCE(info->tgid)) - return -EBUSY; - - err = -ENOMEM; - sub_info = call_usermodehelper_setup(info->driver_name, - (char **)argv, NULL, GFP_KERNEL, - umd_setup, umd_cleanup, info); - if (!sub_info) - goto out; - - err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); -out: - return err; -} -EXPORT_SYMBOL_GPL(fork_usermode_driver); - - diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c index 75af12ff774e..9c58f5b4381d 100644 --- a/kernel/watchdog_perf.c +++ b/kernel/watchdog_perf.c @@ -187,6 +187,28 @@ void watchdog_hardlockup_disable(unsigned int cpu) } /** + * hardlockup_detector_perf_adjust_period - Adjust the event period due + * to current cpu frequency change + * @period: The target period to be set + */ +void hardlockup_detector_perf_adjust_period(u64 period) +{ + struct perf_event *event = this_cpu_read(watchdog_ev); + + if (!(watchdog_enabled & WATCHDOG_HARDLOCKUP_ENABLED)) + return; + + if (!event) + return; + + if (event->attr.sample_period == period) + return; + + if (perf_event_period(event, period)) + pr_err("failed to change period to %llu\n", period); +} + +/** * hardlockup_detector_perf_stop - Globally stop watchdog events * * Special interface for x86 to handle the perf HT bug. |