diff options
Diffstat (limited to 'kernel')
46 files changed, 1794 insertions, 767 deletions
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 2f0597320b6d..6639640523c0 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -125,8 +125,12 @@ static int bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key, fd = *(int *)key; f = fget_raw(fd); - if (!f || !inode_storage_ptr(f->f_inode)) + if (!f) + return -EBADF; + if (!inode_storage_ptr(f->f_inode)) { + fput(f); return -EBADF; + } sdata = bpf_local_storage_update(f->f_inode, (struct bpf_local_storage_map *)map, diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 5454161407f1..a0d9eade9c80 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -287,7 +287,7 @@ int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info) { struct bpf_iter_target_info *tinfo; - tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); + tinfo = kzalloc(sizeof(*tinfo), GFP_KERNEL); if (!tinfo) return -ENOMEM; diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c index 1b6b9349cb85..d99e89f113c4 100644 --- a/kernel/bpf/bpf_lru_list.c +++ b/kernel/bpf/bpf_lru_list.c @@ -502,13 +502,14 @@ struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash) static void bpf_common_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node) { + u8 node_type = READ_ONCE(node->type); unsigned long flags; - if (WARN_ON_ONCE(node->type == BPF_LRU_LIST_T_FREE) || - WARN_ON_ONCE(node->type == BPF_LRU_LOCAL_LIST_T_FREE)) + if (WARN_ON_ONCE(node_type == BPF_LRU_LIST_T_FREE) || + WARN_ON_ONCE(node_type == BPF_LRU_LOCAL_LIST_T_FREE)) return; - if (node->type == BPF_LRU_LOCAL_LIST_T_PENDING) { + if (node_type == BPF_LRU_LOCAL_LIST_T_PENDING) { struct bpf_lru_locallist *loc_l; loc_l = per_cpu_ptr(lru->common_lru.local_list, node->cpu); diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 70e5e0b6d69d..1622a44d1617 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -149,7 +149,11 @@ BTF_ID(func, bpf_lsm_file_ioctl) BTF_ID(func, bpf_lsm_file_lock) BTF_ID(func, bpf_lsm_file_open) BTF_ID(func, bpf_lsm_file_receive) + +#ifdef CONFIG_SECURITY_NETWORK BTF_ID(func, bpf_lsm_inet_conn_established) +#endif /* CONFIG_SECURITY_NETWORK */ + BTF_ID(func, bpf_lsm_inode_create) BTF_ID(func, bpf_lsm_inode_free_security) BTF_ID(func, bpf_lsm_inode_getattr) @@ -166,7 +170,11 @@ BTF_ID(func, bpf_lsm_inode_symlink) BTF_ID(func, bpf_lsm_inode_unlink) BTF_ID(func, bpf_lsm_kernel_module_request) BTF_ID(func, bpf_lsm_kernfs_init_security) + +#ifdef CONFIG_KEYS BTF_ID(func, bpf_lsm_key_free) +#endif /* CONFIG_KEYS */ + BTF_ID(func, bpf_lsm_mmap_file) BTF_ID(func, bpf_lsm_netlink_send) BTF_ID(func, bpf_lsm_path_notify) @@ -181,6 +189,8 @@ BTF_ID(func, bpf_lsm_sb_show_options) BTF_ID(func, bpf_lsm_sb_statfs) BTF_ID(func, bpf_lsm_sb_umount) BTF_ID(func, bpf_lsm_settime) + +#ifdef CONFIG_SECURITY_NETWORK BTF_ID(func, bpf_lsm_socket_accept) BTF_ID(func, bpf_lsm_socket_bind) BTF_ID(func, bpf_lsm_socket_connect) @@ -195,6 +205,8 @@ BTF_ID(func, bpf_lsm_socket_recvmsg) BTF_ID(func, bpf_lsm_socket_sendmsg) BTF_ID(func, bpf_lsm_socket_shutdown) BTF_ID(func, bpf_lsm_socket_socketpair) +#endif /* CONFIG_SECURITY_NETWORK */ + BTF_ID(func, bpf_lsm_syslog) BTF_ID(func, bpf_lsm_task_alloc) BTF_ID(func, bpf_lsm_task_getsecid) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 84a36ee4a4c2..2efeb5f4b343 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -458,7 +458,7 @@ static bool btf_type_is_datasec(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC; } -static u32 btf_nr_types_total(const struct btf *btf) +u32 btf_nr_types(const struct btf *btf) { u32 total = 0; @@ -476,7 +476,7 @@ s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind) const char *tname; u32 i, total; - total = btf_nr_types_total(btf); + total = btf_nr_types(btf); for (i = 1; i < total; i++) { t = btf_type_by_id(btf, i); if (BTF_INFO_KIND(t->info) != kind) @@ -3540,11 +3540,6 @@ static s32 btf_datasec_check_meta(struct btf_verifier_env *env, return -EINVAL; } - if (!btf_type_vlen(t)) { - btf_verifier_log_type(env, t, "vlen == 0"); - return -EINVAL; - } - if (!t->size) { btf_verifier_log_type(env, t, "size == 0"); return -EINVAL; @@ -5296,15 +5291,16 @@ int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *pr * Only PTR_TO_CTX and SCALAR_VALUE states are recognized. */ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, - struct bpf_reg_state *reg) + struct bpf_reg_state *regs) { struct bpf_verifier_log *log = &env->log; struct bpf_prog *prog = env->prog; struct btf *btf = prog->aux->btf; const struct btf_param *args; - const struct btf_type *t; - u32 i, nargs, btf_id; + const struct btf_type *t, *ref_t; + u32 i, nargs, btf_id, type_size; const char *tname; + bool is_global; if (!prog->aux->func_info) return -EINVAL; @@ -5338,38 +5334,57 @@ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, bpf_log(log, "Function %s has %d > 5 args\n", tname, nargs); goto out; } + + is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL; /* check that BTF function arguments match actual types that the * verifier sees. */ for (i = 0; i < nargs; i++) { + struct bpf_reg_state *reg = ®s[i + 1]; + t = btf_type_by_id(btf, args[i].type); while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); if (btf_type_is_int(t) || btf_type_is_enum(t)) { - if (reg[i + 1].type == SCALAR_VALUE) + if (reg->type == SCALAR_VALUE) continue; bpf_log(log, "R%d is not a scalar\n", i + 1); goto out; } if (btf_type_is_ptr(t)) { - if (reg[i + 1].type == SCALAR_VALUE) { - bpf_log(log, "R%d is not a pointer\n", i + 1); - goto out; - } /* If function expects ctx type in BTF check that caller * is passing PTR_TO_CTX. */ if (btf_get_prog_ctx_type(log, btf, t, prog->type, i)) { - if (reg[i + 1].type != PTR_TO_CTX) { + if (reg->type != PTR_TO_CTX) { bpf_log(log, "arg#%d expected pointer to ctx, but got %s\n", i, btf_kind_str[BTF_INFO_KIND(t->info)]); goto out; } - if (check_ctx_reg(env, ®[i + 1], i + 1)) + if (check_ctx_reg(env, reg, i + 1)) goto out; continue; } + + if (!is_global) + goto out; + + t = btf_type_skip_modifiers(btf, t->type, NULL); + + ref_t = btf_resolve_size(btf, t, &type_size); + if (IS_ERR(ref_t)) { + bpf_log(log, + "arg#%d reference type('%s %s') size cannot be determined: %ld\n", + i, btf_type_str(t), btf_name_by_offset(btf, t->name_off), + PTR_ERR(ref_t)); + goto out; + } + + if (check_mem_reg(env, reg, i + 1, type_size)) + goto out; + + continue; } bpf_log(log, "Unrecognized arg#%d type %s\n", i, btf_kind_str[BTF_INFO_KIND(t->info)]); @@ -5393,14 +5408,14 @@ out: * (either PTR_TO_CTX or SCALAR_VALUE). */ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, - struct bpf_reg_state *reg) + struct bpf_reg_state *regs) { struct bpf_verifier_log *log = &env->log; struct bpf_prog *prog = env->prog; enum bpf_prog_type prog_type = prog->type; struct btf *btf = prog->aux->btf; const struct btf_param *args; - const struct btf_type *t; + const struct btf_type *t, *ref_t; u32 i, nargs, btf_id; const char *tname; @@ -5464,16 +5479,35 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, * Only PTR_TO_CTX and SCALAR are supported atm. */ for (i = 0; i < nargs; i++) { + struct bpf_reg_state *reg = ®s[i + 1]; + t = btf_type_by_id(btf, args[i].type); while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); if (btf_type_is_int(t) || btf_type_is_enum(t)) { - reg[i + 1].type = SCALAR_VALUE; + reg->type = SCALAR_VALUE; continue; } - if (btf_type_is_ptr(t) && - btf_get_prog_ctx_type(log, btf, t, prog_type, i)) { - reg[i + 1].type = PTR_TO_CTX; + if (btf_type_is_ptr(t)) { + if (btf_get_prog_ctx_type(log, btf, t, prog_type, i)) { + reg->type = PTR_TO_CTX; + continue; + } + + t = btf_type_skip_modifiers(btf, t->type, NULL); + + ref_t = btf_resolve_size(btf, t, ®->mem_size); + if (IS_ERR(ref_t)) { + bpf_log(log, + "arg#%d reference type('%s %s') size cannot be determined: %ld\n", + i, btf_type_str(t), btf_name_by_offset(btf, t->name_off), + PTR_ERR(ref_t)); + return -EINVAL; + } + + reg->type = PTR_TO_MEM_OR_NULL; + reg->id = ++env->id_gen; + continue; } bpf_log(log, "Arg#%d type %s in %s() is not supported yet.\n", @@ -5743,6 +5777,11 @@ bool btf_is_kernel(const struct btf *btf) return btf->kernel_btf; } +bool btf_is_module(const struct btf *btf) +{ + return btf->kernel_btf && strcmp(btf->name, "vmlinux") != 0; +} + static int btf_id_cmp_func(const void *a, const void *b) { const int *pa = a, *pb = b; @@ -5877,3 +5916,25 @@ static int __init btf_module_init(void) fs_initcall(btf_module_init); #endif /* CONFIG_DEBUG_INFO_BTF_MODULES */ + +struct module *btf_try_get_module(const struct btf *btf) +{ + struct module *res = NULL; +#ifdef CONFIG_DEBUG_INFO_BTF_MODULES + struct btf_module *btf_mod, *tmp; + + mutex_lock(&btf_module_mutex); + list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) { + if (btf_mod->btf != btf) + continue; + + if (try_module_get(btf_mod->module)) + res = btf_mod->module; + + break; + } + mutex_unlock(&btf_module_mutex); +#endif + + return res; +} diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 96555a8a2c54..b567ca46555c 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -19,7 +19,7 @@ #include "../cgroup/cgroup-internal.h" -DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key); +DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_BPF_ATTACH_TYPE); EXPORT_SYMBOL(cgroup_bpf_enabled_key); void cgroup_bpf_offline(struct cgroup *cgrp) @@ -128,7 +128,7 @@ static void cgroup_bpf_release(struct work_struct *work) if (pl->link) bpf_cgroup_link_auto_detach(pl->link); kfree(pl); - static_branch_dec(&cgroup_bpf_enabled_key); + static_branch_dec(&cgroup_bpf_enabled_key[type]); } old_array = rcu_dereference_protected( cgrp->bpf.effective[type], @@ -499,7 +499,7 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, if (old_prog) bpf_prog_put(old_prog); else - static_branch_inc(&cgroup_bpf_enabled_key); + static_branch_inc(&cgroup_bpf_enabled_key[type]); bpf_cgroup_storages_link(new_storage, cgrp, type); return 0; @@ -698,7 +698,7 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, cgrp->bpf.flags[type] = 0; if (old_prog) bpf_prog_put(old_prog); - static_branch_dec(&cgroup_bpf_enabled_key); + static_branch_dec(&cgroup_bpf_enabled_key[type]); return 0; cleanup: @@ -1055,6 +1055,8 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); * @uaddr: sockaddr struct provided by user * @type: The type of program to be exectuted * @t_ctx: Pointer to attach type specific context + * @flags: Pointer to u32 which contains higher bits of BPF program + * return value (OR'ed together). * * socket is expected to be of type INET or INET6. * @@ -1064,7 +1066,8 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, struct sockaddr *uaddr, enum bpf_attach_type type, - void *t_ctx) + void *t_ctx, + u32 *flags) { struct bpf_sock_addr_kern ctx = { .sk = sk, @@ -1087,7 +1090,8 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, } cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN); + ret = BPF_PROG_RUN_ARRAY_FLAGS(cgrp->bpf.effective[type], &ctx, + BPF_PROG_RUN, flags); return ret == 1 ? 0 : -EPERM; } @@ -1298,7 +1302,8 @@ static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp, return empty; } -static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen) +static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen, + struct bpf_sockopt_buf *buf) { if (unlikely(max_optlen < 0)) return -EINVAL; @@ -1310,6 +1315,15 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen) max_optlen = PAGE_SIZE; } + if (max_optlen <= sizeof(buf->data)) { + /* When the optval fits into BPF_SOCKOPT_KERN_BUF_SIZE + * bytes avoid the cost of kzalloc. + */ + ctx->optval = buf->data; + ctx->optval_end = ctx->optval + max_optlen; + return max_optlen; + } + ctx->optval = kzalloc(max_optlen, GFP_USER); if (!ctx->optval) return -ENOMEM; @@ -1319,16 +1333,26 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen) return max_optlen; } -static void sockopt_free_buf(struct bpf_sockopt_kern *ctx) +static void sockopt_free_buf(struct bpf_sockopt_kern *ctx, + struct bpf_sockopt_buf *buf) { + if (ctx->optval == buf->data) + return; kfree(ctx->optval); } +static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx, + struct bpf_sockopt_buf *buf) +{ + return ctx->optval != buf->data; +} + int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, int *optname, char __user *optval, int *optlen, char **kernel_optval) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_sockopt_buf buf = {}; struct bpf_sockopt_kern ctx = { .sk = sk, .level = *level, @@ -1340,8 +1364,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, * attached to the hook so we don't waste time allocating * memory and locking the socket. */ - if (!cgroup_bpf_enabled || - __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT)) + if (__cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT)) return 0; /* Allocate a bit more than the initial user buffer for @@ -1350,7 +1373,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, */ max_optlen = max_t(int, 16, *optlen); - max_optlen = sockopt_alloc_buf(&ctx, max_optlen); + max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf); if (max_optlen < 0) return max_optlen; @@ -1390,14 +1413,31 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, */ if (ctx.optlen != 0) { *optlen = ctx.optlen; - *kernel_optval = ctx.optval; + /* We've used bpf_sockopt_kern->buf as an intermediary + * storage, but the BPF program indicates that we need + * to pass this data to the kernel setsockopt handler. + * No way to export on-stack buf, have to allocate a + * new buffer. + */ + if (!sockopt_buf_allocated(&ctx, &buf)) { + void *p = kmalloc(ctx.optlen, GFP_USER); + + if (!p) { + ret = -ENOMEM; + goto out; + } + memcpy(p, ctx.optval, ctx.optlen); + *kernel_optval = p; + } else { + *kernel_optval = ctx.optval; + } /* export and don't free sockopt buf */ return 0; } } out: - sockopt_free_buf(&ctx); + sockopt_free_buf(&ctx, &buf); return ret; } @@ -1407,6 +1447,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, int retval) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_sockopt_buf buf = {}; struct bpf_sockopt_kern ctx = { .sk = sk, .level = level, @@ -1419,13 +1460,12 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, * attached to the hook so we don't waste time allocating * memory and locking the socket. */ - if (!cgroup_bpf_enabled || - __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT)) + if (__cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT)) return retval; ctx.optlen = max_optlen; - max_optlen = sockopt_alloc_buf(&ctx, max_optlen); + max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf); if (max_optlen < 0) return max_optlen; @@ -1442,6 +1482,11 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, goto out; } + if (ctx.optlen < 0) { + ret = -EFAULT; + goto out; + } + if (copy_from_user(ctx.optval, optval, min(ctx.optlen, max_optlen)) != 0) { ret = -EFAULT; @@ -1459,7 +1504,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, goto out; } - if (ctx.optlen > max_optlen) { + if (ctx.optlen > max_optlen || ctx.optlen < 0) { ret = -EFAULT; goto out; } @@ -1483,9 +1528,55 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, ret = ctx.retval; out: - sockopt_free_buf(&ctx); + sockopt_free_buf(&ctx, &buf); return ret; } + +int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level, + int optname, void *optval, + int *optlen, int retval) +{ + struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_sockopt_kern ctx = { + .sk = sk, + .level = level, + .optname = optname, + .retval = retval, + .optlen = *optlen, + .optval = optval, + .optval_end = optval + *optlen, + }; + int ret; + + /* Note that __cgroup_bpf_run_filter_getsockopt doesn't copy + * user data back into BPF buffer when reval != 0. This is + * done as an optimization to avoid extra copy, assuming + * kernel won't populate the data in case of an error. + * Here we always pass the data and memset() should + * be called if that data shouldn't be "exported". + */ + + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT], + &ctx, BPF_PROG_RUN); + if (!ret) + return -EPERM; + + if (ctx.optlen > *optlen) + return -EFAULT; + + /* BPF programs only allowed to set retval to 0, not some + * arbitrary value. + */ + if (ctx.retval != 0 && ctx.retval != retval) + return -EFAULT; + + /* BPF programs can shrink the buffer, export the modifications. + */ + if (ctx.optlen != 0) + *optlen = ctx.optlen; + + return ctx.retval; +} #endif static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 261f8692d0d2..0ae015ad1e05 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -91,6 +91,12 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag vfree(fp); return NULL; } + fp->active = alloc_percpu_gfp(int, GFP_KERNEL_ACCOUNT | gfp_extra_flags); + if (!fp->active) { + vfree(fp); + kfree(aux); + return NULL; + } fp->pages = size / PAGE_SIZE; fp->aux = aux; @@ -114,8 +120,9 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) if (!prog) return NULL; - prog->aux->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags); - if (!prog->aux->stats) { + prog->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags); + if (!prog->stats) { + free_percpu(prog->active); kfree(prog->aux); vfree(prog); return NULL; @@ -124,7 +131,7 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) for_each_possible_cpu(cpu) { struct bpf_prog_stats *pstats; - pstats = per_cpu_ptr(prog->aux->stats, cpu); + pstats = per_cpu_ptr(prog->stats, cpu); u64_stats_init(&pstats->syncp); } return prog; @@ -238,6 +245,8 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, * reallocated structure. */ fp_old->aux = NULL; + fp_old->stats = NULL; + fp_old->active = NULL; __bpf_prog_free(fp_old); } @@ -249,10 +258,11 @@ void __bpf_prog_free(struct bpf_prog *fp) if (fp->aux) { mutex_destroy(&fp->aux->used_maps_mutex); mutex_destroy(&fp->aux->dst_mutex); - free_percpu(fp->aux->stats); kfree(fp->aux->poke_tab); kfree(fp->aux); } + free_percpu(fp->stats); + free_percpu(fp->active); vfree(fp); } @@ -1309,8 +1319,8 @@ EXPORT_SYMBOL_GPL(__bpf_call_base); INSN_3(STX, MEM, H), \ INSN_3(STX, MEM, W), \ INSN_3(STX, MEM, DW), \ - INSN_3(STX, XADD, W), \ - INSN_3(STX, XADD, DW), \ + INSN_3(STX, ATOMIC, W), \ + INSN_3(STX, ATOMIC, DW), \ /* Immediate based. */ \ INSN_3(ST, MEM, B), \ INSN_3(ST, MEM, H), \ @@ -1618,13 +1628,59 @@ out: LDX_PROBE(DW, 8) #undef LDX_PROBE - STX_XADD_W: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */ - atomic_add((u32) SRC, (atomic_t *)(unsigned long) - (DST + insn->off)); - CONT; - STX_XADD_DW: /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */ - atomic64_add((u64) SRC, (atomic64_t *)(unsigned long) - (DST + insn->off)); +#define ATOMIC_ALU_OP(BOP, KOP) \ + case BOP: \ + if (BPF_SIZE(insn->code) == BPF_W) \ + atomic_##KOP((u32) SRC, (atomic_t *)(unsigned long) \ + (DST + insn->off)); \ + else \ + atomic64_##KOP((u64) SRC, (atomic64_t *)(unsigned long) \ + (DST + insn->off)); \ + break; \ + case BOP | BPF_FETCH: \ + if (BPF_SIZE(insn->code) == BPF_W) \ + SRC = (u32) atomic_fetch_##KOP( \ + (u32) SRC, \ + (atomic_t *)(unsigned long) (DST + insn->off)); \ + else \ + SRC = (u64) atomic64_fetch_##KOP( \ + (u64) SRC, \ + (atomic64_t *)(unsigned long) (DST + insn->off)); \ + break; + + STX_ATOMIC_DW: + STX_ATOMIC_W: + switch (IMM) { + ATOMIC_ALU_OP(BPF_ADD, add) + ATOMIC_ALU_OP(BPF_AND, and) + ATOMIC_ALU_OP(BPF_OR, or) + ATOMIC_ALU_OP(BPF_XOR, xor) +#undef ATOMIC_ALU_OP + + case BPF_XCHG: + if (BPF_SIZE(insn->code) == BPF_W) + SRC = (u32) atomic_xchg( + (atomic_t *)(unsigned long) (DST + insn->off), + (u32) SRC); + else + SRC = (u64) atomic64_xchg( + (atomic64_t *)(unsigned long) (DST + insn->off), + (u64) SRC); + break; + case BPF_CMPXCHG: + if (BPF_SIZE(insn->code) == BPF_W) + BPF_R0 = (u32) atomic_cmpxchg( + (atomic_t *)(unsigned long) (DST + insn->off), + (u32) BPF_R0, (u32) SRC); + else + BPF_R0 = (u64) atomic64_cmpxchg( + (atomic64_t *)(unsigned long) (DST + insn->off), + (u64) BPF_R0, (u64) SRC); + break; + + default: + goto default_label; + } CONT; default_label: @@ -1634,7 +1690,8 @@ out: * * Note, verifier whitelists all opcodes in bpf_opcode_in_insntable(). */ - pr_warn("BPF interpreter: unknown opcode %02x\n", insn->code); + pr_warn("BPF interpreter: unknown opcode %02x (imm: 0x%x)\n", + insn->code, insn->imm); BUG_ON(1); return 0; } @@ -2119,6 +2176,28 @@ static void bpf_free_used_maps(struct bpf_prog_aux *aux) kfree(aux->used_maps); } +void __bpf_free_used_btfs(struct bpf_prog_aux *aux, + struct btf_mod_pair *used_btfs, u32 len) +{ +#ifdef CONFIG_BPF_SYSCALL + struct btf_mod_pair *btf_mod; + u32 i; + + for (i = 0; i < len; i++) { + btf_mod = &used_btfs[i]; + if (btf_mod->module) + module_put(btf_mod->module); + btf_put(btf_mod->btf); + } +#endif +} + +static void bpf_free_used_btfs(struct bpf_prog_aux *aux) +{ + __bpf_free_used_btfs(aux, aux->used_btfs, aux->used_btf_cnt); + kfree(aux->used_btfs); +} + static void bpf_prog_free_deferred(struct work_struct *work) { struct bpf_prog_aux *aux; @@ -2126,6 +2205,7 @@ static void bpf_prog_free_deferred(struct work_struct *work) aux = container_of(work, struct bpf_prog_aux, work); bpf_free_used_maps(aux); + bpf_free_used_btfs(aux); if (bpf_prog_is_dev_bound(aux)) bpf_prog_offload_destroy(aux->prog); #ifdef CONFIG_PERF_EVENTS diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 747313698178..5d1469de6921 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -141,49 +141,6 @@ static void cpu_map_kthread_stop(struct work_struct *work) kthread_stop(rcpu->kthread); } -static struct sk_buff *cpu_map_build_skb(struct xdp_frame *xdpf, - struct sk_buff *skb) -{ - unsigned int hard_start_headroom; - unsigned int frame_size; - void *pkt_data_start; - - /* Part of headroom was reserved to xdpf */ - hard_start_headroom = sizeof(struct xdp_frame) + xdpf->headroom; - - /* Memory size backing xdp_frame data already have reserved - * room for build_skb to place skb_shared_info in tailroom. - */ - frame_size = xdpf->frame_sz; - - pkt_data_start = xdpf->data - hard_start_headroom; - skb = build_skb_around(skb, pkt_data_start, frame_size); - if (unlikely(!skb)) - return NULL; - - skb_reserve(skb, hard_start_headroom); - __skb_put(skb, xdpf->len); - if (xdpf->metasize) - skb_metadata_set(skb, xdpf->metasize); - - /* Essential SKB info: protocol and skb->dev */ - skb->protocol = eth_type_trans(skb, xdpf->dev_rx); - - /* Optional SKB info, currently missing: - * - HW checksum info (skb->ip_summed) - * - HW RX hash (skb_set_hash) - * - RX ring dev queue index (skb_record_rx_queue) - */ - - /* Until page_pool get SKB return path, release DMA here */ - xdp_release_frame(xdpf); - - /* Allow SKB to reuse area used by xdp_frame */ - xdp_scrub_frame(xdpf); - - return skb; -} - static void __cpu_map_ring_cleanup(struct ptr_ring *ring) { /* The tear-down procedure should have made sure that queue is @@ -350,7 +307,8 @@ static int cpu_map_kthread_run(void *data) struct sk_buff *skb = skbs[i]; int ret; - skb = cpu_map_build_skb(xdpf, skb); + skb = __xdp_build_skb_from_frame(xdpf, skb, + xdpf->dev_rx); if (!skb) { xdp_return_frame(xdpf); continue; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index f6e9c68afdd4..85d9d1b72a33 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -802,9 +802,7 @@ static int dev_map_notification(struct notifier_block *notifier, break; /* will be freed in free_netdev() */ - netdev->xdp_bulkq = - __alloc_percpu_gfp(sizeof(struct xdp_dev_bulk_queue), - sizeof(void *), GFP_ATOMIC); + netdev->xdp_bulkq = alloc_percpu(struct xdp_dev_bulk_queue); if (!netdev->xdp_bulkq) return NOTIFY_BAD; diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index b44d8c447afd..3acc7e0b6916 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -80,6 +80,13 @@ const char *const bpf_alu_string[16] = { [BPF_END >> 4] = "endian", }; +static const char *const bpf_atomic_alu_string[16] = { + [BPF_ADD >> 4] = "add", + [BPF_AND >> 4] = "and", + [BPF_OR >> 4] = "or", + [BPF_XOR >> 4] = "or", +}; + static const char *const bpf_ldst_string[] = { [BPF_W >> 3] = "u32", [BPF_H >> 3] = "u16", @@ -153,14 +160,44 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); - else if (BPF_MODE(insn->code) == BPF_XADD) - verbose(cbs->private_data, "(%02x) lock *(%s *)(r%d %+d) += r%d\n", + else if (BPF_MODE(insn->code) == BPF_ATOMIC && + (insn->imm == BPF_ADD || insn->imm == BPF_AND || + insn->imm == BPF_OR || insn->imm == BPF_XOR)) { + verbose(cbs->private_data, "(%02x) lock *(%s *)(r%d %+d) %s r%d\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, insn->off, + bpf_alu_string[BPF_OP(insn->imm) >> 4], + insn->src_reg); + } else if (BPF_MODE(insn->code) == BPF_ATOMIC && + (insn->imm == (BPF_ADD | BPF_FETCH) || + insn->imm == (BPF_AND | BPF_FETCH) || + insn->imm == (BPF_OR | BPF_FETCH) || + insn->imm == (BPF_XOR | BPF_FETCH))) { + verbose(cbs->private_data, "(%02x) r%d = atomic%s_fetch_%s((%s *)(r%d %+d), r%d)\n", + insn->code, insn->src_reg, + BPF_SIZE(insn->code) == BPF_DW ? "64" : "", + bpf_atomic_alu_string[BPF_OP(insn->imm) >> 4], + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, insn->off, insn->src_reg); + } else if (BPF_MODE(insn->code) == BPF_ATOMIC && + insn->imm == BPF_CMPXCHG) { + verbose(cbs->private_data, "(%02x) r0 = atomic%s_cmpxchg((%s *)(r%d %+d), r0, r%d)\n", insn->code, + BPF_SIZE(insn->code) == BPF_DW ? "64" : "", bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); - else + } else if (BPF_MODE(insn->code) == BPF_ATOMIC && + insn->imm == BPF_XCHG) { + verbose(cbs->private_data, "(%02x) r%d = atomic%s_xchg((%s *)(r%d %+d), r%d)\n", + insn->code, insn->src_reg, + BPF_SIZE(insn->code) == BPF_DW ? "64" : "", + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, insn->off, insn->src_reg); + } else { verbose(cbs->private_data, "BUG_%02x\n", insn->code); + } } else if (class == BPF_ST) { if (BPF_MODE(insn->code) != BPF_MEM) { verbose(cbs->private_data, "BUG_st_%02x\n", insn->code); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index c1ac7f964bc9..d63912e73ad9 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1148,7 +1148,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, /* unknown flags */ return -EINVAL; - WARN_ON_ONCE(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); key_size = map->key_size; @@ -1202,7 +1202,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, /* unknown flags */ return -EINVAL; - WARN_ON_ONCE(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); key_size = map->key_size; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 41ca280b1dc1..308427fe03a3 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -720,14 +720,6 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_spin_lock_proto; case BPF_FUNC_spin_unlock: return &bpf_spin_unlock_proto; - case BPF_FUNC_trace_printk: - if (!perfmon_capable()) - return NULL; - return bpf_get_trace_printk_proto(); - case BPF_FUNC_snprintf_btf: - if (!perfmon_capable()) - return NULL; - return &bpf_snprintf_btf_proto; case BPF_FUNC_jiffies64: return &bpf_jiffies64_proto; case BPF_FUNC_per_cpu_ptr: @@ -742,6 +734,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return NULL; switch (func_id) { + case BPF_FUNC_trace_printk: + return bpf_get_trace_printk_proto(); case BPF_FUNC_get_current_task: return &bpf_get_current_task_proto; case BPF_FUNC_probe_read_user: @@ -752,6 +746,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_probe_read_user_str_proto; case BPF_FUNC_probe_read_kernel_str: return &bpf_probe_read_kernel_str_proto; + case BPF_FUNC_snprintf_btf: + return &bpf_snprintf_btf_proto; default: return NULL; } diff --git a/kernel/bpf/preload/Makefile b/kernel/bpf/preload/Makefile index 23ee310b6eb4..1951332dd15f 100644 --- a/kernel/bpf/preload/Makefile +++ b/kernel/bpf/preload/Makefile @@ -4,8 +4,11 @@ LIBBPF_SRCS = $(srctree)/tools/lib/bpf/ LIBBPF_A = $(obj)/libbpf.a LIBBPF_OUT = $(abspath $(obj)) +# Although not in use by libbpf's Makefile, set $(O) so that the "dummy" test +# in tools/scripts/Makefile.include always succeeds when building the kernel +# with $(O) pointing to a relative path, as in "make O=build bindeb-pkg". $(LIBBPF_A): - $(Q)$(MAKE) -C $(LIBBPF_SRCS) OUTPUT=$(LIBBPF_OUT)/ $(LIBBPF_OUT)/libbpf.a + $(Q)$(MAKE) -C $(LIBBPF_SRCS) O=$(LIBBPF_OUT)/ OUTPUT=$(LIBBPF_OUT)/ $(LIBBPF_OUT)/libbpf.a userccflags += -I $(srctree)/tools/include/ -I $(srctree)/tools/include/uapi \ -I $(srctree)/tools/lib/ -Wno-unused-result diff --git a/kernel/bpf/preload/iterators/iterators.c b/kernel/bpf/preload/iterators/iterators.c index b7ff87939172..5d872a705470 100644 --- a/kernel/bpf/preload/iterators/iterators.c +++ b/kernel/bpf/preload/iterators/iterators.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2020 Facebook */ -#include <argp.h> +#include <errno.h> #include <stdio.h> #include <stdlib.h> #include <string.h> diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index aea96b638473..be35bfb7fb13 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -7,10 +7,9 @@ #include <linux/kernel.h> #include <linux/stacktrace.h> #include <linux/perf_event.h> -#include <linux/elf.h> -#include <linux/pagemap.h> #include <linux/irq_work.h> #include <linux/btf_ids.h> +#include <linux/buildid.h> #include "percpu_freelist.h" #define STACK_CREATE_FLAG_MASK \ @@ -115,6 +114,8 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) /* hash table size must be power of 2 */ n_buckets = roundup_pow_of_two(attr->max_entries); + if (!n_buckets) + return ERR_PTR(-E2BIG); cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap); cost += n_buckets * (value_size + sizeof(struct stack_map_bucket)); @@ -143,140 +144,6 @@ free_smap: return ERR_PTR(err); } -#define BPF_BUILD_ID 3 -/* - * Parse build id from the note segment. This logic can be shared between - * 32-bit and 64-bit system, because Elf32_Nhdr and Elf64_Nhdr are - * identical. - */ -static inline int stack_map_parse_build_id(void *page_addr, - unsigned char *build_id, - void *note_start, - Elf32_Word note_size) -{ - Elf32_Word note_offs = 0, new_offs; - - /* check for overflow */ - if (note_start < page_addr || note_start + note_size < note_start) - return -EINVAL; - - /* only supports note that fits in the first page */ - if (note_start + note_size > page_addr + PAGE_SIZE) - return -EINVAL; - - while (note_offs + sizeof(Elf32_Nhdr) < note_size) { - Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_offs); - - if (nhdr->n_type == BPF_BUILD_ID && - nhdr->n_namesz == sizeof("GNU") && - nhdr->n_descsz > 0 && - nhdr->n_descsz <= BPF_BUILD_ID_SIZE) { - memcpy(build_id, - note_start + note_offs + - ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr), - nhdr->n_descsz); - memset(build_id + nhdr->n_descsz, 0, - BPF_BUILD_ID_SIZE - nhdr->n_descsz); - return 0; - } - new_offs = note_offs + sizeof(Elf32_Nhdr) + - ALIGN(nhdr->n_namesz, 4) + ALIGN(nhdr->n_descsz, 4); - if (new_offs <= note_offs) /* overflow */ - break; - note_offs = new_offs; - } - return -EINVAL; -} - -/* Parse build ID from 32-bit ELF */ -static int stack_map_get_build_id_32(void *page_addr, - unsigned char *build_id) -{ - Elf32_Ehdr *ehdr = (Elf32_Ehdr *)page_addr; - Elf32_Phdr *phdr; - int i; - - /* only supports phdr that fits in one page */ - if (ehdr->e_phnum > - (PAGE_SIZE - sizeof(Elf32_Ehdr)) / sizeof(Elf32_Phdr)) - return -EINVAL; - - phdr = (Elf32_Phdr *)(page_addr + sizeof(Elf32_Ehdr)); - - for (i = 0; i < ehdr->e_phnum; ++i) { - if (phdr[i].p_type == PT_NOTE && - !stack_map_parse_build_id(page_addr, build_id, - page_addr + phdr[i].p_offset, - phdr[i].p_filesz)) - return 0; - } - return -EINVAL; -} - -/* Parse build ID from 64-bit ELF */ -static int stack_map_get_build_id_64(void *page_addr, - unsigned char *build_id) -{ - Elf64_Ehdr *ehdr = (Elf64_Ehdr *)page_addr; - Elf64_Phdr *phdr; - int i; - - /* only supports phdr that fits in one page */ - if (ehdr->e_phnum > - (PAGE_SIZE - sizeof(Elf64_Ehdr)) / sizeof(Elf64_Phdr)) - return -EINVAL; - - phdr = (Elf64_Phdr *)(page_addr + sizeof(Elf64_Ehdr)); - - for (i = 0; i < ehdr->e_phnum; ++i) { - if (phdr[i].p_type == PT_NOTE && - !stack_map_parse_build_id(page_addr, build_id, - page_addr + phdr[i].p_offset, - phdr[i].p_filesz)) - return 0; - } - return -EINVAL; -} - -/* Parse build ID of ELF file mapped to vma */ -static int stack_map_get_build_id(struct vm_area_struct *vma, - unsigned char *build_id) -{ - Elf32_Ehdr *ehdr; - struct page *page; - void *page_addr; - int ret; - - /* only works for page backed storage */ - if (!vma->vm_file) - return -EINVAL; - - page = find_get_page(vma->vm_file->f_mapping, 0); - if (!page) - return -EFAULT; /* page not mapped */ - - ret = -EINVAL; - page_addr = kmap_atomic(page); - ehdr = (Elf32_Ehdr *)page_addr; - - /* compare magic x7f "ELF" */ - if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0) - goto out; - - /* only support executable file and shared object file */ - if (ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN) - goto out; - - if (ehdr->e_ident[EI_CLASS] == ELFCLASS32) - ret = stack_map_get_build_id_32(page_addr, build_id); - else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64) - ret = stack_map_get_build_id_64(page_addr, build_id); -out: - kunmap_atomic(page_addr); - put_page(page); - return ret; -} - static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, u64 *ips, u32 trace_nr, bool user) { @@ -317,18 +184,18 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, for (i = 0; i < trace_nr; i++) { id_offs[i].status = BPF_STACK_BUILD_ID_IP; id_offs[i].ip = ips[i]; - memset(id_offs[i].build_id, 0, BPF_BUILD_ID_SIZE); + memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX); } return; } for (i = 0; i < trace_nr; i++) { vma = find_vma(current->mm, ips[i]); - if (!vma || stack_map_get_build_id(vma, id_offs[i].build_id)) { + if (!vma || build_id_parse(vma, id_offs[i].build_id, NULL)) { /* per entry fall back to ips */ id_offs[i].status = BPF_STACK_BUILD_ID_IP; id_offs[i].ip = ips[i]; - memset(id_offs[i].build_id, 0, BPF_BUILD_ID_SIZE); + memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX); continue; } id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i] diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e5999d86c76e..c859bc46d06c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1731,25 +1731,28 @@ static int bpf_prog_release(struct inode *inode, struct file *filp) static void bpf_prog_get_stats(const struct bpf_prog *prog, struct bpf_prog_stats *stats) { - u64 nsecs = 0, cnt = 0; + u64 nsecs = 0, cnt = 0, misses = 0; int cpu; for_each_possible_cpu(cpu) { const struct bpf_prog_stats *st; unsigned int start; - u64 tnsecs, tcnt; + u64 tnsecs, tcnt, tmisses; - st = per_cpu_ptr(prog->aux->stats, cpu); + st = per_cpu_ptr(prog->stats, cpu); do { start = u64_stats_fetch_begin_irq(&st->syncp); tnsecs = st->nsecs; tcnt = st->cnt; + tmisses = st->misses; } while (u64_stats_fetch_retry_irq(&st->syncp, start)); nsecs += tnsecs; cnt += tcnt; + misses += tmisses; } stats->nsecs = nsecs; stats->cnt = cnt; + stats->misses = misses; } #ifdef CONFIG_PROC_FS @@ -1768,14 +1771,16 @@ static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp) "memlock:\t%llu\n" "prog_id:\t%u\n" "run_time_ns:\t%llu\n" - "run_cnt:\t%llu\n", + "run_cnt:\t%llu\n" + "recursion_misses:\t%llu\n", prog->type, prog->jited, prog_tag, prog->pages * 1ULL << PAGE_SHIFT, prog->aux->id, stats.nsecs, - stats.cnt); + stats.cnt, + stats.misses); } #endif @@ -3438,6 +3443,7 @@ static int bpf_prog_get_info_by_fd(struct file *file, bpf_prog_get_stats(prog, &stats); info.run_time_ns = stats.nsecs; info.run_cnt = stats.cnt; + info.recursion_misses = stats.misses; if (!bpf_capable()) { info.jited_prog_len = 0; diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 175b7b42bfc4..b68cb5d6d6eb 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -286,9 +286,248 @@ static const struct seq_operations task_file_seq_ops = { .show = task_file_seq_show, }; +struct bpf_iter_seq_task_vma_info { + /* The first field must be struct bpf_iter_seq_task_common. + * this is assumed by {init, fini}_seq_pidns() callback functions. + */ + struct bpf_iter_seq_task_common common; + struct task_struct *task; + struct vm_area_struct *vma; + u32 tid; + unsigned long prev_vm_start; + unsigned long prev_vm_end; +}; + +enum bpf_task_vma_iter_find_op { + task_vma_iter_first_vma, /* use mm->mmap */ + task_vma_iter_next_vma, /* use curr_vma->vm_next */ + task_vma_iter_find_vma, /* use find_vma() to find next vma */ +}; + +static struct vm_area_struct * +task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info) +{ + struct pid_namespace *ns = info->common.ns; + enum bpf_task_vma_iter_find_op op; + struct vm_area_struct *curr_vma; + struct task_struct *curr_task; + u32 curr_tid = info->tid; + + /* If this function returns a non-NULL vma, it holds a reference to + * the task_struct, and holds read lock on vma->mm->mmap_lock. + * If this function returns NULL, it does not hold any reference or + * lock. + */ + if (info->task) { + curr_task = info->task; + curr_vma = info->vma; + /* In case of lock contention, drop mmap_lock to unblock + * the writer. + * + * After relock, call find(mm, prev_vm_end - 1) to find + * new vma to process. + * + * +------+------+-----------+ + * | VMA1 | VMA2 | VMA3 | + * +------+------+-----------+ + * | | | | + * 4k 8k 16k 400k + * + * For example, curr_vma == VMA2. Before unlock, we set + * + * prev_vm_start = 8k + * prev_vm_end = 16k + * + * There are a few cases: + * + * 1) VMA2 is freed, but VMA3 exists. + * + * find_vma() will return VMA3, just process VMA3. + * + * 2) VMA2 still exists. + * + * find_vma() will return VMA2, process VMA2->next. + * + * 3) no more vma in this mm. + * + * Process the next task. + * + * 4) find_vma() returns a different vma, VMA2'. + * + * 4.1) If VMA2 covers same range as VMA2', skip VMA2', + * because we already covered the range; + * 4.2) VMA2 and VMA2' covers different ranges, process + * VMA2'. + */ + if (mmap_lock_is_contended(curr_task->mm)) { + info->prev_vm_start = curr_vma->vm_start; + info->prev_vm_end = curr_vma->vm_end; + op = task_vma_iter_find_vma; + mmap_read_unlock(curr_task->mm); + if (mmap_read_lock_killable(curr_task->mm)) + goto finish; + } else { + op = task_vma_iter_next_vma; + } + } else { +again: + curr_task = task_seq_get_next(ns, &curr_tid, true); + if (!curr_task) { + info->tid = curr_tid + 1; + goto finish; + } + + if (curr_tid != info->tid) { + info->tid = curr_tid; + /* new task, process the first vma */ + op = task_vma_iter_first_vma; + } else { + /* Found the same tid, which means the user space + * finished data in previous buffer and read more. + * We dropped mmap_lock before returning to user + * space, so it is necessary to use find_vma() to + * find the next vma to process. + */ + op = task_vma_iter_find_vma; + } + + if (!curr_task->mm) + goto next_task; + + if (mmap_read_lock_killable(curr_task->mm)) + goto finish; + } + + switch (op) { + case task_vma_iter_first_vma: + curr_vma = curr_task->mm->mmap; + break; + case task_vma_iter_next_vma: + curr_vma = curr_vma->vm_next; + break; + case task_vma_iter_find_vma: + /* We dropped mmap_lock so it is necessary to use find_vma + * to find the next vma. This is similar to the mechanism + * in show_smaps_rollup(). + */ + curr_vma = find_vma(curr_task->mm, info->prev_vm_end - 1); + /* case 1) and 4.2) above just use curr_vma */ + + /* check for case 2) or case 4.1) above */ + if (curr_vma && + curr_vma->vm_start == info->prev_vm_start && + curr_vma->vm_end == info->prev_vm_end) + curr_vma = curr_vma->vm_next; + break; + } + if (!curr_vma) { + /* case 3) above, or case 2) 4.1) with vma->next == NULL */ + mmap_read_unlock(curr_task->mm); + goto next_task; + } + info->task = curr_task; + info->vma = curr_vma; + return curr_vma; + +next_task: + put_task_struct(curr_task); + info->task = NULL; + curr_tid++; + goto again; + +finish: + if (curr_task) + put_task_struct(curr_task); + info->task = NULL; + info->vma = NULL; + return NULL; +} + +static void *task_vma_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct bpf_iter_seq_task_vma_info *info = seq->private; + struct vm_area_struct *vma; + + vma = task_vma_seq_get_next(info); + if (vma && *pos == 0) + ++*pos; + + return vma; +} + +static void *task_vma_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct bpf_iter_seq_task_vma_info *info = seq->private; + + ++*pos; + return task_vma_seq_get_next(info); +} + +struct bpf_iter__task_vma { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct task_struct *, task); + __bpf_md_ptr(struct vm_area_struct *, vma); +}; + +DEFINE_BPF_ITER_FUNC(task_vma, struct bpf_iter_meta *meta, + struct task_struct *task, struct vm_area_struct *vma) + +static int __task_vma_seq_show(struct seq_file *seq, bool in_stop) +{ + struct bpf_iter_seq_task_vma_info *info = seq->private; + struct bpf_iter__task_vma ctx; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, in_stop); + if (!prog) + return 0; + + ctx.meta = &meta; + ctx.task = info->task; + ctx.vma = info->vma; + return bpf_iter_run_prog(prog, &ctx); +} + +static int task_vma_seq_show(struct seq_file *seq, void *v) +{ + return __task_vma_seq_show(seq, false); +} + +static void task_vma_seq_stop(struct seq_file *seq, void *v) +{ + struct bpf_iter_seq_task_vma_info *info = seq->private; + + if (!v) { + (void)__task_vma_seq_show(seq, true); + } else { + /* info->vma has not been seen by the BPF program. If the + * user space reads more, task_vma_seq_get_next should + * return this vma again. Set prev_vm_start to ~0UL, + * so that we don't skip the vma returned by the next + * find_vma() (case task_vma_iter_find_vma in + * task_vma_seq_get_next()). + */ + info->prev_vm_start = ~0UL; + info->prev_vm_end = info->vma->vm_end; + mmap_read_unlock(info->task->mm); + put_task_struct(info->task); + info->task = NULL; + } +} + +static const struct seq_operations task_vma_seq_ops = { + .start = task_vma_seq_start, + .next = task_vma_seq_next, + .stop = task_vma_seq_stop, + .show = task_vma_seq_show, +}; + BTF_ID_LIST(btf_task_file_ids) BTF_ID(struct, task_struct) BTF_ID(struct, file) +BTF_ID(struct, vm_area_struct) static const struct bpf_iter_seq_info task_seq_info = { .seq_ops = &task_seq_ops, @@ -328,6 +567,26 @@ static struct bpf_iter_reg task_file_reg_info = { .seq_info = &task_file_seq_info, }; +static const struct bpf_iter_seq_info task_vma_seq_info = { + .seq_ops = &task_vma_seq_ops, + .init_seq_private = init_seq_pidns, + .fini_seq_private = fini_seq_pidns, + .seq_priv_size = sizeof(struct bpf_iter_seq_task_vma_info), +}; + +static struct bpf_iter_reg task_vma_reg_info = { + .target = "task_vma", + .feature = BPF_ITER_RESCHED, + .ctx_arg_info_size = 2, + .ctx_arg_info = { + { offsetof(struct bpf_iter__task_vma, task), + PTR_TO_BTF_ID_OR_NULL }, + { offsetof(struct bpf_iter__task_vma, vma), + PTR_TO_BTF_ID_OR_NULL }, + }, + .seq_info = &task_vma_seq_info, +}; + static int __init task_iter_init(void) { int ret; @@ -339,6 +598,12 @@ static int __init task_iter_init(void) task_file_reg_info.ctx_arg_info[0].btf_id = btf_task_file_ids[0]; task_file_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[1]; - return bpf_iter_reg_target(&task_file_reg_info); + ret = bpf_iter_reg_target(&task_file_reg_info); + if (ret) + return ret; + + task_vma_reg_info.ctx_arg_info[0].btf_id = btf_task_file_ids[0]; + task_vma_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[2]; + return bpf_iter_reg_target(&task_vma_reg_info); } late_initcall(task_iter_init); diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 35c5887d82ff..7bc3b3209224 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -381,55 +381,100 @@ out: mutex_unlock(&trampoline_mutex); } +#define NO_START_TIME 1 +static u64 notrace bpf_prog_start_time(void) +{ + u64 start = NO_START_TIME; + + if (static_branch_unlikely(&bpf_stats_enabled_key)) { + start = sched_clock(); + if (unlikely(!start)) + start = NO_START_TIME; + } + return start; +} + +static void notrace inc_misses_counter(struct bpf_prog *prog) +{ + struct bpf_prog_stats *stats; + + stats = this_cpu_ptr(prog->stats); + u64_stats_update_begin(&stats->syncp); + stats->misses++; + u64_stats_update_end(&stats->syncp); +} + /* The logic is similar to BPF_PROG_RUN, but with an explicit * rcu_read_lock() and migrate_disable() which are required * for the trampoline. The macro is split into - * call _bpf_prog_enter + * call __bpf_prog_enter * call prog->bpf_func * call __bpf_prog_exit + * + * __bpf_prog_enter returns: + * 0 - skip execution of the bpf prog + * 1 - execute bpf prog + * [2..MAX_U64] - excute bpf prog and record execution time. + * This is start time. */ -u64 notrace __bpf_prog_enter(void) +u64 notrace __bpf_prog_enter(struct bpf_prog *prog) __acquires(RCU) { - u64 start = 0; - rcu_read_lock(); migrate_disable(); - if (static_branch_unlikely(&bpf_stats_enabled_key)) - start = sched_clock(); - return start; + if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) { + inc_misses_counter(prog); + return 0; + } + return bpf_prog_start_time(); } -void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start) - __releases(RCU) +static void notrace update_prog_stats(struct bpf_prog *prog, + u64 start) { struct bpf_prog_stats *stats; if (static_branch_unlikely(&bpf_stats_enabled_key) && - /* static_key could be enabled in __bpf_prog_enter - * and disabled in __bpf_prog_exit. + /* static_key could be enabled in __bpf_prog_enter* + * and disabled in __bpf_prog_exit*. * And vice versa. - * Hence check that 'start' is not zero. + * Hence check that 'start' is valid. */ - start) { - stats = this_cpu_ptr(prog->aux->stats); + start > NO_START_TIME) { + stats = this_cpu_ptr(prog->stats); u64_stats_update_begin(&stats->syncp); stats->cnt++; stats->nsecs += sched_clock() - start; u64_stats_update_end(&stats->syncp); } +} + +void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start) + __releases(RCU) +{ + update_prog_stats(prog, start); + __this_cpu_dec(*(prog->active)); migrate_enable(); rcu_read_unlock(); } -void notrace __bpf_prog_enter_sleepable(void) +u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog) { rcu_read_lock_trace(); + migrate_disable(); might_fault(); + if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) { + inc_misses_counter(prog); + return 0; + } + return bpf_prog_start_time(); } -void notrace __bpf_prog_exit_sleepable(void) +void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start) { + update_prog_stats(prog, start); + __this_cpu_dec(*(prog->active)); + migrate_enable(); rcu_read_unlock_trace(); } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e7368c5eacb7..1dda9d81f12c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -228,6 +228,12 @@ static void bpf_map_key_store(struct bpf_insn_aux_data *aux, u64 state) (poisoned ? BPF_MAP_KEY_POISON : 0ULL); } +static bool bpf_pseudo_call(const struct bpf_insn *insn) +{ + return insn->code == (BPF_JMP | BPF_CALL) && + insn->src_reg == BPF_PSEUDO_CALL; +} + struct bpf_call_arg_meta { struct bpf_map *map_ptr; bool raw_mode; @@ -1073,6 +1079,51 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env, __mark_reg_known_zero(regs + regno); } +static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) +{ + switch (reg->type) { + case PTR_TO_MAP_VALUE_OR_NULL: { + const struct bpf_map *map = reg->map_ptr; + + if (map->inner_map_meta) { + reg->type = CONST_PTR_TO_MAP; + reg->map_ptr = map->inner_map_meta; + } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { + reg->type = PTR_TO_XDP_SOCK; + } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP || + map->map_type == BPF_MAP_TYPE_SOCKHASH) { + reg->type = PTR_TO_SOCKET; + } else { + reg->type = PTR_TO_MAP_VALUE; + } + break; + } + case PTR_TO_SOCKET_OR_NULL: + reg->type = PTR_TO_SOCKET; + break; + case PTR_TO_SOCK_COMMON_OR_NULL: + reg->type = PTR_TO_SOCK_COMMON; + break; + case PTR_TO_TCP_SOCK_OR_NULL: + reg->type = PTR_TO_TCP_SOCK; + break; + case PTR_TO_BTF_ID_OR_NULL: + reg->type = PTR_TO_BTF_ID; + break; + case PTR_TO_MEM_OR_NULL: + reg->type = PTR_TO_MEM; + break; + case PTR_TO_RDONLY_BUF_OR_NULL: + reg->type = PTR_TO_RDONLY_BUF; + break; + case PTR_TO_RDWR_BUF_OR_NULL: + reg->type = PTR_TO_RDWR_BUF; + break; + default: + WARN_ON("unknown nullable register type"); + } +} + static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg) { return type_is_pkt_pointer(reg->type); @@ -1486,9 +1537,7 @@ static int check_subprogs(struct bpf_verifier_env *env) /* determine subprog starts. The end is one before the next starts */ for (i = 0; i < insn_cnt; i++) { - if (insn[i].code != (BPF_JMP | BPF_CALL)) - continue; - if (insn[i].src_reg != BPF_PSEUDO_CALL) + if (!bpf_pseudo_call(insn + i)) continue; if (!env->bpf_capable) { verbose(env, @@ -2271,12 +2320,14 @@ static void save_register_state(struct bpf_func_state *state, state->stack[spi].slot_type[i] = STACK_SPILL; } -/* check_stack_read/write functions track spill/fill of registers, +/* check_stack_{read,write}_fixed_off functions track spill/fill of registers, * stack boundary and alignment are checked in check_mem_access() */ -static int check_stack_write(struct bpf_verifier_env *env, - struct bpf_func_state *state, /* func where register points to */ - int off, int size, int value_regno, int insn_idx) +static int check_stack_write_fixed_off(struct bpf_verifier_env *env, + /* stack frame we're writing to */ + struct bpf_func_state *state, + int off, int size, int value_regno, + int insn_idx) { struct bpf_func_state *cur; /* state of the current function */ int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err; @@ -2402,9 +2453,175 @@ static int check_stack_write(struct bpf_verifier_env *env, return 0; } -static int check_stack_read(struct bpf_verifier_env *env, - struct bpf_func_state *reg_state /* func where register points to */, - int off, int size, int value_regno) +/* Write the stack: 'stack[ptr_regno + off] = value_regno'. 'ptr_regno' is + * known to contain a variable offset. + * This function checks whether the write is permitted and conservatively + * tracks the effects of the write, considering that each stack slot in the + * dynamic range is potentially written to. + * + * 'off' includes 'regno->off'. + * 'value_regno' can be -1, meaning that an unknown value is being written to + * the stack. + * + * Spilled pointers in range are not marked as written because we don't know + * what's going to be actually written. This means that read propagation for + * future reads cannot be terminated by this write. + * + * For privileged programs, uninitialized stack slots are considered + * initialized by this write (even though we don't know exactly what offsets + * are going to be written to). The idea is that we don't want the verifier to + * reject future reads that access slots written to through variable offsets. + */ +static int check_stack_write_var_off(struct bpf_verifier_env *env, + /* func where register points to */ + struct bpf_func_state *state, + int ptr_regno, int off, int size, + int value_regno, int insn_idx) +{ + struct bpf_func_state *cur; /* state of the current function */ + int min_off, max_off; + int i, err; + struct bpf_reg_state *ptr_reg = NULL, *value_reg = NULL; + bool writing_zero = false; + /* set if the fact that we're writing a zero is used to let any + * stack slots remain STACK_ZERO + */ + bool zero_used = false; + + cur = env->cur_state->frame[env->cur_state->curframe]; + ptr_reg = &cur->regs[ptr_regno]; + min_off = ptr_reg->smin_value + off; + max_off = ptr_reg->smax_value + off + size; + if (value_regno >= 0) + value_reg = &cur->regs[value_regno]; + if (value_reg && register_is_null(value_reg)) + writing_zero = true; + + err = realloc_func_state(state, round_up(-min_off, BPF_REG_SIZE), + state->acquired_refs, true); + if (err) + return err; + + + /* Variable offset writes destroy any spilled pointers in range. */ + for (i = min_off; i < max_off; i++) { + u8 new_type, *stype; + int slot, spi; + + slot = -i - 1; + spi = slot / BPF_REG_SIZE; + stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE]; + + if (!env->allow_ptr_leaks + && *stype != NOT_INIT + && *stype != SCALAR_VALUE) { + /* Reject the write if there's are spilled pointers in + * range. If we didn't reject here, the ptr status + * would be erased below (even though not all slots are + * actually overwritten), possibly opening the door to + * leaks. + */ + verbose(env, "spilled ptr in range of var-offset stack write; insn %d, ptr off: %d", + insn_idx, i); + return -EINVAL; + } + + /* Erase all spilled pointers. */ + state->stack[spi].spilled_ptr.type = NOT_INIT; + + /* Update the slot type. */ + new_type = STACK_MISC; + if (writing_zero && *stype == STACK_ZERO) { + new_type = STACK_ZERO; + zero_used = true; + } + /* If the slot is STACK_INVALID, we check whether it's OK to + * pretend that it will be initialized by this write. The slot + * might not actually be written to, and so if we mark it as + * initialized future reads might leak uninitialized memory. + * For privileged programs, we will accept such reads to slots + * that may or may not be written because, if we're reject + * them, the error would be too confusing. + */ + if (*stype == STACK_INVALID && !env->allow_uninit_stack) { + verbose(env, "uninit stack in range of var-offset write prohibited for !root; insn %d, off: %d", + insn_idx, i); + return -EINVAL; + } + *stype = new_type; + } + if (zero_used) { + /* backtracking doesn't work for STACK_ZERO yet. */ + err = mark_chain_precision(env, value_regno); + if (err) + return err; + } + return 0; +} + +/* When register 'dst_regno' is assigned some values from stack[min_off, + * max_off), we set the register's type according to the types of the + * respective stack slots. If all the stack values are known to be zeros, then + * so is the destination reg. Otherwise, the register is considered to be + * SCALAR. This function does not deal with register filling; the caller must + * ensure that all spilled registers in the stack range have been marked as + * read. + */ +static void mark_reg_stack_read(struct bpf_verifier_env *env, + /* func where src register points to */ + struct bpf_func_state *ptr_state, + int min_off, int max_off, int dst_regno) +{ + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; + int i, slot, spi; + u8 *stype; + int zeros = 0; + + for (i = min_off; i < max_off; i++) { + slot = -i - 1; + spi = slot / BPF_REG_SIZE; + stype = ptr_state->stack[spi].slot_type; + if (stype[slot % BPF_REG_SIZE] != STACK_ZERO) + break; + zeros++; + } + if (zeros == max_off - min_off) { + /* any access_size read into register is zero extended, + * so the whole register == const_zero + */ + __mark_reg_const_zero(&state->regs[dst_regno]); + /* backtracking doesn't support STACK_ZERO yet, + * so mark it precise here, so that later + * backtracking can stop here. + * Backtracking may not need this if this register + * doesn't participate in pointer adjustment. + * Forward propagation of precise flag is not + * necessary either. This mark is only to stop + * backtracking. Any register that contributed + * to const 0 was marked precise before spill. + */ + state->regs[dst_regno].precise = true; + } else { + /* have read misc data from the stack */ + mark_reg_unknown(env, state->regs, dst_regno); + } + state->regs[dst_regno].live |= REG_LIVE_WRITTEN; +} + +/* Read the stack at 'off' and put the results into the register indicated by + * 'dst_regno'. It handles reg filling if the addressed stack slot is a + * spilled reg. + * + * 'dst_regno' can be -1, meaning that the read value is not going to a + * register. + * + * The access is assumed to be within the current stack bounds. + */ +static int check_stack_read_fixed_off(struct bpf_verifier_env *env, + /* func where src register points to */ + struct bpf_func_state *reg_state, + int off, int size, int dst_regno) { struct bpf_verifier_state *vstate = env->cur_state; struct bpf_func_state *state = vstate->frame[vstate->curframe]; @@ -2412,11 +2629,6 @@ static int check_stack_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg; u8 *stype; - if (reg_state->allocated_stack <= slot) { - verbose(env, "invalid read from stack off %d+0 size %d\n", - off, size); - return -EACCES; - } stype = reg_state->stack[spi].slot_type; reg = ®_state->stack[spi].spilled_ptr; @@ -2427,9 +2639,9 @@ static int check_stack_read(struct bpf_verifier_env *env, verbose(env, "invalid size of register fill\n"); return -EACCES; } - if (value_regno >= 0) { - mark_reg_unknown(env, state->regs, value_regno); - state->regs[value_regno].live |= REG_LIVE_WRITTEN; + if (dst_regno >= 0) { + mark_reg_unknown(env, state->regs, dst_regno); + state->regs[dst_regno].live |= REG_LIVE_WRITTEN; } mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); return 0; @@ -2441,16 +2653,16 @@ static int check_stack_read(struct bpf_verifier_env *env, } } - if (value_regno >= 0) { + if (dst_regno >= 0) { /* restore register state from stack */ - state->regs[value_regno] = *reg; + state->regs[dst_regno] = *reg; /* mark reg as written since spilled pointer state likely * has its liveness marks cleared by is_state_visited() * which resets stack/reg liveness for state transitions */ - state->regs[value_regno].live |= REG_LIVE_WRITTEN; + state->regs[dst_regno].live |= REG_LIVE_WRITTEN; } else if (__is_pointer_value(env->allow_ptr_leaks, reg)) { - /* If value_regno==-1, the caller is asking us whether + /* If dst_regno==-1, the caller is asking us whether * it is acceptable to use this value as a SCALAR_VALUE * (e.g. for XADD). * We must not allow unprivileged callers to do that @@ -2462,70 +2674,167 @@ static int check_stack_read(struct bpf_verifier_env *env, } mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); } else { - int zeros = 0; + u8 type; for (i = 0; i < size; i++) { - if (stype[(slot - i) % BPF_REG_SIZE] == STACK_MISC) + type = stype[(slot - i) % BPF_REG_SIZE]; + if (type == STACK_MISC) continue; - if (stype[(slot - i) % BPF_REG_SIZE] == STACK_ZERO) { - zeros++; + if (type == STACK_ZERO) continue; - } verbose(env, "invalid read from stack off %d+%d size %d\n", off, i, size); return -EACCES; } mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); - if (value_regno >= 0) { - if (zeros == size) { - /* any size read into register is zero extended, - * so the whole register == const_zero - */ - __mark_reg_const_zero(&state->regs[value_regno]); - /* backtracking doesn't support STACK_ZERO yet, - * so mark it precise here, so that later - * backtracking can stop here. - * Backtracking may not need this if this register - * doesn't participate in pointer adjustment. - * Forward propagation of precise flag is not - * necessary either. This mark is only to stop - * backtracking. Any register that contributed - * to const 0 was marked precise before spill. - */ - state->regs[value_regno].precise = true; - } else { - /* have read misc data from the stack */ - mark_reg_unknown(env, state->regs, value_regno); - } - state->regs[value_regno].live |= REG_LIVE_WRITTEN; - } + if (dst_regno >= 0) + mark_reg_stack_read(env, reg_state, off, off + size, dst_regno); } return 0; } -static int check_stack_access(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, - int off, int size) +enum stack_access_src { + ACCESS_DIRECT = 1, /* the access is performed by an instruction */ + ACCESS_HELPER = 2, /* the access is performed by a helper */ +}; + +static int check_stack_range_initialized(struct bpf_verifier_env *env, + int regno, int off, int access_size, + bool zero_size_allowed, + enum stack_access_src type, + struct bpf_call_arg_meta *meta); + +static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno) { - /* Stack accesses must be at a fixed offset, so that we - * can determine what type of data were returned. See - * check_stack_read(). + return cur_regs(env) + regno; +} + +/* Read the stack at 'ptr_regno + off' and put the result into the register + * 'dst_regno'. + * 'off' includes the pointer register's fixed offset(i.e. 'ptr_regno.off'), + * but not its variable offset. + * 'size' is assumed to be <= reg size and the access is assumed to be aligned. + * + * As opposed to check_stack_read_fixed_off, this function doesn't deal with + * filling registers (i.e. reads of spilled register cannot be detected when + * the offset is not fixed). We conservatively mark 'dst_regno' as containing + * SCALAR_VALUE. That's why we assert that the 'ptr_regno' has a variable + * offset; for a fixed offset check_stack_read_fixed_off should be used + * instead. + */ +static int check_stack_read_var_off(struct bpf_verifier_env *env, + int ptr_regno, int off, int size, int dst_regno) +{ + /* The state of the source register. */ + struct bpf_reg_state *reg = reg_state(env, ptr_regno); + struct bpf_func_state *ptr_state = func(env, reg); + int err; + int min_off, max_off; + + /* Note that we pass a NULL meta, so raw access will not be permitted. */ - if (!tnum_is_const(reg->var_off)) { + err = check_stack_range_initialized(env, ptr_regno, off, size, + false, ACCESS_DIRECT, NULL); + if (err) + return err; + + min_off = reg->smin_value + off; + max_off = reg->smax_value + off; + mark_reg_stack_read(env, ptr_state, min_off, max_off + size, dst_regno); + return 0; +} + +/* check_stack_read dispatches to check_stack_read_fixed_off or + * check_stack_read_var_off. + * + * The caller must ensure that the offset falls within the allocated stack + * bounds. + * + * 'dst_regno' is a register which will receive the value from the stack. It + * can be -1, meaning that the read value is not going to a register. + */ +static int check_stack_read(struct bpf_verifier_env *env, + int ptr_regno, int off, int size, + int dst_regno) +{ + struct bpf_reg_state *reg = reg_state(env, ptr_regno); + struct bpf_func_state *state = func(env, reg); + int err; + /* Some accesses are only permitted with a static offset. */ + bool var_off = !tnum_is_const(reg->var_off); + + /* The offset is required to be static when reads don't go to a + * register, in order to not leak pointers (see + * check_stack_read_fixed_off). + */ + if (dst_regno < 0 && var_off) { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "variable stack access var_off=%s off=%d size=%d\n", + verbose(env, "variable offset stack pointer cannot be passed into helper function; var_off=%s off=%d size=%d\n", tn_buf, off, size); return -EACCES; } + /* Variable offset is prohibited for unprivileged mode for simplicity + * since it requires corresponding support in Spectre masking for stack + * ALU. See also retrieve_ptr_limit(). + */ + if (!env->bypass_spec_v1 && var_off) { + char tn_buf[48]; - if (off >= 0 || off < -MAX_BPF_STACK) { - verbose(env, "invalid stack off=%d size=%d\n", off, size); + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "R%d variable offset stack access prohibited for !root, var_off=%s\n", + ptr_regno, tn_buf); return -EACCES; } - return 0; + if (!var_off) { + off += reg->var_off.value; + err = check_stack_read_fixed_off(env, state, off, size, + dst_regno); + } else { + /* Variable offset stack reads need more conservative handling + * than fixed offset ones. Note that dst_regno >= 0 on this + * branch. + */ + err = check_stack_read_var_off(env, ptr_regno, off, size, + dst_regno); + } + return err; +} + + +/* check_stack_write dispatches to check_stack_write_fixed_off or + * check_stack_write_var_off. + * + * 'ptr_regno' is the register used as a pointer into the stack. + * 'off' includes 'ptr_regno->off', but not its variable offset (if any). + * 'value_regno' is the register whose value we're writing to the stack. It can + * be -1, meaning that we're not writing from a register. + * + * The caller must ensure that the offset falls within the maximum stack size. + */ +static int check_stack_write(struct bpf_verifier_env *env, + int ptr_regno, int off, int size, + int value_regno, int insn_idx) +{ + struct bpf_reg_state *reg = reg_state(env, ptr_regno); + struct bpf_func_state *state = func(env, reg); + int err; + + if (tnum_is_const(reg->var_off)) { + off += reg->var_off.value; + err = check_stack_write_fixed_off(env, state, off, size, + value_regno, insn_idx); + } else { + /* Variable offset stack reads need more conservative handling + * than fixed offset ones. + */ + err = check_stack_write_var_off(env, state, + ptr_regno, off, size, + value_regno, insn_idx); + } + return err; } static int check_map_access_type(struct bpf_verifier_env *env, u32 regno, @@ -2858,11 +3167,6 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, return -EACCES; } -static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno) -{ - return cur_regs(env) + regno; -} - static bool is_pointer_value(struct bpf_verifier_env *env, int regno) { return __is_pointer_value(env->allow_ptr_leaks, reg_state(env, regno)); @@ -2981,8 +3285,8 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, break; case PTR_TO_STACK: pointer_desc = "stack "; - /* The stack spill tracking logic in check_stack_write() - * and check_stack_read() relies on stack accesses being + /* The stack spill tracking logic in check_stack_write_fixed_off() + * and check_stack_read_fixed_off() relies on stack accesses being * aligned. */ strict = true; @@ -3074,9 +3378,7 @@ process_func: continue_func: subprog_end = subprog[idx + 1].start; for (; i < subprog_end; i++) { - if (insn[i].code != (BPF_JMP | BPF_CALL)) - continue; - if (insn[i].src_reg != BPF_PSEUDO_CALL) + if (!bpf_pseudo_call(insn + i)) continue; /* remember insn and function to return to */ ret_insn[frame] = i + 1; @@ -3400,6 +3702,91 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env, return 0; } +/* Check that the stack access at the given offset is within bounds. The + * maximum valid offset is -1. + * + * The minimum valid offset is -MAX_BPF_STACK for writes, and + * -state->allocated_stack for reads. + */ +static int check_stack_slot_within_bounds(int off, + struct bpf_func_state *state, + enum bpf_access_type t) +{ + int min_valid_off; + + if (t == BPF_WRITE) + min_valid_off = -MAX_BPF_STACK; + else + min_valid_off = -state->allocated_stack; + + if (off < min_valid_off || off > -1) + return -EACCES; + return 0; +} + +/* Check that the stack access at 'regno + off' falls within the maximum stack + * bounds. + * + * 'off' includes `regno->offset`, but not its dynamic part (if any). + */ +static int check_stack_access_within_bounds( + struct bpf_verifier_env *env, + int regno, int off, int access_size, + enum stack_access_src src, enum bpf_access_type type) +{ + struct bpf_reg_state *regs = cur_regs(env); + struct bpf_reg_state *reg = regs + regno; + struct bpf_func_state *state = func(env, reg); + int min_off, max_off; + int err; + char *err_extra; + + if (src == ACCESS_HELPER) + /* We don't know if helpers are reading or writing (or both). */ + err_extra = " indirect access to"; + else if (type == BPF_READ) + err_extra = " read from"; + else + err_extra = " write to"; + + if (tnum_is_const(reg->var_off)) { + min_off = reg->var_off.value + off; + if (access_size > 0) + max_off = min_off + access_size - 1; + else + max_off = min_off; + } else { + if (reg->smax_value >= BPF_MAX_VAR_OFF || + reg->smin_value <= -BPF_MAX_VAR_OFF) { + verbose(env, "invalid unbounded variable-offset%s stack R%d\n", + err_extra, regno); + return -EACCES; + } + min_off = reg->smin_value + off; + if (access_size > 0) + max_off = reg->smax_value + off + access_size - 1; + else + max_off = min_off; + } + + err = check_stack_slot_within_bounds(min_off, state, type); + if (!err) + err = check_stack_slot_within_bounds(max_off, state, type); + + if (err) { + if (tnum_is_const(reg->var_off)) { + verbose(env, "invalid%s stack R%d off=%d size=%d\n", + err_extra, regno, off, access_size); + } else { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "invalid variable-offset%s stack R%d var_off=%s size=%d\n", + err_extra, regno, tn_buf, access_size); + } + } + return err; +} /* check whether memory at (regno + off) is accessible for t = (read | write) * if t==write, value_regno is a register which value is stored into memory @@ -3515,8 +3902,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } } else if (reg->type == PTR_TO_STACK) { - off += reg->var_off.value; - err = check_stack_access(env, reg, off, size); + /* Basic bounds checks. */ + err = check_stack_access_within_bounds(env, regno, off, size, ACCESS_DIRECT, t); if (err) return err; @@ -3525,12 +3912,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (err) return err; - if (t == BPF_WRITE) - err = check_stack_write(env, state, off, size, - value_regno, insn_idx); - else - err = check_stack_read(env, state, off, size, + if (t == BPF_READ) + err = check_stack_read(env, regno, off, size, value_regno); + else + err = check_stack_write(env, regno, off, size, + value_regno, insn_idx); } else if (reg_is_pkt_pointer(reg)) { if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) { verbose(env, "cannot write into packet\n"); @@ -3606,13 +3993,30 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return err; } -static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn) +static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn) { + int load_reg; int err; - if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) || - insn->imm != 0) { - verbose(env, "BPF_XADD uses reserved fields\n"); + switch (insn->imm) { + case BPF_ADD: + case BPF_ADD | BPF_FETCH: + case BPF_AND: + case BPF_AND | BPF_FETCH: + case BPF_OR: + case BPF_OR | BPF_FETCH: + case BPF_XOR: + case BPF_XOR | BPF_FETCH: + case BPF_XCHG: + case BPF_CMPXCHG: + break; + default: + verbose(env, "BPF_ATOMIC uses invalid atomic opcode %02x\n", insn->imm); + return -EINVAL; + } + + if (BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) { + verbose(env, "invalid atomic operand size\n"); return -EINVAL; } @@ -3626,6 +4030,13 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins if (err) return err; + if (insn->imm == BPF_CMPXCHG) { + /* Check comparison of R0 with memory location */ + err = check_reg_arg(env, BPF_REG_0, SRC_OP); + if (err) + return err; + } + if (is_pointer_value(env, insn->src_reg)) { verbose(env, "R%d leaks addr into mem\n", insn->src_reg); return -EACCES; @@ -3635,66 +4046,91 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins is_pkt_reg(env, insn->dst_reg) || is_flow_key_reg(env, insn->dst_reg) || is_sk_reg(env, insn->dst_reg)) { - verbose(env, "BPF_XADD stores into R%d %s is not allowed\n", + verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n", insn->dst_reg, reg_type_str[reg_state(env, insn->dst_reg)->type]); return -EACCES; } - /* check whether atomic_add can read the memory */ + if (insn->imm & BPF_FETCH) { + if (insn->imm == BPF_CMPXCHG) + load_reg = BPF_REG_0; + else + load_reg = insn->src_reg; + + /* check and record load of old value */ + err = check_reg_arg(env, load_reg, DST_OP); + if (err) + return err; + } else { + /* This instruction accesses a memory location but doesn't + * actually load it into a register. + */ + load_reg = -1; + } + + /* check whether we can read the memory */ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, - BPF_SIZE(insn->code), BPF_READ, -1, true); + BPF_SIZE(insn->code), BPF_READ, load_reg, true); if (err) return err; - /* check whether atomic_add can write into the same memory */ - return check_mem_access(env, insn_idx, insn->dst_reg, insn->off, - BPF_SIZE(insn->code), BPF_WRITE, -1, true); -} - -static int __check_stack_boundary(struct bpf_verifier_env *env, u32 regno, - int off, int access_size, - bool zero_size_allowed) -{ - struct bpf_reg_state *reg = reg_state(env, regno); - - if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || - access_size < 0 || (access_size == 0 && !zero_size_allowed)) { - if (tnum_is_const(reg->var_off)) { - verbose(env, "invalid stack type R%d off=%d access_size=%d\n", - regno, off, access_size); - } else { - char tn_buf[48]; + /* check whether we can write into the same memory */ + err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, + BPF_SIZE(insn->code), BPF_WRITE, -1, true); + if (err) + return err; - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "invalid stack type R%d var_off=%s access_size=%d\n", - regno, tn_buf, access_size); - } - return -EACCES; - } return 0; } -/* when register 'regno' is passed into function that will read 'access_size' - * bytes from that pointer, make sure that it's within stack boundary - * and all elements of stack are initialized. - * Unlike most pointer bounds-checking functions, this one doesn't take an - * 'off' argument, so it has to add in reg->off itself. +/* When register 'regno' is used to read the stack (either directly or through + * a helper function) make sure that it's within stack boundary and, depending + * on the access type, that all elements of the stack are initialized. + * + * 'off' includes 'regno->off', but not its dynamic part (if any). + * + * All registers that have been spilled on the stack in the slots within the + * read offsets are marked as read. */ -static int check_stack_boundary(struct bpf_verifier_env *env, int regno, - int access_size, bool zero_size_allowed, - struct bpf_call_arg_meta *meta) +static int check_stack_range_initialized( + struct bpf_verifier_env *env, int regno, int off, + int access_size, bool zero_size_allowed, + enum stack_access_src type, struct bpf_call_arg_meta *meta) { struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_func_state *state = func(env, reg); int err, min_off, max_off, i, j, slot, spi; + char *err_extra = type == ACCESS_HELPER ? " indirect" : ""; + enum bpf_access_type bounds_check_type; + /* Some accesses can write anything into the stack, others are + * read-only. + */ + bool clobber = false; + + if (access_size == 0 && !zero_size_allowed) { + verbose(env, "invalid zero-sized read\n"); + return -EACCES; + } + + if (type == ACCESS_HELPER) { + /* The bounds checks for writes are more permissive than for + * reads. However, if raw_mode is not set, we'll do extra + * checks below. + */ + bounds_check_type = BPF_WRITE; + clobber = true; + } else { + bounds_check_type = BPF_READ; + } + err = check_stack_access_within_bounds(env, regno, off, access_size, + type, bounds_check_type); + if (err) + return err; + if (tnum_is_const(reg->var_off)) { - min_off = max_off = reg->var_off.value + reg->off; - err = __check_stack_boundary(env, regno, min_off, access_size, - zero_size_allowed); - if (err) - return err; + min_off = max_off = reg->var_off.value + off; } else { /* Variable offset is prohibited for unprivileged mode for * simplicity since it requires corresponding support in @@ -3705,8 +4141,8 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "R%d indirect variable offset stack access prohibited for !root, var_off=%s\n", - regno, tn_buf); + verbose(env, "R%d%s variable offset stack access prohibited for !root, var_off=%s\n", + regno, err_extra, tn_buf); return -EACCES; } /* Only initialized buffer on stack is allowed to be accessed @@ -3718,28 +4154,8 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, if (meta && meta->raw_mode) meta = NULL; - if (reg->smax_value >= BPF_MAX_VAR_OFF || - reg->smax_value <= -BPF_MAX_VAR_OFF) { - verbose(env, "R%d unbounded indirect variable offset stack access\n", - regno); - return -EACCES; - } - min_off = reg->smin_value + reg->off; - max_off = reg->smax_value + reg->off; - err = __check_stack_boundary(env, regno, min_off, access_size, - zero_size_allowed); - if (err) { - verbose(env, "R%d min value is outside of stack bound\n", - regno); - return err; - } - err = __check_stack_boundary(env, regno, max_off, access_size, - zero_size_allowed); - if (err) { - verbose(env, "R%d max value is outside of stack bound\n", - regno); - return err; - } + min_off = reg->smin_value + off; + max_off = reg->smax_value + off; } if (meta && meta->raw_mode) { @@ -3759,8 +4175,10 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, if (*stype == STACK_MISC) goto mark; if (*stype == STACK_ZERO) { - /* helper can write anything into the stack */ - *stype = STACK_MISC; + if (clobber) { + /* helper can write anything into the stack */ + *stype = STACK_MISC; + } goto mark; } @@ -3771,22 +4189,24 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, if (state->stack[spi].slot_type[0] == STACK_SPILL && (state->stack[spi].spilled_ptr.type == SCALAR_VALUE || env->allow_ptr_leaks)) { - __mark_reg_unknown(env, &state->stack[spi].spilled_ptr); - for (j = 0; j < BPF_REG_SIZE; j++) - state->stack[spi].slot_type[j] = STACK_MISC; + if (clobber) { + __mark_reg_unknown(env, &state->stack[spi].spilled_ptr); + for (j = 0; j < BPF_REG_SIZE; j++) + state->stack[spi].slot_type[j] = STACK_MISC; + } goto mark; } err: if (tnum_is_const(reg->var_off)) { - verbose(env, "invalid indirect read from stack off %d+%d size %d\n", - min_off, i - min_off, access_size); + verbose(env, "invalid%s read from stack R%d off %d+%d size %d\n", + err_extra, regno, min_off, i - min_off, access_size); } else { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "invalid indirect read from stack var_off %s+%d size %d\n", - tn_buf, i - min_off, access_size); + verbose(env, "invalid%s read from stack R%d var_off %s+%d size %d\n", + err_extra, regno, tn_buf, i - min_off, access_size); } return -EACCES; mark: @@ -3835,8 +4255,10 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, "rdwr", &env->prog->aux->max_rdwr_access); case PTR_TO_STACK: - return check_stack_boundary(env, regno, access_size, - zero_size_allowed, meta); + return check_stack_range_initialized( + env, + regno, reg->off, access_size, + zero_size_allowed, ACCESS_HELPER, meta); default: /* scalar_value or invalid ptr */ /* Allow zero-byte read from NULL, regardless of pointer type */ if (zero_size_allowed && access_size == 0 && @@ -3850,6 +4272,29 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, } } +int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + u32 regno, u32 mem_size) +{ + if (register_is_null(reg)) + return 0; + + if (reg_type_may_be_null(reg->type)) { + /* Assuming that the register contains a value check if the memory + * access is safe. Temporarily save and restore the register's state as + * the conversion shouldn't be visible to a caller. + */ + const struct bpf_reg_state saved_reg = *reg; + int rv; + + mark_ptr_not_null_reg(reg); + rv = check_helper_mem_access(env, regno, mem_size, true, NULL); + *reg = saved_reg; + return rv; + } + + return check_helper_mem_access(env, regno, mem_size, true, NULL); +} + /* Implementation details: * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL * Two bpf_map_lookups (even with the same key) will have different reg->id. @@ -4321,7 +4766,7 @@ skip_type_check: err = mark_chain_precision(env, regno); } else if (arg_type_is_alloc_size(arg_type)) { if (!tnum_is_const(reg->var_off)) { - verbose(env, "R%d unbounded size, use 'var &= const' or 'if (var < const)'\n", + verbose(env, "R%d is not a known constant'\n", regno); return -EACCES; } @@ -4834,8 +5279,9 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, subprog); clear_caller_saved_regs(env, caller->regs); - /* All global functions return SCALAR_VALUE */ + /* All global functions return a 64-bit SCALAR_VALUE */ mark_reg_unknown(env, caller->regs, BPF_REG_0); + caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG; /* continue with next insn after call */ return 0; @@ -5500,6 +5946,41 @@ do_sim: return !ret ? -EFAULT : 0; } +/* check that stack access falls within stack limits and that 'reg' doesn't + * have a variable offset. + * + * Variable offset is prohibited for unprivileged mode for simplicity since it + * requires corresponding support in Spectre masking for stack ALU. See also + * retrieve_ptr_limit(). + * + * + * 'off' includes 'reg->off'. + */ +static int check_stack_access_for_ptr_arithmetic( + struct bpf_verifier_env *env, + int regno, + const struct bpf_reg_state *reg, + int off) +{ + if (!tnum_is_const(reg->var_off)) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "R%d variable stack access prohibited for !root, var_off=%s off=%d\n", + regno, tn_buf, off); + return -EACCES; + } + + if (off >= 0 || off < -MAX_BPF_STACK) { + verbose(env, "R%d stack pointer arithmetic goes out of range, " + "prohibited for !root; off=%d\n", regno, off); + return -EACCES; + } + + return 0; +} + + /* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off. * Caller should also handle BPF_MOV case separately. * If we return -EACCES, caller may want to try again treating pointer as a @@ -5743,10 +6224,9 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, "prohibited for !root\n", dst); return -EACCES; } else if (dst_reg->type == PTR_TO_STACK && - check_stack_access(env, dst_reg, dst_reg->off + - dst_reg->var_off.value, 1)) { - verbose(env, "R%d stack pointer arithmetic goes out of range, " - "prohibited for !root\n", dst); + check_stack_access_for_ptr_arithmetic( + env, dst, dst_reg, dst_reg->off + + dst_reg->var_off.value)) { return -EACCES; } } @@ -6225,7 +6705,7 @@ static void scalar32_min_max_rsh(struct bpf_reg_state *dst_reg, * 3) the signed bounds cross zero, so they tell us nothing * about the result * If the value in dst_reg is known nonnegative, then again the - * unsigned bounts capture the signed bounds. + * unsigned bounds capture the signed bounds. * Thus, in all cases it suffices to blow away our signed bounds * and rely on inferring new ones from the unsigned bounds and * var_off of the result. @@ -6256,7 +6736,7 @@ static void scalar_min_max_rsh(struct bpf_reg_state *dst_reg, * 3) the signed bounds cross zero, so they tell us nothing * about the result * If the value in dst_reg is known nonnegative, then again the - * unsigned bounts capture the signed bounds. + * unsigned bounds capture the signed bounds. * Thus, in all cases it suffices to blow away our signed bounds * and rely on inferring new ones from the unsigned bounds and * var_off of the result. @@ -6877,7 +7357,7 @@ static int is_branch32_taken(struct bpf_reg_state *reg, u32 val, u8 opcode) case BPF_JSGT: if (reg->s32_min_value > sval) return 1; - else if (reg->s32_max_value < sval) + else if (reg->s32_max_value <= sval) return 0; break; case BPF_JLT: @@ -6950,7 +7430,7 @@ static int is_branch64_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) case BPF_JSGT: if (reg->smin_value > sval) return 1; - else if (reg->smax_value < sval) + else if (reg->smax_value <= sval) return 0; break; case BPF_JLT: @@ -7326,43 +7806,19 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, } if (is_null) { reg->type = SCALAR_VALUE; - } else if (reg->type == PTR_TO_MAP_VALUE_OR_NULL) { - const struct bpf_map *map = reg->map_ptr; - - if (map->inner_map_meta) { - reg->type = CONST_PTR_TO_MAP; - reg->map_ptr = map->inner_map_meta; - } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { - reg->type = PTR_TO_XDP_SOCK; - } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP || - map->map_type == BPF_MAP_TYPE_SOCKHASH) { - reg->type = PTR_TO_SOCKET; - } else { - reg->type = PTR_TO_MAP_VALUE; - } - } else if (reg->type == PTR_TO_SOCKET_OR_NULL) { - reg->type = PTR_TO_SOCKET; - } else if (reg->type == PTR_TO_SOCK_COMMON_OR_NULL) { - reg->type = PTR_TO_SOCK_COMMON; - } else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) { - reg->type = PTR_TO_TCP_SOCK; - } else if (reg->type == PTR_TO_BTF_ID_OR_NULL) { - reg->type = PTR_TO_BTF_ID; - } else if (reg->type == PTR_TO_MEM_OR_NULL) { - reg->type = PTR_TO_MEM; - } else if (reg->type == PTR_TO_RDONLY_BUF_OR_NULL) { - reg->type = PTR_TO_RDONLY_BUF; - } else if (reg->type == PTR_TO_RDWR_BUF_OR_NULL) { - reg->type = PTR_TO_RDWR_BUF; - } - if (is_null) { /* We don't need id and ref_obj_id from this point * onwards anymore, thus we should better reset it, * so that state pruning has chances to take effect. */ reg->id = 0; reg->ref_obj_id = 0; - } else if (!reg_may_point_to_spin_lock(reg)) { + + return; + } + + mark_ptr_not_null_reg(reg); + + if (!reg_may_point_to_spin_lock(reg)) { /* For not-NULL ptr, reg->ref_obj_id will be reset * in release_reg_references(). * @@ -7945,6 +8401,9 @@ static int check_return_code(struct bpf_verifier_env *env) env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME || env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME) range = tnum_range(1, 1); + if (env->prog->expected_attach_type == BPF_CGROUP_INET4_BIND || + env->prog->expected_attach_type == BPF_CGROUP_INET6_BIND) + range = tnum_range(0, 3); break; case BPF_PROG_TYPE_CGROUP_SKB: if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) { @@ -8590,7 +9049,11 @@ static bool range_within(struct bpf_reg_state *old, return old->umin_value <= cur->umin_value && old->umax_value >= cur->umax_value && old->smin_value <= cur->smin_value && - old->smax_value >= cur->smax_value; + old->smax_value >= cur->smax_value && + old->u32_min_value <= cur->u32_min_value && + old->u32_max_value >= cur->u32_max_value && + old->s32_min_value <= cur->s32_min_value && + old->s32_max_value >= cur->s32_max_value; } /* Maximum number of register states that can exist at once */ @@ -9526,14 +9989,19 @@ static int do_check(struct bpf_verifier_env *env) } else if (class == BPF_STX) { enum bpf_reg_type *prev_dst_type, dst_reg_type; - if (BPF_MODE(insn->code) == BPF_XADD) { - err = check_xadd(env, env->insn_idx, insn); + if (BPF_MODE(insn->code) == BPF_ATOMIC) { + err = check_atomic(env, env->insn_idx, insn); if (err) return err; env->insn_idx++; continue; } + if (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0) { + verbose(env, "BPF_STX uses reserved fields\n"); + return -EINVAL; + } + /* check src1 operand */ err = check_reg_arg(env, insn->src_reg, SRC_OP); if (err) @@ -9705,6 +10173,36 @@ process_bpf_exit: return 0; } +static int find_btf_percpu_datasec(struct btf *btf) +{ + const struct btf_type *t; + const char *tname; + int i, n; + + /* + * Both vmlinux and module each have their own ".data..percpu" + * DATASECs in BTF. So for module's case, we need to skip vmlinux BTF + * types to look at only module's own BTF types. + */ + n = btf_nr_types(btf); + if (btf_is_module(btf)) + i = btf_nr_types(btf_vmlinux); + else + i = 1; + + for(; i < n; i++) { + t = btf_type_by_id(btf, i); + if (BTF_INFO_KIND(t->info) != BTF_KIND_DATASEC) + continue; + + tname = btf_name_by_offset(btf, t->name_off); + if (!strcmp(tname, ".data..percpu")) + return i; + } + + return -ENOENT; +} + /* replace pseudo btf_id with kernel symbol address */ static int check_pseudo_btf_id(struct bpf_verifier_env *env, struct bpf_insn *insn, @@ -9712,48 +10210,57 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, { const struct btf_var_secinfo *vsi; const struct btf_type *datasec; + struct btf_mod_pair *btf_mod; const struct btf_type *t; const char *sym_name; bool percpu = false; u32 type, id = insn->imm; + struct btf *btf; s32 datasec_id; u64 addr; - int i; + int i, btf_fd, err; - if (!btf_vmlinux) { - verbose(env, "kernel is missing BTF, make sure CONFIG_DEBUG_INFO_BTF=y is specified in Kconfig.\n"); - return -EINVAL; - } - - if (insn[1].imm != 0) { - verbose(env, "reserved field (insn[1].imm) is used in pseudo_btf_id ldimm64 insn.\n"); - return -EINVAL; + btf_fd = insn[1].imm; + if (btf_fd) { + btf = btf_get_by_fd(btf_fd); + if (IS_ERR(btf)) { + verbose(env, "invalid module BTF object FD specified.\n"); + return -EINVAL; + } + } else { + if (!btf_vmlinux) { + verbose(env, "kernel is missing BTF, make sure CONFIG_DEBUG_INFO_BTF=y is specified in Kconfig.\n"); + return -EINVAL; + } + btf = btf_vmlinux; + btf_get(btf); } - t = btf_type_by_id(btf_vmlinux, id); + t = btf_type_by_id(btf, id); if (!t) { verbose(env, "ldimm64 insn specifies invalid btf_id %d.\n", id); - return -ENOENT; + err = -ENOENT; + goto err_put; } if (!btf_type_is_var(t)) { - verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR.\n", - id); - return -EINVAL; + verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR.\n", id); + err = -EINVAL; + goto err_put; } - sym_name = btf_name_by_offset(btf_vmlinux, t->name_off); + sym_name = btf_name_by_offset(btf, t->name_off); addr = kallsyms_lookup_name(sym_name); if (!addr) { verbose(env, "ldimm64 failed to find the address for kernel symbol '%s'.\n", sym_name); - return -ENOENT; + err = -ENOENT; + goto err_put; } - datasec_id = btf_find_by_name_kind(btf_vmlinux, ".data..percpu", - BTF_KIND_DATASEC); + datasec_id = find_btf_percpu_datasec(btf); if (datasec_id > 0) { - datasec = btf_type_by_id(btf_vmlinux, datasec_id); + datasec = btf_type_by_id(btf, datasec_id); for_each_vsi(i, datasec, vsi) { if (vsi->type == id) { percpu = true; @@ -9766,10 +10273,10 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, insn[1].imm = addr >> 32; type = t->type; - t = btf_type_skip_modifiers(btf_vmlinux, type, NULL); + t = btf_type_skip_modifiers(btf, type, NULL); if (percpu) { aux->btf_var.reg_type = PTR_TO_PERCPU_BTF_ID; - aux->btf_var.btf = btf_vmlinux; + aux->btf_var.btf = btf; aux->btf_var.btf_id = type; } else if (!btf_type_is_struct(t)) { const struct btf_type *ret; @@ -9777,21 +10284,54 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, u32 tsize; /* resolve the type size of ksym. */ - ret = btf_resolve_size(btf_vmlinux, t, &tsize); + ret = btf_resolve_size(btf, t, &tsize); if (IS_ERR(ret)) { - tname = btf_name_by_offset(btf_vmlinux, t->name_off); + tname = btf_name_by_offset(btf, t->name_off); verbose(env, "ldimm64 unable to resolve the size of type '%s': %ld\n", tname, PTR_ERR(ret)); - return -EINVAL; + err = -EINVAL; + goto err_put; } aux->btf_var.reg_type = PTR_TO_MEM; aux->btf_var.mem_size = tsize; } else { aux->btf_var.reg_type = PTR_TO_BTF_ID; - aux->btf_var.btf = btf_vmlinux; + aux->btf_var.btf = btf; aux->btf_var.btf_id = type; } + + /* check whether we recorded this BTF (and maybe module) already */ + for (i = 0; i < env->used_btf_cnt; i++) { + if (env->used_btfs[i].btf == btf) { + btf_put(btf); + return 0; + } + } + + if (env->used_btf_cnt >= MAX_USED_BTFS) { + err = -E2BIG; + goto err_put; + } + + btf_mod = &env->used_btfs[env->used_btf_cnt]; + btf_mod->btf = btf; + btf_mod->module = NULL; + + /* if we reference variables from kernel module, bump its refcount */ + if (btf_is_module(btf)) { + btf_mod->module = btf_try_get_module(btf); + if (!btf_mod->module) { + err = -ENXIO; + goto err_put; + } + } + + env->used_btf_cnt++; + return 0; +err_put: + btf_put(btf); + return err; } static int check_map_prealloc(struct bpf_map *map) @@ -9893,15 +10433,22 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_HASH: case BPF_MAP_TYPE_LRU_HASH: case BPF_MAP_TYPE_ARRAY: + case BPF_MAP_TYPE_PERCPU_HASH: + case BPF_MAP_TYPE_PERCPU_ARRAY: + case BPF_MAP_TYPE_LRU_PERCPU_HASH: + case BPF_MAP_TYPE_ARRAY_OF_MAPS: + case BPF_MAP_TYPE_HASH_OF_MAPS: if (!is_preallocated_map(map)) { verbose(env, - "Sleepable programs can only use preallocated hash maps\n"); + "Sleepable programs can only use preallocated maps\n"); return -EINVAL; } break; + case BPF_MAP_TYPE_RINGBUF: + break; default: verbose(env, - "Sleepable programs can only use array and hash maps\n"); + "Sleepable programs can only use array, hash, and ringbuf maps\n"); return -EINVAL; } @@ -9938,13 +10485,6 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) return -EINVAL; } - if (BPF_CLASS(insn->code) == BPF_STX && - ((BPF_MODE(insn->code) != BPF_MEM && - BPF_MODE(insn->code) != BPF_XADD) || insn->imm != 0)) { - verbose(env, "BPF_STX uses reserved fields\n"); - return -EINVAL; - } - if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) { struct bpf_insn_aux_data *aux; struct bpf_map *map; @@ -10088,6 +10628,13 @@ static void release_maps(struct bpf_verifier_env *env) env->used_map_cnt); } +/* drop refcnt of maps used by the rejected program */ +static void release_btfs(struct bpf_verifier_env *env) +{ + __bpf_free_used_btfs(env->prog->aux, env->used_btfs, + env->used_btf_cnt); +} + /* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env) { @@ -10459,6 +11006,7 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env, for (i = 0; i < len; i++) { int adj_idx = i + delta; struct bpf_insn insn; + u8 load_reg; insn = insns[adj_idx]; if (!aux[adj_idx].zext_dst) { @@ -10501,9 +11049,27 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env, if (!bpf_jit_needs_zext()) continue; + /* zext_dst means that we want to zero-extend whatever register + * the insn defines, which is dst_reg most of the time, with + * the notable exception of BPF_STX + BPF_ATOMIC + BPF_FETCH. + */ + if (BPF_CLASS(insn.code) == BPF_STX && + BPF_MODE(insn.code) == BPF_ATOMIC) { + /* BPF_STX + BPF_ATOMIC insns without BPF_FETCH do not + * define any registers, therefore zext_dst cannot be + * set. + */ + if (WARN_ON(!(insn.imm & BPF_FETCH))) + return -EINVAL; + load_reg = insn.imm == BPF_CMPXCHG ? BPF_REG_0 + : insn.src_reg; + } else { + load_reg = insn.dst_reg; + } + zext_patch[0] = insn; - zext_patch[1].dst_reg = insn.dst_reg; - zext_patch[1].src_reg = insn.dst_reg; + zext_patch[1].dst_reg = load_reg; + zext_patch[1].src_reg = load_reg; patch = zext_patch; patch_len = 2; apply_patch_buffer: @@ -10719,8 +11285,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) return 0; for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { - if (insn->code != (BPF_JMP | BPF_CALL) || - insn->src_reg != BPF_PSEUDO_CALL) + if (!bpf_pseudo_call(insn)) continue; /* Upon error here we cannot fall back to interpreter but * need a hard reject of the program. Thus -EFAULT is @@ -10761,7 +11326,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) /* BPF_PROG_RUN doesn't call subprogs directly, * hence main prog stats include the runtime of subprogs. * subprogs don't have IDs and not reachable via prog_get_next_id - * func[i]->aux->stats will never be accessed and stays NULL + * func[i]->stats will never be accessed and stays NULL */ func[i] = bpf_prog_alloc_no_stats(bpf_prog_size(len), GFP_USER); if (!func[i]) @@ -10849,8 +11414,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) for (i = 0; i < env->subprog_cnt; i++) { insn = func[i]->insnsi; for (j = 0; j < func[i]->len; j++, insn++) { - if (insn->code != (BPF_JMP | BPF_CALL) || - insn->src_reg != BPF_PSEUDO_CALL) + if (!bpf_pseudo_call(insn)) continue; subprog = insn->off; insn->imm = BPF_CAST_CALL(func[subprog]->bpf_func) - @@ -10895,8 +11459,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) * later look the same as if they were interpreted only. */ for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { - if (insn->code != (BPF_JMP | BPF_CALL) || - insn->src_reg != BPF_PSEUDO_CALL) + if (!bpf_pseudo_call(insn)) continue; insn->off = env->insn_aux_data[i].call_imm; subprog = find_subprog(env, i + insn->off + 1); @@ -10925,8 +11488,7 @@ out_undo_insn: /* cleanup main prog to be interpreted */ prog->jit_requested = 0; for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { - if (insn->code != (BPF_JMP | BPF_CALL) || - insn->src_reg != BPF_PSEUDO_CALL) + if (!bpf_pseudo_call(insn)) continue; insn->off = 0; insn->imm = env->insn_aux_data[i].call_imm; @@ -10961,8 +11523,7 @@ static int fixup_call_args(struct bpf_verifier_env *env) return -EINVAL; } for (i = 0; i < prog->len; i++, insn++) { - if (insn->code != (BPF_JMP | BPF_CALL) || - insn->src_reg != BPF_PSEUDO_CALL) + if (!bpf_pseudo_call(insn)) continue; depth = get_callee_stack_depth(env, insn, i); if (depth < 0) @@ -10999,30 +11560,30 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) insn->code == (BPF_ALU | BPF_MOD | BPF_X) || insn->code == (BPF_ALU | BPF_DIV | BPF_X)) { bool is64 = BPF_CLASS(insn->code) == BPF_ALU64; - struct bpf_insn mask_and_div[] = { - BPF_MOV32_REG(insn->src_reg, insn->src_reg), - /* Rx div 0 -> 0 */ - BPF_JMP_IMM(BPF_JNE, insn->src_reg, 0, 2), + bool isdiv = BPF_OP(insn->code) == BPF_DIV; + struct bpf_insn *patchlet; + struct bpf_insn chk_and_div[] = { + /* [R,W]x div 0 -> 0 */ + BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | + BPF_JNE | BPF_K, insn->src_reg, + 0, 2, 0), BPF_ALU32_REG(BPF_XOR, insn->dst_reg, insn->dst_reg), BPF_JMP_IMM(BPF_JA, 0, 0, 1), *insn, }; - struct bpf_insn mask_and_mod[] = { - BPF_MOV32_REG(insn->src_reg, insn->src_reg), - /* Rx mod 0 -> Rx */ - BPF_JMP_IMM(BPF_JEQ, insn->src_reg, 0, 1), + struct bpf_insn chk_and_mod[] = { + /* [R,W]x mod 0 -> [R,W]x */ + BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | + BPF_JEQ | BPF_K, insn->src_reg, + 0, 1 + (is64 ? 0 : 1), 0), *insn, + BPF_JMP_IMM(BPF_JA, 0, 0, 1), + BPF_MOV32_REG(insn->dst_reg, insn->dst_reg), }; - struct bpf_insn *patchlet; - if (insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) || - insn->code == (BPF_ALU | BPF_DIV | BPF_X)) { - patchlet = mask_and_div + (is64 ? 1 : 0); - cnt = ARRAY_SIZE(mask_and_div) - (is64 ? 1 : 0); - } else { - patchlet = mask_and_mod + (is64 ? 1 : 0); - cnt = ARRAY_SIZE(mask_and_mod) - (is64 ? 1 : 0); - } + patchlet = isdiv ? chk_and_div : chk_and_mod; + cnt = isdiv ? ARRAY_SIZE(chk_and_div) : + ARRAY_SIZE(chk_and_mod) - (is64 ? 2 : 0); new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt); if (!new_prog) @@ -11427,6 +11988,13 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) mark_reg_known_zero(env, regs, i); else if (regs[i].type == SCALAR_VALUE) mark_reg_unknown(env, regs, i); + else if (regs[i].type == PTR_TO_MEM_OR_NULL) { + const u32 mem_size = regs[i].mem_size; + + mark_reg_known_zero(env, regs, i); + regs[i].mem_size = mem_size; + regs[i].id = ++env->id_gen; + } } } else { /* 1st arg to a function */ @@ -12005,6 +12573,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, env->strict_alignment = false; env->allow_ptr_leaks = bpf_allow_ptr_leaks(); + env->allow_uninit_stack = bpf_allow_uninit_stack(); env->allow_ptr_to_map_access = bpf_allow_ptr_to_map_access(); env->bypass_spec_v1 = bpf_bypass_spec_v1(); env->bypass_spec_v4 = bpf_bypass_spec_v4(); @@ -12100,7 +12669,10 @@ skip_full_check: goto err_release_maps; } - if (ret == 0 && env->used_map_cnt) { + if (ret) + goto err_release_maps; + + if (env->used_map_cnt) { /* if program passed verifier, update used_maps in bpf_prog_info */ env->prog->aux->used_maps = kmalloc_array(env->used_map_cnt, sizeof(env->used_maps[0]), @@ -12114,15 +12686,29 @@ skip_full_check: memcpy(env->prog->aux->used_maps, env->used_maps, sizeof(env->used_maps[0]) * env->used_map_cnt); env->prog->aux->used_map_cnt = env->used_map_cnt; + } + if (env->used_btf_cnt) { + /* if program passed verifier, update used_btfs in bpf_prog_aux */ + env->prog->aux->used_btfs = kmalloc_array(env->used_btf_cnt, + sizeof(env->used_btfs[0]), + GFP_KERNEL); + if (!env->prog->aux->used_btfs) { + ret = -ENOMEM; + goto err_release_maps; + } + memcpy(env->prog->aux->used_btfs, env->used_btfs, + sizeof(env->used_btfs[0]) * env->used_btf_cnt); + env->prog->aux->used_btf_cnt = env->used_btf_cnt; + } + if (env->used_map_cnt || env->used_btf_cnt) { /* program is valid. Convert pseudo bpf_ld_imm64 into generic * bpf_ld_imm64 instructions */ convert_pseudo_ld_imm64(env); } - if (ret == 0) - adjust_btf_func(env); + adjust_btf_func(env); err_release_maps: if (!env->prog->aux->used_maps) @@ -12130,6 +12716,8 @@ err_release_maps: * them now. Otherwise free_used_maps() will release them. */ release_maps(env); + if (!env->prog->aux->used_btfs) + release_btfs(env); /* extension progs temporarily inherit the attach_type of their targets for verification purposes, so set it back to zero before returning diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 32596fdbcd5b..a5751784ad74 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -917,6 +917,9 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) for_each_subsys(ss, i) { if (strcmp(param->key, ss->legacy_name)) continue; + if (!cgroup_ssid_enabled(i) || cgroup1_ssid_disabled(i)) + return invalfc(fc, "Disabled controller '%s'", + param->key); ctx->subsys_mask |= (1 << i); return 0; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 613845769103..1ea995f801ec 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3564,6 +3564,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, { struct psi_trigger *new; struct cgroup *cgrp; + struct psi_group *psi; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) @@ -3572,7 +3573,8 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, cgroup_get(cgrp); cgroup_kn_unlock(of->kn); - new = psi_trigger_create(&cgrp->psi, buf, nbytes, res); + psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; + new = psi_trigger_create(psi, buf, nbytes, res); if (IS_ERR(new)) { cgroup_put(cgrp); return PTR_ERR(new); diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c index b1496e744c68..da95df381483 100644 --- a/kernel/dma/map_benchmark.c +++ b/kernel/dma/map_benchmark.c @@ -36,7 +36,7 @@ struct map_benchmark { __s32 node; /* which numa node this benchmark will run on */ __u32 dma_bits; /* DMA addressing capability */ __u32 dma_dir; /* DMA data direction */ - __u64 expansion[10]; /* For future use */ + __u8 expansion[84]; /* For future use */ }; struct map_benchmark_data { @@ -147,8 +147,10 @@ static int do_map_benchmark(struct map_benchmark_data *map) atomic64_set(&map->sum_sq_unmap, 0); atomic64_set(&map->loops, 0); - for (i = 0; i < threads; i++) + for (i = 0; i < threads; i++) { + get_task_struct(tsk[i]); wake_up_process(tsk[i]); + } msleep_interruptible(map->bparam.seconds * 1000); @@ -183,6 +185,8 @@ static int do_map_benchmark(struct map_benchmark_data *map) } out: + for (i = 0; i < threads; i++) + put_task_struct(tsk[i]); put_device(map->dev); kfree(tsk); return ret; diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 378341642f94..f9d491b17b78 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -209,26 +209,18 @@ static void exit_to_user_mode_prepare(struct pt_regs *regs) lockdep_sys_exit(); } -#ifndef _TIF_SINGLESTEP -static inline bool report_single_step(unsigned long work) -{ - return false; -} -#else /* * If SYSCALL_EMU is set, then the only reason to report is when - * TIF_SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall + * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall * instruction has been already reported in syscall_enter_from_user_mode(). */ static inline bool report_single_step(unsigned long work) { - if (!(work & SYSCALL_WORK_SYSCALL_EMU)) + if (work & SYSCALL_WORK_SYSCALL_EMU) return false; - return !!(current_thread_info()->flags & _TIF_SINGLESTEP); + return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP; } -#endif - static void syscall_exit_work(struct pt_regs *regs, unsigned long work) { diff --git a/kernel/entry/syscall_user_dispatch.c b/kernel/entry/syscall_user_dispatch.c index b0338a5625d9..c240302f56e2 100644 --- a/kernel/entry/syscall_user_dispatch.c +++ b/kernel/entry/syscall_user_dispatch.c @@ -50,10 +50,10 @@ bool syscall_user_dispatch(struct pt_regs *regs) if (unlikely(__get_user(state, sd->selector))) do_exit(SIGSEGV); - if (likely(state == PR_SYS_DISPATCH_OFF)) + if (likely(state == SYSCALL_DISPATCH_FILTER_ALLOW)) return false; - if (state != PR_SYS_DISPATCH_ON) + if (state != SYSCALL_DISPATCH_FILTER_BLOCK) do_exit(SIGSYS); } diff --git a/kernel/events/core.c b/kernel/events/core.c index 55d18791a72d..c37401e3e5f7 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -53,6 +53,7 @@ #include <linux/min_heap.h> #include <linux/highmem.h> #include <linux/pgtable.h> +#include <linux/buildid.h> #include "internal.h" @@ -397,6 +398,7 @@ static atomic_t nr_ksymbol_events __read_mostly; static atomic_t nr_bpf_events __read_mostly; static atomic_t nr_cgroup_events __read_mostly; static atomic_t nr_text_poke_events __read_mostly; +static atomic_t nr_build_id_events __read_mostly; static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); @@ -4673,6 +4675,8 @@ static void unaccount_event(struct perf_event *event) dec = true; if (event->attr.mmap || event->attr.mmap_data) atomic_dec(&nr_mmap_events); + if (event->attr.build_id) + atomic_dec(&nr_build_id_events); if (event->attr.comm) atomic_dec(&nr_comm_events); if (event->attr.namespaces) @@ -8046,6 +8050,8 @@ struct perf_mmap_event { u64 ino; u64 ino_generation; u32 prot, flags; + u8 build_id[BUILD_ID_SIZE_MAX]; + u32 build_id_size; struct { struct perf_event_header header; @@ -8077,6 +8083,7 @@ static void perf_event_mmap_output(struct perf_event *event, struct perf_sample_data sample; int size = mmap_event->event_id.header.size; u32 type = mmap_event->event_id.header.type; + bool use_build_id; int ret; if (!perf_event_mmap_match(event, data)) @@ -8101,13 +8108,25 @@ static void perf_event_mmap_output(struct perf_event *event, mmap_event->event_id.pid = perf_event_pid(event, current); mmap_event->event_id.tid = perf_event_tid(event, current); + use_build_id = event->attr.build_id && mmap_event->build_id_size; + + if (event->attr.mmap2 && use_build_id) + mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_BUILD_ID; + perf_output_put(&handle, mmap_event->event_id); if (event->attr.mmap2) { - perf_output_put(&handle, mmap_event->maj); - perf_output_put(&handle, mmap_event->min); - perf_output_put(&handle, mmap_event->ino); - perf_output_put(&handle, mmap_event->ino_generation); + if (use_build_id) { + u8 size[4] = { (u8) mmap_event->build_id_size, 0, 0, 0 }; + + __output_copy(&handle, size, 4); + __output_copy(&handle, mmap_event->build_id, BUILD_ID_SIZE_MAX); + } else { + perf_output_put(&handle, mmap_event->maj); + perf_output_put(&handle, mmap_event->min); + perf_output_put(&handle, mmap_event->ino); + perf_output_put(&handle, mmap_event->ino_generation); + } perf_output_put(&handle, mmap_event->prot); perf_output_put(&handle, mmap_event->flags); } @@ -8236,6 +8255,9 @@ got_name: mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; + if (atomic_read(&nr_build_id_events)) + build_id_parse(vma, mmap_event->build_id, &mmap_event->build_id_size); + perf_iterate_sb(perf_event_mmap_output, mmap_event, NULL); @@ -11172,6 +11194,8 @@ static void account_event(struct perf_event *event) inc = true; if (event->attr.mmap || event->attr.mmap_data) atomic_inc(&nr_mmap_events); + if (event->attr.build_id) + atomic_inc(&nr_build_id_events); if (event->attr.comm) atomic_inc(&nr_comm_events); if (event->attr.namespaces) diff --git a/kernel/futex.c b/kernel/futex.c index c47d1015d759..45a13eb8894e 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -763,6 +763,29 @@ static struct futex_pi_state *alloc_pi_state(void) return pi_state; } +static void pi_state_update_owner(struct futex_pi_state *pi_state, + struct task_struct *new_owner) +{ + struct task_struct *old_owner = pi_state->owner; + + lockdep_assert_held(&pi_state->pi_mutex.wait_lock); + + if (old_owner) { + raw_spin_lock(&old_owner->pi_lock); + WARN_ON(list_empty(&pi_state->list)); + list_del_init(&pi_state->list); + raw_spin_unlock(&old_owner->pi_lock); + } + + if (new_owner) { + raw_spin_lock(&new_owner->pi_lock); + WARN_ON(!list_empty(&pi_state->list)); + list_add(&pi_state->list, &new_owner->pi_state_list); + pi_state->owner = new_owner; + raw_spin_unlock(&new_owner->pi_lock); + } +} + static void get_pi_state(struct futex_pi_state *pi_state) { WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount)); @@ -785,17 +808,11 @@ static void put_pi_state(struct futex_pi_state *pi_state) * and has cleaned up the pi_state already */ if (pi_state->owner) { - struct task_struct *owner; unsigned long flags; raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags); - owner = pi_state->owner; - if (owner) { - raw_spin_lock(&owner->pi_lock); - list_del_init(&pi_state->list); - raw_spin_unlock(&owner->pi_lock); - } - rt_mutex_proxy_unlock(&pi_state->pi_mutex, owner); + pi_state_update_owner(pi_state, NULL); + rt_mutex_proxy_unlock(&pi_state->pi_mutex); raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags); } @@ -941,7 +958,8 @@ static inline void exit_pi_state_list(struct task_struct *curr) { } * FUTEX_OWNER_DIED bit. See [4] * * [10] There is no transient state which leaves owner and user space - * TID out of sync. + * TID out of sync. Except one error case where the kernel is denied + * write access to the user address, see fixup_pi_state_owner(). * * * Serialization and lifetime rules: @@ -1521,26 +1539,15 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_ ret = -EINVAL; } - if (ret) - goto out_unlock; - - /* - * This is a point of no return; once we modify the uval there is no - * going back and subsequent operations must not fail. - */ - - raw_spin_lock(&pi_state->owner->pi_lock); - WARN_ON(list_empty(&pi_state->list)); - list_del_init(&pi_state->list); - raw_spin_unlock(&pi_state->owner->pi_lock); - - raw_spin_lock(&new_owner->pi_lock); - WARN_ON(!list_empty(&pi_state->list)); - list_add(&pi_state->list, &new_owner->pi_state_list); - pi_state->owner = new_owner; - raw_spin_unlock(&new_owner->pi_lock); - - postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); + if (!ret) { + /* + * This is a point of no return; once we modified the uval + * there is no going back and subsequent operations must + * not fail. + */ + pi_state_update_owner(pi_state, new_owner); + postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); + } out_unlock: raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); @@ -2323,18 +2330,13 @@ static void unqueue_me_pi(struct futex_q *q) spin_unlock(q->lock_ptr); } -static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, - struct task_struct *argowner) +static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, + struct task_struct *argowner) { struct futex_pi_state *pi_state = q->pi_state; - u32 uval, curval, newval; struct task_struct *oldowner, *newowner; - u32 newtid; - int ret, err = 0; - - lockdep_assert_held(q->lock_ptr); - - raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + u32 uval, curval, newval, newtid; + int err = 0; oldowner = pi_state->owner; @@ -2368,14 +2370,12 @@ retry: * We raced against a concurrent self; things are * already fixed up. Nothing to do. */ - ret = 0; - goto out_unlock; + return 0; } if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) { - /* We got the lock after all, nothing to fix. */ - ret = 0; - goto out_unlock; + /* We got the lock. pi_state is correct. Tell caller. */ + return 1; } /* @@ -2402,8 +2402,7 @@ retry: * We raced against a concurrent self; things are * already fixed up. Nothing to do. */ - ret = 0; - goto out_unlock; + return 1; } newowner = argowner; } @@ -2433,22 +2432,9 @@ retry: * We fixed up user space. Now we need to fix the pi_state * itself. */ - if (pi_state->owner != NULL) { - raw_spin_lock(&pi_state->owner->pi_lock); - WARN_ON(list_empty(&pi_state->list)); - list_del_init(&pi_state->list); - raw_spin_unlock(&pi_state->owner->pi_lock); - } + pi_state_update_owner(pi_state, newowner); - pi_state->owner = newowner; - - raw_spin_lock(&newowner->pi_lock); - WARN_ON(!list_empty(&pi_state->list)); - list_add(&pi_state->list, &newowner->pi_state_list); - raw_spin_unlock(&newowner->pi_lock); - raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); - - return 0; + return argowner == current; /* * In order to reschedule or handle a page fault, we need to drop the @@ -2469,17 +2455,16 @@ handle_err: switch (err) { case -EFAULT: - ret = fault_in_user_writeable(uaddr); + err = fault_in_user_writeable(uaddr); break; case -EAGAIN: cond_resched(); - ret = 0; + err = 0; break; default: WARN_ON_ONCE(1); - ret = err; break; } @@ -2489,17 +2474,44 @@ handle_err: /* * Check if someone else fixed it for us: */ - if (pi_state->owner != oldowner) { - ret = 0; - goto out_unlock; - } + if (pi_state->owner != oldowner) + return argowner == current; - if (ret) - goto out_unlock; + /* Retry if err was -EAGAIN or the fault in succeeded */ + if (!err) + goto retry; - goto retry; + /* + * fault_in_user_writeable() failed so user state is immutable. At + * best we can make the kernel state consistent but user state will + * be most likely hosed and any subsequent unlock operation will be + * rejected due to PI futex rule [10]. + * + * Ensure that the rtmutex owner is also the pi_state owner despite + * the user space value claiming something different. There is no + * point in unlocking the rtmutex if current is the owner as it + * would need to wait until the next waiter has taken the rtmutex + * to guarantee consistent state. Keep it simple. Userspace asked + * for this wreckaged state. + * + * The rtmutex has an owner - either current or some other + * task. See the EAGAIN loop above. + */ + pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex)); -out_unlock: + return err; +} + +static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, + struct task_struct *argowner) +{ + struct futex_pi_state *pi_state = q->pi_state; + int ret; + + lockdep_assert_held(q->lock_ptr); + + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + ret = __fixup_pi_state_owner(uaddr, q, argowner); raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); return ret; } @@ -2523,8 +2535,6 @@ static long futex_wait_restart(struct restart_block *restart); */ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) { - int ret = 0; - if (locked) { /* * Got the lock. We might not be the anticipated owner if we @@ -2535,8 +2545,8 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) * stable state, anything else needs more attention. */ if (q->pi_state->owner != current) - ret = fixup_pi_state_owner(uaddr, q, current); - return ret ? ret : locked; + return fixup_pi_state_owner(uaddr, q, current); + return 1; } /* @@ -2547,23 +2557,17 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) * Another speculative read; pi_state->owner == current is unstable * but needs our attention. */ - if (q->pi_state->owner == current) { - ret = fixup_pi_state_owner(uaddr, q, NULL); - return ret; - } + if (q->pi_state->owner == current) + return fixup_pi_state_owner(uaddr, q, NULL); /* * Paranoia check. If we did not take the lock, then we should not be - * the owner of the rt_mutex. + * the owner of the rt_mutex. Warn and establish consistent state. */ - if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) { - printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " - "pi-state %p\n", ret, - q->pi_state->pi_mutex.owner, - q->pi_state->owner); - } + if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current)) + return fixup_pi_state_owner(uaddr, q, current); - return ret; + return 0; } /** @@ -2771,7 +2775,6 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock) { struct hrtimer_sleeper timeout, *to; - struct futex_pi_state *pi_state = NULL; struct task_struct *exiting = NULL; struct rt_mutex_waiter rt_waiter; struct futex_hash_bucket *hb; @@ -2907,23 +2910,8 @@ no_block: if (res) ret = (res < 0) ? res : 0; - /* - * If fixup_owner() faulted and was unable to handle the fault, unlock - * it and return the fault to userspace. - */ - if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) { - pi_state = q.pi_state; - get_pi_state(pi_state); - } - /* Unqueue and drop the lock */ unqueue_me_pi(&q); - - if (pi_state) { - rt_mutex_futex_unlock(&pi_state->pi_mutex); - put_pi_state(pi_state); - } - goto out; out_unlock_put_key: @@ -3183,7 +3171,6 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, u32 __user *uaddr2) { struct hrtimer_sleeper timeout, *to; - struct futex_pi_state *pi_state = NULL; struct rt_mutex_waiter rt_waiter; struct futex_hash_bucket *hb; union futex_key key2 = FUTEX_KEY_INIT; @@ -3261,16 +3248,17 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, if (q.pi_state && (q.pi_state->owner != current)) { spin_lock(q.lock_ptr); ret = fixup_pi_state_owner(uaddr2, &q, current); - if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) { - pi_state = q.pi_state; - get_pi_state(pi_state); - } /* * Drop the reference to the pi state which * the requeue_pi() code acquired for us. */ put_pi_state(q.pi_state); spin_unlock(q.lock_ptr); + /* + * Adjust the return value. It's either -EFAULT or + * success (1) but the caller expects 0 for success. + */ + ret = ret < 0 ? ret : 0; } } else { struct rt_mutex *pi_mutex; @@ -3301,25 +3289,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, if (res) ret = (res < 0) ? res : 0; - /* - * If fixup_pi_state_owner() faulted and was unable to handle - * the fault, unlock the rt_mutex and return the fault to - * userspace. - */ - if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) { - pi_state = q.pi_state; - get_pi_state(pi_state); - } - /* Unqueue and drop the lock. */ unqueue_me_pi(&q); } - if (pi_state) { - rt_mutex_futex_unlock(&pi_state->pi_mutex); - put_pi_state(pi_state); - } - if (ret == -EINTR) { /* * We've already been requeued, but cannot restart by calling diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 3110c77230c7..f62de2dea8a3 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -4,7 +4,7 @@ menu "GCOV-based kernel profiling" config GCOV_KERNEL bool "Enable gcov-based kernel profiling" depends on DEBUG_FS - select CONSTRUCTORS if !UML + select CONSTRUCTORS default n help This option enables gcov-based code profiling (e.g. for code coverage diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index dc0e2d7fbdfd..b338d622f26e 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -436,22 +436,22 @@ int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, can_reserve = msi_check_reservation_mode(domain, info, dev); - for_each_msi_entry(desc, dev) { - virq = desc->irq; - if (desc->nvec_used == 1) - dev_dbg(dev, "irq %d for MSI\n", virq); - else + /* + * This flag is set by the PCI layer as we need to activate + * the MSI entries before the PCI layer enables MSI in the + * card. Otherwise the card latches a random msi message. + */ + if (!(info->flags & MSI_FLAG_ACTIVATE_EARLY)) + goto skip_activate; + + for_each_msi_vector(desc, i, dev) { + if (desc->irq == i) { + virq = desc->irq; dev_dbg(dev, "irq [%d-%d] for MSI\n", virq, virq + desc->nvec_used - 1); - /* - * This flag is set by the PCI layer as we need to activate - * the MSI entries before the PCI layer enables MSI in the - * card. Otherwise the card latches a random msi message. - */ - if (!(info->flags & MSI_FLAG_ACTIVATE_EARLY)) - continue; + } - irq_data = irq_domain_get_irq_data(domain, desc->irq); + irq_data = irq_domain_get_irq_data(domain, i); if (!can_reserve) { irqd_clr_can_reserve(irq_data); if (domain->flags & IRQ_DOMAIN_MSI_NOMASK_QUIRK) @@ -462,28 +462,24 @@ int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, goto cleanup; } +skip_activate: /* * If these interrupts use reservation mode, clear the activated bit * so request_irq() will assign the final vector. */ if (can_reserve) { - for_each_msi_entry(desc, dev) { - irq_data = irq_domain_get_irq_data(domain, desc->irq); + for_each_msi_vector(desc, i, dev) { + irq_data = irq_domain_get_irq_data(domain, i); irqd_clr_activated(irq_data); } } return 0; cleanup: - for_each_msi_entry(desc, dev) { - struct irq_data *irqd; - - if (desc->irq == virq) - break; - - irqd = irq_domain_get_irq_data(domain, desc->irq); - if (irqd_is_activated(irqd)) - irq_domain_deactivate_irq(irqd); + for_each_msi_vector(desc, i, dev) { + irq_data = irq_domain_get_irq_data(domain, i); + if (irqd_is_activated(irq_data)) + irq_domain_deactivate_irq(irq_data); } msi_domain_free_irqs(domain, dev); return ret; diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 4f8efc278aa7..a0b6780740c8 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1076,7 +1076,7 @@ void crash_save_cpu(struct pt_regs *regs, int cpu) if (!buf) return; memset(&prstatus, 0, sizeof(prstatus)); - prstatus.pr_pid = current->pid; + prstatus.common.pr_pid = current->pid; elf_core_copy_kernel_regs(&prstatus.pr_reg, regs); buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, &prstatus, sizeof(prstatus)); @@ -1134,7 +1134,6 @@ int kernel_kexec(void) #ifdef CONFIG_KEXEC_JUMP if (kexec_image->preserve_context) { - lock_system_sleep(); pm_prepare_console(); error = freeze_processes(); if (error) { @@ -1197,7 +1196,6 @@ int kernel_kexec(void) thaw_processes(); Restore_console: pm_restore_console(); - unlock_system_sleep(); } #endif diff --git a/kernel/kprobes.c b/kernel/kprobes.c index f7fb5d135930..d5a3eb74a657 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1954,28 +1954,48 @@ bool __weak arch_kprobe_on_func_entry(unsigned long offset) return !offset; } -bool kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset) +/** + * kprobe_on_func_entry() -- check whether given address is function entry + * @addr: Target address + * @sym: Target symbol name + * @offset: The offset from the symbol or the address + * + * This checks whether the given @addr+@offset or @sym+@offset is on the + * function entry address or not. + * This returns 0 if it is the function entry, or -EINVAL if it is not. + * And also it returns -ENOENT if it fails the symbol or address lookup. + * Caller must pass @addr or @sym (either one must be NULL), or this + * returns -EINVAL. + */ +int kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset) { kprobe_opcode_t *kp_addr = _kprobe_addr(addr, sym, offset); if (IS_ERR(kp_addr)) - return false; + return PTR_ERR(kp_addr); - if (!kallsyms_lookup_size_offset((unsigned long)kp_addr, NULL, &offset) || - !arch_kprobe_on_func_entry(offset)) - return false; + if (!kallsyms_lookup_size_offset((unsigned long)kp_addr, NULL, &offset)) + return -ENOENT; - return true; + if (!arch_kprobe_on_func_entry(offset)) + return -EINVAL; + + return 0; } int register_kretprobe(struct kretprobe *rp) { - int ret = 0; + int ret; struct kretprobe_instance *inst; int i; void *addr; - if (!kprobe_on_func_entry(rp->kp.addr, rp->kp.symbol_name, rp->kp.offset)) + ret = kprobe_on_func_entry(rp->kp.addr, rp->kp.symbol_name, rp->kp.offset); + if (ret) + return ret; + + /* If only rp->kp.addr is specified, check reregistering kprobes */ + if (rp->kp.addr && check_kprobe_rereg(&rp->kp)) return -EINVAL; if (kretprobe_blacklist_size) { diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index cfdd5b93264d..2f8cd616d3b2 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1716,8 +1716,7 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock, * possible because it belongs to the pi_state which is about to be freed * and it is not longer visible to other tasks. */ -void rt_mutex_proxy_unlock(struct rt_mutex *lock, - struct task_struct *proxy_owner) +void rt_mutex_proxy_unlock(struct rt_mutex *lock) { debug_rt_mutex_proxy_unlock(lock); rt_mutex_set_owner(lock, NULL); diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index d1d62f942be2..ca6fb489007b 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -133,8 +133,7 @@ enum rtmutex_chainwalk { extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, struct task_struct *proxy_owner); -extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, - struct task_struct *proxy_owner); +extern void rt_mutex_proxy_unlock(struct rt_mutex *lock); extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index a7320f07689d..6bfe3ead10ad 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -139,7 +139,6 @@ config PM_SLEEP_SMP_NONZERO_CPU config PM_AUTOSLEEP bool "Opportunistic sleep" depends on PM_SLEEP - default n help Allow the kernel to trigger a system transition into a global sleep state automatically whenever there are no active wakeup sources. @@ -147,7 +146,6 @@ config PM_AUTOSLEEP config PM_WAKELOCKS bool "User space wakeup sources interface" depends on PM_SLEEP - default n help Allow user space to create, activate and deactivate wakeup source objects with the help of a sysfs-based interface. @@ -293,7 +291,6 @@ config PM_GENERIC_DOMAINS config WQ_POWER_EFFICIENT_DEFAULT bool "Enable workqueue power-efficient mode by default" depends on PM - default n help Per-cpu workqueues are generally preferred because they show better performance thanks to cache locality; unfortunately, @@ -322,15 +319,14 @@ config CPU_PM bool config ENERGY_MODEL - bool "Energy Model for CPUs" + bool "Energy Model for devices with DVFS (CPUs, GPUs, etc)" depends on SMP depends on CPU_FREQ - default n help Several subsystems (thermal and/or the task scheduler for example) - can leverage information about the energy consumed by CPUs to make - smarter decisions. This config option enables the framework from - which subsystems can access the energy models. + can leverage information about the energy consumed by devices to + make smarter decisions. This config option enables the framework + from which subsystems can access the energy models. The exact usage of the energy model is subsystem-dependent. diff --git a/kernel/power/main.c b/kernel/power/main.c index 0aefd6f57e0a..12c7e1bb442f 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -387,7 +387,7 @@ static struct attribute *suspend_attrs[] = { NULL, }; -static struct attribute_group suspend_attr_group = { +static const struct attribute_group suspend_attr_group = { .name = "suspend_stats", .attrs = suspend_attrs, }; diff --git a/kernel/power/process.c b/kernel/power/process.c index 45b054b7b5ec..50cc63534486 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -235,7 +235,7 @@ void thaw_kernel_threads(void) read_lock(&tasklist_lock); for_each_process_thread(g, p) { - if (p->flags & (PF_KTHREAD | PF_WQ_WORKER)) + if (p->flags & PF_KTHREAD) __thaw_task(p); } read_unlock(&tasklist_lock); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index c73f2e295167..72e33054a2e1 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -497,10 +497,10 @@ static int swap_writer_finish(struct swap_map_handle *handle, unsigned int flags, int error) { if (!error) { - flush_swap_writer(handle); pr_info("S"); error = mark_swapfiles(handle, flags); pr_cont("|\n"); + flush_swap_writer(handle); } if (error) diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 6639a0cfe0ac..5a95c688621f 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1398,7 +1398,7 @@ static size_t record_print_text(struct printk_record *r, bool syslog, * not counted in the return value. */ if (buf_size > 0) - text[len] = 0; + r->text_buf[len] = 0; return len; } diff --git a/kernel/sys.c b/kernel/sys.c index 51f00fe20e4d..6928d23c46ea 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -24,7 +24,6 @@ #include <linux/times.h> #include <linux/posix-timers.h> #include <linux/security.h> -#include <linux/dcookies.h> #include <linux/suspend.h> #include <linux/tty.h> #include <linux/signal.h> diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 87389b9e21ab..5247afd7f345 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -502,7 +502,7 @@ static struct hrtimer sync_hrtimer; static enum hrtimer_restart sync_timer_callback(struct hrtimer *timer) { - queue_work(system_power_efficient_wq, &sync_work); + queue_work(system_freezable_power_efficient_wq, &sync_work); return HRTIMER_NORESTART; } @@ -668,7 +668,7 @@ void ntp_notify_cmos_timer(void) * just a pointless work scheduled. */ if (ntp_synced() && !hrtimer_is_queued(&sync_hrtimer)) - queue_work(system_power_efficient_wq, &sync_work); + queue_work(system_freezable_power_efficient_wq, &sync_work); } static void __init ntp_init_cmos_sync(void) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index fb0fe4c66b84..9e9ee4945043 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -903,7 +903,7 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, static void blk_add_trace_bio_bounce(void *ignore, struct bio *bio) { - blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_BOUNCE, 0); + blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_BOUNCE, 0); } static void blk_add_trace_bio_complete(void *ignore, @@ -915,22 +915,24 @@ static void blk_add_trace_bio_complete(void *ignore, static void blk_add_trace_bio_backmerge(void *ignore, struct bio *bio) { - blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_BACKMERGE, 0); + blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_BACKMERGE, + 0); } static void blk_add_trace_bio_frontmerge(void *ignore, struct bio *bio) { - blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_FRONTMERGE, 0); + blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_FRONTMERGE, + 0); } static void blk_add_trace_bio_queue(void *ignore, struct bio *bio) { - blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_QUEUE, 0); + blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_QUEUE, 0); } static void blk_add_trace_getrq(void *ignore, struct bio *bio) { - blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_GETRQ, 0); + blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_GETRQ, 0); } static void blk_add_trace_plug(void *ignore, struct request_queue *q) @@ -967,7 +969,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q, static void blk_add_trace_split(void *ignore, struct bio *bio, unsigned int pdu) { - struct request_queue *q = bio->bi_disk->queue; + struct request_queue *q = bio->bi_bdev->bd_disk->queue; struct blk_trace *bt; rcu_read_lock(); @@ -997,7 +999,7 @@ static void blk_add_trace_split(void *ignore, struct bio *bio, unsigned int pdu) static void blk_add_trace_bio_remap(void *ignore, struct bio *bio, dev_t dev, sector_t from) { - struct request_queue *q = bio->bi_disk->queue; + struct request_queue *q = bio->bi_bdev->bd_disk->queue; struct blk_trace *bt; struct blk_io_trace_remap r; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 6c0018abe68a..b0c45d923f0f 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -96,9 +96,6 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) { unsigned int ret; - if (in_nmi()) /* not supported yet */ - return 1; - cant_sleep(); if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) { @@ -1191,6 +1188,10 @@ BTF_SET_END(btf_allowlist_d_path) static bool bpf_d_path_allowed(const struct bpf_prog *prog) { + if (prog->type == BPF_PROG_TYPE_TRACING && + prog->expected_attach_type == BPF_TRACE_ITER) + return true; + if (prog->type == BPF_PROG_TYPE_LSM) return bpf_lsm_is_sleepable_hook(prog->aux->attach_btf_id); @@ -1760,6 +1761,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_storage_delete_tracing_proto; case BPF_FUNC_sock_from_file: return &bpf_sock_from_file_proto; + case BPF_FUNC_get_socket_cookie: + return &bpf_get_socket_ptr_cookie_proto; #endif case BPF_FUNC_seq_printf: return prog->expected_attach_type == BPF_TRACE_ITER ? diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 73edb9e4f354..29a6ebeebc9e 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -394,7 +394,6 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) } if (t->ret_stack == NULL) { - atomic_set(&t->tracing_graph_pause, 0); atomic_set(&t->trace_overrun, 0); t->curr_ret_stack = -1; t->curr_ret_depth = -1; @@ -489,7 +488,6 @@ static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack); static void graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack) { - atomic_set(&t->tracing_graph_pause, 0); atomic_set(&t->trace_overrun, 0); t->ftrace_timestamp = 0; /* make curr_ret_stack visible before we add the ret_stack */ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b8a2d786b503..b5815a022ecc 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2745,7 +2745,7 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb, (entry = this_cpu_read(trace_buffered_event))) { /* Try to use the per cpu buffer first */ val = this_cpu_inc_return(trace_buffered_event_cnt); - if (val == 1) { + if ((len < (PAGE_SIZE - sizeof(*entry))) && val == 1) { trace_event_setup(entry, type, flags, pc); entry->array[0] = len; return entry; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index e9d28eeccb7e..d387b774ceeb 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1212,7 +1212,8 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, mutex_lock(&event_mutex); list_for_each_entry(file, &tr->events, list) { call = file->event_call; - if (!trace_event_name(call) || !call->class || !call->class->reg) + if ((call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) || + !trace_event_name(call) || !call->class || !call->class->reg) continue; if (system && strcmp(call->class->system, system->name) != 0) diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index d06aab4dcbb8..6756379b661f 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -562,6 +562,8 @@ static int __irqsoff_tracer_init(struct trace_array *tr) /* non overwrite screws up the latency tracers */ set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1); + /* without pause, we will produce garbage if another latency occurs */ + set_tracer_flag(tr, TRACE_ITER_PAUSE_ON_TRACE, 1); tr->max_latency = 0; irqsoff_trace = tr; @@ -583,11 +585,13 @@ static void __irqsoff_tracer_reset(struct trace_array *tr) { int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT; int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE; + int pause_flag = save_flags & TRACE_ITER_PAUSE_ON_TRACE; stop_irqsoff_tracer(tr, is_graph(tr)); set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); + set_tracer_flag(tr, TRACE_ITER_PAUSE_ON_TRACE, pause_flag); ftrace_reset_array_ops(tr); irqsoff_busy = false; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index e6fba1798771..56c7fbff7bd7 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -221,9 +221,9 @@ bool trace_kprobe_on_func_entry(struct trace_event_call *call) { struct trace_kprobe *tk = trace_kprobe_primary_from_call(call); - return tk ? kprobe_on_func_entry(tk->rp.kp.addr, + return tk ? (kprobe_on_func_entry(tk->rp.kp.addr, tk->rp.kp.addr ? NULL : tk->rp.kp.symbol_name, - tk->rp.kp.addr ? 0 : tk->rp.kp.offset) : false; + tk->rp.kp.addr ? 0 : tk->rp.kp.offset) == 0) : false; } bool trace_kprobe_error_injectable(struct trace_event_call *call) @@ -828,9 +828,11 @@ static int trace_kprobe_create(int argc, const char *argv[]) } if (is_return) flags |= TPARG_FL_RETURN; - if (kprobe_on_func_entry(NULL, symbol, offset)) + ret = kprobe_on_func_entry(NULL, symbol, offset); + if (ret == 0) flags |= TPARG_FL_FENTRY; - if (offset && is_return && !(flags & TPARG_FL_FENTRY)) { + /* Defer the ENOENT case until register kprobe */ + if (ret == -EINVAL && is_return) { trace_probe_log_err(0, BAD_RETPROBE); goto parse_error; } |