diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/bpf/btf.c | 11 | ||||
-rw-r--r-- | kernel/bpf/helpers.c | 46 | ||||
-rw-r--r-- | kernel/bpf/log.c | 480 | ||||
-rw-r--r-- | kernel/bpf/lpm_trie.c | 3 | ||||
-rw-r--r-- | kernel/bpf/stackmap.c | 11 | ||||
-rw-r--r-- | kernel/bpf/syscall.c | 3 | ||||
-rw-r--r-- | kernel/bpf/task_iter.c | 29 | ||||
-rw-r--r-- | kernel/bpf/tnum.c | 7 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 1676 | ||||
-rw-r--r-- | kernel/cgroup/cgroup-internal.h | 4 | ||||
-rw-r--r-- | kernel/cgroup/cgroup-v1.c | 34 | ||||
-rw-r--r-- | kernel/cgroup/cgroup.c | 45 | ||||
-rw-r--r-- | kernel/trace/bpf_trace.c | 12 |
13 files changed, 1304 insertions, 1057 deletions
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 15d71d2986d3..63cf4128fc05 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3840,9 +3840,6 @@ end: return ERR_PTR(ret); } -#define GRAPH_ROOT_MASK (BPF_LIST_HEAD | BPF_RB_ROOT) -#define GRAPH_NODE_MASK (BPF_LIST_NODE | BPF_RB_NODE) - int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec) { int i; @@ -3855,13 +3852,13 @@ int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec) * Hence we only need to ensure that bpf_{list_head,rb_root} ownership * does not form cycles. */ - if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & GRAPH_ROOT_MASK)) + if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & BPF_GRAPH_ROOT)) return 0; for (i = 0; i < rec->cnt; i++) { struct btf_struct_meta *meta; u32 btf_id; - if (!(rec->fields[i].type & GRAPH_ROOT_MASK)) + if (!(rec->fields[i].type & BPF_GRAPH_ROOT)) continue; btf_id = rec->fields[i].graph_root.value_btf_id; meta = btf_find_struct_meta(btf, btf_id); @@ -3873,7 +3870,7 @@ int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec) * to check ownership cycle for a type unless it's also a * node type. */ - if (!(rec->field_mask & GRAPH_NODE_MASK)) + if (!(rec->field_mask & BPF_GRAPH_NODE)) continue; /* We need to ensure ownership acyclicity among all types. The @@ -3909,7 +3906,7 @@ int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec) * - A is both an root and node. * - B is only an node. */ - if (meta->record->field_mask & GRAPH_ROOT_MASK) + if (meta->record->field_mask & BPF_GRAPH_ROOT) return -ELOOP; } return 0; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 56b0c1f678ee..b45a8381f9bd 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1937,10 +1937,7 @@ void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu) ma = &bpf_global_percpu_ma; else ma = &bpf_global_ma; - if (rec && rec->refcount_off >= 0) - bpf_mem_free_rcu(ma, p); - else - bpf_mem_free(ma, p); + bpf_mem_free_rcu(ma, p); } __bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign) @@ -2231,6 +2228,25 @@ __bpf_kfunc long bpf_task_under_cgroup(struct task_struct *task, rcu_read_unlock(); return ret; } + +/** + * bpf_task_get_cgroup1 - Acquires the associated cgroup of a task within a + * specific cgroup1 hierarchy. The cgroup1 hierarchy is identified by its + * hierarchy ID. + * @task: The target task + * @hierarchy_id: The ID of a cgroup1 hierarchy + * + * On success, the cgroup is returen. On failure, NULL is returned. + */ +__bpf_kfunc struct cgroup * +bpf_task_get_cgroup1(struct task_struct *task, int hierarchy_id) +{ + struct cgroup *cgrp = task_get_cgroup1(task, hierarchy_id); + + if (IS_ERR(cgrp)) + return NULL; + return cgrp; +} #endif /* CONFIG_CGROUPS */ /** @@ -2520,7 +2536,7 @@ BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_percpu_obj_new_impl, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE) BTF_ID_FLAGS(func, bpf_percpu_obj_drop_impl, KF_RELEASE) -BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE | KF_RET_NULL | KF_RCU) BTF_ID_FLAGS(func, bpf_list_push_front_impl) BTF_ID_FLAGS(func, bpf_list_push_back_impl) BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL) @@ -2537,6 +2553,7 @@ BTF_ID_FLAGS(func, bpf_cgroup_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_RCU | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_under_cgroup, KF_RCU) +BTF_ID_FLAGS(func, bpf_task_get_cgroup1, KF_ACQUIRE | KF_RCU | KF_RET_NULL) #endif BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_throw) @@ -2618,3 +2635,22 @@ static int __init kfunc_init(void) } late_initcall(kfunc_init); + +/* Get a pointer to dynptr data up to len bytes for read only access. If + * the dynptr doesn't have continuous data up to len bytes, return NULL. + */ +const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len) +{ + return bpf_dynptr_slice(ptr, 0, NULL, len); +} + +/* Get a pointer to dynptr data up to len bytes for read write access. If + * the dynptr doesn't have continuous data up to len bytes, or the dynptr + * is read only, return NULL. + */ +void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u32 len) +{ + if (__bpf_dynptr_is_rdonly(ptr)) + return NULL; + return (void *)__bpf_dynptr_data(ptr, len); +} diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 850494423530..3505f3e5ae96 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -10,6 +10,8 @@ #include <linux/bpf_verifier.h> #include <linux/math64.h> +#define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args) + static bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log) { /* ubuf and len_total should both be specified (or not) together */ @@ -325,3 +327,481 @@ __printf(2, 3) void bpf_log(struct bpf_verifier_log *log, va_end(args); } EXPORT_SYMBOL_GPL(bpf_log); + +static const struct bpf_line_info * +find_linfo(const struct bpf_verifier_env *env, u32 insn_off) +{ + const struct bpf_line_info *linfo; + const struct bpf_prog *prog; + u32 i, nr_linfo; + + prog = env->prog; + nr_linfo = prog->aux->nr_linfo; + + if (!nr_linfo || insn_off >= prog->len) + return NULL; + + linfo = prog->aux->linfo; + for (i = 1; i < nr_linfo; i++) + if (insn_off < linfo[i].insn_off) + break; + + return &linfo[i - 1]; +} + +static const char *ltrim(const char *s) +{ + while (isspace(*s)) + s++; + + return s; +} + +__printf(3, 4) void verbose_linfo(struct bpf_verifier_env *env, + u32 insn_off, + const char *prefix_fmt, ...) +{ + const struct bpf_line_info *linfo; + + if (!bpf_verifier_log_needed(&env->log)) + return; + + linfo = find_linfo(env, insn_off); + if (!linfo || linfo == env->prev_linfo) + return; + + if (prefix_fmt) { + va_list args; + + va_start(args, prefix_fmt); + bpf_verifier_vlog(&env->log, prefix_fmt, args); + va_end(args); + } + + verbose(env, "%s\n", + ltrim(btf_name_by_offset(env->prog->aux->btf, + linfo->line_off))); + + env->prev_linfo = linfo; +} + +static const char *btf_type_name(const struct btf *btf, u32 id) +{ + return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off); +} + +/* string representation of 'enum bpf_reg_type' + * + * Note that reg_type_str() can not appear more than once in a single verbose() + * statement. + */ +const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type) +{ + char postfix[16] = {0}, prefix[64] = {0}; + static const char * const str[] = { + [NOT_INIT] = "?", + [SCALAR_VALUE] = "scalar", + [PTR_TO_CTX] = "ctx", + [CONST_PTR_TO_MAP] = "map_ptr", + [PTR_TO_MAP_VALUE] = "map_value", + [PTR_TO_STACK] = "fp", + [PTR_TO_PACKET] = "pkt", + [PTR_TO_PACKET_META] = "pkt_meta", + [PTR_TO_PACKET_END] = "pkt_end", + [PTR_TO_FLOW_KEYS] = "flow_keys", + [PTR_TO_SOCKET] = "sock", + [PTR_TO_SOCK_COMMON] = "sock_common", + [PTR_TO_TCP_SOCK] = "tcp_sock", + [PTR_TO_TP_BUFFER] = "tp_buffer", + [PTR_TO_XDP_SOCK] = "xdp_sock", + [PTR_TO_BTF_ID] = "ptr_", + [PTR_TO_MEM] = "mem", + [PTR_TO_BUF] = "buf", + [PTR_TO_FUNC] = "func", + [PTR_TO_MAP_KEY] = "map_key", + [CONST_PTR_TO_DYNPTR] = "dynptr_ptr", + }; + + if (type & PTR_MAYBE_NULL) { + if (base_type(type) == PTR_TO_BTF_ID) + strncpy(postfix, "or_null_", 16); + else + strncpy(postfix, "_or_null", 16); + } + + snprintf(prefix, sizeof(prefix), "%s%s%s%s%s%s%s", + type & MEM_RDONLY ? "rdonly_" : "", + type & MEM_RINGBUF ? "ringbuf_" : "", + type & MEM_USER ? "user_" : "", + type & MEM_PERCPU ? "percpu_" : "", + type & MEM_RCU ? "rcu_" : "", + type & PTR_UNTRUSTED ? "untrusted_" : "", + type & PTR_TRUSTED ? "trusted_" : "" + ); + + snprintf(env->tmp_str_buf, TMP_STR_BUF_LEN, "%s%s%s", + prefix, str[base_type(type)], postfix); + return env->tmp_str_buf; +} + +const char *dynptr_type_str(enum bpf_dynptr_type type) +{ + switch (type) { + case BPF_DYNPTR_TYPE_LOCAL: + return "local"; + case BPF_DYNPTR_TYPE_RINGBUF: + return "ringbuf"; + case BPF_DYNPTR_TYPE_SKB: + return "skb"; + case BPF_DYNPTR_TYPE_XDP: + return "xdp"; + case BPF_DYNPTR_TYPE_INVALID: + return "<invalid>"; + default: + WARN_ONCE(1, "unknown dynptr type %d\n", type); + return "<unknown>"; + } +} + +const char *iter_type_str(const struct btf *btf, u32 btf_id) +{ + if (!btf || btf_id == 0) + return "<invalid>"; + + /* we already validated that type is valid and has conforming name */ + return btf_type_name(btf, btf_id) + sizeof(ITER_PREFIX) - 1; +} + +const char *iter_state_str(enum bpf_iter_state state) +{ + switch (state) { + case BPF_ITER_STATE_ACTIVE: + return "active"; + case BPF_ITER_STATE_DRAINED: + return "drained"; + case BPF_ITER_STATE_INVALID: + return "<invalid>"; + default: + WARN_ONCE(1, "unknown iter state %d\n", state); + return "<unknown>"; + } +} + +static char slot_type_char[] = { + [STACK_INVALID] = '?', + [STACK_SPILL] = 'r', + [STACK_MISC] = 'm', + [STACK_ZERO] = '0', + [STACK_DYNPTR] = 'd', + [STACK_ITER] = 'i', +}; + +static void print_liveness(struct bpf_verifier_env *env, + enum bpf_reg_liveness live) +{ + if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN | REG_LIVE_DONE)) + verbose(env, "_"); + if (live & REG_LIVE_READ) + verbose(env, "r"); + if (live & REG_LIVE_WRITTEN) + verbose(env, "w"); + if (live & REG_LIVE_DONE) + verbose(env, "D"); +} + +#define UNUM_MAX_DECIMAL U16_MAX +#define SNUM_MAX_DECIMAL S16_MAX +#define SNUM_MIN_DECIMAL S16_MIN + +static bool is_unum_decimal(u64 num) +{ + return num <= UNUM_MAX_DECIMAL; +} + +static bool is_snum_decimal(s64 num) +{ + return num >= SNUM_MIN_DECIMAL && num <= SNUM_MAX_DECIMAL; +} + +static void verbose_unum(struct bpf_verifier_env *env, u64 num) +{ + if (is_unum_decimal(num)) + verbose(env, "%llu", num); + else + verbose(env, "%#llx", num); +} + +static void verbose_snum(struct bpf_verifier_env *env, s64 num) +{ + if (is_snum_decimal(num)) + verbose(env, "%lld", num); + else + verbose(env, "%#llx", num); +} + +static void print_scalar_ranges(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, + const char **sep) +{ + /* For signed ranges, we want to unify 64-bit and 32-bit values in the + * output as much as possible, but there is a bit of a complication. + * If we choose to print values as decimals, this is natural to do, + * because negative 64-bit and 32-bit values >= -S32_MIN have the same + * representation due to sign extension. But if we choose to print + * them in hex format (see is_snum_decimal()), then sign extension is + * misleading. + * E.g., smin=-2 and smin32=-2 are exactly the same in decimal, but in + * hex they will be smin=0xfffffffffffffffe and smin32=0xfffffffe, two + * very different numbers. + * So we avoid sign extension if we choose to print values in hex. + */ + struct { + const char *name; + u64 val; + bool omit; + } minmaxs[] = { + {"smin", reg->smin_value, reg->smin_value == S64_MIN}, + {"smax", reg->smax_value, reg->smax_value == S64_MAX}, + {"umin", reg->umin_value, reg->umin_value == 0}, + {"umax", reg->umax_value, reg->umax_value == U64_MAX}, + {"smin32", + is_snum_decimal((s64)reg->s32_min_value) + ? (s64)reg->s32_min_value + : (u32)reg->s32_min_value, reg->s32_min_value == S32_MIN}, + {"smax32", + is_snum_decimal((s64)reg->s32_max_value) + ? (s64)reg->s32_max_value + : (u32)reg->s32_max_value, reg->s32_max_value == S32_MAX}, + {"umin32", reg->u32_min_value, reg->u32_min_value == 0}, + {"umax32", reg->u32_max_value, reg->u32_max_value == U32_MAX}, + }, *m1, *m2, *mend = &minmaxs[ARRAY_SIZE(minmaxs)]; + bool neg1, neg2; + + for (m1 = &minmaxs[0]; m1 < mend; m1++) { + if (m1->omit) + continue; + + neg1 = m1->name[0] == 's' && (s64)m1->val < 0; + + verbose(env, "%s%s=", *sep, m1->name); + *sep = ","; + + for (m2 = m1 + 2; m2 < mend; m2 += 2) { + if (m2->omit || m2->val != m1->val) + continue; + /* don't mix negatives with positives */ + neg2 = m2->name[0] == 's' && (s64)m2->val < 0; + if (neg2 != neg1) + continue; + m2->omit = true; + verbose(env, "%s=", m2->name); + } + + if (m1->name[0] == 's') + verbose_snum(env, m1->val); + else + verbose_unum(env, m1->val); + } +} + +static bool type_is_map_ptr(enum bpf_reg_type t) { + switch (base_type(t)) { + case CONST_PTR_TO_MAP: + case PTR_TO_MAP_KEY: + case PTR_TO_MAP_VALUE: + return true; + default: + return false; + } +} + +static void print_reg_state(struct bpf_verifier_env *env, + const struct bpf_func_state *state, + const struct bpf_reg_state *reg) +{ + enum bpf_reg_type t; + const char *sep = ""; + + t = reg->type; + if (t == SCALAR_VALUE && reg->precise) + verbose(env, "P"); + if (t == SCALAR_VALUE && tnum_is_const(reg->var_off)) { + /* reg->off should be 0 for SCALAR_VALUE */ + verbose_snum(env, reg->var_off.value + reg->off); + return; + } +/* + * _a stands for append, was shortened to avoid multiline statements below. + * This macro is used to output a comma separated list of attributes. + */ +#define verbose_a(fmt, ...) ({ verbose(env, "%s" fmt, sep, ##__VA_ARGS__); sep = ","; }) + + verbose(env, "%s", reg_type_str(env, t)); + if (t == PTR_TO_STACK) { + if (state->frameno != reg->frameno) + verbose(env, "[%d]", reg->frameno); + if (tnum_is_const(reg->var_off)) { + verbose_snum(env, reg->var_off.value + reg->off); + return; + } + } + if (base_type(t) == PTR_TO_BTF_ID) + verbose(env, "%s", btf_type_name(reg->btf, reg->btf_id)); + verbose(env, "("); + if (reg->id) + verbose_a("id=%d", reg->id); + if (reg->ref_obj_id) + verbose_a("ref_obj_id=%d", reg->ref_obj_id); + if (type_is_non_owning_ref(reg->type)) + verbose_a("%s", "non_own_ref"); + if (type_is_map_ptr(t)) { + if (reg->map_ptr->name[0]) + verbose_a("map=%s", reg->map_ptr->name); + verbose_a("ks=%d,vs=%d", + reg->map_ptr->key_size, + reg->map_ptr->value_size); + } + if (t != SCALAR_VALUE && reg->off) { + verbose_a("off="); + verbose_snum(env, reg->off); + } + if (type_is_pkt_pointer(t)) { + verbose_a("r="); + verbose_unum(env, reg->range); + } + if (tnum_is_const(reg->var_off)) { + /* a pointer register with fixed offset */ + if (reg->var_off.value) { + verbose_a("imm="); + verbose_snum(env, reg->var_off.value); + } + } else { + print_scalar_ranges(env, reg, &sep); + if (!tnum_is_unknown(reg->var_off)) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose_a("var_off=%s", tn_buf); + } + } + verbose(env, ")"); + +#undef verbose_a +} + +void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_func_state *state, + bool print_all) +{ + const struct bpf_reg_state *reg; + int i; + + if (state->frameno) + verbose(env, " frame%d:", state->frameno); + for (i = 0; i < MAX_BPF_REG; i++) { + reg = &state->regs[i]; + if (reg->type == NOT_INIT) + continue; + if (!print_all && !reg_scratched(env, i)) + continue; + verbose(env, " R%d", i); + print_liveness(env, reg->live); + verbose(env, "="); + print_reg_state(env, state, reg); + } + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + char types_buf[BPF_REG_SIZE + 1]; + bool valid = false; + u8 slot_type; + int j; + + if (!print_all && !stack_slot_scratched(env, i)) + continue; + + for (j = 0; j < BPF_REG_SIZE; j++) { + slot_type = state->stack[i].slot_type[j]; + if (slot_type != STACK_INVALID) + valid = true; + types_buf[j] = slot_type_char[slot_type]; + } + types_buf[BPF_REG_SIZE] = 0; + if (!valid) + continue; + + reg = &state->stack[i].spilled_ptr; + switch (state->stack[i].slot_type[BPF_REG_SIZE - 1]) { + case STACK_SPILL: + /* print MISC/ZERO/INVALID slots above subreg spill */ + for (j = 0; j < BPF_REG_SIZE; j++) + if (state->stack[i].slot_type[j] == STACK_SPILL) + break; + types_buf[j] = '\0'; + + verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); + print_liveness(env, reg->live); + verbose(env, "=%s", types_buf); + print_reg_state(env, state, reg); + break; + case STACK_DYNPTR: + /* skip to main dynptr slot */ + i += BPF_DYNPTR_NR_SLOTS - 1; + reg = &state->stack[i].spilled_ptr; + + verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); + print_liveness(env, reg->live); + verbose(env, "=dynptr_%s", dynptr_type_str(reg->dynptr.type)); + if (reg->ref_obj_id) + verbose(env, "(ref_id=%d)", reg->ref_obj_id); + break; + case STACK_ITER: + /* only main slot has ref_obj_id set; skip others */ + if (!reg->ref_obj_id) + continue; + + verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); + print_liveness(env, reg->live); + verbose(env, "=iter_%s(ref_id=%d,state=%s,depth=%u)", + iter_type_str(reg->iter.btf, reg->iter.btf_id), + reg->ref_obj_id, iter_state_str(reg->iter.state), + reg->iter.depth); + break; + case STACK_MISC: + case STACK_ZERO: + default: + verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); + print_liveness(env, reg->live); + verbose(env, "=%s", types_buf); + break; + } + } + if (state->acquired_refs && state->refs[0].id) { + verbose(env, " refs=%d", state->refs[0].id); + for (i = 1; i < state->acquired_refs; i++) + if (state->refs[i].id) + verbose(env, ",%d", state->refs[i].id); + } + if (state->in_callback_fn) + verbose(env, " cb"); + if (state->in_async_callback_fn) + verbose(env, " async_cb"); + verbose(env, "\n"); + if (!print_all) + mark_verifier_state_clean(env); +} + +static inline u32 vlog_alignment(u32 pos) +{ + return round_up(max(pos + BPF_LOG_MIN_ALIGNMENT / 2, BPF_LOG_ALIGNMENT), + BPF_LOG_MIN_ALIGNMENT) - pos - 1; +} + +void print_insn_state(struct bpf_verifier_env *env, const struct bpf_func_state *state) +{ + if (env->prev_log_pos && env->prev_log_pos == env->log.end_pos) { + /* remove new line character */ + bpf_vlog_reset(&env->log, env->prev_log_pos - 1); + verbose(env, "%*c;", vlog_alignment(env->prev_insn_print_pos), ' '); + } else { + verbose(env, "%d:", env->insn_idx); + } + print_verifier_state(env, state, false); +} diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 17c7e7782a1f..b32be680da6c 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -231,6 +231,9 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key) struct lpm_trie_node *node, *found = NULL; struct bpf_lpm_trie_key *key = _key; + if (key->prefixlen > trie->max_prefixlen) + return NULL; + /* Start walking the trie from the root node ... */ for (node = rcu_dereference_check(trie->root, rcu_read_lock_bh_held()); diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index d6b277482085..dff7ba539701 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -388,6 +388,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, { u32 trace_nr, copy_len, elem_size, num_elem, max_depth; bool user_build_id = flags & BPF_F_USER_BUILD_ID; + bool crosstask = task && task != current; u32 skip = flags & BPF_F_SKIP_FIELD_MASK; bool user = flags & BPF_F_USER_STACK; struct perf_callchain_entry *trace; @@ -410,6 +411,14 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, if (task && user && !user_mode(regs)) goto err_fault; + /* get_perf_callchain does not support crosstask user stack walking + * but returns an empty stack instead of NULL. + */ + if (crosstask && user) { + err = -EOPNOTSUPP; + goto clear; + } + num_elem = size / elem_size; max_depth = num_elem + skip; if (sysctl_perf_event_max_stack < max_depth) @@ -421,7 +430,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, trace = get_callchain_entry_for_task(task, max_depth); else trace = get_perf_callchain(regs, 0, kernel, user, max_depth, - false, false); + crosstask, false); if (unlikely(!trace)) goto err_fault; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0ed286b8a0f0..5e43ddd1b83f 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2573,7 +2573,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) BPF_F_SLEEPABLE | BPF_F_TEST_RND_HI32 | BPF_F_XDP_HAS_FRAGS | - BPF_F_XDP_DEV_BOUND_ONLY)) + BPF_F_XDP_DEV_BOUND_ONLY | + BPF_F_TEST_REG_INVARIANTS)) return -EINVAL; if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 26082b97894d..e5c3500443c6 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -70,15 +70,13 @@ static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_comm return NULL; retry: - task = next_thread(task); + task = __next_thread(task); + if (!task) + return NULL; next_tid = __task_pid_nr_ns(task, PIDTYPE_PID, common->ns); - if (!next_tid || next_tid == common->pid) { - /* Run out of tasks of a process. The tasks of a - * thread_group are linked as circular linked list. - */ - return NULL; - } + if (!next_tid) + goto retry; if (skip_if_dup_files && task->files == task->group_leader->files) goto retry; @@ -980,7 +978,6 @@ __bpf_kfunc int bpf_iter_task_new(struct bpf_iter_task *it, BUILD_BUG_ON(__alignof__(struct bpf_iter_task_kern) != __alignof__(struct bpf_iter_task)); - kit->task = kit->pos = NULL; switch (flags) { case BPF_TASK_ITER_ALL_THREADS: case BPF_TASK_ITER_ALL_PROCS: @@ -1017,20 +1014,16 @@ __bpf_kfunc struct task_struct *bpf_iter_task_next(struct bpf_iter_task *it) if (flags == BPF_TASK_ITER_ALL_PROCS) goto get_next_task; - kit->pos = next_thread(kit->pos); - if (kit->pos == kit->task) { - if (flags == BPF_TASK_ITER_PROC_THREADS) { - kit->pos = NULL; - return pos; - } - } else + kit->pos = __next_thread(kit->pos); + if (kit->pos || flags == BPF_TASK_ITER_PROC_THREADS) return pos; get_next_task: - kit->pos = next_task(kit->pos); - kit->task = kit->pos; - if (kit->pos == &init_task) + kit->task = next_task(kit->task); + if (kit->task == &init_task) kit->pos = NULL; + else + kit->pos = kit->task; return pos; } diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c index 3d7127f439a1..f4c91c9b27d7 100644 --- a/kernel/bpf/tnum.c +++ b/kernel/bpf/tnum.c @@ -208,7 +208,12 @@ struct tnum tnum_clear_subreg(struct tnum a) return tnum_lshift(tnum_rshift(a, 32), 32); } +struct tnum tnum_with_subreg(struct tnum reg, struct tnum subreg) +{ + return tnum_or(tnum_clear_subreg(reg), tnum_subreg(subreg)); +} + struct tnum tnum_const_subreg(struct tnum a, u32 value) { - return tnum_or(tnum_clear_subreg(a), tnum_const(value)); + return tnum_with_subreg(a, tnum_const(value)); } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6da370a047fe..1340921ea311 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -342,27 +342,6 @@ struct btf *btf_vmlinux; static DEFINE_MUTEX(bpf_verifier_lock); static DEFINE_MUTEX(bpf_percpu_ma_lock); -static const struct bpf_line_info * -find_linfo(const struct bpf_verifier_env *env, u32 insn_off) -{ - const struct bpf_line_info *linfo; - const struct bpf_prog *prog; - u32 i, nr_linfo; - - prog = env->prog; - nr_linfo = prog->aux->nr_linfo; - - if (!nr_linfo || insn_off >= prog->len) - return NULL; - - linfo = prog->aux->linfo; - for (i = 1; i < nr_linfo; i++) - if (insn_off < linfo[i].insn_off) - break; - - return &linfo[i - 1]; -} - __printf(2, 3) static void verbose(void *private_data, const char *fmt, ...) { struct bpf_verifier_env *env = private_data; @@ -376,42 +355,6 @@ __printf(2, 3) static void verbose(void *private_data, const char *fmt, ...) va_end(args); } -static const char *ltrim(const char *s) -{ - while (isspace(*s)) - s++; - - return s; -} - -__printf(3, 4) static void verbose_linfo(struct bpf_verifier_env *env, - u32 insn_off, - const char *prefix_fmt, ...) -{ - const struct bpf_line_info *linfo; - - if (!bpf_verifier_log_needed(&env->log)) - return; - - linfo = find_linfo(env, insn_off); - if (!linfo || linfo == env->prev_linfo) - return; - - if (prefix_fmt) { - va_list args; - - va_start(args, prefix_fmt); - bpf_verifier_vlog(&env->log, prefix_fmt, args); - va_end(args); - } - - verbose(env, "%s\n", - ltrim(btf_name_by_offset(env->prog->aux->btf, - linfo->line_off))); - - env->prev_linfo = linfo; -} - static void verbose_invalid_scalar(struct bpf_verifier_env *env, struct bpf_reg_state *reg, struct tnum *range, const char *ctx, @@ -430,21 +373,6 @@ static void verbose_invalid_scalar(struct bpf_verifier_env *env, verbose(env, " should have been in %s\n", tn_buf); } -static bool type_is_pkt_pointer(enum bpf_reg_type type) -{ - type = base_type(type); - return type == PTR_TO_PACKET || - type == PTR_TO_PACKET_META; -} - -static bool type_is_sk_pointer(enum bpf_reg_type type) -{ - return type == PTR_TO_SOCKET || - type == PTR_TO_SOCK_COMMON || - type == PTR_TO_TCP_SOCK || - type == PTR_TO_XDP_SOCK; -} - static bool type_may_be_null(u32 type) { return type & PTR_MAYBE_NULL; @@ -468,16 +396,6 @@ static bool reg_not_null(const struct bpf_reg_state *reg) type == PTR_TO_MEM; } -static bool type_is_ptr_alloc_obj(u32 type) -{ - return base_type(type) == PTR_TO_BTF_ID && type_flag(type) & MEM_ALLOC; -} - -static bool type_is_non_owning_ref(u32 type) -{ - return type_is_ptr_alloc_obj(type) && type_flag(type) & NON_OWN_REF; -} - static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg) { struct btf_record *rec = NULL; @@ -594,83 +512,6 @@ static bool is_cmpxchg_insn(const struct bpf_insn *insn) insn->imm == BPF_CMPXCHG; } -/* string representation of 'enum bpf_reg_type' - * - * Note that reg_type_str() can not appear more than once in a single verbose() - * statement. - */ -static const char *reg_type_str(struct bpf_verifier_env *env, - enum bpf_reg_type type) -{ - char postfix[16] = {0}, prefix[64] = {0}; - static const char * const str[] = { - [NOT_INIT] = "?", - [SCALAR_VALUE] = "scalar", - [PTR_TO_CTX] = "ctx", - [CONST_PTR_TO_MAP] = "map_ptr", - [PTR_TO_MAP_VALUE] = "map_value", - [PTR_TO_STACK] = "fp", - [PTR_TO_PACKET] = "pkt", - [PTR_TO_PACKET_META] = "pkt_meta", - [PTR_TO_PACKET_END] = "pkt_end", - [PTR_TO_FLOW_KEYS] = "flow_keys", - [PTR_TO_SOCKET] = "sock", - [PTR_TO_SOCK_COMMON] = "sock_common", - [PTR_TO_TCP_SOCK] = "tcp_sock", - [PTR_TO_TP_BUFFER] = "tp_buffer", - [PTR_TO_XDP_SOCK] = "xdp_sock", - [PTR_TO_BTF_ID] = "ptr_", - [PTR_TO_MEM] = "mem", - [PTR_TO_BUF] = "buf", - [PTR_TO_FUNC] = "func", - [PTR_TO_MAP_KEY] = "map_key", - [CONST_PTR_TO_DYNPTR] = "dynptr_ptr", - }; - - if (type & PTR_MAYBE_NULL) { - if (base_type(type) == PTR_TO_BTF_ID) - strncpy(postfix, "or_null_", 16); - else - strncpy(postfix, "_or_null", 16); - } - - snprintf(prefix, sizeof(prefix), "%s%s%s%s%s%s%s", - type & MEM_RDONLY ? "rdonly_" : "", - type & MEM_RINGBUF ? "ringbuf_" : "", - type & MEM_USER ? "user_" : "", - type & MEM_PERCPU ? "percpu_" : "", - type & MEM_RCU ? "rcu_" : "", - type & PTR_UNTRUSTED ? "untrusted_" : "", - type & PTR_TRUSTED ? "trusted_" : "" - ); - - snprintf(env->tmp_str_buf, TMP_STR_BUF_LEN, "%s%s%s", - prefix, str[base_type(type)], postfix); - return env->tmp_str_buf; -} - -static char slot_type_char[] = { - [STACK_INVALID] = '?', - [STACK_SPILL] = 'r', - [STACK_MISC] = 'm', - [STACK_ZERO] = '0', - [STACK_DYNPTR] = 'd', - [STACK_ITER] = 'i', -}; - -static void print_liveness(struct bpf_verifier_env *env, - enum bpf_reg_liveness live) -{ - if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN | REG_LIVE_DONE)) - verbose(env, "_"); - if (live & REG_LIVE_READ) - verbose(env, "r"); - if (live & REG_LIVE_WRITTEN) - verbose(env, "w"); - if (live & REG_LIVE_DONE) - verbose(env, "D"); -} - static int __get_spi(s32 off) { return (-off - 1) / BPF_REG_SIZE; @@ -740,87 +581,6 @@ static const char *btf_type_name(const struct btf *btf, u32 id) return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off); } -static const char *dynptr_type_str(enum bpf_dynptr_type type) -{ - switch (type) { - case BPF_DYNPTR_TYPE_LOCAL: - return "local"; - case BPF_DYNPTR_TYPE_RINGBUF: - return "ringbuf"; - case BPF_DYNPTR_TYPE_SKB: - return "skb"; - case BPF_DYNPTR_TYPE_XDP: - return "xdp"; - case BPF_DYNPTR_TYPE_INVALID: - return "<invalid>"; - default: - WARN_ONCE(1, "unknown dynptr type %d\n", type); - return "<unknown>"; - } -} - -static const char *iter_type_str(const struct btf *btf, u32 btf_id) -{ - if (!btf || btf_id == 0) - return "<invalid>"; - - /* we already validated that type is valid and has conforming name */ - return btf_type_name(btf, btf_id) + sizeof(ITER_PREFIX) - 1; -} - -static const char *iter_state_str(enum bpf_iter_state state) -{ - switch (state) { - case BPF_ITER_STATE_ACTIVE: - return "active"; - case BPF_ITER_STATE_DRAINED: - return "drained"; - case BPF_ITER_STATE_INVALID: - return "<invalid>"; - default: - WARN_ONCE(1, "unknown iter state %d\n", state); - return "<unknown>"; - } -} - -static void mark_reg_scratched(struct bpf_verifier_env *env, u32 regno) -{ - env->scratched_regs |= 1U << regno; -} - -static void mark_stack_slot_scratched(struct bpf_verifier_env *env, u32 spi) -{ - env->scratched_stack_slots |= 1ULL << spi; -} - -static bool reg_scratched(const struct bpf_verifier_env *env, u32 regno) -{ - return (env->scratched_regs >> regno) & 1; -} - -static bool stack_slot_scratched(const struct bpf_verifier_env *env, u64 regno) -{ - return (env->scratched_stack_slots >> regno) & 1; -} - -static bool verifier_state_scratched(const struct bpf_verifier_env *env) -{ - return env->scratched_regs || env->scratched_stack_slots; -} - -static void mark_verifier_state_clean(struct bpf_verifier_env *env) -{ - env->scratched_regs = 0U; - env->scratched_stack_slots = 0ULL; -} - -/* Used for printing the entire verifier state. */ -static void mark_verifier_state_scratched(struct bpf_verifier_env *env) -{ - env->scratched_regs = ~0U; - env->scratched_stack_slots = ~0ULL; -} - static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type) { switch (arg_type & DYNPTR_TYPE_FLAG_MASK) { @@ -1360,226 +1120,6 @@ static void scrub_spilled_slot(u8 *stype) *stype = STACK_MISC; } -static void print_scalar_ranges(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, - const char **sep) -{ - struct { - const char *name; - u64 val; - bool omit; - } minmaxs[] = { - {"smin", reg->smin_value, reg->smin_value == S64_MIN}, - {"smax", reg->smax_value, reg->smax_value == S64_MAX}, - {"umin", reg->umin_value, reg->umin_value == 0}, - {"umax", reg->umax_value, reg->umax_value == U64_MAX}, - {"smin32", (s64)reg->s32_min_value, reg->s32_min_value == S32_MIN}, - {"smax32", (s64)reg->s32_max_value, reg->s32_max_value == S32_MAX}, - {"umin32", reg->u32_min_value, reg->u32_min_value == 0}, - {"umax32", reg->u32_max_value, reg->u32_max_value == U32_MAX}, - }, *m1, *m2, *mend = &minmaxs[ARRAY_SIZE(minmaxs)]; - bool neg1, neg2; - - for (m1 = &minmaxs[0]; m1 < mend; m1++) { - if (m1->omit) - continue; - - neg1 = m1->name[0] == 's' && (s64)m1->val < 0; - - verbose(env, "%s%s=", *sep, m1->name); - *sep = ","; - - for (m2 = m1 + 2; m2 < mend; m2 += 2) { - if (m2->omit || m2->val != m1->val) - continue; - /* don't mix negatives with positives */ - neg2 = m2->name[0] == 's' && (s64)m2->val < 0; - if (neg2 != neg1) - continue; - m2->omit = true; - verbose(env, "%s=", m2->name); - } - - verbose(env, m1->name[0] == 's' ? "%lld" : "%llu", m1->val); - } -} - -static void print_verifier_state(struct bpf_verifier_env *env, - const struct bpf_func_state *state, - bool print_all) -{ - const struct bpf_reg_state *reg; - enum bpf_reg_type t; - int i; - - if (state->frameno) - verbose(env, " frame%d:", state->frameno); - for (i = 0; i < MAX_BPF_REG; i++) { - reg = &state->regs[i]; - t = reg->type; - if (t == NOT_INIT) - continue; - if (!print_all && !reg_scratched(env, i)) - continue; - verbose(env, " R%d", i); - print_liveness(env, reg->live); - verbose(env, "="); - if (t == SCALAR_VALUE && reg->precise) - verbose(env, "P"); - if ((t == SCALAR_VALUE || t == PTR_TO_STACK) && - tnum_is_const(reg->var_off)) { - /* reg->off should be 0 for SCALAR_VALUE */ - verbose(env, "%s", t == SCALAR_VALUE ? "" : reg_type_str(env, t)); - verbose(env, "%lld", reg->var_off.value + reg->off); - } else { - const char *sep = ""; - - verbose(env, "%s", reg_type_str(env, t)); - if (base_type(t) == PTR_TO_BTF_ID) - verbose(env, "%s", btf_type_name(reg->btf, reg->btf_id)); - verbose(env, "("); -/* - * _a stands for append, was shortened to avoid multiline statements below. - * This macro is used to output a comma separated list of attributes. - */ -#define verbose_a(fmt, ...) ({ verbose(env, "%s" fmt, sep, __VA_ARGS__); sep = ","; }) - - if (reg->id) - verbose_a("id=%d", reg->id); - if (reg->ref_obj_id) - verbose_a("ref_obj_id=%d", reg->ref_obj_id); - if (type_is_non_owning_ref(reg->type)) - verbose_a("%s", "non_own_ref"); - if (t != SCALAR_VALUE) - verbose_a("off=%d", reg->off); - if (type_is_pkt_pointer(t)) - verbose_a("r=%d", reg->range); - else if (base_type(t) == CONST_PTR_TO_MAP || - base_type(t) == PTR_TO_MAP_KEY || - base_type(t) == PTR_TO_MAP_VALUE) - verbose_a("ks=%d,vs=%d", - reg->map_ptr->key_size, - reg->map_ptr->value_size); - if (tnum_is_const(reg->var_off)) { - /* Typically an immediate SCALAR_VALUE, but - * could be a pointer whose offset is too big - * for reg->off - */ - verbose_a("imm=%llx", reg->var_off.value); - } else { - print_scalar_ranges(env, reg, &sep); - if (!tnum_is_unknown(reg->var_off)) { - char tn_buf[48]; - - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose_a("var_off=%s", tn_buf); - } - } -#undef verbose_a - - verbose(env, ")"); - } - } - for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { - char types_buf[BPF_REG_SIZE + 1]; - bool valid = false; - int j; - - for (j = 0; j < BPF_REG_SIZE; j++) { - if (state->stack[i].slot_type[j] != STACK_INVALID) - valid = true; - types_buf[j] = slot_type_char[state->stack[i].slot_type[j]]; - } - types_buf[BPF_REG_SIZE] = 0; - if (!valid) - continue; - if (!print_all && !stack_slot_scratched(env, i)) - continue; - switch (state->stack[i].slot_type[BPF_REG_SIZE - 1]) { - case STACK_SPILL: - reg = &state->stack[i].spilled_ptr; - t = reg->type; - - verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); - print_liveness(env, reg->live); - verbose(env, "=%s", t == SCALAR_VALUE ? "" : reg_type_str(env, t)); - if (t == SCALAR_VALUE && reg->precise) - verbose(env, "P"); - if (t == SCALAR_VALUE && tnum_is_const(reg->var_off)) - verbose(env, "%lld", reg->var_off.value + reg->off); - break; - case STACK_DYNPTR: - i += BPF_DYNPTR_NR_SLOTS - 1; - reg = &state->stack[i].spilled_ptr; - - verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); - print_liveness(env, reg->live); - verbose(env, "=dynptr_%s", dynptr_type_str(reg->dynptr.type)); - if (reg->ref_obj_id) - verbose(env, "(ref_id=%d)", reg->ref_obj_id); - break; - case STACK_ITER: - /* only main slot has ref_obj_id set; skip others */ - reg = &state->stack[i].spilled_ptr; - if (!reg->ref_obj_id) - continue; - - verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); - print_liveness(env, reg->live); - verbose(env, "=iter_%s(ref_id=%d,state=%s,depth=%u)", - iter_type_str(reg->iter.btf, reg->iter.btf_id), - reg->ref_obj_id, iter_state_str(reg->iter.state), - reg->iter.depth); - break; - case STACK_MISC: - case STACK_ZERO: - default: - reg = &state->stack[i].spilled_ptr; - - for (j = 0; j < BPF_REG_SIZE; j++) - types_buf[j] = slot_type_char[state->stack[i].slot_type[j]]; - types_buf[BPF_REG_SIZE] = 0; - - verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); - print_liveness(env, reg->live); - verbose(env, "=%s", types_buf); - break; - } - } - if (state->acquired_refs && state->refs[0].id) { - verbose(env, " refs=%d", state->refs[0].id); - for (i = 1; i < state->acquired_refs; i++) - if (state->refs[i].id) - verbose(env, ",%d", state->refs[i].id); - } - if (state->in_callback_fn) - verbose(env, " cb"); - if (state->in_async_callback_fn) - verbose(env, " async_cb"); - verbose(env, "\n"); - if (!print_all) - mark_verifier_state_clean(env); -} - -static inline u32 vlog_alignment(u32 pos) -{ - return round_up(max(pos + BPF_LOG_MIN_ALIGNMENT / 2, BPF_LOG_ALIGNMENT), - BPF_LOG_MIN_ALIGNMENT) - pos - 1; -} - -static void print_insn_state(struct bpf_verifier_env *env, - const struct bpf_func_state *state) -{ - if (env->prev_log_pos && env->prev_log_pos == env->log.end_pos) { - /* remove new line character */ - bpf_vlog_reset(&env->log, env->prev_log_pos - 1); - verbose(env, "%*c;", vlog_alignment(env->prev_insn_print_pos), ' '); - } else { - verbose(env, "%d:", env->insn_idx); - } - print_verifier_state(env, state, false); -} - /* copy array src of length n * size bytes to dst. dst is reallocated if it's too * small to hold src. This is different from krealloc since we don't want to preserve * the contents of dst. @@ -2329,69 +1869,214 @@ static void __update_reg_bounds(struct bpf_reg_state *reg) /* Uses signed min/max values to inform unsigned, and vice-versa */ static void __reg32_deduce_bounds(struct bpf_reg_state *reg) { - /* Learn sign from signed bounds. - * If we cannot cross the sign boundary, then signed and unsigned bounds - * are the same, so combine. This works even in the negative case, e.g. - * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff. + /* If upper 32 bits of u64/s64 range don't change, we can use lower 32 + * bits to improve our u32/s32 boundaries. + * + * E.g., the case where we have upper 32 bits as zero ([10, 20] in + * u64) is pretty trivial, it's obvious that in u32 we'll also have + * [10, 20] range. But this property holds for any 64-bit range as + * long as upper 32 bits in that entire range of values stay the same. + * + * E.g., u64 range [0x10000000A, 0x10000000F] ([4294967306, 4294967311] + * in decimal) has the same upper 32 bits throughout all the values in + * that range. As such, lower 32 bits form a valid [0xA, 0xF] ([10, 15]) + * range. + * + * Note also, that [0xA, 0xF] is a valid range both in u32 and in s32, + * following the rules outlined below about u64/s64 correspondence + * (which equally applies to u32 vs s32 correspondence). In general it + * depends on actual hexadecimal values of 32-bit range. They can form + * only valid u32, or only valid s32 ranges in some cases. + * + * So we use all these insights to derive bounds for subregisters here. */ - if (reg->s32_min_value >= 0 || reg->s32_max_value < 0) { - reg->s32_min_value = reg->u32_min_value = - max_t(u32, reg->s32_min_value, reg->u32_min_value); - reg->s32_max_value = reg->u32_max_value = - min_t(u32, reg->s32_max_value, reg->u32_max_value); - return; + if ((reg->umin_value >> 32) == (reg->umax_value >> 32)) { + /* u64 to u32 casting preserves validity of low 32 bits as + * a range, if upper 32 bits are the same + */ + reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)reg->umin_value); + reg->u32_max_value = min_t(u32, reg->u32_max_value, (u32)reg->umax_value); + + if ((s32)reg->umin_value <= (s32)reg->umax_value) { + reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->umin_value); + reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->umax_value); + } + } + if ((reg->smin_value >> 32) == (reg->smax_value >> 32)) { + /* low 32 bits should form a proper u32 range */ + if ((u32)reg->smin_value <= (u32)reg->smax_value) { + reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)reg->smin_value); + reg->u32_max_value = min_t(u32, reg->u32_max_value, (u32)reg->smax_value); + } + /* low 32 bits should form a proper s32 range */ + if ((s32)reg->smin_value <= (s32)reg->smax_value) { + reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->smin_value); + reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->smax_value); + } + } + /* Special case where upper bits form a small sequence of two + * sequential numbers (in 32-bit unsigned space, so 0xffffffff to + * 0x00000000 is also valid), while lower bits form a proper s32 range + * going from negative numbers to positive numbers. E.g., let's say we + * have s64 range [-1, 1] ([0xffffffffffffffff, 0x0000000000000001]). + * Possible s64 values are {-1, 0, 1} ({0xffffffffffffffff, + * 0x0000000000000000, 0x00000000000001}). Ignoring upper 32 bits, + * we still get a valid s32 range [-1, 1] ([0xffffffff, 0x00000001]). + * Note that it doesn't have to be 0xffffffff going to 0x00000000 in + * upper 32 bits. As a random example, s64 range + * [0xfffffff0fffffff0; 0xfffffff100000010], forms a valid s32 range + * [-16, 16] ([0xfffffff0; 0x00000010]) in its 32 bit subregister. + */ + if ((u32)(reg->umin_value >> 32) + 1 == (u32)(reg->umax_value >> 32) && + (s32)reg->umin_value < 0 && (s32)reg->umax_value >= 0) { + reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->umin_value); + reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->umax_value); + } + if ((u32)(reg->smin_value >> 32) + 1 == (u32)(reg->smax_value >> 32) && + (s32)reg->smin_value < 0 && (s32)reg->smax_value >= 0) { + reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->smin_value); + reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->smax_value); + } + /* if u32 range forms a valid s32 range (due to matching sign bit), + * try to learn from that + */ + if ((s32)reg->u32_min_value <= (s32)reg->u32_max_value) { + reg->s32_min_value = max_t(s32, reg->s32_min_value, reg->u32_min_value); + reg->s32_max_value = min_t(s32, reg->s32_max_value, reg->u32_max_value); } - /* Learn sign from unsigned bounds. Signed bounds cross the sign - * boundary, so we must be careful. + /* If we cannot cross the sign boundary, then signed and unsigned bounds + * are the same, so combine. This works even in the negative case, e.g. + * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff. */ - if ((s32)reg->u32_max_value >= 0) { - /* Positive. We can't learn anything from the smin, but smax - * is positive, hence safe. - */ - reg->s32_min_value = reg->u32_min_value; - reg->s32_max_value = reg->u32_max_value = - min_t(u32, reg->s32_max_value, reg->u32_max_value); - } else if ((s32)reg->u32_min_value < 0) { - /* Negative. We can't learn anything from the smax, but smin - * is negative, hence safe. - */ - reg->s32_min_value = reg->u32_min_value = - max_t(u32, reg->s32_min_value, reg->u32_min_value); - reg->s32_max_value = reg->u32_max_value; + if ((u32)reg->s32_min_value <= (u32)reg->s32_max_value) { + reg->u32_min_value = max_t(u32, reg->s32_min_value, reg->u32_min_value); + reg->u32_max_value = min_t(u32, reg->s32_max_value, reg->u32_max_value); } } static void __reg64_deduce_bounds(struct bpf_reg_state *reg) { - /* Learn sign from signed bounds. - * If we cannot cross the sign boundary, then signed and unsigned bounds + /* If u64 range forms a valid s64 range (due to matching sign bit), + * try to learn from that. Let's do a bit of ASCII art to see when + * this is happening. Let's take u64 range first: + * + * 0 0x7fffffffffffffff 0x8000000000000000 U64_MAX + * |-------------------------------|--------------------------------| + * + * Valid u64 range is formed when umin and umax are anywhere in the + * range [0, U64_MAX], and umin <= umax. u64 case is simple and + * straightforward. Let's see how s64 range maps onto the same range + * of values, annotated below the line for comparison: + * + * 0 0x7fffffffffffffff 0x8000000000000000 U64_MAX + * |-------------------------------|--------------------------------| + * 0 S64_MAX S64_MIN -1 + * + * So s64 values basically start in the middle and they are logically + * contiguous to the right of it, wrapping around from -1 to 0, and + * then finishing as S64_MAX (0x7fffffffffffffff) right before + * S64_MIN. We can try drawing the continuity of u64 vs s64 values + * more visually as mapped to sign-agnostic range of hex values. + * + * u64 start u64 end + * _______________________________________________________________ + * / \ + * 0 0x7fffffffffffffff 0x8000000000000000 U64_MAX + * |-------------------------------|--------------------------------| + * 0 S64_MAX S64_MIN -1 + * / \ + * >------------------------------ -------------------------------> + * s64 continues... s64 end s64 start s64 "midpoint" + * + * What this means is that, in general, we can't always derive + * something new about u64 from any random s64 range, and vice versa. + * + * But we can do that in two particular cases. One is when entire + * u64/s64 range is *entirely* contained within left half of the above + * diagram or when it is *entirely* contained in the right half. I.e.: + * + * |-------------------------------|--------------------------------| + * ^ ^ ^ ^ + * A B C D + * + * [A, B] and [C, D] are contained entirely in their respective halves + * and form valid contiguous ranges as both u64 and s64 values. [A, B] + * will be non-negative both as u64 and s64 (and in fact it will be + * identical ranges no matter the signedness). [C, D] treated as s64 + * will be a range of negative values, while in u64 it will be + * non-negative range of values larger than 0x8000000000000000. + * + * Now, any other range here can't be represented in both u64 and s64 + * simultaneously. E.g., [A, C], [A, D], [B, C], [B, D] are valid + * contiguous u64 ranges, but they are discontinuous in s64. [B, C] + * in s64 would be properly presented as [S64_MIN, C] and [B, S64_MAX], + * for example. Similarly, valid s64 range [D, A] (going from negative + * to positive values), would be two separate [D, U64_MAX] and [0, A] + * ranges as u64. Currently reg_state can't represent two segments per + * numeric domain, so in such situations we can only derive maximal + * possible range ([0, U64_MAX] for u64, and [S64_MIN, S64_MAX] for s64). + * + * So we use these facts to derive umin/umax from smin/smax and vice + * versa only if they stay within the same "half". This is equivalent + * to checking sign bit: lower half will have sign bit as zero, upper + * half have sign bit 1. Below in code we simplify this by just + * casting umin/umax as smin/smax and checking if they form valid + * range, and vice versa. Those are equivalent checks. + */ + if ((s64)reg->umin_value <= (s64)reg->umax_value) { + reg->smin_value = max_t(s64, reg->smin_value, reg->umin_value); + reg->smax_value = min_t(s64, reg->smax_value, reg->umax_value); + } + /* If we cannot cross the sign boundary, then signed and unsigned bounds * are the same, so combine. This works even in the negative case, e.g. * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff. */ - if (reg->smin_value >= 0 || reg->smax_value < 0) { - reg->smin_value = reg->umin_value = max_t(u64, reg->smin_value, - reg->umin_value); - reg->smax_value = reg->umax_value = min_t(u64, reg->smax_value, - reg->umax_value); - return; + if ((u64)reg->smin_value <= (u64)reg->smax_value) { + reg->umin_value = max_t(u64, reg->smin_value, reg->umin_value); + reg->umax_value = min_t(u64, reg->smax_value, reg->umax_value); } - /* Learn sign from unsigned bounds. Signed bounds cross the sign - * boundary, so we must be careful. +} + +static void __reg_deduce_mixed_bounds(struct bpf_reg_state *reg) +{ + /* Try to tighten 64-bit bounds from 32-bit knowledge, using 32-bit + * values on both sides of 64-bit range in hope to have tigher range. + * E.g., if r1 is [0x1'00000000, 0x3'80000000], and we learn from + * 32-bit signed > 0 operation that s32 bounds are now [1; 0x7fffffff]. + * With this, we can substitute 1 as low 32-bits of _low_ 64-bit bound + * (0x100000000 -> 0x100000001) and 0x7fffffff as low 32-bits of + * _high_ 64-bit bound (0x380000000 -> 0x37fffffff) and arrive at a + * better overall bounds for r1 as [0x1'000000001; 0x3'7fffffff]. + * We just need to make sure that derived bounds we are intersecting + * with are well-formed ranges in respecitve s64 or u64 domain, just + * like we do with similar kinds of 32-to-64 or 64-to-32 adjustments. */ - if ((s64)reg->umax_value >= 0) { - /* Positive. We can't learn anything from the smin, but smax - * is positive, hence safe. - */ - reg->smin_value = reg->umin_value; - reg->smax_value = reg->umax_value = min_t(u64, reg->smax_value, - reg->umax_value); - } else if ((s64)reg->umin_value < 0) { - /* Negative. We can't learn anything from the smax, but smin - * is negative, hence safe. - */ - reg->smin_value = reg->umin_value = max_t(u64, reg->smin_value, - reg->umin_value); - reg->smax_value = reg->umax_value; + __u64 new_umin, new_umax; + __s64 new_smin, new_smax; + + /* u32 -> u64 tightening, it's always well-formed */ + new_umin = (reg->umin_value & ~0xffffffffULL) | reg->u32_min_value; + new_umax = (reg->umax_value & ~0xffffffffULL) | reg->u32_max_value; + reg->umin_value = max_t(u64, reg->umin_value, new_umin); + reg->umax_value = min_t(u64, reg->umax_value, new_umax); + /* u32 -> s64 tightening, u32 range embedded into s64 preserves range validity */ + new_smin = (reg->smin_value & ~0xffffffffULL) | reg->u32_min_value; + new_smax = (reg->smax_value & ~0xffffffffULL) | reg->u32_max_value; + reg->smin_value = max_t(s64, reg->smin_value, new_smin); + reg->smax_value = min_t(s64, reg->smax_value, new_smax); + + /* if s32 can be treated as valid u32 range, we can use it as well */ + if ((u32)reg->s32_min_value <= (u32)reg->s32_max_value) { + /* s32 -> u64 tightening */ + new_umin = (reg->umin_value & ~0xffffffffULL) | (u32)reg->s32_min_value; + new_umax = (reg->umax_value & ~0xffffffffULL) | (u32)reg->s32_max_value; + reg->umin_value = max_t(u64, reg->umin_value, new_umin); + reg->umax_value = min_t(u64, reg->umax_value, new_umax); + /* s32 -> s64 tightening */ + new_smin = (reg->smin_value & ~0xffffffffULL) | (u32)reg->s32_min_value; + new_smax = (reg->smax_value & ~0xffffffffULL) | (u32)reg->s32_max_value; + reg->smin_value = max_t(s64, reg->smin_value, new_smin); + reg->smax_value = min_t(s64, reg->smax_value, new_smax); } } @@ -2399,6 +2084,7 @@ static void __reg_deduce_bounds(struct bpf_reg_state *reg) { __reg32_deduce_bounds(reg); __reg64_deduce_bounds(reg); + __reg_deduce_mixed_bounds(reg); } /* Attempts to improve var_off based on unsigned min/max information */ @@ -2420,6 +2106,7 @@ static void reg_bounds_sync(struct bpf_reg_state *reg) __update_reg_bounds(reg); /* We might have learned something about the sign bit. */ __reg_deduce_bounds(reg); + __reg_deduce_bounds(reg); /* We might have learned some bits from the bounds. */ __reg_bound_offset(reg); /* Intersecting with the old var_off might have improved our bounds @@ -2429,6 +2116,56 @@ static void reg_bounds_sync(struct bpf_reg_state *reg) __update_reg_bounds(reg); } +static int reg_bounds_sanity_check(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, const char *ctx) +{ + const char *msg; + + if (reg->umin_value > reg->umax_value || + reg->smin_value > reg->smax_value || + reg->u32_min_value > reg->u32_max_value || + reg->s32_min_value > reg->s32_max_value) { + msg = "range bounds violation"; + goto out; + } + + if (tnum_is_const(reg->var_off)) { + u64 uval = reg->var_off.value; + s64 sval = (s64)uval; + + if (reg->umin_value != uval || reg->umax_value != uval || + reg->smin_value != sval || reg->smax_value != sval) { + msg = "const tnum out of sync with range bounds"; + goto out; + } + } + + if (tnum_subreg_is_const(reg->var_off)) { + u32 uval32 = tnum_subreg(reg->var_off).value; + s32 sval32 = (s32)uval32; + + if (reg->u32_min_value != uval32 || reg->u32_max_value != uval32 || + reg->s32_min_value != sval32 || reg->s32_max_value != sval32) { + msg = "const subreg tnum out of sync with range bounds"; + goto out; + } + } + + return 0; +out: + verbose(env, "REG INVARIANTS VIOLATION (%s): %s u64=[%#llx, %#llx] " + "s64=[%#llx, %#llx] u32=[%#x, %#x] s32=[%#x, %#x] var_off=(%#llx, %#llx)\n", + ctx, msg, reg->umin_value, reg->umax_value, + reg->smin_value, reg->smax_value, + reg->u32_min_value, reg->u32_max_value, + reg->s32_min_value, reg->s32_max_value, + reg->var_off.value, reg->var_off.mask); + if (env->test_reg_invariants) + return -EFAULT; + __mark_reg_unbounded(reg); + return 0; +} + static bool __reg32_bound_s64(s32 a) { return a >= 0 && a <= S32_MAX; @@ -2453,51 +2190,6 @@ static void __reg_assign_32_into_64(struct bpf_reg_state *reg) } } -static void __reg_combine_32_into_64(struct bpf_reg_state *reg) -{ - /* special case when 64-bit register has upper 32-bit register - * zeroed. Typically happens after zext or <<32, >>32 sequence - * allowing us to use 32-bit bounds directly, - */ - if (tnum_equals_const(tnum_clear_subreg(reg->var_off), 0)) { - __reg_assign_32_into_64(reg); - } else { - /* Otherwise the best we can do is push lower 32bit known and - * unknown bits into register (var_off set from jmp logic) - * then learn as much as possible from the 64-bit tnum - * known and unknown bits. The previous smin/smax bounds are - * invalid here because of jmp32 compare so mark them unknown - * so they do not impact tnum bounds calculation. - */ - __mark_reg64_unbounded(reg); - } - reg_bounds_sync(reg); -} - -static bool __reg64_bound_s32(s64 a) -{ - return a >= S32_MIN && a <= S32_MAX; -} - -static bool __reg64_bound_u32(u64 a) -{ - return a >= U32_MIN && a <= U32_MAX; -} - -static void __reg_combine_64_into_32(struct bpf_reg_state *reg) -{ - __mark_reg32_unbounded(reg); - if (__reg64_bound_s32(reg->smin_value) && __reg64_bound_s32(reg->smax_value)) { - reg->s32_min_value = (s32)reg->smin_value; - reg->s32_max_value = (s32)reg->smax_value; - } - if (__reg64_bound_u32(reg->umin_value) && __reg64_bound_u32(reg->umax_value)) { - reg->u32_min_value = (u32)reg->umin_value; - reg->u32_max_value = (u32)reg->umax_value; - } - reg_bounds_sync(reg); -} - /* Mark a register as having a completely unknown (scalar) value. */ static void __mark_reg_unknown(const struct bpf_verifier_env *env, struct bpf_reg_state *reg) @@ -4566,9 +4258,17 @@ static bool register_is_null(struct bpf_reg_state *reg) return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0); } -static bool register_is_const(struct bpf_reg_state *reg) +/* check if register is a constant scalar value */ +static bool is_reg_const(struct bpf_reg_state *reg, bool subreg32) { - return reg->type == SCALAR_VALUE && tnum_is_const(reg->var_off); + return reg->type == SCALAR_VALUE && + tnum_is_const(subreg32 ? tnum_subreg(reg->var_off) : reg->var_off); +} + +/* assuming is_reg_const() is true, return constant value of a register */ +static u64 reg_const_value(struct bpf_reg_state *reg, bool subreg32) +{ + return subreg32 ? tnum_subreg(reg->var_off).value : reg->var_off.value; } static bool __is_scalar_unbounded(struct bpf_reg_state *reg) @@ -5425,10 +5125,23 @@ BTF_SET_END(rcu_protected_types) static bool rcu_protected_object(const struct btf *btf, u32 btf_id) { if (!btf_is_kernel(btf)) - return false; + return true; return btf_id_set_contains(&rcu_protected_types, btf_id); } +static struct btf_record *kptr_pointee_btf_record(struct btf_field *kptr_field) +{ + struct btf_struct_meta *meta; + + if (btf_is_kernel(kptr_field->kptr.btf)) + return NULL; + + meta = btf_find_struct_meta(kptr_field->kptr.btf, + kptr_field->kptr.btf_id); + + return meta ? meta->record : NULL; +} + static bool rcu_safe_kptr(const struct btf_field *field) { const struct btf_field_kptr *kptr = &field->kptr; @@ -5439,12 +5152,25 @@ static bool rcu_safe_kptr(const struct btf_field *field) static u32 btf_ld_kptr_type(struct bpf_verifier_env *env, struct btf_field *kptr_field) { + struct btf_record *rec; + u32 ret; + + ret = PTR_MAYBE_NULL; if (rcu_safe_kptr(kptr_field) && in_rcu_cs(env)) { - if (kptr_field->type != BPF_KPTR_PERCPU) - return PTR_MAYBE_NULL | MEM_RCU; - return PTR_MAYBE_NULL | MEM_RCU | MEM_PERCPU; + ret |= MEM_RCU; + if (kptr_field->type == BPF_KPTR_PERCPU) + ret |= MEM_PERCPU; + else if (!btf_is_kernel(kptr_field->kptr.btf)) + ret |= MEM_ALLOC; + + rec = kptr_pointee_btf_record(kptr_field); + if (rec && btf_record_has_field(rec, BPF_GRAPH_NODE)) + ret |= NON_OWN_REF; + } else { + ret |= PTR_UNTRUSTED; } - return PTR_MAYBE_NULL | PTR_UNTRUSTED; + + return ret; } static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, @@ -6218,9 +5944,10 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size) * values are also truncated so we push 64-bit bounds into * 32-bit bounds. Above were truncated < 32-bits already. */ - if (size >= 4) - return; - __reg_combine_64_into_32(reg); + if (size < 4) { + __mark_reg32_unbounded(reg); + reg_bounds_sync(reg); + } } static void set_sext64_default_val(struct bpf_reg_state *reg, int size) @@ -8600,6 +8327,54 @@ static enum bpf_dynptr_type dynptr_get_type(struct bpf_verifier_env *env, return state->stack[spi].spilled_ptr.dynptr.type; } +static int check_reg_const_str(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, u32 regno) +{ + struct bpf_map *map = reg->map_ptr; + int err; + int map_off; + u64 map_addr; + char *str_ptr; + + if (reg->type != PTR_TO_MAP_VALUE) + return -EINVAL; + + if (!bpf_map_is_rdonly(map)) { + verbose(env, "R%d does not point to a readonly map'\n", regno); + return -EACCES; + } + + if (!tnum_is_const(reg->var_off)) { + verbose(env, "R%d is not a constant address'\n", regno); + return -EACCES; + } + + if (!map->ops->map_direct_value_addr) { + verbose(env, "no direct value access support for this map type\n"); + return -EACCES; + } + + err = check_map_access(env, regno, reg->off, + map->value_size - reg->off, false, + ACCESS_HELPER); + if (err) + return err; + + map_off = reg->off + reg->var_off.value; + err = map->ops->map_direct_value_addr(map, &map_addr, map_off); + if (err) { + verbose(env, "direct value access on string failed\n"); + return err; + } + + str_ptr = (char *)(long)(map_addr); + if (!strnchr(str_ptr + map_off, map->value_size - map_off, 0)) { + verbose(env, "string is not zero-terminated\n"); + return -EINVAL; + } + return 0; +} + static int check_func_arg(struct bpf_verifier_env *env, u32 arg, struct bpf_call_arg_meta *meta, const struct bpf_func_proto *fn, @@ -8844,44 +8619,9 @@ skip_type_check: } case ARG_PTR_TO_CONST_STR: { - struct bpf_map *map = reg->map_ptr; - int map_off; - u64 map_addr; - char *str_ptr; - - if (!bpf_map_is_rdonly(map)) { - verbose(env, "R%d does not point to a readonly map'\n", regno); - return -EACCES; - } - - if (!tnum_is_const(reg->var_off)) { - verbose(env, "R%d is not a constant address'\n", regno); - return -EACCES; - } - - if (!map->ops->map_direct_value_addr) { - verbose(env, "no direct value access support for this map type\n"); - return -EACCES; - } - - err = check_map_access(env, regno, reg->off, - map->value_size - reg->off, false, - ACCESS_HELPER); + err = check_reg_const_str(env, reg, regno); if (err) return err; - - map_off = reg->off + reg->var_off.value; - err = map->ops->map_direct_value_addr(map, &map_addr, map_off); - if (err) { - verbose(env, "direct value access on string failed\n"); - return err; - } - - str_ptr = (char *)(long)(map_addr); - if (!strnchr(str_ptr + map_off, map->value_size - map_off, 0)) { - verbose(env, "string is not zero-terminated\n"); - return -EINVAL; - } break; } case ARG_PTR_TO_KPTR: @@ -9810,14 +9550,15 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) return 0; } -static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type, - int func_id, - struct bpf_call_arg_meta *meta) +static int do_refine_retval_range(struct bpf_verifier_env *env, + struct bpf_reg_state *regs, int ret_type, + int func_id, + struct bpf_call_arg_meta *meta) { struct bpf_reg_state *ret_reg = ®s[BPF_REG_0]; if (ret_type != RET_INTEGER) - return; + return 0; switch (func_id) { case BPF_FUNC_get_stack: @@ -9843,6 +9584,8 @@ static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type, reg_bounds_sync(ret_reg); break; } + + return reg_bounds_sanity_check(env, ret_reg, "retval"); } static int @@ -9912,7 +9655,7 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, val = reg->var_off.value; max = map->max_entries; - if (!(register_is_const(reg) && val < max)) { + if (!(is_reg_const(reg, false) && val < max)) { bpf_map_key_store(aux, BPF_MAP_KEY_POISON); return 0; } @@ -10494,7 +10237,9 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn regs[BPF_REG_0].ref_obj_id = id; } - do_refine_retval_range(regs, fn->ret_type, func_id, &meta); + err = do_refine_retval_range(env, regs, fn->ret_type, func_id, &meta); + if (err) + return err; err = check_map_func_compatibility(env, meta.map_ptr, func_id); if (err) @@ -10672,6 +10417,11 @@ static bool is_kfunc_arg_nullable(const struct btf *btf, const struct btf_param return __kfunc_param_match_suffix(btf, arg, "__nullable"); } +static bool is_kfunc_arg_const_str(const struct btf *btf, const struct btf_param *arg) +{ + return __kfunc_param_match_suffix(btf, arg, "__str"); +} + static bool is_kfunc_arg_scalar_with_name(const struct btf *btf, const struct btf_param *arg, const char *name) @@ -10815,6 +10565,7 @@ enum kfunc_ptr_arg_type { KF_ARG_PTR_TO_RB_ROOT, KF_ARG_PTR_TO_RB_NODE, KF_ARG_PTR_TO_NULL, + KF_ARG_PTR_TO_CONST_STR, }; enum special_kfunc_type { @@ -10965,6 +10716,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (is_kfunc_arg_rbtree_node(meta->btf, &args[argno])) return KF_ARG_PTR_TO_RB_NODE; + if (is_kfunc_arg_const_str(meta->btf, &args[argno])) + return KF_ARG_PTR_TO_CONST_STR; + if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) { if (!btf_type_is_struct(ref_t)) { verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n", @@ -11596,6 +11350,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_MEM_SIZE: case KF_ARG_PTR_TO_CALLBACK: case KF_ARG_PTR_TO_REFCOUNTED_KPTR: + case KF_ARG_PTR_TO_CONST_STR: /* Trusted by default */ break; default: @@ -11867,6 +11622,15 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ meta->arg_btf = reg->btf; meta->arg_btf_id = reg->btf_id; break; + case KF_ARG_PTR_TO_CONST_STR: + if (reg->type != PTR_TO_MAP_VALUE) { + verbose(env, "arg#%d doesn't point to a const string\n", i); + return -EINVAL; + } + ret = check_reg_const_str(env, reg, regno); + if (ret) + return ret; + break; } } @@ -13986,13 +13750,12 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) /* check dest operand */ err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK); + err = err ?: adjust_reg_min_max_vals(env, insn); if (err) return err; - - return adjust_reg_min_max_vals(env, insn); } - return 0; + return reg_bounds_sanity_check(env, ®s[insn->dst_reg], "alu"); } static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, @@ -14074,161 +13837,130 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, })); } -static int is_branch32_taken(struct bpf_reg_state *reg, u32 val, u8 opcode) -{ - struct tnum subreg = tnum_subreg(reg->var_off); - s32 sval = (s32)val; - - switch (opcode) { - case BPF_JEQ: - if (tnum_is_const(subreg)) - return !!tnum_equals_const(subreg, val); - else if (val < reg->u32_min_value || val > reg->u32_max_value) - return 0; - else if (sval < reg->s32_min_value || sval > reg->s32_max_value) - return 0; - break; - case BPF_JNE: - if (tnum_is_const(subreg)) - return !tnum_equals_const(subreg, val); - else if (val < reg->u32_min_value || val > reg->u32_max_value) - return 1; - else if (sval < reg->s32_min_value || sval > reg->s32_max_value) - return 1; - break; - case BPF_JSET: - if ((~subreg.mask & subreg.value) & val) - return 1; - if (!((subreg.mask | subreg.value) & val)) - return 0; - break; - case BPF_JGT: - if (reg->u32_min_value > val) - return 1; - else if (reg->u32_max_value <= val) - return 0; - break; - case BPF_JSGT: - if (reg->s32_min_value > sval) - return 1; - else if (reg->s32_max_value <= sval) - return 0; - break; - case BPF_JLT: - if (reg->u32_max_value < val) - return 1; - else if (reg->u32_min_value >= val) - return 0; - break; - case BPF_JSLT: - if (reg->s32_max_value < sval) - return 1; - else if (reg->s32_min_value >= sval) - return 0; - break; - case BPF_JGE: - if (reg->u32_min_value >= val) - return 1; - else if (reg->u32_max_value < val) - return 0; - break; - case BPF_JSGE: - if (reg->s32_min_value >= sval) - return 1; - else if (reg->s32_max_value < sval) - return 0; - break; - case BPF_JLE: - if (reg->u32_max_value <= val) - return 1; - else if (reg->u32_min_value > val) - return 0; - break; - case BPF_JSLE: - if (reg->s32_max_value <= sval) - return 1; - else if (reg->s32_min_value > sval) - return 0; - break; - } - - return -1; -} - - -static int is_branch64_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) -{ - s64 sval = (s64)val; +/* + * <reg1> <op> <reg2>, currently assuming reg2 is a constant + */ +static int is_scalar_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2, + u8 opcode, bool is_jmp32) +{ + struct tnum t1 = is_jmp32 ? tnum_subreg(reg1->var_off) : reg1->var_off; + struct tnum t2 = is_jmp32 ? tnum_subreg(reg2->var_off) : reg2->var_off; + u64 umin1 = is_jmp32 ? (u64)reg1->u32_min_value : reg1->umin_value; + u64 umax1 = is_jmp32 ? (u64)reg1->u32_max_value : reg1->umax_value; + s64 smin1 = is_jmp32 ? (s64)reg1->s32_min_value : reg1->smin_value; + s64 smax1 = is_jmp32 ? (s64)reg1->s32_max_value : reg1->smax_value; + u64 umin2 = is_jmp32 ? (u64)reg2->u32_min_value : reg2->umin_value; + u64 umax2 = is_jmp32 ? (u64)reg2->u32_max_value : reg2->umax_value; + s64 smin2 = is_jmp32 ? (s64)reg2->s32_min_value : reg2->smin_value; + s64 smax2 = is_jmp32 ? (s64)reg2->s32_max_value : reg2->smax_value; switch (opcode) { case BPF_JEQ: - if (tnum_is_const(reg->var_off)) - return !!tnum_equals_const(reg->var_off, val); - else if (val < reg->umin_value || val > reg->umax_value) + /* constants, umin/umax and smin/smax checks would be + * redundant in this case because they all should match + */ + if (tnum_is_const(t1) && tnum_is_const(t2)) + return t1.value == t2.value; + /* non-overlapping ranges */ + if (umin1 > umax2 || umax1 < umin2) return 0; - else if (sval < reg->smin_value || sval > reg->smax_value) + if (smin1 > smax2 || smax1 < smin2) return 0; + if (!is_jmp32) { + /* if 64-bit ranges are inconclusive, see if we can + * utilize 32-bit subrange knowledge to eliminate + * branches that can't be taken a priori + */ + if (reg1->u32_min_value > reg2->u32_max_value || + reg1->u32_max_value < reg2->u32_min_value) + return 0; + if (reg1->s32_min_value > reg2->s32_max_value || + reg1->s32_max_value < reg2->s32_min_value) + return 0; + } break; case BPF_JNE: - if (tnum_is_const(reg->var_off)) - return !tnum_equals_const(reg->var_off, val); - else if (val < reg->umin_value || val > reg->umax_value) + /* constants, umin/umax and smin/smax checks would be + * redundant in this case because they all should match + */ + if (tnum_is_const(t1) && tnum_is_const(t2)) + return t1.value != t2.value; + /* non-overlapping ranges */ + if (umin1 > umax2 || umax1 < umin2) return 1; - else if (sval < reg->smin_value || sval > reg->smax_value) + if (smin1 > smax2 || smax1 < smin2) return 1; + if (!is_jmp32) { + /* if 64-bit ranges are inconclusive, see if we can + * utilize 32-bit subrange knowledge to eliminate + * branches that can't be taken a priori + */ + if (reg1->u32_min_value > reg2->u32_max_value || + reg1->u32_max_value < reg2->u32_min_value) + return 1; + if (reg1->s32_min_value > reg2->s32_max_value || + reg1->s32_max_value < reg2->s32_min_value) + return 1; + } break; case BPF_JSET: - if ((~reg->var_off.mask & reg->var_off.value) & val) + if (!is_reg_const(reg2, is_jmp32)) { + swap(reg1, reg2); + swap(t1, t2); + } + if (!is_reg_const(reg2, is_jmp32)) + return -1; + if ((~t1.mask & t1.value) & t2.value) return 1; - if (!((reg->var_off.mask | reg->var_off.value) & val)) + if (!((t1.mask | t1.value) & t2.value)) return 0; break; case BPF_JGT: - if (reg->umin_value > val) + if (umin1 > umax2) return 1; - else if (reg->umax_value <= val) + else if (umax1 <= umin2) return 0; break; case BPF_JSGT: - if (reg->smin_value > sval) + if (smin1 > smax2) return 1; - else if (reg->smax_value <= sval) + else if (smax1 <= smin2) return 0; break; case BPF_JLT: - if (reg->umax_value < val) + if (umax1 < umin2) return 1; - else if (reg->umin_value >= val) + else if (umin1 >= umax2) return 0; break; case BPF_JSLT: - if (reg->smax_value < sval) + if (smax1 < smin2) return 1; - else if (reg->smin_value >= sval) + else if (smin1 >= smax2) return 0; break; case BPF_JGE: - if (reg->umin_value >= val) + if (umin1 >= umax2) return 1; - else if (reg->umax_value < val) + else if (umax1 < umin2) return 0; break; case BPF_JSGE: - if (reg->smin_value >= sval) + if (smin1 >= smax2) return 1; - else if (reg->smax_value < sval) + else if (smax1 < smin2) return 0; break; case BPF_JLE: - if (reg->umax_value <= val) + if (umax1 <= umin2) return 1; - else if (reg->umin_value > val) + else if (umin1 > umax2) return 0; break; case BPF_JSLE: - if (reg->smax_value <= sval) + if (smax1 <= smin2) return 1; - else if (reg->smin_value > sval) + else if (smin1 > smax2) return 0; break; } @@ -14236,41 +13968,6 @@ static int is_branch64_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) return -1; } -/* compute branch direction of the expression "if (reg opcode val) goto target;" - * and return: - * 1 - branch will be taken and "goto target" will be executed - * 0 - branch will not be taken and fall-through to next insn - * -1 - unknown. Example: "if (reg < 5)" is unknown when register value - * range [0,10] - */ -static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode, - bool is_jmp32) -{ - if (__is_pointer_value(false, reg)) { - if (!reg_not_null(reg)) - return -1; - - /* If pointer is valid tests against zero will fail so we can - * use this to direct branch taken. - */ - if (val != 0) - return -1; - - switch (opcode) { - case BPF_JEQ: - return 0; - case BPF_JNE: - return 1; - default: - return -1; - } - } - - if (is_jmp32) - return is_branch32_taken(reg, val, opcode); - return is_branch64_taken(reg, val, opcode); -} - static int flip_opcode(u32 opcode) { /* How can we transform "a <op> b" into "b <op> a"? */ @@ -14332,216 +14029,244 @@ static int is_pkt_ptr_branch_taken(struct bpf_reg_state *dst_reg, return -1; } -/* Adjusts the register min/max values in the case that the dst_reg is the - * variable register that we are working on, and src_reg is a constant or we're - * simply doing a BPF_K check. - * In JEQ/JNE cases we also adjust the var_off values. +/* compute branch direction of the expression "if (<reg1> opcode <reg2>) goto target;" + * and return: + * 1 - branch will be taken and "goto target" will be executed + * 0 - branch will not be taken and fall-through to next insn + * -1 - unknown. Example: "if (reg1 < 5)" is unknown when register value + * range [0,10] */ -static void reg_set_min_max(struct bpf_reg_state *true_reg, - struct bpf_reg_state *false_reg, - u64 val, u32 val32, - u8 opcode, bool is_jmp32) -{ - struct tnum false_32off = tnum_subreg(false_reg->var_off); - struct tnum false_64off = false_reg->var_off; - struct tnum true_32off = tnum_subreg(true_reg->var_off); - struct tnum true_64off = true_reg->var_off; - s64 sval = (s64)val; - s32 sval32 = (s32)val32; - - /* If the dst_reg is a pointer, we can't learn anything about its - * variable offset from the compare (unless src_reg were a pointer into - * the same object, but we don't bother with that. - * Since false_reg and true_reg have the same type by construction, we - * only need to check one of them for pointerness. - */ - if (__is_pointer_value(false, false_reg)) - return; +static int is_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2, + u8 opcode, bool is_jmp32) +{ + if (reg_is_pkt_pointer_any(reg1) && reg_is_pkt_pointer_any(reg2) && !is_jmp32) + return is_pkt_ptr_branch_taken(reg1, reg2, opcode); + + if (__is_pointer_value(false, reg1) || __is_pointer_value(false, reg2)) { + u64 val; + + /* arrange that reg2 is a scalar, and reg1 is a pointer */ + if (!is_reg_const(reg2, is_jmp32)) { + opcode = flip_opcode(opcode); + swap(reg1, reg2); + } + /* and ensure that reg2 is a constant */ + if (!is_reg_const(reg2, is_jmp32)) + return -1; + + if (!reg_not_null(reg1)) + return -1; + + /* If pointer is valid tests against zero will fail so we can + * use this to direct branch taken. + */ + val = reg_const_value(reg2, is_jmp32); + if (val != 0) + return -1; + + switch (opcode) { + case BPF_JEQ: + return 0; + case BPF_JNE: + return 1; + default: + return -1; + } + } + /* now deal with two scalars, but not necessarily constants */ + return is_scalar_branch_taken(reg1, reg2, opcode, is_jmp32); +} + +/* Opcode that corresponds to a *false* branch condition. + * E.g., if r1 < r2, then reverse (false) condition is r1 >= r2 + */ +static u8 rev_opcode(u8 opcode) +{ switch (opcode) { - /* JEQ/JNE comparison doesn't change the register equivalence. - * - * r1 = r2; - * if (r1 == 42) goto label; - * ... - * label: // here both r1 and r2 are known to be 42. - * - * Hence when marking register as known preserve it's ID. + case BPF_JEQ: return BPF_JNE; + case BPF_JNE: return BPF_JEQ; + /* JSET doesn't have it's reverse opcode in BPF, so add + * BPF_X flag to denote the reverse of that operation */ + case BPF_JSET: return BPF_JSET | BPF_X; + case BPF_JSET | BPF_X: return BPF_JSET; + case BPF_JGE: return BPF_JLT; + case BPF_JGT: return BPF_JLE; + case BPF_JLE: return BPF_JGT; + case BPF_JLT: return BPF_JGE; + case BPF_JSGE: return BPF_JSLT; + case BPF_JSGT: return BPF_JSLE; + case BPF_JSLE: return BPF_JSGT; + case BPF_JSLT: return BPF_JSGE; + default: return 0; + } +} + +/* Refine range knowledge for <reg1> <op> <reg>2 conditional operation. */ +static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2, + u8 opcode, bool is_jmp32) +{ + struct tnum t; + u64 val; + +again: + switch (opcode) { case BPF_JEQ: if (is_jmp32) { - __mark_reg32_known(true_reg, val32); - true_32off = tnum_subreg(true_reg->var_off); + reg1->u32_min_value = max(reg1->u32_min_value, reg2->u32_min_value); + reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value); + reg1->s32_min_value = max(reg1->s32_min_value, reg2->s32_min_value); + reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value); + reg2->u32_min_value = reg1->u32_min_value; + reg2->u32_max_value = reg1->u32_max_value; + reg2->s32_min_value = reg1->s32_min_value; + reg2->s32_max_value = reg1->s32_max_value; + + t = tnum_intersect(tnum_subreg(reg1->var_off), tnum_subreg(reg2->var_off)); + reg1->var_off = tnum_with_subreg(reg1->var_off, t); + reg2->var_off = tnum_with_subreg(reg2->var_off, t); } else { - ___mark_reg_known(true_reg, val); - true_64off = true_reg->var_off; + reg1->umin_value = max(reg1->umin_value, reg2->umin_value); + reg1->umax_value = min(reg1->umax_value, reg2->umax_value); + reg1->smin_value = max(reg1->smin_value, reg2->smin_value); + reg1->smax_value = min(reg1->smax_value, reg2->smax_value); + reg2->umin_value = reg1->umin_value; + reg2->umax_value = reg1->umax_value; + reg2->smin_value = reg1->smin_value; + reg2->smax_value = reg1->smax_value; + + reg1->var_off = tnum_intersect(reg1->var_off, reg2->var_off); + reg2->var_off = reg1->var_off; } break; case BPF_JNE: - if (is_jmp32) { - __mark_reg32_known(false_reg, val32); - false_32off = tnum_subreg(false_reg->var_off); - } else { - ___mark_reg_known(false_reg, val); - false_64off = false_reg->var_off; - } + /* we don't derive any new information for inequality yet */ break; case BPF_JSET: + if (!is_reg_const(reg2, is_jmp32)) + swap(reg1, reg2); + if (!is_reg_const(reg2, is_jmp32)) + break; + val = reg_const_value(reg2, is_jmp32); + /* BPF_JSET (i.e., TRUE branch, *not* BPF_JSET | BPF_X) + * requires single bit to learn something useful. E.g., if we + * know that `r1 & 0x3` is true, then which bits (0, 1, or both) + * are actually set? We can learn something definite only if + * it's a single-bit value to begin with. + * + * BPF_JSET | BPF_X (i.e., negation of BPF_JSET) doesn't have + * this restriction. I.e., !(r1 & 0x3) means neither bit 0 nor + * bit 1 is set, which we can readily use in adjustments. + */ + if (!is_power_of_2(val)) + break; if (is_jmp32) { - false_32off = tnum_and(false_32off, tnum_const(~val32)); - if (is_power_of_2(val32)) - true_32off = tnum_or(true_32off, - tnum_const(val32)); + t = tnum_or(tnum_subreg(reg1->var_off), tnum_const(val)); + reg1->var_off = tnum_with_subreg(reg1->var_off, t); } else { - false_64off = tnum_and(false_64off, tnum_const(~val)); - if (is_power_of_2(val)) - true_64off = tnum_or(true_64off, - tnum_const(val)); + reg1->var_off = tnum_or(reg1->var_off, tnum_const(val)); } break; - case BPF_JGE: - case BPF_JGT: - { + case BPF_JSET | BPF_X: /* reverse of BPF_JSET, see rev_opcode() */ + if (!is_reg_const(reg2, is_jmp32)) + swap(reg1, reg2); + if (!is_reg_const(reg2, is_jmp32)) + break; + val = reg_const_value(reg2, is_jmp32); if (is_jmp32) { - u32 false_umax = opcode == BPF_JGT ? val32 : val32 - 1; - u32 true_umin = opcode == BPF_JGT ? val32 + 1 : val32; - - false_reg->u32_max_value = min(false_reg->u32_max_value, - false_umax); - true_reg->u32_min_value = max(true_reg->u32_min_value, - true_umin); + t = tnum_and(tnum_subreg(reg1->var_off), tnum_const(~val)); + reg1->var_off = tnum_with_subreg(reg1->var_off, t); } else { - u64 false_umax = opcode == BPF_JGT ? val : val - 1; - u64 true_umin = opcode == BPF_JGT ? val + 1 : val; - - false_reg->umax_value = min(false_reg->umax_value, false_umax); - true_reg->umin_value = max(true_reg->umin_value, true_umin); + reg1->var_off = tnum_and(reg1->var_off, tnum_const(~val)); } break; - } - case BPF_JSGE: - case BPF_JSGT: - { + case BPF_JLE: if (is_jmp32) { - s32 false_smax = opcode == BPF_JSGT ? sval32 : sval32 - 1; - s32 true_smin = opcode == BPF_JSGT ? sval32 + 1 : sval32; - - false_reg->s32_max_value = min(false_reg->s32_max_value, false_smax); - true_reg->s32_min_value = max(true_reg->s32_min_value, true_smin); + reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value); + reg2->u32_min_value = max(reg1->u32_min_value, reg2->u32_min_value); } else { - s64 false_smax = opcode == BPF_JSGT ? sval : sval - 1; - s64 true_smin = opcode == BPF_JSGT ? sval + 1 : sval; - - false_reg->smax_value = min(false_reg->smax_value, false_smax); - true_reg->smin_value = max(true_reg->smin_value, true_smin); + reg1->umax_value = min(reg1->umax_value, reg2->umax_value); + reg2->umin_value = max(reg1->umin_value, reg2->umin_value); } break; - } - case BPF_JLE: case BPF_JLT: - { if (is_jmp32) { - u32 false_umin = opcode == BPF_JLT ? val32 : val32 + 1; - u32 true_umax = opcode == BPF_JLT ? val32 - 1 : val32; - - false_reg->u32_min_value = max(false_reg->u32_min_value, - false_umin); - true_reg->u32_max_value = min(true_reg->u32_max_value, - true_umax); + reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value - 1); + reg2->u32_min_value = max(reg1->u32_min_value + 1, reg2->u32_min_value); } else { - u64 false_umin = opcode == BPF_JLT ? val : val + 1; - u64 true_umax = opcode == BPF_JLT ? val - 1 : val; - - false_reg->umin_value = max(false_reg->umin_value, false_umin); - true_reg->umax_value = min(true_reg->umax_value, true_umax); + reg1->umax_value = min(reg1->umax_value, reg2->umax_value - 1); + reg2->umin_value = max(reg1->umin_value + 1, reg2->umin_value); } break; - } case BPF_JSLE: + if (is_jmp32) { + reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value); + reg2->s32_min_value = max(reg1->s32_min_value, reg2->s32_min_value); + } else { + reg1->smax_value = min(reg1->smax_value, reg2->smax_value); + reg2->smin_value = max(reg1->smin_value, reg2->smin_value); + } + break; case BPF_JSLT: - { if (is_jmp32) { - s32 false_smin = opcode == BPF_JSLT ? sval32 : sval32 + 1; - s32 true_smax = opcode == BPF_JSLT ? sval32 - 1 : sval32; - - false_reg->s32_min_value = max(false_reg->s32_min_value, false_smin); - true_reg->s32_max_value = min(true_reg->s32_max_value, true_smax); + reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value - 1); + reg2->s32_min_value = max(reg1->s32_min_value + 1, reg2->s32_min_value); } else { - s64 false_smin = opcode == BPF_JSLT ? sval : sval + 1; - s64 true_smax = opcode == BPF_JSLT ? sval - 1 : sval; - - false_reg->smin_value = max(false_reg->smin_value, false_smin); - true_reg->smax_value = min(true_reg->smax_value, true_smax); + reg1->smax_value = min(reg1->smax_value, reg2->smax_value - 1); + reg2->smin_value = max(reg1->smin_value + 1, reg2->smin_value); } break; - } + case BPF_JGE: + case BPF_JGT: + case BPF_JSGE: + case BPF_JSGT: + /* just reuse LE/LT logic above */ + opcode = flip_opcode(opcode); + swap(reg1, reg2); + goto again; default: return; } - - if (is_jmp32) { - false_reg->var_off = tnum_or(tnum_clear_subreg(false_64off), - tnum_subreg(false_32off)); - true_reg->var_off = tnum_or(tnum_clear_subreg(true_64off), - tnum_subreg(true_32off)); - __reg_combine_32_into_64(false_reg); - __reg_combine_32_into_64(true_reg); - } else { - false_reg->var_off = false_64off; - true_reg->var_off = true_64off; - __reg_combine_64_into_32(false_reg); - __reg_combine_64_into_32(true_reg); - } } -/* Same as above, but for the case that dst_reg holds a constant and src_reg is - * the variable reg. +/* Adjusts the register min/max values in the case that the dst_reg and + * src_reg are both SCALAR_VALUE registers (or we are simply doing a BPF_K + * check, in which case we havea fake SCALAR_VALUE representing insn->imm). + * Technically we can do similar adjustments for pointers to the same object, + * but we don't support that right now. */ -static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, - struct bpf_reg_state *false_reg, - u64 val, u32 val32, - u8 opcode, bool is_jmp32) +static int reg_set_min_max(struct bpf_verifier_env *env, + struct bpf_reg_state *true_reg1, + struct bpf_reg_state *true_reg2, + struct bpf_reg_state *false_reg1, + struct bpf_reg_state *false_reg2, + u8 opcode, bool is_jmp32) { - opcode = flip_opcode(opcode); - /* This uses zero as "not present in table"; luckily the zero opcode, - * BPF_JA, can't get here. + int err; + + /* If either register is a pointer, we can't learn anything about its + * variable offset from the compare (unless they were a pointer into + * the same object, but we don't bother with that). */ - if (opcode) - reg_set_min_max(true_reg, false_reg, val, val32, opcode, is_jmp32); -} - -/* Regs are known to be equal, so intersect their min/max/var_off */ -static void __reg_combine_min_max(struct bpf_reg_state *src_reg, - struct bpf_reg_state *dst_reg) -{ - src_reg->umin_value = dst_reg->umin_value = max(src_reg->umin_value, - dst_reg->umin_value); - src_reg->umax_value = dst_reg->umax_value = min(src_reg->umax_value, - dst_reg->umax_value); - src_reg->smin_value = dst_reg->smin_value = max(src_reg->smin_value, - dst_reg->smin_value); - src_reg->smax_value = dst_reg->smax_value = min(src_reg->smax_value, - dst_reg->smax_value); - src_reg->var_off = dst_reg->var_off = tnum_intersect(src_reg->var_off, - dst_reg->var_off); - reg_bounds_sync(src_reg); - reg_bounds_sync(dst_reg); -} + if (false_reg1->type != SCALAR_VALUE || false_reg2->type != SCALAR_VALUE) + return 0; -static void reg_combine_min_max(struct bpf_reg_state *true_src, - struct bpf_reg_state *true_dst, - struct bpf_reg_state *false_src, - struct bpf_reg_state *false_dst, - u8 opcode) -{ - switch (opcode) { - case BPF_JEQ: - __reg_combine_min_max(true_src, true_dst); - break; - case BPF_JNE: - __reg_combine_min_max(false_src, false_dst); - break; - } + /* fallthrough (FALSE) branch */ + regs_refine_cond_op(false_reg1, false_reg2, rev_opcode(opcode), is_jmp32); + reg_bounds_sync(false_reg1); + reg_bounds_sync(false_reg2); + + /* jump (TRUE) branch */ + regs_refine_cond_op(true_reg1, true_reg2, opcode, is_jmp32); + reg_bounds_sync(true_reg1); + reg_bounds_sync(true_reg2); + + err = reg_bounds_sanity_check(env, true_reg1, "true_reg1"); + err = err ?: reg_bounds_sanity_check(env, true_reg2, "true_reg2"); + err = err ?: reg_bounds_sanity_check(env, false_reg1, "false_reg1"); + err = err ?: reg_bounds_sanity_check(env, false_reg2, "false_reg2"); + return err; } static void mark_ptr_or_null_reg(struct bpf_func_state *state, @@ -14739,6 +14464,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs; struct bpf_reg_state *dst_reg, *other_branch_regs, *src_reg = NULL; struct bpf_reg_state *eq_branch_regs; + struct bpf_reg_state fake_reg = {}; u8 opcode = BPF_OP(insn->code); bool is_jmp32; int pred = -1; @@ -14779,42 +14505,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, verbose(env, "BPF_JMP/JMP32 uses reserved fields\n"); return -EINVAL; } + src_reg = &fake_reg; + src_reg->type = SCALAR_VALUE; + __mark_reg_known(src_reg, insn->imm); } is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32; - - if (BPF_SRC(insn->code) == BPF_K) { - pred = is_branch_taken(dst_reg, insn->imm, opcode, is_jmp32); - } else if (src_reg->type == SCALAR_VALUE && - is_jmp32 && tnum_is_const(tnum_subreg(src_reg->var_off))) { - pred = is_branch_taken(dst_reg, - tnum_subreg(src_reg->var_off).value, - opcode, - is_jmp32); - } else if (src_reg->type == SCALAR_VALUE && - !is_jmp32 && tnum_is_const(src_reg->var_off)) { - pred = is_branch_taken(dst_reg, - src_reg->var_off.value, - opcode, - is_jmp32); - } else if (dst_reg->type == SCALAR_VALUE && - is_jmp32 && tnum_is_const(tnum_subreg(dst_reg->var_off))) { - pred = is_branch_taken(src_reg, - tnum_subreg(dst_reg->var_off).value, - flip_opcode(opcode), - is_jmp32); - } else if (dst_reg->type == SCALAR_VALUE && - !is_jmp32 && tnum_is_const(dst_reg->var_off)) { - pred = is_branch_taken(src_reg, - dst_reg->var_off.value, - flip_opcode(opcode), - is_jmp32); - } else if (reg_is_pkt_pointer_any(dst_reg) && - reg_is_pkt_pointer_any(src_reg) && - !is_jmp32) { - pred = is_pkt_ptr_branch_taken(dst_reg, src_reg, opcode); - } - + pred = is_branch_taken(dst_reg, src_reg, opcode, is_jmp32); if (pred >= 0) { /* If we get here with a dst_reg pointer type it is because * above is_branch_taken() special cased the 0 comparison. @@ -14862,53 +14559,27 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, return -EFAULT; other_branch_regs = other_branch->frame[other_branch->curframe]->regs; - /* detect if we are comparing against a constant value so we can adjust - * our min/max values for our dst register. - * this is only legit if both are scalars (or pointers to the same - * object, I suppose, see the PTR_MAYBE_NULL related if block below), - * because otherwise the different base pointers mean the offsets aren't - * comparable. - */ if (BPF_SRC(insn->code) == BPF_X) { - struct bpf_reg_state *src_reg = ®s[insn->src_reg]; - - if (dst_reg->type == SCALAR_VALUE && - src_reg->type == SCALAR_VALUE) { - if (tnum_is_const(src_reg->var_off) || - (is_jmp32 && - tnum_is_const(tnum_subreg(src_reg->var_off)))) - reg_set_min_max(&other_branch_regs[insn->dst_reg], - dst_reg, - src_reg->var_off.value, - tnum_subreg(src_reg->var_off).value, - opcode, is_jmp32); - else if (tnum_is_const(dst_reg->var_off) || - (is_jmp32 && - tnum_is_const(tnum_subreg(dst_reg->var_off)))) - reg_set_min_max_inv(&other_branch_regs[insn->src_reg], - src_reg, - dst_reg->var_off.value, - tnum_subreg(dst_reg->var_off).value, - opcode, is_jmp32); - else if (!is_jmp32 && - (opcode == BPF_JEQ || opcode == BPF_JNE)) - /* Comparing for equality, we can combine knowledge */ - reg_combine_min_max(&other_branch_regs[insn->src_reg], - &other_branch_regs[insn->dst_reg], - src_reg, dst_reg, opcode); - if (src_reg->id && - !WARN_ON_ONCE(src_reg->id != other_branch_regs[insn->src_reg].id)) { - find_equal_scalars(this_branch, src_reg); - find_equal_scalars(other_branch, &other_branch_regs[insn->src_reg]); - } - - } - } else if (dst_reg->type == SCALAR_VALUE) { - reg_set_min_max(&other_branch_regs[insn->dst_reg], - dst_reg, insn->imm, (u32)insn->imm, - opcode, is_jmp32); + err = reg_set_min_max(env, + &other_branch_regs[insn->dst_reg], + &other_branch_regs[insn->src_reg], + dst_reg, src_reg, opcode, is_jmp32); + } else /* BPF_SRC(insn->code) == BPF_K */ { + err = reg_set_min_max(env, + &other_branch_regs[insn->dst_reg], + src_reg /* fake one */, + dst_reg, src_reg /* same fake one */, + opcode, is_jmp32); } + if (err) + return err; + if (BPF_SRC(insn->code) == BPF_X && + src_reg->type == SCALAR_VALUE && src_reg->id && + !WARN_ON_ONCE(src_reg->id != other_branch_regs[insn->src_reg].id)) { + find_equal_scalars(this_branch, src_reg); + find_equal_scalars(other_branch, &other_branch_regs[insn->src_reg]); + } if (dst_reg->type == SCALAR_VALUE && dst_reg->id && !WARN_ON_ONCE(dst_reg->id != other_branch_regs[insn->dst_reg].id)) { find_equal_scalars(this_branch, dst_reg); @@ -17411,10 +17082,8 @@ static int do_check(struct bpf_verifier_env *env) insn->off, BPF_SIZE(insn->code), BPF_READ, insn->dst_reg, false, BPF_MODE(insn->code) == BPF_MEMSX); - if (err) - return err; - - err = save_aux_ptr_type(env, src_reg_type, true); + err = err ?: save_aux_ptr_type(env, src_reg_type, true); + err = err ?: reg_bounds_sanity_check(env, ®s[insn->dst_reg], "ldx"); if (err) return err; } else if (class == BPF_STX) { @@ -20701,6 +20370,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 if (is_priv) env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ; + env->test_reg_invariants = attr->prog_flags & BPF_F_TEST_REG_INVARIANTS; env->explored_states = kvcalloc(state_htab_size(env), sizeof(struct bpf_verifier_state_list *), diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index c56071f150f2..520b90dd97ec 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -164,13 +164,13 @@ struct cgroup_mgctx { #define DEFINE_CGROUP_MGCTX(name) \ struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name) -extern spinlock_t css_set_lock; extern struct cgroup_subsys *cgroup_subsys[]; extern struct list_head cgroup_roots; /* iterate across the hierarchies */ #define for_each_root(root) \ - list_for_each_entry((root), &cgroup_roots, root_list) + list_for_each_entry_rcu((root), &cgroup_roots, root_list, \ + lockdep_is_held(&cgroup_mutex)) /** * for_each_subsys - iterate all enabled cgroup subsystems diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 76db6c67e39a..04d11a7dd95f 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -1262,6 +1262,40 @@ int cgroup1_get_tree(struct fs_context *fc) return ret; } +/** + * task_get_cgroup1 - Acquires the associated cgroup of a task within a + * specific cgroup1 hierarchy. The cgroup1 hierarchy is identified by its + * hierarchy ID. + * @tsk: The target task + * @hierarchy_id: The ID of a cgroup1 hierarchy + * + * On success, the cgroup is returned. On failure, ERR_PTR is returned. + * We limit it to cgroup1 only. + */ +struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id) +{ + struct cgroup *cgrp = ERR_PTR(-ENOENT); + struct cgroup_root *root; + unsigned long flags; + + rcu_read_lock(); + for_each_root(root) { + /* cgroup1 only*/ + if (root == &cgrp_dfl_root) + continue; + if (root->hierarchy_id != hierarchy_id) + continue; + spin_lock_irqsave(&css_set_lock, flags); + cgrp = task_cgroup_from_root(tsk, root); + if (!cgrp || !cgroup_tryget(cgrp)) + cgrp = ERR_PTR(-ENOENT); + spin_unlock_irqrestore(&css_set_lock, flags); + break; + } + rcu_read_unlock(); + return cgrp; +} + static int __init cgroup1_wq_init(void) { /* diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 1d5b9de3b1b9..4e610863cc37 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1315,7 +1315,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root) void cgroup_free_root(struct cgroup_root *root) { - kfree(root); + kfree_rcu(root, rcu); } static void cgroup_destroy_root(struct cgroup_root *root) @@ -1347,10 +1347,9 @@ static void cgroup_destroy_root(struct cgroup_root *root) spin_unlock_irq(&css_set_lock); - if (!list_empty(&root->root_list)) { - list_del(&root->root_list); - cgroup_root_count--; - } + WARN_ON_ONCE(list_empty(&root->root_list)); + list_del_rcu(&root->root_list); + cgroup_root_count--; if (!have_favordynmods) cgroup_favor_dynmods(root, false); @@ -1390,7 +1389,15 @@ static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset, } } - BUG_ON(!res_cgroup); + /* + * If cgroup_mutex is not held, the cgrp_cset_link will be freed + * before we remove the cgroup root from the root_list. Consequently, + * when accessing a cgroup root, the cset_link may have already been + * freed, resulting in a NULL res_cgroup. However, by holding the + * cgroup_mutex, we ensure that res_cgroup can't be NULL. + * If we don't hold cgroup_mutex in the caller, we must do the NULL + * check. + */ return res_cgroup; } @@ -1413,6 +1420,11 @@ current_cgns_cgroup_from_root(struct cgroup_root *root) rcu_read_unlock(); + /* + * The namespace_sem is held by current, so the root cgroup can't + * be umounted. Therefore, we can ensure that the res is non-NULL. + */ + WARN_ON_ONCE(!res); return res; } @@ -1449,7 +1461,6 @@ static struct cgroup *current_cgns_cgroup_dfl(void) static struct cgroup *cset_cgroup_from_root(struct css_set *cset, struct cgroup_root *root) { - lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&css_set_lock); return __cset_cgroup_from_root(cset, root); @@ -1457,7 +1468,9 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, /* * Return the cgroup for "task" from the given hierarchy. Must be - * called with cgroup_mutex and css_set_lock held. + * called with css_set_lock held to prevent task's groups from being modified. + * Must be called with either cgroup_mutex or rcu read lock to prevent the + * cgroup root from being destroyed. */ struct cgroup *task_cgroup_from_root(struct task_struct *task, struct cgroup_root *root) @@ -2032,7 +2045,7 @@ void init_cgroup_root(struct cgroup_fs_context *ctx) struct cgroup_root *root = ctx->root; struct cgroup *cgrp = &root->cgrp; - INIT_LIST_HEAD(&root->root_list); + INIT_LIST_HEAD_RCU(&root->root_list); atomic_set(&root->nr_cgrps, 1); cgrp->root = root; init_cgroup_housekeeping(cgrp); @@ -2115,7 +2128,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) * care of subsystems' refcounts, which are explicitly dropped in * the failure exit path. */ - list_add(&root->root_list, &cgroup_roots); + list_add_rcu(&root->root_list, &cgroup_roots); cgroup_root_count++; /* @@ -6277,7 +6290,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, if (!buf) goto out; - cgroup_lock(); + rcu_read_lock(); spin_lock_irq(&css_set_lock); for_each_root(root) { @@ -6288,6 +6301,11 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, if (root == &cgrp_dfl_root && !READ_ONCE(cgrp_dfl_visible)) continue; + cgrp = task_cgroup_from_root(tsk, root); + /* The root has already been unmounted. */ + if (!cgrp) + continue; + seq_printf(m, "%d:", root->hierarchy_id); if (root != &cgrp_dfl_root) for_each_subsys(ss, ssid) @@ -6298,9 +6316,6 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, seq_printf(m, "%sname=%s", count ? "," : "", root->name); seq_putc(m, ':'); - - cgrp = task_cgroup_from_root(tsk, root); - /* * On traditional hierarchies, all zombie tasks show up as * belonging to the root cgroup. On the default hierarchy, @@ -6332,7 +6347,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, retval = 0; out_unlock: spin_unlock_irq(&css_set_lock); - cgroup_unlock(); + rcu_read_unlock(); kfree(buf); out: return retval; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 84e8a0f6e4e0..f0b8b7c29126 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1376,6 +1376,8 @@ __bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr, struct bpf_dynptr_kern *sig_ptr, struct bpf_key *trusted_keyring) { + const void *data, *sig; + u32 data_len, sig_len; int ret; if (trusted_keyring->has_ref) { @@ -1392,10 +1394,12 @@ __bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr, return ret; } - return verify_pkcs7_signature(data_ptr->data, - __bpf_dynptr_size(data_ptr), - sig_ptr->data, - __bpf_dynptr_size(sig_ptr), + data_len = __bpf_dynptr_size(data_ptr); + data = __bpf_dynptr_data(data_ptr, data_len); + sig_len = __bpf_dynptr_size(sig_ptr); + sig = __bpf_dynptr_data(sig_ptr, sig_len); + + return verify_pkcs7_signature(data, data_len, sig, sig_len, trusted_keyring->key, VERIFYING_UNSPECIFIED_SIGNATURE, NULL, NULL); |