summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-10-31 05:10:11 -1000
committerLinus Torvalds <torvalds@linux-foundation.org>2023-10-31 05:10:11 -1000
commit89ed67ef126c4160349c1b96fdb775ea6170ac90 (patch)
tree98caaf8bba44b21f9345a0af1dd2bd9987764e27 /kernel
parent5a6a09e97199d6600d31383055f9d43fbbcbe86f (diff)
parentf1c73396133cb3d913e2075298005644ee8dfade (diff)
Merge tag 'net-next-6.7' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from Jakub Kicinski: "Core & protocols: - Support usec resolution of TCP timestamps, enabled selectively by a route attribute. - Defer regular TCP ACK while processing socket backlog, try to send a cumulative ACK at the end. Increase single TCP flow performance on a 200Gbit NIC by 20% (100Gbit -> 120Gbit). - The Fair Queuing (FQ) packet scheduler: - add built-in 3 band prio / WRR scheduling - support bypass if the qdisc is mostly idle (5% speed up for TCP RR) - improve inactive flow reporting - optimize the layout of structures for better cache locality - Support TCP Authentication Option (RFC 5925, TCP-AO), a more modern replacement for the old MD5 option. - Add more retransmission timeout (RTO) related statistics to TCP_INFO. - Support sending fragmented skbs over vsock sockets. - Make sure we send SIGPIPE for vsock sockets if socket was shutdown(). - Add sysctl for ignoring lower limit on lifetime in Router Advertisement PIO, based on an in-progress IETF draft. - Add sysctl to control activation of TCP ping-pong mode. - Add sysctl to make connection timeout in MPTCP configurable. - Support rcvlowat and notsent_lowat on MPTCP sockets, to help apps limit the number of wakeups. - Support netlink GET for MDB (multicast forwarding), allowing user space to request a single MDB entry instead of dumping the entire table. - Support selective FDB flushing in the VXLAN tunnel driver. - Allow limiting learned FDB entries in bridges, prevent OOM attacks. - Allow controlling via configfs netconsole targets which were created via the kernel cmdline at boot, rather than via configfs at runtime. - Support multiple PTP timestamp event queue readers with different filters. - MCTP over I3C. BPF: - Add new veth-like netdevice where BPF program defines the logic of the xmit routine. It can operate in L3 and L2 mode. - Support exceptions - allow asserting conditions which should never be true but are hard for the verifier to infer. With some extra flexibility around handling of the exit / failure: https://lwn.net/Articles/938435/ - Add support for local per-cpu kptr, allow allocating and storing per-cpu objects in maps. Access to those objects operates on the value for the current CPU. This allows to deprecate local one-off implementations of per-CPU storage like BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE maps. - Extend cgroup BPF sockaddr hooks for UNIX sockets. The use case is for systemd to re-implement the LogNamespace feature which allows running multiple instances of systemd-journald to process the logs of different services. - Enable open-coded task_vma iteration, after maple tree conversion made it hard to directly walk VMAs in tracing programs. - Add open-coded task, css_task and css iterator support. One of the use cases is customizable OOM victim selection via BPF. - Allow source address selection with bpf_*_fib_lookup(). - Add ability to pin BPF timer to the current CPU. - Prevent creation of infinite loops by combining tail calls and fentry/fexit programs. - Add missed stats for kprobes to retrieve the number of missed kprobe executions and subsequent executions of BPF programs. - Inherit system settings for CPU security mitigations. - Add BPF v4 CPU instruction support for arm32 and s390x. Changes to common code: - overflow: add DEFINE_FLEX() for on-stack definition of structs with flexible array members. - Process doc update with more guidance for reviewers. Driver API: - Simplify locking in WiFi (cfg80211 and mac80211 layers), use wiphy mutex in most places and remove a lot of smaller locks. - Create a common DPLL configuration API. Allow configuring and querying state of PLL circuits used for clock syntonization, in network time distribution. - Unify fragmented and full page allocation APIs in page pool code. Let drivers be ignorant of PAGE_SIZE. - Rework PHY state machine to avoid races with calls to phy_stop(). - Notify DSA drivers of MAC address changes on user ports, improve correctness of offloads which depend on matching port MAC addresses. - Allow antenna control on injected WiFi frames. - Reduce the number of variants of napi_schedule(). - Simplify error handling when composing devlink health messages. Misc: - A lot of KCSAN data race "fixes", from Eric. - A lot of __counted_by() annotations, from Kees. - A lot of strncpy -> strscpy and printf format fixes. - Replace master/slave terminology with conduit/user in DSA drivers. - Handful of KUnit tests for netdev and WiFi core. Removed: - AppleTalk COPS. - AppleTalk ipddp. - TI AR7 CPMAC Ethernet driver. Drivers: - Ethernet high-speed NICs: - Intel (100G, ice, idpf): - add a driver for the Intel E2000 IPUs - make CRC/FCS stripping configurable - cross-timestamping for E823 devices - basic support for E830 devices - use aux-bus for managing client drivers - i40e: report firmware versions via devlink - nVidia/Mellanox: - support 4-port NICs - increase max number of channels to 256 - optimize / parallelize SF creation flow - Broadcom (bnxt): - enhance NIC temperature reporting - support PAM4 speeds and lane configuration - Marvell OcteonTX2: - PTP pulse-per-second output support - enable hardware timestamping for VFs - Solarflare/AMD: - conntrack NAT offload and offload for tunnels - Wangxun (ngbe/txgbe): - expose HW statistics - Pensando/AMD: - support PCI level reset - narrow down the condition under which skbs are linearized - Netronome/Corigine (nfp): - support CHACHA20-POLY1305 crypto in IPsec offload - Ethernet NICs embedded, slower, virtual: - Synopsys (stmmac): - add Loongson-1 SoC support - enable use of HW queues with no offload capabilities - enable PPS input support on all 5 channels - increase TX coalesce timer to 5ms - RealTek USB (r8152): improve efficiency of Rx by using GRO frags - xen: support SW packet timestamping - add drivers for implementations based on TI's PRUSS (AM64x EVM) - nVidia/Mellanox Ethernet datacenter switches: - avoid poor HW resource use on Spectrum-4 by better block selection for IPv6 multicast forwarding and ordering of blocks in ACL region - Ethernet embedded switches: - Microchip: - support configuring the drive strength for EMI compliance - ksz9477: partial ACL support - ksz9477: HSR offload - ksz9477: Wake on LAN - Realtek: - rtl8366rb: respect device tree config of the CPU port - Ethernet PHYs: - support Broadcom BCM5221 PHYs - TI dp83867: support hardware LED blinking - CAN: - add support for Linux-PHY based CAN transceivers - at91_can: clean up and use rx-offload helpers - WiFi: - MediaTek (mt76): - new sub-driver for mt7925 USB/PCIe devices - HW wireless <> Ethernet bridging in MT7988 chips - mt7603/mt7628 stability improvements - Qualcomm (ath12k): - WCN7850: - enable 320 MHz channels in 6 GHz band - hardware rfkill support - enable IEEE80211_HW_SINGLE_SCAN_ON_ALL_BANDS to make scan faster - read board data variant name from SMBIOS - QCN9274: mesh support - RealTek (rtw89): - TDMA-based multi-channel concurrency (MCC) - Silicon Labs (wfx): - Remain-On-Channel (ROC) support - Bluetooth: - ISO: many improvements for broadcast support - mark BCM4378/BCM4387 as BROKEN_LE_CODED - add support for QCA2066 - btmtksdio: enable Bluetooth wakeup from suspend" * tag 'net-next-6.7' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1816 commits) net: pcs: xpcs: Add 2500BASE-X case in get state for XPCS drivers net: bpf: Use sockopt_lock_sock() in ip_sock_set_tos() net: mana: Use xdp_set_features_flag instead of direct assignment vxlan: Cleanup IFLA_VXLAN_PORT_RANGE entry in vxlan_get_size() iavf: delete the iavf client interface iavf: add a common function for undoing the interrupt scheme iavf: use unregister_netdev iavf: rely on netdev's own registered state iavf: fix the waiting time for initial reset iavf: in iavf_down, don't queue watchdog_task if comms failed iavf: simplify mutex_trylock+sleep loops iavf: fix comments about old bit locks doc/netlink: Update schema to support cmd-cnt-name and cmd-max-name tools: ynl: introduce option to process unknown attributes or types ipvlan: properly track tx_errors netdevsim: Block until all devices are released nfp: using napi_build_skb() to replace build_skb() net: dsa: microchip: ksz9477: Fix spelling mistake "Enery" -> "Energy" net: dsa: microchip: Ensure Stable PME Pin State for Wake-on-LAN net: dsa: microchip: Refactor switch shutdown routine for WoL preparation ...
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/bpf_iter.c2
-rw-r--r--kernel/bpf/bpf_struct_ops.c26
-rw-r--r--kernel/bpf/btf.c35
-rw-r--r--kernel/bpf/cgroup.c28
-rw-r--r--kernel/bpf/cgroup_iter.c65
-rw-r--r--kernel/bpf/core.c37
-rw-r--r--kernel/bpf/cpumap.c10
-rw-r--r--kernel/bpf/devmap.c10
-rw-r--r--kernel/bpf/hashtab.c7
-rw-r--r--kernel/bpf/helpers.c109
-rw-r--r--kernel/bpf/memalloc.c72
-rw-r--r--kernel/bpf/offload.c18
-rw-r--r--kernel/bpf/ringbuf.c3
-rw-r--r--kernel/bpf/stackmap.c2
-rw-r--r--kernel/bpf/syscall.c71
-rw-r--r--kernel/bpf/task_iter.c282
-rw-r--r--kernel/bpf/tcx.c4
-rw-r--r--kernel/bpf/trampoline.c4
-rw-r--r--kernel/bpf/verifier.c1293
-rw-r--r--kernel/cgroup/cgroup.c18
-rw-r--r--kernel/time/posix-clock.c36
-rw-r--r--kernel/trace/bpf_trace.c10
-rw-r--r--kernel/trace/trace_kprobe.c14
-rw-r--r--kernel/trace/trace_syscalls.c4
24 files changed, 1826 insertions, 334 deletions
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 96856f130cbf..833faa04461b 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -793,8 +793,6 @@ __bpf_kfunc int bpf_iter_num_new(struct bpf_iter_num *it, int start, int end)
BUILD_BUG_ON(sizeof(struct bpf_iter_num_kern) != sizeof(struct bpf_iter_num));
BUILD_BUG_ON(__alignof__(struct bpf_iter_num_kern) != __alignof__(struct bpf_iter_num));
- BTF_TYPE_EMIT(struct btf_iter_num);
-
/* start == end is legit, it's an empty range and we'll just get NULL
* on first (and any subsequent) bpf_iter_num_next() call
*/
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index fdc3e8705a3c..db6176fb64dc 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -615,7 +615,10 @@ static void __bpf_struct_ops_map_free(struct bpf_map *map)
if (st_map->links)
bpf_struct_ops_map_put_progs(st_map);
bpf_map_area_free(st_map->links);
- bpf_jit_free_exec(st_map->image);
+ if (st_map->image) {
+ bpf_jit_free_exec(st_map->image);
+ bpf_jit_uncharge_modmem(PAGE_SIZE);
+ }
bpf_map_area_free(st_map->uvalue);
bpf_map_area_free(st_map);
}
@@ -657,6 +660,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
struct bpf_struct_ops_map *st_map;
const struct btf_type *t, *vt;
struct bpf_map *map;
+ int ret;
st_ops = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id);
if (!st_ops)
@@ -681,12 +685,27 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
st_map->st_ops = st_ops;
map = &st_map->map;
+ ret = bpf_jit_charge_modmem(PAGE_SIZE);
+ if (ret) {
+ __bpf_struct_ops_map_free(map);
+ return ERR_PTR(ret);
+ }
+
+ st_map->image = bpf_jit_alloc_exec(PAGE_SIZE);
+ if (!st_map->image) {
+ /* __bpf_struct_ops_map_free() uses st_map->image as flag
+ * for "charged or not". In this case, we need to unchange
+ * here.
+ */
+ bpf_jit_uncharge_modmem(PAGE_SIZE);
+ __bpf_struct_ops_map_free(map);
+ return ERR_PTR(-ENOMEM);
+ }
st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE);
st_map->links =
bpf_map_area_alloc(btf_type_vlen(t) * sizeof(struct bpf_links *),
NUMA_NO_NODE);
- st_map->image = bpf_jit_alloc_exec(PAGE_SIZE);
- if (!st_map->uvalue || !st_map->links || !st_map->image) {
+ if (!st_map->uvalue || !st_map->links) {
__bpf_struct_ops_map_free(map);
return ERR_PTR(-ENOMEM);
}
@@ -907,4 +926,3 @@ err_out:
kfree(link);
return err;
}
-
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 8090d7fb11ef..15d71d2986d3 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3293,6 +3293,8 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
type = BPF_KPTR_UNREF;
else if (!strcmp("kptr", __btf_name_by_offset(btf, t->name_off)))
type = BPF_KPTR_REF;
+ else if (!strcmp("percpu_kptr", __btf_name_by_offset(btf, t->name_off)))
+ type = BPF_KPTR_PERCPU;
else
return -EINVAL;
@@ -3308,10 +3310,10 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
return BTF_FIELD_FOUND;
}
-static const char *btf_find_decl_tag_value(const struct btf *btf,
- const struct btf_type *pt,
- int comp_idx, const char *tag_key)
+const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt,
+ int comp_idx, const char *tag_key)
{
+ const char *value = NULL;
int i;
for (i = 1; i < btf_nr_types(btf); i++) {
@@ -3325,9 +3327,14 @@ static const char *btf_find_decl_tag_value(const struct btf *btf,
continue;
if (strncmp(__btf_name_by_offset(btf, t->name_off), tag_key, len))
continue;
- return __btf_name_by_offset(btf, t->name_off) + len;
+ /* Prevent duplicate entries for same type */
+ if (value)
+ return ERR_PTR(-EEXIST);
+ value = __btf_name_by_offset(btf, t->name_off) + len;
}
- return NULL;
+ if (!value)
+ return ERR_PTR(-ENOENT);
+ return value;
}
static int
@@ -3345,7 +3352,7 @@ btf_find_graph_root(const struct btf *btf, const struct btf_type *pt,
if (t->size != sz)
return BTF_FIELD_IGNORE;
value_type = btf_find_decl_tag_value(btf, pt, comp_idx, "contains:");
- if (!value_type)
+ if (IS_ERR(value_type))
return -EINVAL;
node_field_name = strstr(value_type, ":");
if (!node_field_name)
@@ -3457,6 +3464,7 @@ static int btf_find_struct_field(const struct btf *btf,
break;
case BPF_KPTR_UNREF:
case BPF_KPTR_REF:
+ case BPF_KPTR_PERCPU:
ret = btf_find_kptr(btf, member_type, off, sz,
idx < info_cnt ? &info[idx] : &tmp);
if (ret < 0)
@@ -3523,6 +3531,7 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
break;
case BPF_KPTR_UNREF:
case BPF_KPTR_REF:
+ case BPF_KPTR_PERCPU:
ret = btf_find_kptr(btf, var_type, off, sz,
idx < info_cnt ? &info[idx] : &tmp);
if (ret < 0)
@@ -3783,6 +3792,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
break;
case BPF_KPTR_UNREF:
case BPF_KPTR_REF:
+ case BPF_KPTR_PERCPU:
ret = btf_parse_kptr(btf, &rec->fields[i], &info_arr[i]);
if (ret < 0)
goto end;
@@ -6949,7 +6959,7 @@ int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
* (either PTR_TO_CTX or SCALAR_VALUE).
*/
int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
- struct bpf_reg_state *regs)
+ struct bpf_reg_state *regs, bool is_ex_cb)
{
struct bpf_verifier_log *log = &env->log;
struct bpf_prog *prog = env->prog;
@@ -7006,7 +7016,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
tname, nargs, MAX_BPF_FUNC_REG_ARGS);
return -EINVAL;
}
- /* check that function returns int */
+ /* check that function returns int, exception cb also requires this */
t = btf_type_by_id(btf, t->type);
while (btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
@@ -7055,6 +7065,14 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
i, btf_type_str(t), tname);
return -EINVAL;
}
+ /* We have already ensured that the callback returns an integer, just
+ * like all global subprogs. We need to determine it only has a single
+ * scalar argument.
+ */
+ if (is_ex_cb && (nargs != 1 || regs[BPF_REG_1].type != SCALAR_VALUE)) {
+ bpf_log(log, "exception cb only supports single integer argument\n");
+ return -EINVAL;
+ }
return 0;
}
@@ -7832,6 +7850,7 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
case BPF_PROG_TYPE_SYSCALL:
return BTF_KFUNC_HOOK_SYSCALL;
case BPF_PROG_TYPE_CGROUP_SKB:
+ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
return BTF_KFUNC_HOOK_CGROUP_SKB;
case BPF_PROG_TYPE_SCHED_ACT:
return BTF_KFUNC_HOOK_SCHED_ACT;
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 03b3d4492980..74ad2215e1ba 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1450,18 +1450,22 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
* provided by user sockaddr
* @sk: sock struct that will use sockaddr
* @uaddr: sockaddr struct provided by user
+ * @uaddrlen: Pointer to the size of the sockaddr struct provided by user. It is
+ * read-only for AF_INET[6] uaddr but can be modified for AF_UNIX
+ * uaddr.
* @atype: The type of program to be executed
* @t_ctx: Pointer to attach type specific context
* @flags: Pointer to u32 which contains higher bits of BPF program
* return value (OR'ed together).
*
- * socket is expected to be of type INET or INET6.
+ * socket is expected to be of type INET, INET6 or UNIX.
*
* This function will return %-EPERM if an attached program is found and
* returned value != 1 during execution. In all other cases, 0 is returned.
*/
int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
struct sockaddr *uaddr,
+ int *uaddrlen,
enum cgroup_bpf_attach_type atype,
void *t_ctx,
u32 *flags)
@@ -1473,21 +1477,31 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
};
struct sockaddr_storage unspec;
struct cgroup *cgrp;
+ int ret;
/* Check socket family since not all sockets represent network
* endpoint (e.g. AF_UNIX).
*/
- if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
+ if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6 &&
+ sk->sk_family != AF_UNIX)
return 0;
if (!ctx.uaddr) {
memset(&unspec, 0, sizeof(unspec));
ctx.uaddr = (struct sockaddr *)&unspec;
+ ctx.uaddrlen = 0;
+ } else {
+ ctx.uaddrlen = *uaddrlen;
}
cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
- return bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run,
- 0, flags);
+ ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run,
+ 0, flags);
+
+ if (!ret && uaddr)
+ *uaddrlen = ctx.uaddrlen;
+
+ return ret;
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
@@ -2520,10 +2534,13 @@ cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_CGROUP_SOCK_OPS:
case BPF_CGROUP_UDP4_RECVMSG:
case BPF_CGROUP_UDP6_RECVMSG:
+ case BPF_CGROUP_UNIX_RECVMSG:
case BPF_CGROUP_INET4_GETPEERNAME:
case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_UNIX_GETPEERNAME:
case BPF_CGROUP_INET4_GETSOCKNAME:
case BPF_CGROUP_INET6_GETSOCKNAME:
+ case BPF_CGROUP_UNIX_GETSOCKNAME:
return NULL;
default:
return &bpf_get_retval_proto;
@@ -2535,10 +2552,13 @@ cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_CGROUP_SOCK_OPS:
case BPF_CGROUP_UDP4_RECVMSG:
case BPF_CGROUP_UDP6_RECVMSG:
+ case BPF_CGROUP_UNIX_RECVMSG:
case BPF_CGROUP_INET4_GETPEERNAME:
case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_UNIX_GETPEERNAME:
case BPF_CGROUP_INET4_GETSOCKNAME:
case BPF_CGROUP_INET6_GETSOCKNAME:
+ case BPF_CGROUP_UNIX_GETSOCKNAME:
return NULL;
default:
return &bpf_set_retval_proto;
diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c
index 810378f04fbc..209e5135f9fb 100644
--- a/kernel/bpf/cgroup_iter.c
+++ b/kernel/bpf/cgroup_iter.c
@@ -294,3 +294,68 @@ static int __init bpf_cgroup_iter_init(void)
}
late_initcall(bpf_cgroup_iter_init);
+
+struct bpf_iter_css {
+ __u64 __opaque[3];
+} __attribute__((aligned(8)));
+
+struct bpf_iter_css_kern {
+ struct cgroup_subsys_state *start;
+ struct cgroup_subsys_state *pos;
+ unsigned int flags;
+} __attribute__((aligned(8)));
+
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+ "Global functions as their definitions will be in vmlinux BTF");
+
+__bpf_kfunc int bpf_iter_css_new(struct bpf_iter_css *it,
+ struct cgroup_subsys_state *start, unsigned int flags)
+{
+ struct bpf_iter_css_kern *kit = (void *)it;
+
+ BUILD_BUG_ON(sizeof(struct bpf_iter_css_kern) > sizeof(struct bpf_iter_css));
+ BUILD_BUG_ON(__alignof__(struct bpf_iter_css_kern) != __alignof__(struct bpf_iter_css));
+
+ kit->start = NULL;
+ switch (flags) {
+ case BPF_CGROUP_ITER_DESCENDANTS_PRE:
+ case BPF_CGROUP_ITER_DESCENDANTS_POST:
+ case BPF_CGROUP_ITER_ANCESTORS_UP:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ kit->start = start;
+ kit->pos = NULL;
+ kit->flags = flags;
+ return 0;
+}
+
+__bpf_kfunc struct cgroup_subsys_state *bpf_iter_css_next(struct bpf_iter_css *it)
+{
+ struct bpf_iter_css_kern *kit = (void *)it;
+
+ if (!kit->start)
+ return NULL;
+
+ switch (kit->flags) {
+ case BPF_CGROUP_ITER_DESCENDANTS_PRE:
+ kit->pos = css_next_descendant_pre(kit->pos, kit->start);
+ break;
+ case BPF_CGROUP_ITER_DESCENDANTS_POST:
+ kit->pos = css_next_descendant_post(kit->pos, kit->start);
+ break;
+ case BPF_CGROUP_ITER_ANCESTORS_UP:
+ kit->pos = kit->pos ? kit->pos->parent : kit->start;
+ }
+
+ return kit->pos;
+}
+
+__bpf_kfunc void bpf_iter_css_destroy(struct bpf_iter_css *it)
+{
+}
+
+__diag_pop(); \ No newline at end of file
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 4e3ce0542e31..08626b519ce2 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -64,8 +64,8 @@
#define OFF insn->off
#define IMM insn->imm
-struct bpf_mem_alloc bpf_global_ma;
-bool bpf_global_ma_set;
+struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma;
+bool bpf_global_ma_set, bpf_global_percpu_ma_set;
/* No hurry in this branch
*
@@ -212,7 +212,7 @@ void bpf_prog_fill_jited_linfo(struct bpf_prog *prog,
const struct bpf_line_info *linfo;
void **jited_linfo;
- if (!prog->aux->jited_linfo)
+ if (!prog->aux->jited_linfo || prog->aux->func_idx > prog->aux->func_cnt)
/* Userspace did not provide linfo */
return;
@@ -539,7 +539,7 @@ static void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp)
{
int i;
- for (i = 0; i < fp->aux->func_cnt; i++)
+ for (i = 0; i < fp->aux->real_func_cnt; i++)
bpf_prog_kallsyms_del(fp->aux->func[i]);
}
@@ -589,7 +589,7 @@ bpf_prog_ksym_set_name(struct bpf_prog *prog)
sym = bin2hex(sym, prog->tag, sizeof(prog->tag));
/* prog->aux->name will be ignored if full btf name is available */
- if (prog->aux->func_info_cnt) {
+ if (prog->aux->func_info_cnt && prog->aux->func_idx < prog->aux->func_info_cnt) {
type = btf_type_by_id(prog->aux->btf,
prog->aux->func_info[prog->aux->func_idx].type_id);
func_name = btf_name_by_offset(prog->aux->btf, type->name_off);
@@ -623,7 +623,11 @@ static __always_inline int bpf_tree_comp(void *key, struct latch_tree_node *n)
if (val < ksym->start)
return -1;
- if (val >= ksym->end)
+ /* Ensure that we detect return addresses as part of the program, when
+ * the final instruction is a call for a program part of the stack
+ * trace. Therefore, do val > ksym->end instead of val >= ksym->end.
+ */
+ if (val > ksym->end)
return 1;
return 0;
@@ -733,7 +737,7 @@ bool is_bpf_text_address(unsigned long addr)
return ret;
}
-static struct bpf_prog *bpf_prog_ksym_find(unsigned long addr)
+struct bpf_prog *bpf_prog_ksym_find(unsigned long addr)
{
struct bpf_ksym *ksym = bpf_ksym_find(addr);
@@ -1208,7 +1212,7 @@ int bpf_jit_get_func_addr(const struct bpf_prog *prog,
if (!extra_pass)
addr = NULL;
else if (prog->aux->func &&
- off >= 0 && off < prog->aux->func_cnt)
+ off >= 0 && off < prog->aux->real_func_cnt)
addr = (u8 *)prog->aux->func[off]->bpf_func;
else
return -EINVAL;
@@ -2721,7 +2725,7 @@ static void bpf_prog_free_deferred(struct work_struct *work)
#endif
if (aux->dst_trampoline)
bpf_trampoline_put(aux->dst_trampoline);
- for (i = 0; i < aux->func_cnt; i++) {
+ for (i = 0; i < aux->real_func_cnt; i++) {
/* We can just unlink the subprog poke descriptor table as
* it was originally linked to the main program and is also
* released along with it.
@@ -2729,7 +2733,7 @@ static void bpf_prog_free_deferred(struct work_struct *work)
aux->func[i]->aux->poke_tab = NULL;
bpf_jit_free(aux->func[i]);
}
- if (aux->func_cnt) {
+ if (aux->real_func_cnt) {
kfree(aux->func);
bpf_prog_unlock_free(aux->prog);
} else {
@@ -2914,6 +2918,15 @@ int __weak bpf_arch_text_invalidate(void *dst, size_t len)
return -ENOTSUPP;
}
+bool __weak bpf_jit_supports_exceptions(void)
+{
+ return false;
+}
+
+void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie)
+{
+}
+
#ifdef CONFIG_BPF_SYSCALL
static int __init bpf_global_ma_init(void)
{
@@ -2921,7 +2934,9 @@ static int __init bpf_global_ma_init(void)
ret = bpf_mem_alloc_init(&bpf_global_ma, 0, false);
bpf_global_ma_set = !ret;
- return ret;
+ ret = bpf_mem_alloc_init(&bpf_global_percpu_ma, 0, true);
+ bpf_global_percpu_ma_set = !ret;
+ return !bpf_global_ma_set || !bpf_global_percpu_ma_set;
}
late_initcall(bpf_global_ma_init);
#endif
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index e42a1bdb7f53..8a0bb80fe48a 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -764,6 +764,16 @@ void __cpu_map_flush(void)
}
}
+#ifdef CONFIG_DEBUG_NET
+bool cpu_map_check_flush(void)
+{
+ if (list_empty(this_cpu_ptr(&cpu_map_flush_list)))
+ return false;
+ __cpu_map_flush();
+ return true;
+}
+#endif
+
static int __init cpu_map_init(void)
{
int cpu;
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 4d42f6ed6c11..a936c704d4e7 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -418,6 +418,16 @@ void __dev_flush(void)
}
}
+#ifdef CONFIG_DEBUG_NET
+bool dev_check_flush(void)
+{
+ if (list_empty(this_cpu_ptr(&dev_flush_list)))
+ return false;
+ __dev_flush();
+ return true;
+}
+#endif
+
/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or
* by local_bh_disable() (from XDP calls inside NAPI). The
* rcu_read_lock_bh_held() below makes lockdep accept both.
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index a8c7e1c5abfa..fd8d4b0addfc 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -155,13 +155,15 @@ static inline int htab_lock_bucket(const struct bpf_htab *htab,
hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1);
preempt_disable();
+ local_irq_save(flags);
if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) {
__this_cpu_dec(*(htab->map_locked[hash]));
+ local_irq_restore(flags);
preempt_enable();
return -EBUSY;
}
- raw_spin_lock_irqsave(&b->raw_lock, flags);
+ raw_spin_lock(&b->raw_lock);
*pflags = flags;
return 0;
@@ -172,8 +174,9 @@ static inline void htab_unlock_bucket(const struct bpf_htab *htab,
unsigned long flags)
{
hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1);
- raw_spin_unlock_irqrestore(&b->raw_lock, flags);
+ raw_spin_unlock(&b->raw_lock);
__this_cpu_dec(*(htab->map_locked[hash]));
+ local_irq_restore(flags);
preempt_enable();
}
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 8bd3812fb8df..e46ac288a108 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -22,6 +22,7 @@
#include <linux/security.h>
#include <linux/btf_ids.h>
#include <linux/bpf_mem_alloc.h>
+#include <linux/kasan.h>
#include "../../lib/kstrtox.h"
@@ -1271,7 +1272,7 @@ BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, fla
if (in_nmi())
return -EOPNOTSUPP;
- if (flags > BPF_F_TIMER_ABS)
+ if (flags & ~(BPF_F_TIMER_ABS | BPF_F_TIMER_CPU_PIN))
return -EINVAL;
__bpf_spin_lock_irqsave(&timer->lock);
t = timer->timer;
@@ -1285,6 +1286,9 @@ BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, fla
else
mode = HRTIMER_MODE_REL_SOFT;
+ if (flags & BPF_F_TIMER_CPU_PIN)
+ mode |= HRTIMER_MODE_PINNED;
+
hrtimer_start(&t->timer, ns_to_ktime(nsecs), mode);
out:
__bpf_spin_unlock_irqrestore(&timer->lock);
@@ -1807,8 +1811,6 @@ bpf_base_func_proto(enum bpf_func_id func_id)
}
}
-void __bpf_obj_drop_impl(void *p, const struct btf_record *rec);
-
void bpf_list_head_free(const struct btf_field *field, void *list_head,
struct bpf_spin_lock *spin_lock)
{
@@ -1840,7 +1842,7 @@ unlock:
* bpf_list_head which needs to be freed.
*/
migrate_disable();
- __bpf_obj_drop_impl(obj, field->graph_root.value_rec);
+ __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false);
migrate_enable();
}
}
@@ -1879,7 +1881,7 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
migrate_disable();
- __bpf_obj_drop_impl(obj, field->graph_root.value_rec);
+ __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false);
migrate_enable();
}
}
@@ -1902,9 +1904,19 @@ __bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign)
return p;
}
+__bpf_kfunc void *bpf_percpu_obj_new_impl(u64 local_type_id__k, void *meta__ign)
+{
+ u64 size = local_type_id__k;
+
+ /* The verifier has ensured that meta__ign must be NULL */
+ return bpf_mem_alloc(&bpf_global_percpu_ma, size);
+}
+
/* Must be called under migrate_disable(), as required by bpf_mem_free */
-void __bpf_obj_drop_impl(void *p, const struct btf_record *rec)
+void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu)
{
+ struct bpf_mem_alloc *ma;
+
if (rec && rec->refcount_off >= 0 &&
!refcount_dec_and_test((refcount_t *)(p + rec->refcount_off))) {
/* Object is refcounted and refcount_dec didn't result in 0
@@ -1916,10 +1928,14 @@ void __bpf_obj_drop_impl(void *p, const struct btf_record *rec)
if (rec)
bpf_obj_free_fields(rec, p);
+ if (percpu)
+ ma = &bpf_global_percpu_ma;
+ else
+ ma = &bpf_global_ma;
if (rec && rec->refcount_off >= 0)
- bpf_mem_free_rcu(&bpf_global_ma, p);
+ bpf_mem_free_rcu(ma, p);
else
- bpf_mem_free(&bpf_global_ma, p);
+ bpf_mem_free(ma, p);
}
__bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)
@@ -1927,7 +1943,13 @@ __bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)
struct btf_struct_meta *meta = meta__ign;
void *p = p__alloc;
- __bpf_obj_drop_impl(p, meta ? meta->record : NULL);
+ __bpf_obj_drop_impl(p, meta ? meta->record : NULL, false);
+}
+
+__bpf_kfunc void bpf_percpu_obj_drop_impl(void *p__alloc, void *meta__ign)
+{
+ /* The verifier has ensured that meta__ign must be NULL */
+ bpf_mem_free_rcu(&bpf_global_percpu_ma, p__alloc);
}
__bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta__ign)
@@ -1965,7 +1987,7 @@ static int __bpf_list_add(struct bpf_list_node_kern *node,
*/
if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) {
/* Only called from BPF prog, no need to migrate_disable */
- __bpf_obj_drop_impl((void *)n - off, rec);
+ __bpf_obj_drop_impl((void *)n - off, rec, false);
return -EINVAL;
}
@@ -2064,7 +2086,7 @@ static int __bpf_rbtree_add(struct bpf_rb_root *root,
*/
if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) {
/* Only called from BPF prog, no need to migrate_disable */
- __bpf_obj_drop_impl((void *)n - off, rec);
+ __bpf_obj_drop_impl((void *)n - off, rec, false);
return -EINVAL;
}
@@ -2197,7 +2219,12 @@ __bpf_kfunc struct cgroup *bpf_cgroup_from_id(u64 cgid)
__bpf_kfunc long bpf_task_under_cgroup(struct task_struct *task,
struct cgroup *ancestor)
{
- return task_under_cgroup_hierarchy(task, ancestor);
+ long ret;
+
+ rcu_read_lock();
+ ret = task_under_cgroup_hierarchy(task, ancestor);
+ rcu_read_unlock();
+ return ret;
}
#endif /* CONFIG_CGROUPS */
@@ -2435,6 +2462,49 @@ __bpf_kfunc void bpf_rcu_read_unlock(void)
rcu_read_unlock();
}
+struct bpf_throw_ctx {
+ struct bpf_prog_aux *aux;
+ u64 sp;
+ u64 bp;
+ int cnt;
+};
+
+static bool bpf_stack_walker(void *cookie, u64 ip, u64 sp, u64 bp)
+{
+ struct bpf_throw_ctx *ctx = cookie;
+ struct bpf_prog *prog;
+
+ if (!is_bpf_text_address(ip))
+ return !ctx->cnt;
+ prog = bpf_prog_ksym_find(ip);
+ ctx->cnt++;
+ if (bpf_is_subprog(prog))
+ return true;
+ ctx->aux = prog->aux;
+ ctx->sp = sp;
+ ctx->bp = bp;
+ return false;
+}
+
+__bpf_kfunc void bpf_throw(u64 cookie)
+{
+ struct bpf_throw_ctx ctx = {};
+
+ arch_bpf_stack_walk(bpf_stack_walker, &ctx);
+ WARN_ON_ONCE(!ctx.aux);
+ if (ctx.aux)
+ WARN_ON_ONCE(!ctx.aux->exception_boundary);
+ WARN_ON_ONCE(!ctx.bp);
+ WARN_ON_ONCE(!ctx.cnt);
+ /* Prevent KASAN false positives for CONFIG_KASAN_STACK by unpoisoning
+ * deeper stack depths than ctx.sp as we do not return from bpf_throw,
+ * which skips compiler generated instrumentation to do the same.
+ */
+ kasan_unpoison_task_stack_below((void *)(long)ctx.sp);
+ ctx.aux->bpf_exception_cb(cookie, ctx.sp, ctx.bp);
+ WARN(1, "A call to BPF exception callback should never return\n");
+}
+
__diag_pop();
BTF_SET8_START(generic_btf_ids)
@@ -2442,7 +2512,9 @@ BTF_SET8_START(generic_btf_ids)
BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE)
#endif
BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_percpu_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_percpu_obj_drop_impl, KF_RELEASE)
BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_list_push_front_impl)
BTF_ID_FLAGS(func, bpf_list_push_back_impl)
@@ -2462,6 +2534,7 @@ BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_task_under_cgroup, KF_RCU)
#endif
BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_throw)
BTF_SET8_END(generic_btf_ids)
static const struct btf_kfunc_id_set generic_kfunc_set = {
@@ -2488,6 +2561,18 @@ BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW)
BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
+BTF_ID_FLAGS(func, bpf_iter_task_vma_new, KF_ITER_NEW | KF_RCU)
+BTF_ID_FLAGS(func, bpf_iter_task_vma_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_task_vma_destroy, KF_ITER_DESTROY)
+BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_iter_css_task_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_css_task_destroy, KF_ITER_DESTROY)
+BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED)
+BTF_ID_FLAGS(func, bpf_iter_task_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_task_destroy, KF_ITER_DESTROY)
+BTF_ID_FLAGS(func, bpf_iter_css_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED)
+BTF_ID_FLAGS(func, bpf_iter_css_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_css_destroy, KF_ITER_DESTROY)
BTF_ID_FLAGS(func, bpf_dynptr_adjust)
BTF_ID_FLAGS(func, bpf_dynptr_is_null)
BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index d93ddac283d4..63b909d277d4 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -340,6 +340,7 @@ static void free_bulk(struct bpf_mem_cache *c)
int cnt;
WARN_ON_ONCE(tgt->unit_size != c->unit_size);
+ WARN_ON_ONCE(tgt->percpu_size != c->percpu_size);
do {
inc_active(c, &flags);
@@ -365,6 +366,9 @@ static void __free_by_rcu(struct rcu_head *head)
struct bpf_mem_cache *tgt = c->tgt;
struct llist_node *llnode;
+ WARN_ON_ONCE(tgt->unit_size != c->unit_size);
+ WARN_ON_ONCE(tgt->percpu_size != c->percpu_size);
+
llnode = llist_del_all(&c->waiting_for_gp);
if (!llnode)
goto out;
@@ -491,21 +495,17 @@ static int check_obj_size(struct bpf_mem_cache *c, unsigned int idx)
struct llist_node *first;
unsigned int obj_size;
- /* For per-cpu allocator, the size of free objects in free list doesn't
- * match with unit_size and now there is no way to get the size of
- * per-cpu pointer saved in free object, so just skip the checking.
- */
- if (c->percpu_size)
- return 0;
-
first = c->free_llist.first;
if (!first)
return 0;
- obj_size = ksize(first);
+ if (c->percpu_size)
+ obj_size = pcpu_alloc_size(((void **)first)[1]);
+ else
+ obj_size = ksize(first);
if (obj_size != c->unit_size) {
- WARN_ONCE(1, "bpf_mem_cache[%u]: unexpected object size %u, expect %u\n",
- idx, obj_size, c->unit_size);
+ WARN_ONCE(1, "bpf_mem_cache[%u]: percpu %d, unexpected object size %u, expect %u\n",
+ idx, c->percpu_size, obj_size, c->unit_size);
return -EINVAL;
}
return 0;
@@ -526,15 +526,17 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
struct bpf_mem_cache *c, __percpu *pc;
struct obj_cgroup *objcg = NULL;
+ /* room for llist_node and per-cpu pointer */
+ if (percpu)
+ percpu_size = LLIST_NODE_SZ + sizeof(void *);
+ ma->percpu = percpu;
+
if (size) {
pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL);
if (!pc)
return -ENOMEM;
- if (percpu)
- /* room for llist_node and per-cpu pointer */
- percpu_size = LLIST_NODE_SZ + sizeof(void *);
- else
+ if (!percpu)
size += LLIST_NODE_SZ; /* room for llist_node */
unit_size = size;
@@ -555,10 +557,6 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
return 0;
}
- /* size == 0 && percpu is an invalid combination */
- if (WARN_ON_ONCE(percpu))
- return -EINVAL;
-
pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL);
if (!pcc)
return -ENOMEM;
@@ -572,6 +570,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
c = &cc->cache[i];
c->unit_size = sizes[i];
c->objcg = objcg;
+ c->percpu_size = percpu_size;
c->tgt = c;
init_refill_work(c);
@@ -782,12 +781,17 @@ static void notrace *unit_alloc(struct bpf_mem_cache *c)
}
}
local_dec(&c->active);
- local_irq_restore(flags);
WARN_ON(cnt < 0);
if (cnt < c->low_watermark)
irq_work_raise(c);
+ /* Enable IRQ after the enqueue of irq work completes, so irq work
+ * will run after IRQ is enabled and free_llist may be refilled by
+ * irq work before other task preempts current task.
+ */
+ local_irq_restore(flags);
+
return llnode;
}
@@ -823,11 +827,16 @@ static void notrace unit_free(struct bpf_mem_cache *c, void *ptr)
llist_add(llnode, &c->free_llist_extra);
}
local_dec(&c->active);
- local_irq_restore(flags);
if (cnt > c->high_watermark)
/* free few objects from current cpu into global kmalloc pool */
irq_work_raise(c);
+ /* Enable IRQ after irq_work_raise() completes, otherwise when current
+ * task is preempted by task which does unit_alloc(), unit_alloc() may
+ * return NULL unexpectedly because irq work is already pending but can
+ * not been triggered and free_llist can not be refilled timely.
+ */
+ local_irq_restore(flags);
}
static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr)
@@ -845,10 +854,10 @@ static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr)
llist_add(llnode, &c->free_llist_extra_rcu);
}
local_dec(&c->active);
- local_irq_restore(flags);
if (!atomic_read(&c->call_rcu_in_progress))
irq_work_raise(c);
+ local_irq_restore(flags);
}
/* Called from BPF program or from sys_bpf syscall.
@@ -870,6 +879,17 @@ void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size)
return !ret ? NULL : ret + LLIST_NODE_SZ;
}
+static notrace int bpf_mem_free_idx(void *ptr, bool percpu)
+{
+ size_t size;
+
+ if (percpu)
+ size = pcpu_alloc_size(*((void **)ptr));
+ else
+ size = ksize(ptr - LLIST_NODE_SZ);
+ return bpf_mem_cache_idx(size);
+}
+
void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr)
{
int idx;
@@ -877,7 +897,7 @@ void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr)
if (!ptr)
return;
- idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ));
+ idx = bpf_mem_free_idx(ptr, ma->percpu);
if (idx < 0)
return;
@@ -891,7 +911,7 @@ void notrace bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr)
if (!ptr)
return;
- idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ));
+ idx = bpf_mem_free_idx(ptr, ma->percpu);
if (idx < 0)
return;
@@ -965,6 +985,12 @@ void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags)
return !ret ? NULL : ret + LLIST_NODE_SZ;
}
+/* The alignment of dynamic per-cpu area is 8, so c->unit_size and the
+ * actual size of dynamic per-cpu area will always be matched and there is
+ * no need to adjust size_index for per-cpu allocation. However for the
+ * simplicity of the implementation, use an unified size_index for both
+ * kmalloc and per-cpu allocation.
+ */
static __init int bpf_mem_cache_adjust_size(void)
{
unsigned int size;
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 87d6693d8233..1a4fec330eaa 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -234,7 +234,14 @@ int bpf_prog_dev_bound_init(struct bpf_prog *prog, union bpf_attr *attr)
attr->prog_type != BPF_PROG_TYPE_XDP)
return -EINVAL;
- if (attr->prog_flags & ~BPF_F_XDP_DEV_BOUND_ONLY)
+ if (attr->prog_flags & ~(BPF_F_XDP_DEV_BOUND_ONLY | BPF_F_XDP_HAS_FRAGS))
+ return -EINVAL;
+
+ /* Frags are allowed only if program is dev-bound-only, but not
+ * if it is requesting bpf offload.
+ */
+ if (attr->prog_flags & BPF_F_XDP_HAS_FRAGS &&
+ !(attr->prog_flags & BPF_F_XDP_DEV_BOUND_ONLY))
return -EINVAL;
if (attr->prog_type == BPF_PROG_TYPE_SCHED_CLS &&
@@ -847,10 +854,11 @@ void *bpf_dev_bound_resolve_kfunc(struct bpf_prog *prog, u32 func_id)
if (!ops)
goto out;
- if (func_id == bpf_xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_TIMESTAMP))
- p = ops->xmo_rx_timestamp;
- else if (func_id == bpf_xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_HASH))
- p = ops->xmo_rx_hash;
+#define XDP_METADATA_KFUNC(name, _, __, xmo) \
+ if (func_id == bpf_xdp_metadata_kfunc_id(name)) p = ops->xmo;
+ XDP_METADATA_KFUNC_xxx
+#undef XDP_METADATA_KFUNC
+
out:
up_read(&bpf_devs_lock);
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index f045fde632e5..0ee653a936ea 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -770,8 +770,7 @@ schedule_work_return:
/* Prevent the clearing of the busy-bit from being reordered before the
* storing of any rb consumer or producer positions.
*/
- smp_mb__before_atomic();
- atomic_set(&rb->busy, 0);
+ atomic_set_release(&rb->busy, 0);
if (flags & BPF_RB_FORCE_WAKEUP)
irq_work_queue(&rb->work);
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 458bb80b14d5..d6b277482085 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -28,7 +28,7 @@ struct bpf_stack_map {
void *elems;
struct pcpu_freelist freelist;
u32 n_buckets;
- struct stack_map_bucket *buckets[];
+ struct stack_map_bucket *buckets[] __counted_by(n_buckets);
};
static inline bool stack_map_use_build_id(struct bpf_map *map)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index d77b2f8b9364..0ed286b8a0f0 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -35,8 +35,9 @@
#include <linux/rcupdate_trace.h>
#include <linux/memcontrol.h>
#include <linux/trace_events.h>
-#include <net/netfilter/nf_bpf_link.h>
+#include <net/netfilter/nf_bpf_link.h>
+#include <net/netkit.h>
#include <net/tcx.h>
#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
@@ -514,6 +515,7 @@ void btf_record_free(struct btf_record *rec)
switch (rec->fields[i].type) {
case BPF_KPTR_UNREF:
case BPF_KPTR_REF:
+ case BPF_KPTR_PERCPU:
if (rec->fields[i].kptr.module)
module_put(rec->fields[i].kptr.module);
btf_put(rec->fields[i].kptr.btf);
@@ -560,6 +562,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec)
switch (fields[i].type) {
case BPF_KPTR_UNREF:
case BPF_KPTR_REF:
+ case BPF_KPTR_PERCPU:
btf_get(fields[i].kptr.btf);
if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) {
ret = -ENXIO;
@@ -624,8 +627,6 @@ void bpf_obj_free_timer(const struct btf_record *rec, void *obj)
bpf_timer_cancel_and_free(obj + rec->timer_off);
}
-extern void __bpf_obj_drop_impl(void *p, const struct btf_record *rec);
-
void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
{
const struct btf_field *fields;
@@ -650,6 +651,7 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
WRITE_ONCE(*(u64 *)field_ptr, 0);
break;
case BPF_KPTR_REF:
+ case BPF_KPTR_PERCPU:
xchgd_field = (void *)xchg((unsigned long *)field_ptr, 0);
if (!xchgd_field)
break;
@@ -659,8 +661,8 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
field->kptr.btf_id);
migrate_disable();
__bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ?
- pointee_struct_meta->record :
- NULL);
+ pointee_struct_meta->record : NULL,
+ fields[i].type == BPF_KPTR_PERCPU);
migrate_enable();
} else {
field->kptr.dtor(xchgd_field);
@@ -1045,6 +1047,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
break;
case BPF_KPTR_UNREF:
case BPF_KPTR_REF:
+ case BPF_KPTR_PERCPU:
case BPF_REFCOUNT:
if (map->map_type != BPF_MAP_TYPE_HASH &&
map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
@@ -2442,14 +2445,19 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
case BPF_CGROUP_INET6_BIND:
case BPF_CGROUP_INET4_CONNECT:
case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_UNIX_CONNECT:
case BPF_CGROUP_INET4_GETPEERNAME:
case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_UNIX_GETPEERNAME:
case BPF_CGROUP_INET4_GETSOCKNAME:
case BPF_CGROUP_INET6_GETSOCKNAME:
+ case BPF_CGROUP_UNIX_GETSOCKNAME:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
+ case BPF_CGROUP_UNIX_SENDMSG:
case BPF_CGROUP_UDP4_RECVMSG:
case BPF_CGROUP_UDP6_RECVMSG:
+ case BPF_CGROUP_UNIX_RECVMSG:
return 0;
default:
return -EINVAL;
@@ -2745,7 +2753,7 @@ free_used_maps:
* period before we can tear down JIT memory since symbols
* are already exposed under kallsyms.
*/
- __bpf_prog_put_noref(prog, prog->aux->func_cnt);
+ __bpf_prog_put_noref(prog, prog->aux->real_func_cnt);
return err;
free_prog_sec:
free_uid(prog->aux->user);
@@ -3370,7 +3378,7 @@ static void bpf_perf_link_dealloc(struct bpf_link *link)
static int bpf_perf_link_fill_common(const struct perf_event *event,
char __user *uname, u32 ulen,
u64 *probe_offset, u64 *probe_addr,
- u32 *fd_type)
+ u32 *fd_type, unsigned long *missed)
{
const char *buf;
u32 prog_id;
@@ -3381,7 +3389,7 @@ static int bpf_perf_link_fill_common(const struct perf_event *event,
return -EINVAL;
err = bpf_get_perf_event_info(event, &prog_id, fd_type, &buf,
- probe_offset, probe_addr);
+ probe_offset, probe_addr, missed);
if (err)
return err;
if (!uname)
@@ -3404,6 +3412,7 @@ static int bpf_perf_link_fill_common(const struct perf_event *event,
static int bpf_perf_link_fill_kprobe(const struct perf_event *event,
struct bpf_link_info *info)
{
+ unsigned long missed;
char __user *uname;
u64 addr, offset;
u32 ulen, type;
@@ -3412,7 +3421,7 @@ static int bpf_perf_link_fill_kprobe(const struct perf_event *event,
uname = u64_to_user_ptr(info->perf_event.kprobe.func_name);
ulen = info->perf_event.kprobe.name_len;
err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr,
- &type);
+ &type, &missed);
if (err)
return err;
if (type == BPF_FD_TYPE_KRETPROBE)
@@ -3421,6 +3430,7 @@ static int bpf_perf_link_fill_kprobe(const struct perf_event *event,
info->perf_event.type = BPF_PERF_EVENT_KPROBE;
info->perf_event.kprobe.offset = offset;
+ info->perf_event.kprobe.missed = missed;
if (!kallsyms_show_value(current_cred()))
addr = 0;
info->perf_event.kprobe.addr = addr;
@@ -3440,7 +3450,7 @@ static int bpf_perf_link_fill_uprobe(const struct perf_event *event,
uname = u64_to_user_ptr(info->perf_event.uprobe.file_name);
ulen = info->perf_event.uprobe.name_len;
err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr,
- &type);
+ &type, NULL);
if (err)
return err;
@@ -3476,7 +3486,7 @@ static int bpf_perf_link_fill_tracepoint(const struct perf_event *event,
uname = u64_to_user_ptr(info->perf_event.tracepoint.tp_name);
ulen = info->perf_event.tracepoint.name_len;
info->perf_event.type = BPF_PERF_EVENT_TRACEPOINT;
- return bpf_perf_link_fill_common(event, uname, ulen, NULL, NULL, NULL);
+ return bpf_perf_link_fill_common(event, uname, ulen, NULL, NULL, NULL, NULL);
}
static int bpf_perf_link_fill_perf_event(const struct perf_event *event,
@@ -3672,14 +3682,19 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
case BPF_CGROUP_INET6_BIND:
case BPF_CGROUP_INET4_CONNECT:
case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_UNIX_CONNECT:
case BPF_CGROUP_INET4_GETPEERNAME:
case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_UNIX_GETPEERNAME:
case BPF_CGROUP_INET4_GETSOCKNAME:
case BPF_CGROUP_INET6_GETSOCKNAME:
+ case BPF_CGROUP_UNIX_GETSOCKNAME:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
+ case BPF_CGROUP_UNIX_SENDMSG:
case BPF_CGROUP_UDP4_RECVMSG:
case BPF_CGROUP_UDP6_RECVMSG:
+ case BPF_CGROUP_UNIX_RECVMSG:
return BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
case BPF_CGROUP_SOCK_OPS:
return BPF_PROG_TYPE_SOCK_OPS;
@@ -3716,6 +3731,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
return BPF_PROG_TYPE_LSM;
case BPF_TCX_INGRESS:
case BPF_TCX_EGRESS:
+ case BPF_NETKIT_PRIMARY:
+ case BPF_NETKIT_PEER:
return BPF_PROG_TYPE_SCHED_CLS;
default:
return BPF_PROG_TYPE_UNSPEC;
@@ -3767,7 +3784,9 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
return 0;
case BPF_PROG_TYPE_SCHED_CLS:
if (attach_type != BPF_TCX_INGRESS &&
- attach_type != BPF_TCX_EGRESS)
+ attach_type != BPF_TCX_EGRESS &&
+ attach_type != BPF_NETKIT_PRIMARY &&
+ attach_type != BPF_NETKIT_PEER)
return -EINVAL;
return 0;
default:
@@ -3850,7 +3869,11 @@ static int bpf_prog_attach(const union bpf_attr *attr)
ret = cgroup_bpf_prog_attach(attr, ptype, prog);
break;
case BPF_PROG_TYPE_SCHED_CLS:
- ret = tcx_prog_attach(attr, prog);
+ if (attr->attach_type == BPF_TCX_INGRESS ||
+ attr->attach_type == BPF_TCX_EGRESS)
+ ret = tcx_prog_attach(attr, prog);
+ else
+ ret = netkit_prog_attach(attr, prog);
break;
default:
ret = -EINVAL;
@@ -3911,7 +3934,11 @@ static int bpf_prog_detach(const union bpf_attr *attr)
ret = cgroup_bpf_prog_detach(attr, ptype);
break;
case BPF_PROG_TYPE_SCHED_CLS:
- ret = tcx_prog_detach(attr, prog);
+ if (attr->attach_type == BPF_TCX_INGRESS ||
+ attr->attach_type == BPF_TCX_EGRESS)
+ ret = tcx_prog_detach(attr, prog);
+ else
+ ret = netkit_prog_detach(attr, prog);
break;
default:
ret = -EINVAL;
@@ -3945,14 +3972,19 @@ static int bpf_prog_query(const union bpf_attr *attr,
case BPF_CGROUP_INET6_POST_BIND:
case BPF_CGROUP_INET4_CONNECT:
case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_UNIX_CONNECT:
case BPF_CGROUP_INET4_GETPEERNAME:
case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_UNIX_GETPEERNAME:
case BPF_CGROUP_INET4_GETSOCKNAME:
case BPF_CGROUP_INET6_GETSOCKNAME:
+ case BPF_CGROUP_UNIX_GETSOCKNAME:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
+ case BPF_CGROUP_UNIX_SENDMSG:
case BPF_CGROUP_UDP4_RECVMSG:
case BPF_CGROUP_UDP6_RECVMSG:
+ case BPF_CGROUP_UNIX_RECVMSG:
case BPF_CGROUP_SOCK_OPS:
case BPF_CGROUP_DEVICE:
case BPF_CGROUP_SYSCTL:
@@ -3973,6 +4005,9 @@ static int bpf_prog_query(const union bpf_attr *attr,
case BPF_TCX_INGRESS:
case BPF_TCX_EGRESS:
return tcx_prog_query(attr, uattr);
+ case BPF_NETKIT_PRIMARY:
+ case BPF_NETKIT_PEER:
+ return netkit_prog_query(attr, uattr);
default:
return -EINVAL;
}
@@ -4818,7 +4853,7 @@ static int bpf_task_fd_query(const union bpf_attr *attr,
err = bpf_get_perf_event_info(event, &prog_id, &fd_type,
&buf, &probe_offset,
- &probe_addr);
+ &probe_addr, NULL);
if (!err)
err = bpf_task_fd_query_copy(attr, uattr, prog_id,
fd_type, buf,
@@ -4954,7 +4989,11 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
ret = bpf_xdp_link_attach(attr, prog);
break;
case BPF_PROG_TYPE_SCHED_CLS:
- ret = tcx_link_attach(attr, prog);
+ if (attr->link_create.attach_type == BPF_TCX_INGRESS ||
+ attr->link_create.attach_type == BPF_TCX_EGRESS)
+ ret = tcx_link_attach(attr, prog);
+ else
+ ret = netkit_link_attach(attr, prog);
break;
case BPF_PROG_TYPE_NETFILTER:
ret = bpf_nf_link_attach(attr, prog);
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 82ad23b1d257..654601dd6b49 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -7,7 +7,9 @@
#include <linux/fs.h>
#include <linux/fdtable.h>
#include <linux/filter.h>
+#include <linux/bpf_mem_alloc.h>
#include <linux/btf_ids.h>
+#include <linux/mm_types.h>
#include "mmap_unlock_work.h"
static const char * const iter_task_type_names[] = {
@@ -35,16 +37,13 @@ static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_comm
u32 *tid,
bool skip_if_dup_files)
{
- struct task_struct *task, *next_task;
+ struct task_struct *task;
struct pid *pid;
- u32 saved_tid;
+ u32 next_tid;
if (!*tid) {
/* The first time, the iterator calls this function. */
pid = find_pid_ns(common->pid, common->ns);
- if (!pid)
- return NULL;
-
task = get_pid_task(pid, PIDTYPE_TGID);
if (!task)
return NULL;
@@ -66,44 +65,27 @@ static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_comm
return task;
}
- pid = find_pid_ns(common->pid_visiting, common->ns);
- if (!pid)
- return NULL;
-
- task = get_pid_task(pid, PIDTYPE_PID);
+ task = find_task_by_pid_ns(common->pid_visiting, common->ns);
if (!task)
return NULL;
retry:
- if (!pid_alive(task)) {
- put_task_struct(task);
- return NULL;
- }
+ task = next_thread(task);
- next_task = next_thread(task);
- put_task_struct(task);
- if (!next_task)
- return NULL;
-
- saved_tid = *tid;
- *tid = __task_pid_nr_ns(next_task, PIDTYPE_PID, common->ns);
- if (!*tid || *tid == common->pid) {
+ next_tid = __task_pid_nr_ns(task, PIDTYPE_PID, common->ns);
+ if (!next_tid || next_tid == common->pid) {
/* Run out of tasks of a process. The tasks of a
* thread_group are linked as circular linked list.
*/
- *tid = saved_tid;
return NULL;
}
- get_task_struct(next_task);
- common->pid_visiting = *tid;
-
- if (skip_if_dup_files && task->files == task->group_leader->files) {
- task = next_task;
+ if (skip_if_dup_files && task->files == task->group_leader->files)
goto retry;
- }
- return next_task;
+ *tid = common->pid_visiting = next_tid;
+ get_task_struct(task);
+ return task;
}
static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *common,
@@ -821,6 +803,246 @@ const struct bpf_func_proto bpf_find_vma_proto = {
.arg5_type = ARG_ANYTHING,
};
+struct bpf_iter_task_vma_kern_data {
+ struct task_struct *task;
+ struct mm_struct *mm;
+ struct mmap_unlock_irq_work *work;
+ struct vma_iterator vmi;
+};
+
+struct bpf_iter_task_vma {
+ /* opaque iterator state; having __u64 here allows to preserve correct
+ * alignment requirements in vmlinux.h, generated from BTF
+ */
+ __u64 __opaque[1];
+} __attribute__((aligned(8)));
+
+/* Non-opaque version of bpf_iter_task_vma */
+struct bpf_iter_task_vma_kern {
+ struct bpf_iter_task_vma_kern_data *data;
+} __attribute__((aligned(8)));
+
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+ "Global functions as their definitions will be in vmlinux BTF");
+
+__bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
+ struct task_struct *task, u64 addr)
+{
+ struct bpf_iter_task_vma_kern *kit = (void *)it;
+ bool irq_work_busy = false;
+ int err;
+
+ BUILD_BUG_ON(sizeof(struct bpf_iter_task_vma_kern) != sizeof(struct bpf_iter_task_vma));
+ BUILD_BUG_ON(__alignof__(struct bpf_iter_task_vma_kern) != __alignof__(struct bpf_iter_task_vma));
+
+ /* is_iter_reg_valid_uninit guarantees that kit hasn't been initialized
+ * before, so non-NULL kit->data doesn't point to previously
+ * bpf_mem_alloc'd bpf_iter_task_vma_kern_data
+ */
+ kit->data = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_iter_task_vma_kern_data));
+ if (!kit->data)
+ return -ENOMEM;
+
+ kit->data->task = get_task_struct(task);
+ kit->data->mm = task->mm;
+ if (!kit->data->mm) {
+ err = -ENOENT;
+ goto err_cleanup_iter;
+ }
+
+ /* kit->data->work == NULL is valid after bpf_mmap_unlock_get_irq_work */
+ irq_work_busy = bpf_mmap_unlock_get_irq_work(&kit->data->work);
+ if (irq_work_busy || !mmap_read_trylock(kit->data->mm)) {
+ err = -EBUSY;
+ goto err_cleanup_iter;
+ }
+
+ vma_iter_init(&kit->data->vmi, kit->data->mm, addr);
+ return 0;
+
+err_cleanup_iter:
+ if (kit->data->task)
+ put_task_struct(kit->data->task);
+ bpf_mem_free(&bpf_global_ma, kit->data);
+ /* NULL kit->data signals failed bpf_iter_task_vma initialization */
+ kit->data = NULL;
+ return err;
+}
+
+__bpf_kfunc struct vm_area_struct *bpf_iter_task_vma_next(struct bpf_iter_task_vma *it)
+{
+ struct bpf_iter_task_vma_kern *kit = (void *)it;
+
+ if (!kit->data) /* bpf_iter_task_vma_new failed */
+ return NULL;
+ return vma_next(&kit->data->vmi);
+}
+
+__bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it)
+{
+ struct bpf_iter_task_vma_kern *kit = (void *)it;
+
+ if (kit->data) {
+ bpf_mmap_unlock_mm(kit->data->work, kit->data->mm);
+ put_task_struct(kit->data->task);
+ bpf_mem_free(&bpf_global_ma, kit->data);
+ }
+}
+
+__diag_pop();
+
+struct bpf_iter_css_task {
+ __u64 __opaque[1];
+} __attribute__((aligned(8)));
+
+struct bpf_iter_css_task_kern {
+ struct css_task_iter *css_it;
+} __attribute__((aligned(8)));
+
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+ "Global functions as their definitions will be in vmlinux BTF");
+
+__bpf_kfunc int bpf_iter_css_task_new(struct bpf_iter_css_task *it,
+ struct cgroup_subsys_state *css, unsigned int flags)
+{
+ struct bpf_iter_css_task_kern *kit = (void *)it;
+
+ BUILD_BUG_ON(sizeof(struct bpf_iter_css_task_kern) != sizeof(struct bpf_iter_css_task));
+ BUILD_BUG_ON(__alignof__(struct bpf_iter_css_task_kern) !=
+ __alignof__(struct bpf_iter_css_task));
+ kit->css_it = NULL;
+ switch (flags) {
+ case CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED:
+ case CSS_TASK_ITER_PROCS:
+ case 0:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ kit->css_it = bpf_mem_alloc(&bpf_global_ma, sizeof(struct css_task_iter));
+ if (!kit->css_it)
+ return -ENOMEM;
+ css_task_iter_start(css, flags, kit->css_it);
+ return 0;
+}
+
+__bpf_kfunc struct task_struct *bpf_iter_css_task_next(struct bpf_iter_css_task *it)
+{
+ struct bpf_iter_css_task_kern *kit = (void *)it;
+
+ if (!kit->css_it)
+ return NULL;
+ return css_task_iter_next(kit->css_it);
+}
+
+__bpf_kfunc void bpf_iter_css_task_destroy(struct bpf_iter_css_task *it)
+{
+ struct bpf_iter_css_task_kern *kit = (void *)it;
+
+ if (!kit->css_it)
+ return;
+ css_task_iter_end(kit->css_it);
+ bpf_mem_free(&bpf_global_ma, kit->css_it);
+}
+
+__diag_pop();
+
+struct bpf_iter_task {
+ __u64 __opaque[3];
+} __attribute__((aligned(8)));
+
+struct bpf_iter_task_kern {
+ struct task_struct *task;
+ struct task_struct *pos;
+ unsigned int flags;
+} __attribute__((aligned(8)));
+
+enum {
+ /* all process in the system */
+ BPF_TASK_ITER_ALL_PROCS,
+ /* all threads in the system */
+ BPF_TASK_ITER_ALL_THREADS,
+ /* all threads of a specific process */
+ BPF_TASK_ITER_PROC_THREADS
+};
+
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+ "Global functions as their definitions will be in vmlinux BTF");
+
+__bpf_kfunc int bpf_iter_task_new(struct bpf_iter_task *it,
+ struct task_struct *task__nullable, unsigned int flags)
+{
+ struct bpf_iter_task_kern *kit = (void *)it;
+
+ BUILD_BUG_ON(sizeof(struct bpf_iter_task_kern) > sizeof(struct bpf_iter_task));
+ BUILD_BUG_ON(__alignof__(struct bpf_iter_task_kern) !=
+ __alignof__(struct bpf_iter_task));
+
+ kit->task = kit->pos = NULL;
+ switch (flags) {
+ case BPF_TASK_ITER_ALL_THREADS:
+ case BPF_TASK_ITER_ALL_PROCS:
+ break;
+ case BPF_TASK_ITER_PROC_THREADS:
+ if (!task__nullable)
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (flags == BPF_TASK_ITER_PROC_THREADS)
+ kit->task = task__nullable;
+ else
+ kit->task = &init_task;
+ kit->pos = kit->task;
+ kit->flags = flags;
+ return 0;
+}
+
+__bpf_kfunc struct task_struct *bpf_iter_task_next(struct bpf_iter_task *it)
+{
+ struct bpf_iter_task_kern *kit = (void *)it;
+ struct task_struct *pos;
+ unsigned int flags;
+
+ flags = kit->flags;
+ pos = kit->pos;
+
+ if (!pos)
+ return pos;
+
+ if (flags == BPF_TASK_ITER_ALL_PROCS)
+ goto get_next_task;
+
+ kit->pos = next_thread(kit->pos);
+ if (kit->pos == kit->task) {
+ if (flags == BPF_TASK_ITER_PROC_THREADS) {
+ kit->pos = NULL;
+ return pos;
+ }
+ } else
+ return pos;
+
+get_next_task:
+ kit->pos = next_task(kit->pos);
+ kit->task = kit->pos;
+ if (kit->pos == &init_task)
+ kit->pos = NULL;
+
+ return pos;
+}
+
+__bpf_kfunc void bpf_iter_task_destroy(struct bpf_iter_task *it)
+{
+}
+
+__diag_pop();
+
DEFINE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work);
static void do_mmap_read_unlock(struct irq_work *entry)
diff --git a/kernel/bpf/tcx.c b/kernel/bpf/tcx.c
index 1338a13a8b64..2e4885e7781f 100644
--- a/kernel/bpf/tcx.c
+++ b/kernel/bpf/tcx.c
@@ -250,7 +250,7 @@ static void tcx_link_dealloc(struct bpf_link *link)
static void tcx_link_fdinfo(const struct bpf_link *link, struct seq_file *seq)
{
- const struct tcx_link *tcx = tcx_link_const(link);
+ const struct tcx_link *tcx = tcx_link(link);
u32 ifindex = 0;
rtnl_lock();
@@ -267,7 +267,7 @@ static void tcx_link_fdinfo(const struct bpf_link *link, struct seq_file *seq)
static int tcx_link_fill_info(const struct bpf_link *link,
struct bpf_link_info *info)
{
- const struct tcx_link *tcx = tcx_link_const(link);
+ const struct tcx_link *tcx = tcx_link(link);
u32 ifindex = 0;
rtnl_lock();
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 53ff50cac61e..e97aeda3a86b 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -415,8 +415,8 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut
goto out;
}
- /* clear all bits except SHARE_IPMODIFY */
- tr->flags &= BPF_TRAMP_F_SHARE_IPMODIFY;
+ /* clear all bits except SHARE_IPMODIFY and TAIL_CALL_CTX */
+ tr->flags &= (BPF_TRAMP_F_SHARE_IPMODIFY | BPF_TRAMP_F_TAIL_CALL_CTX);
if (tlinks[BPF_TRAMP_FEXIT].nr_links ||
tlinks[BPF_TRAMP_MODIFY_RETURN].nr_links) {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 873ade146f3d..857d76694517 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -304,7 +304,7 @@ struct bpf_kfunc_call_arg_meta {
/* arg_{btf,btf_id,owning_ref} are used by kfunc-specific handling,
* generally to pass info about user-defined local kptr types to later
* verification logic
- * bpf_obj_drop
+ * bpf_obj_drop/bpf_percpu_obj_drop
* Record the local kptr type to be drop'd
* bpf_refcount_acquire (via KF_ARG_PTR_TO_REFCOUNTED_KPTR arg type)
* Record the local kptr type to be refcount_incr'd and use
@@ -543,6 +543,7 @@ static bool is_dynptr_ref_function(enum bpf_func_id func_id)
}
static bool is_callback_calling_kfunc(u32 btf_id);
+static bool is_bpf_throw_kfunc(struct bpf_insn *insn);
static bool is_callback_calling_function(enum bpf_func_id func_id)
{
@@ -1172,7 +1173,12 @@ static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg
static void __mark_reg_known_zero(struct bpf_reg_state *reg);
+static bool in_rcu_cs(struct bpf_verifier_env *env);
+
+static bool is_kfunc_rcu_protected(struct bpf_kfunc_call_arg_meta *meta);
+
static int mark_stack_slots_iter(struct bpf_verifier_env *env,
+ struct bpf_kfunc_call_arg_meta *meta,
struct bpf_reg_state *reg, int insn_idx,
struct btf *btf, u32 btf_id, int nr_slots)
{
@@ -1193,6 +1199,12 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env,
__mark_reg_known_zero(st);
st->type = PTR_TO_STACK; /* we don't have dedicated reg type */
+ if (is_kfunc_rcu_protected(meta)) {
+ if (in_rcu_cs(env))
+ st->type |= MEM_RCU;
+ else
+ st->type |= PTR_UNTRUSTED;
+ }
st->live |= REG_LIVE_WRITTEN;
st->ref_obj_id = i == 0 ? id : 0;
st->iter.btf = btf;
@@ -1267,7 +1279,7 @@ static bool is_iter_reg_valid_uninit(struct bpf_verifier_env *env,
return true;
}
-static bool is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+static int is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
struct btf *btf, u32 btf_id, int nr_slots)
{
struct bpf_func_state *state = func(env, reg);
@@ -1275,26 +1287,28 @@ static bool is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_
spi = iter_get_spi(env, reg, nr_slots);
if (spi < 0)
- return false;
+ return -EINVAL;
for (i = 0; i < nr_slots; i++) {
struct bpf_stack_state *slot = &state->stack[spi - i];
struct bpf_reg_state *st = &slot->spilled_ptr;
+ if (st->type & PTR_UNTRUSTED)
+ return -EPROTO;
/* only main (first) slot has ref_obj_id set */
if (i == 0 && !st->ref_obj_id)
- return false;
+ return -EINVAL;
if (i != 0 && st->ref_obj_id)
- return false;
+ return -EINVAL;
if (st->iter.btf != btf || st->iter.btf_id != btf_id)
- return false;
+ return -EINVAL;
for (j = 0; j < BPF_REG_SIZE; j++)
if (slot->slot_type[j] != STACK_ITER)
- return false;
+ return -EINVAL;
}
- return true;
+ return 0;
}
/* Check if given stack slot is "special":
@@ -1341,6 +1355,50 @@ static void scrub_spilled_slot(u8 *stype)
*stype = STACK_MISC;
}
+static void print_scalar_ranges(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg,
+ const char **sep)
+{
+ struct {
+ const char *name;
+ u64 val;
+ bool omit;
+ } minmaxs[] = {
+ {"smin", reg->smin_value, reg->smin_value == S64_MIN},
+ {"smax", reg->smax_value, reg->smax_value == S64_MAX},
+ {"umin", reg->umin_value, reg->umin_value == 0},
+ {"umax", reg->umax_value, reg->umax_value == U64_MAX},
+ {"smin32", (s64)reg->s32_min_value, reg->s32_min_value == S32_MIN},
+ {"smax32", (s64)reg->s32_max_value, reg->s32_max_value == S32_MAX},
+ {"umin32", reg->u32_min_value, reg->u32_min_value == 0},
+ {"umax32", reg->u32_max_value, reg->u32_max_value == U32_MAX},
+ }, *m1, *m2, *mend = &minmaxs[ARRAY_SIZE(minmaxs)];
+ bool neg1, neg2;
+
+ for (m1 = &minmaxs[0]; m1 < mend; m1++) {
+ if (m1->omit)
+ continue;
+
+ neg1 = m1->name[0] == 's' && (s64)m1->val < 0;
+
+ verbose(env, "%s%s=", *sep, m1->name);
+ *sep = ",";
+
+ for (m2 = m1 + 2; m2 < mend; m2 += 2) {
+ if (m2->omit || m2->val != m1->val)
+ continue;
+ /* don't mix negatives with positives */
+ neg2 = m2->name[0] == 's' && (s64)m2->val < 0;
+ if (neg2 != neg1)
+ continue;
+ m2->omit = true;
+ verbose(env, "%s=", m2->name);
+ }
+
+ verbose(env, m1->name[0] == 's' ? "%lld" : "%llu", m1->val);
+ }
+}
+
static void print_verifier_state(struct bpf_verifier_env *env,
const struct bpf_func_state *state,
bool print_all)
@@ -1404,34 +1462,13 @@ static void print_verifier_state(struct bpf_verifier_env *env,
*/
verbose_a("imm=%llx", reg->var_off.value);
} else {
- if (reg->smin_value != reg->umin_value &&
- reg->smin_value != S64_MIN)
- verbose_a("smin=%lld", (long long)reg->smin_value);
- if (reg->smax_value != reg->umax_value &&
- reg->smax_value != S64_MAX)
- verbose_a("smax=%lld", (long long)reg->smax_value);
- if (reg->umin_value != 0)
- verbose_a("umin=%llu", (unsigned long long)reg->umin_value);
- if (reg->umax_value != U64_MAX)
- verbose_a("umax=%llu", (unsigned long long)reg->umax_value);
+ print_scalar_ranges(env, reg, &sep);
if (!tnum_is_unknown(reg->var_off)) {
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
verbose_a("var_off=%s", tn_buf);
}
- if (reg->s32_min_value != reg->smin_value &&
- reg->s32_min_value != S32_MIN)
- verbose_a("s32_min=%d", (int)(reg->s32_min_value));
- if (reg->s32_max_value != reg->smax_value &&
- reg->s32_max_value != S32_MAX)
- verbose_a("s32_max=%d", (int)(reg->s32_max_value));
- if (reg->u32_min_value != reg->umin_value &&
- reg->u32_min_value != U32_MIN)
- verbose_a("u32_min=%d", (int)(reg->u32_min_value));
- if (reg->u32_max_value != reg->umax_value &&
- reg->u32_max_value != U32_MAX)
- verbose_a("u32_max=%d", (int)(reg->u32_max_value));
}
#undef verbose_a
@@ -1515,7 +1552,8 @@ static void print_verifier_state(struct bpf_verifier_env *env,
if (state->in_async_callback_fn)
verbose(env, " async_cb");
verbose(env, "\n");
- mark_verifier_state_clean(env);
+ if (!print_all)
+ mark_verifier_state_clean(env);
}
static inline u32 vlog_alignment(u32 pos)
@@ -1748,7 +1786,9 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
return -ENOMEM;
dst_state->jmp_history_cnt = src->jmp_history_cnt;
- /* if dst has more stack frames then src frame, free them */
+ /* if dst has more stack frames then src frame, free them, this is also
+ * necessary in case of exceptional exits using bpf_throw.
+ */
for (i = src->curframe + 1; i <= dst_state->curframe; i++) {
free_func_state(dst_state->frame[i]);
dst_state->frame[i] = NULL;
@@ -1762,6 +1802,8 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
dst_state->parent = src->parent;
dst_state->first_insn_idx = src->first_insn_idx;
dst_state->last_insn_idx = src->last_insn_idx;
+ dst_state->dfs_depth = src->dfs_depth;
+ dst_state->used_as_loop_entry = src->used_as_loop_entry;
for (i = 0; i <= src->curframe; i++) {
dst = dst_state->frame[i];
if (!dst) {
@@ -1777,11 +1819,203 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
return 0;
}
+static u32 state_htab_size(struct bpf_verifier_env *env)
+{
+ return env->prog->len;
+}
+
+static struct bpf_verifier_state_list **explored_state(struct bpf_verifier_env *env, int idx)
+{
+ struct bpf_verifier_state *cur = env->cur_state;
+ struct bpf_func_state *state = cur->frame[cur->curframe];
+
+ return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)];
+}
+
+static bool same_callsites(struct bpf_verifier_state *a, struct bpf_verifier_state *b)
+{
+ int fr;
+
+ if (a->curframe != b->curframe)
+ return false;
+
+ for (fr = a->curframe; fr >= 0; fr--)
+ if (a->frame[fr]->callsite != b->frame[fr]->callsite)
+ return false;
+
+ return true;
+}
+
+/* Open coded iterators allow back-edges in the state graph in order to
+ * check unbounded loops that iterators.
+ *
+ * In is_state_visited() it is necessary to know if explored states are
+ * part of some loops in order to decide whether non-exact states
+ * comparison could be used:
+ * - non-exact states comparison establishes sub-state relation and uses
+ * read and precision marks to do so, these marks are propagated from
+ * children states and thus are not guaranteed to be final in a loop;
+ * - exact states comparison just checks if current and explored states
+ * are identical (and thus form a back-edge).
+ *
+ * Paper "A New Algorithm for Identifying Loops in Decompilation"
+ * by Tao Wei, Jian Mao, Wei Zou and Yu Chen [1] presents a convenient
+ * algorithm for loop structure detection and gives an overview of
+ * relevant terminology. It also has helpful illustrations.
+ *
+ * [1] https://api.semanticscholar.org/CorpusID:15784067
+ *
+ * We use a similar algorithm but because loop nested structure is
+ * irrelevant for verifier ours is significantly simpler and resembles
+ * strongly connected components algorithm from Sedgewick's textbook.
+ *
+ * Define topmost loop entry as a first node of the loop traversed in a
+ * depth first search starting from initial state. The goal of the loop
+ * tracking algorithm is to associate topmost loop entries with states
+ * derived from these entries.
+ *
+ * For each step in the DFS states traversal algorithm needs to identify
+ * the following situations:
+ *
+ * initial initial initial
+ * | | |
+ * V V V
+ * ... ... .---------> hdr
+ * | | | |
+ * V V | V
+ * cur .-> succ | .------...
+ * | | | | | |
+ * V | V | V V
+ * succ '-- cur | ... ...
+ * | | |
+ * | V V
+ * | succ <- cur
+ * | |
+ * | V
+ * | ...
+ * | |
+ * '----'
+ *
+ * (A) successor state of cur (B) successor state of cur or it's entry
+ * not yet traversed are in current DFS path, thus cur and succ
+ * are members of the same outermost loop
+ *
+ * initial initial
+ * | |
+ * V V
+ * ... ...
+ * | |
+ * V V
+ * .------... .------...
+ * | | | |
+ * V V V V
+ * .-> hdr ... ... ...
+ * | | | | |
+ * | V V V V
+ * | succ <- cur succ <- cur
+ * | | |
+ * | V V
+ * | ... ...
+ * | | |
+ * '----' exit
+ *
+ * (C) successor state of cur is a part of some loop but this loop
+ * does not include cur or successor state is not in a loop at all.
+ *
+ * Algorithm could be described as the following python code:
+ *
+ * traversed = set() # Set of traversed nodes
+ * entries = {} # Mapping from node to loop entry
+ * depths = {} # Depth level assigned to graph node
+ * path = set() # Current DFS path
+ *
+ * # Find outermost loop entry known for n
+ * def get_loop_entry(n):
+ * h = entries.get(n, None)
+ * while h in entries and entries[h] != h:
+ * h = entries[h]
+ * return h
+ *
+ * # Update n's loop entry if h's outermost entry comes
+ * # before n's outermost entry in current DFS path.
+ * def update_loop_entry(n, h):
+ * n1 = get_loop_entry(n) or n
+ * h1 = get_loop_entry(h) or h
+ * if h1 in path and depths[h1] <= depths[n1]:
+ * entries[n] = h1
+ *
+ * def dfs(n, depth):
+ * traversed.add(n)
+ * path.add(n)
+ * depths[n] = depth
+ * for succ in G.successors(n):
+ * if succ not in traversed:
+ * # Case A: explore succ and update cur's loop entry
+ * # only if succ's entry is in current DFS path.
+ * dfs(succ, depth + 1)
+ * h = get_loop_entry(succ)
+ * update_loop_entry(n, h)
+ * else:
+ * # Case B or C depending on `h1 in path` check in update_loop_entry().
+ * update_loop_entry(n, succ)
+ * path.remove(n)
+ *
+ * To adapt this algorithm for use with verifier:
+ * - use st->branch == 0 as a signal that DFS of succ had been finished
+ * and cur's loop entry has to be updated (case A), handle this in
+ * update_branch_counts();
+ * - use st->branch > 0 as a signal that st is in the current DFS path;
+ * - handle cases B and C in is_state_visited();
+ * - update topmost loop entry for intermediate states in get_loop_entry().
+ */
+static struct bpf_verifier_state *get_loop_entry(struct bpf_verifier_state *st)
+{
+ struct bpf_verifier_state *topmost = st->loop_entry, *old;
+
+ while (topmost && topmost->loop_entry && topmost != topmost->loop_entry)
+ topmost = topmost->loop_entry;
+ /* Update loop entries for intermediate states to avoid this
+ * traversal in future get_loop_entry() calls.
+ */
+ while (st && st->loop_entry != topmost) {
+ old = st->loop_entry;
+ st->loop_entry = topmost;
+ st = old;
+ }
+ return topmost;
+}
+
+static void update_loop_entry(struct bpf_verifier_state *cur, struct bpf_verifier_state *hdr)
+{
+ struct bpf_verifier_state *cur1, *hdr1;
+
+ cur1 = get_loop_entry(cur) ?: cur;
+ hdr1 = get_loop_entry(hdr) ?: hdr;
+ /* The head1->branches check decides between cases B and C in
+ * comment for get_loop_entry(). If hdr1->branches == 0 then
+ * head's topmost loop entry is not in current DFS path,
+ * hence 'cur' and 'hdr' are not in the same loop and there is
+ * no need to update cur->loop_entry.
+ */
+ if (hdr1->branches && hdr1->dfs_depth <= cur1->dfs_depth) {
+ cur->loop_entry = hdr;
+ hdr->used_as_loop_entry = true;
+ }
+}
+
static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
{
while (st) {
u32 br = --st->branches;
+ /* br == 0 signals that DFS exploration for 'st' is finished,
+ * thus it is necessary to update parent's loop entry if it
+ * turned out that st is a part of some loop.
+ * This is a part of 'case A' in get_loop_entry() comment.
+ */
+ if (br == 0 && st->parent && st->loop_entry)
+ update_loop_entry(st->parent, st->loop_entry);
+
/* WARN_ON(br > 1) technically makes sense here,
* but see comment in push_stack(), hence:
*/
@@ -2454,6 +2688,68 @@ static int add_subprog(struct bpf_verifier_env *env, int off)
return env->subprog_cnt - 1;
}
+static int bpf_find_exception_callback_insn_off(struct bpf_verifier_env *env)
+{
+ struct bpf_prog_aux *aux = env->prog->aux;
+ struct btf *btf = aux->btf;
+ const struct btf_type *t;
+ u32 main_btf_id, id;
+ const char *name;
+ int ret, i;
+
+ /* Non-zero func_info_cnt implies valid btf */
+ if (!aux->func_info_cnt)
+ return 0;
+ main_btf_id = aux->func_info[0].type_id;
+
+ t = btf_type_by_id(btf, main_btf_id);
+ if (!t) {
+ verbose(env, "invalid btf id for main subprog in func_info\n");
+ return -EINVAL;
+ }
+
+ name = btf_find_decl_tag_value(btf, t, -1, "exception_callback:");
+ if (IS_ERR(name)) {
+ ret = PTR_ERR(name);
+ /* If there is no tag present, there is no exception callback */
+ if (ret == -ENOENT)
+ ret = 0;
+ else if (ret == -EEXIST)
+ verbose(env, "multiple exception callback tags for main subprog\n");
+ return ret;
+ }
+
+ ret = btf_find_by_name_kind(btf, name, BTF_KIND_FUNC);
+ if (ret < 0) {
+ verbose(env, "exception callback '%s' could not be found in BTF\n", name);
+ return ret;
+ }
+ id = ret;
+ t = btf_type_by_id(btf, id);
+ if (btf_func_linkage(t) != BTF_FUNC_GLOBAL) {
+ verbose(env, "exception callback '%s' must have global linkage\n", name);
+ return -EINVAL;
+ }
+ ret = 0;
+ for (i = 0; i < aux->func_info_cnt; i++) {
+ if (aux->func_info[i].type_id != id)
+ continue;
+ ret = aux->func_info[i].insn_off;
+ /* Further func_info and subprog checks will also happen
+ * later, so assume this is the right insn_off for now.
+ */
+ if (!ret) {
+ verbose(env, "invalid exception callback insn_off in func_info: 0\n");
+ ret = -EINVAL;
+ }
+ }
+ if (!ret) {
+ verbose(env, "exception callback type id not found in func_info\n");
+ ret = -EINVAL;
+ }
+ return ret;
+}
+
#define MAX_KFUNC_DESCS 256
#define MAX_KFUNC_BTFS 256
@@ -2793,8 +3089,8 @@ bpf_jit_find_kfunc_model(const struct bpf_prog *prog,
static int add_subprog_and_kfunc(struct bpf_verifier_env *env)
{
struct bpf_subprog_info *subprog = env->subprog_info;
+ int i, ret, insn_cnt = env->prog->len, ex_cb_insn;
struct bpf_insn *insn = env->prog->insnsi;
- int i, ret, insn_cnt = env->prog->len;
/* Add entry function. */
ret = add_subprog(env, 0);
@@ -2820,6 +3116,26 @@ static int add_subprog_and_kfunc(struct bpf_verifier_env *env)
return ret;
}
+ ret = bpf_find_exception_callback_insn_off(env);
+ if (ret < 0)
+ return ret;
+ ex_cb_insn = ret;
+
+ /* If ex_cb_insn > 0, this means that the main program has a subprog
+ * marked using BTF decl tag to serve as the exception callback.
+ */
+ if (ex_cb_insn) {
+ ret = add_subprog(env, ex_cb_insn);
+ if (ret < 0)
+ return ret;
+ for (i = 1; i < env->subprog_cnt; i++) {
+ if (env->subprog_info[i].start != ex_cb_insn)
+ continue;
+ env->exception_callback_subprog = i;
+ break;
+ }
+ }
+
/* Add a fake 'exit' subprog which could simplify subprog iteration
* logic. 'subprog_cnt' should not be increased.
*/
@@ -2868,7 +3184,7 @@ next:
if (i == subprog_end - 1) {
/* to avoid fall-through from one subprog into another
* the last insn of the subprog should be either exit
- * or unconditional jump back
+ * or unconditional jump back or bpf_throw call
*/
if (code != (BPF_JMP | BPF_EXIT) &&
code != (BPF_JMP32 | BPF_JA) &&
@@ -3029,7 +3345,7 @@ static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn,
if (class == BPF_LDX) {
if (t != SRC_OP)
- return BPF_SIZE(code) == BPF_DW;
+ return BPF_SIZE(code) == BPF_DW || BPF_MODE(code) == BPF_MEMSX;
/* LDX source must be ptr. */
return true;
}
@@ -4999,6 +5315,8 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
perm_flags |= PTR_UNTRUSTED;
} else {
perm_flags = PTR_MAYBE_NULL | MEM_ALLOC;
+ if (kptr_field->type == BPF_KPTR_PERCPU)
+ perm_flags |= MEM_PERCPU;
}
if (base_type(reg->type) != PTR_TO_BTF_ID || (type_flag(reg->type) & ~perm_flags))
@@ -5042,7 +5360,7 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
*/
if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
kptr_field->kptr.btf, kptr_field->kptr.btf_id,
- kptr_field->type == BPF_KPTR_REF))
+ kptr_field->type != BPF_KPTR_UNREF))
goto bad_type;
return 0;
bad_type:
@@ -5086,7 +5404,18 @@ static bool rcu_safe_kptr(const struct btf_field *field)
{
const struct btf_field_kptr *kptr = &field->kptr;
- return field->type == BPF_KPTR_REF && rcu_protected_object(kptr->btf, kptr->btf_id);
+ return field->type == BPF_KPTR_PERCPU ||
+ (field->type == BPF_KPTR_REF && rcu_protected_object(kptr->btf, kptr->btf_id));
+}
+
+static u32 btf_ld_kptr_type(struct bpf_verifier_env *env, struct btf_field *kptr_field)
+{
+ if (rcu_safe_kptr(kptr_field) && in_rcu_cs(env)) {
+ if (kptr_field->type != BPF_KPTR_PERCPU)
+ return PTR_MAYBE_NULL | MEM_RCU;
+ return PTR_MAYBE_NULL | MEM_RCU | MEM_PERCPU;
+ }
+ return PTR_MAYBE_NULL | PTR_UNTRUSTED;
}
static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
@@ -5112,7 +5441,8 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
/* We only allow loading referenced kptr, since it will be marked as
* untrusted, similar to unreferenced kptr.
*/
- if (class != BPF_LDX && kptr_field->type == BPF_KPTR_REF) {
+ if (class != BPF_LDX &&
+ (kptr_field->type == BPF_KPTR_REF || kptr_field->type == BPF_KPTR_PERCPU)) {
verbose(env, "store to referenced kptr disallowed\n");
return -EACCES;
}
@@ -5123,10 +5453,7 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
* value from map as PTR_TO_BTF_ID, with the correct type.
*/
mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, kptr_field->kptr.btf,
- kptr_field->kptr.btf_id,
- rcu_safe_kptr(kptr_field) && in_rcu_cs(env) ?
- PTR_MAYBE_NULL | MEM_RCU :
- PTR_MAYBE_NULL | PTR_UNTRUSTED);
+ kptr_field->kptr.btf_id, btf_ld_kptr_type(env, kptr_field));
/* For mark_ptr_or_null_reg */
val_reg->id = ++env->id_gen;
} else if (class == BPF_STX) {
@@ -5180,6 +5507,7 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
switch (field->type) {
case BPF_KPTR_UNREF:
case BPF_KPTR_REF:
+ case BPF_KPTR_PERCPU:
if (src != ACCESS_DIRECT) {
verbose(env, "kptr cannot be accessed indirectly by helper\n");
return -EACCES;
@@ -5647,6 +5975,27 @@ continue_func:
for (; i < subprog_end; i++) {
int next_insn, sidx;
+ if (bpf_pseudo_kfunc_call(insn + i) && !insn[i].off) {
+ bool err = false;
+
+ if (!is_bpf_throw_kfunc(insn + i))
+ continue;
+ if (subprog[idx].is_cb)
+ err = true;
+ for (int c = 0; c < frame && !err; c++) {
+ if (subprog[ret_prog[c]].is_cb) {
+ err = true;
+ break;
+ }
+ }
+ if (!err)
+ continue;
+ verbose(env,
+ "bpf_throw kfunc (insn %d) cannot be called from callback subprog %d\n",
+ i, idx);
+ return -EINVAL;
+ }
+
if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i))
continue;
/* remember insn and function to return to */
@@ -5669,6 +6018,10 @@ continue_func:
/* async callbacks don't increase bpf prog stack size unless called directly */
if (!bpf_pseudo_call(insn + i))
continue;
+ if (subprog[sidx].is_exception_cb) {
+ verbose(env, "insn %d cannot call exception cb directly\n", i);
+ return -EINVAL;
+ }
}
i = next_insn;
idx = sidx;
@@ -5690,8 +6043,13 @@ continue_func:
* tail call counter throughout bpf2bpf calls combined with tailcalls
*/
if (tail_call_reachable)
- for (j = 0; j < frame; j++)
+ for (j = 0; j < frame; j++) {
+ if (subprog[ret_prog[j]].is_exception_cb) {
+ verbose(env, "cannot tail call within exception cb\n");
+ return -EINVAL;
+ }
subprog[ret_prog[j]].tail_call_reachable = true;
+ }
if (subprog[0].tail_call_reachable)
env->prog->aux->tail_call_reachable = true;
@@ -6207,7 +6565,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
}
if (type_is_alloc(reg->type) && !type_is_non_owning_ref(reg->type) &&
- !reg->ref_obj_id) {
+ !(reg->type & MEM_RCU) && !reg->ref_obj_id) {
verbose(env, "verifier internal error: ref_obj_id for allocated object must be non-zero\n");
return -EFAULT;
}
@@ -7318,7 +7676,7 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno,
verbose(env, "off=%d doesn't point to kptr\n", kptr_off);
return -EACCES;
}
- if (kptr_field->type != BPF_KPTR_REF) {
+ if (kptr_field->type != BPF_KPTR_REF && kptr_field->type != BPF_KPTR_PERCPU) {
verbose(env, "off=%d kptr isn't referenced kptr\n", kptr_off);
return -EACCES;
}
@@ -7489,15 +7847,24 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id
return err;
}
- err = mark_stack_slots_iter(env, reg, insn_idx, meta->btf, btf_id, nr_slots);
+ err = mark_stack_slots_iter(env, meta, reg, insn_idx, meta->btf, btf_id, nr_slots);
if (err)
return err;
} else {
/* iter_next() or iter_destroy() expect initialized iter state*/
- if (!is_iter_reg_valid_init(env, reg, meta->btf, btf_id, nr_slots)) {
+ err = is_iter_reg_valid_init(env, reg, meta->btf, btf_id, nr_slots);
+ switch (err) {
+ case 0:
+ break;
+ case -EINVAL:
verbose(env, "expected an initialized iter_%s as arg #%d\n",
iter_type_str(meta->btf, btf_id), regno);
- return -EINVAL;
+ return err;
+ case -EPROTO:
+ verbose(env, "expected an RCU CS when using %s\n", meta->func_name);
+ return err;
+ default:
+ return err;
}
spi = iter_get_spi(env, reg, nr_slots);
@@ -7523,6 +7890,81 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id
return 0;
}
+/* Look for a previous loop entry at insn_idx: nearest parent state
+ * stopped at insn_idx with callsites matching those in cur->frame.
+ */
+static struct bpf_verifier_state *find_prev_entry(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *cur,
+ int insn_idx)
+{
+ struct bpf_verifier_state_list *sl;
+ struct bpf_verifier_state *st;
+
+ /* Explored states are pushed in stack order, most recent states come first */
+ sl = *explored_state(env, insn_idx);
+ for (; sl; sl = sl->next) {
+ /* If st->branches != 0 state is a part of current DFS verification path,
+ * hence cur & st for a loop.
+ */
+ st = &sl->state;
+ if (st->insn_idx == insn_idx && st->branches && same_callsites(st, cur) &&
+ st->dfs_depth < cur->dfs_depth)
+ return st;
+ }
+
+ return NULL;
+}
+
+static void reset_idmap_scratch(struct bpf_verifier_env *env);
+static bool regs_exact(const struct bpf_reg_state *rold,
+ const struct bpf_reg_state *rcur,
+ struct bpf_idmap *idmap);
+
+static void maybe_widen_reg(struct bpf_verifier_env *env,
+ struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
+ struct bpf_idmap *idmap)
+{
+ if (rold->type != SCALAR_VALUE)
+ return;
+ if (rold->type != rcur->type)
+ return;
+ if (rold->precise || rcur->precise || regs_exact(rold, rcur, idmap))
+ return;
+ __mark_reg_unknown(env, rcur);
+}
+
+static int widen_imprecise_scalars(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *old,
+ struct bpf_verifier_state *cur)
+{
+ struct bpf_func_state *fold, *fcur;
+ int i, fr;
+
+ reset_idmap_scratch(env);
+ for (fr = old->curframe; fr >= 0; fr--) {
+ fold = old->frame[fr];
+ fcur = cur->frame[fr];
+
+ for (i = 0; i < MAX_BPF_REG; i++)
+ maybe_widen_reg(env,
+ &fold->regs[i],
+ &fcur->regs[i],
+ &env->idmap_scratch);
+
+ for (i = 0; i < fold->allocated_stack / BPF_REG_SIZE; i++) {
+ if (!is_spilled_reg(&fold->stack[i]) ||
+ !is_spilled_reg(&fcur->stack[i]))
+ continue;
+
+ maybe_widen_reg(env,
+ &fold->stack[i].spilled_ptr,
+ &fcur->stack[i].spilled_ptr,
+ &env->idmap_scratch);
+ }
+ }
+ return 0;
+}
+
/* process_iter_next_call() is called when verifier gets to iterator's next
* "method" (e.g., bpf_iter_num_next() for numbers iterator) call. We'll refer
* to it as just "iter_next()" in comments below.
@@ -7564,25 +8006,47 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id
* is some statically known limit on number of iterations (e.g., if there is
* an explicit `if n > 100 then break;` statement somewhere in the loop).
*
- * One very subtle but very important aspect is that we *always* simulate NULL
- * condition first (as the current state) before we simulate non-NULL case.
- * This has to do with intricacies of scalar precision tracking. By simulating
- * "exit condition" of iter_next() returning NULL first, we make sure all the
- * relevant precision marks *that will be set **after** we exit iterator loop*
- * are propagated backwards to common parent state of NULL and non-NULL
- * branches. Thanks to that, state equivalence checks done later in forked
- * state, when reaching iter_next() for ACTIVE iterator, can assume that
- * precision marks are finalized and won't change. Because simulating another
- * ACTIVE iterator iteration won't change them (because given same input
- * states we'll end up with exactly same output states which we are currently
- * comparing; and verification after the loop already propagated back what
- * needs to be **additionally** tracked as precise). It's subtle, grok
- * precision tracking for more intuitive understanding.
+ * Iteration convergence logic in is_state_visited() relies on exact
+ * states comparison, which ignores read and precision marks.
+ * This is necessary because read and precision marks are not finalized
+ * while in the loop. Exact comparison might preclude convergence for
+ * simple programs like below:
+ *
+ * i = 0;
+ * while(iter_next(&it))
+ * i++;
+ *
+ * At each iteration step i++ would produce a new distinct state and
+ * eventually instruction processing limit would be reached.
+ *
+ * To avoid such behavior speculatively forget (widen) range for
+ * imprecise scalar registers, if those registers were not precise at the
+ * end of the previous iteration and do not match exactly.
+ *
+ * This is a conservative heuristic that allows to verify wide range of programs,
+ * however it precludes verification of programs that conjure an
+ * imprecise value on the first loop iteration and use it as precise on a second.
+ * For example, the following safe program would fail to verify:
+ *
+ * struct bpf_num_iter it;
+ * int arr[10];
+ * int i = 0, a = 0;
+ * bpf_iter_num_new(&it, 0, 10);
+ * while (bpf_iter_num_next(&it)) {
+ * if (a == 0) {
+ * a = 1;
+ * i = 7; // Because i changed verifier would forget
+ * // it's range on second loop entry.
+ * } else {
+ * arr[i] = 42; // This would fail to verify.
+ * }
+ * }
+ * bpf_iter_num_destroy(&it);
*/
static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx,
struct bpf_kfunc_call_arg_meta *meta)
{
- struct bpf_verifier_state *cur_st = env->cur_state, *queued_st;
+ struct bpf_verifier_state *cur_st = env->cur_state, *queued_st, *prev_st;
struct bpf_func_state *cur_fr = cur_st->frame[cur_st->curframe], *queued_fr;
struct bpf_reg_state *cur_iter, *queued_iter;
int iter_frameno = meta->iter.frameno;
@@ -7600,6 +8064,19 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx,
}
if (cur_iter->iter.state == BPF_ITER_STATE_ACTIVE) {
+ /* Because iter_next() call is a checkpoint is_state_visitied()
+ * should guarantee parent state with same call sites and insn_idx.
+ */
+ if (!cur_st->parent || cur_st->parent->insn_idx != insn_idx ||
+ !same_callsites(cur_st->parent, cur_st)) {
+ verbose(env, "bug: bad parent state for iter next call");
+ return -EFAULT;
+ }
+ /* Note cur_st->parent in the call below, it is necessary to skip
+ * checkpoint created for cur_st by is_state_visited()
+ * right at this instruction.
+ */
+ prev_st = find_prev_entry(env, cur_st->parent, insn_idx);
/* branch out active iter state */
queued_st = push_stack(env, insn_idx + 1, insn_idx, false);
if (!queued_st)
@@ -7608,6 +8085,8 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx,
queued_iter = &queued_st->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
queued_iter->iter.state = BPF_ITER_STATE_ACTIVE;
queued_iter->iter.depth++;
+ if (prev_st)
+ widen_imprecise_scalars(env, prev_st, queued_st);
queued_fr = queued_st->frame[queued_st->curframe];
mark_ptr_not_null_reg(&queued_fr->regs[BPF_REG_0]);
@@ -7751,6 +8230,7 @@ static const struct bpf_reg_types btf_ptr_types = {
static const struct bpf_reg_types percpu_btf_ptr_types = {
.types = {
PTR_TO_BTF_ID | MEM_PERCPU,
+ PTR_TO_BTF_ID | MEM_PERCPU | MEM_RCU,
PTR_TO_BTF_ID | MEM_PERCPU | PTR_TRUSTED,
}
};
@@ -7829,8 +8309,10 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
if (base_type(arg_type) == ARG_PTR_TO_MEM)
type &= ~DYNPTR_TYPE_FLAG_MASK;
- if (meta->func_id == BPF_FUNC_kptr_xchg && type_is_alloc(type))
+ if (meta->func_id == BPF_FUNC_kptr_xchg && type_is_alloc(type)) {
type &= ~MEM_ALLOC;
+ type &= ~MEM_PERCPU;
+ }
for (i = 0; i < ARRAY_SIZE(compatible->types); i++) {
expected = compatible->types[i];
@@ -7913,6 +8395,7 @@ found:
break;
}
case PTR_TO_BTF_ID | MEM_ALLOC:
+ case PTR_TO_BTF_ID | MEM_PERCPU | MEM_ALLOC:
if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock &&
meta->func_id != BPF_FUNC_kptr_xchg) {
verbose(env, "verifier internal error: unimplemented handling of MEM_ALLOC\n");
@@ -7924,6 +8407,7 @@ found:
}
break;
case PTR_TO_BTF_ID | MEM_PERCPU:
+ case PTR_TO_BTF_ID | MEM_PERCPU | MEM_RCU:
case PTR_TO_BTF_ID | MEM_PERCPU | PTR_TRUSTED:
/* Handled by helper specific checks */
break;
@@ -8900,6 +9384,7 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
* callbacks
*/
if (set_callee_state_cb != set_callee_state) {
+ env->subprog_info[subprog].is_cb = true;
if (bpf_pseudo_kfunc_call(insn) &&
!is_callback_calling_kfunc(insn->imm)) {
verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n",
@@ -9289,7 +9774,8 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
verbose(env, "to caller at %d:\n", *insn_idx);
print_verifier_state(env, caller, true);
}
- /* clear everything in the callee */
+ /* clear everything in the callee. In case of exceptional exits using
+ * bpf_throw, this will be done by copy_verifier_state for extra frames. */
free_func_state(callee);
state->frame[state->curframe--] = NULL;
return 0;
@@ -9413,17 +9899,17 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
return 0;
}
-static int check_reference_leak(struct bpf_verifier_env *env)
+static int check_reference_leak(struct bpf_verifier_env *env, bool exception_exit)
{
struct bpf_func_state *state = cur_func(env);
bool refs_lingering = false;
int i;
- if (state->frameno && !state->in_callback_fn)
+ if (!exception_exit && state->frameno && !state->in_callback_fn)
return 0;
for (i = 0; i < state->acquired_refs; i++) {
- if (state->in_callback_fn && state->refs[i].callback_ref != state->frameno)
+ if (!exception_exit && state->in_callback_fn && state->refs[i].callback_ref != state->frameno)
continue;
verbose(env, "Unreleased reference id=%d alloc_insn=%d\n",
state->refs[i].id, state->refs[i].insn_idx);
@@ -9530,6 +10016,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
int *insn_idx_p)
{
enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
+ bool returns_cpu_specific_alloc_ptr = false;
const struct bpf_func_proto *fn = NULL;
enum bpf_return_type ret_type;
enum bpf_type_flag ret_flag;
@@ -9640,6 +10127,26 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
return -EFAULT;
}
err = unmark_stack_slots_dynptr(env, &regs[meta.release_regno]);
+ } else if (func_id == BPF_FUNC_kptr_xchg && meta.ref_obj_id) {
+ u32 ref_obj_id = meta.ref_obj_id;
+ bool in_rcu = in_rcu_cs(env);
+ struct bpf_func_state *state;
+ struct bpf_reg_state *reg;
+
+ err = release_reference_state(cur_func(env), ref_obj_id);
+ if (!err) {
+ bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
+ if (reg->ref_obj_id == ref_obj_id) {
+ if (in_rcu && (reg->type & MEM_ALLOC) && (reg->type & MEM_PERCPU)) {
+ reg->ref_obj_id = 0;
+ reg->type &= ~MEM_ALLOC;
+ reg->type |= MEM_RCU;
+ } else {
+ mark_reg_invalid(env, reg);
+ }
+ }
+ }));
+ }
} else if (meta.ref_obj_id) {
err = release_reference(env, meta.ref_obj_id);
} else if (register_is_null(&regs[meta.release_regno])) {
@@ -9657,7 +10164,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
switch (func_id) {
case BPF_FUNC_tail_call:
- err = check_reference_leak(env);
+ err = check_reference_leak(env, false);
if (err) {
verbose(env, "tail_call would lead to reference leak\n");
return err;
@@ -9768,6 +10275,23 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
break;
}
+ case BPF_FUNC_per_cpu_ptr:
+ case BPF_FUNC_this_cpu_ptr:
+ {
+ struct bpf_reg_state *reg = &regs[BPF_REG_1];
+ const struct btf_type *type;
+
+ if (reg->type & MEM_RCU) {
+ type = btf_type_by_id(reg->btf, reg->btf_id);
+ if (!type || !btf_type_is_struct(type)) {
+ verbose(env, "Helper has invalid btf/btf_id in R1\n");
+ return -EFAULT;
+ }
+ returns_cpu_specific_alloc_ptr = true;
+ env->insn_aux_data[insn_idx].call_with_percpu_alloc_ptr = true;
+ }
+ break;
+ }
case BPF_FUNC_user_ringbuf_drain:
err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
set_user_ringbuf_callback_state);
@@ -9857,14 +10381,18 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
regs[BPF_REG_0].mem_size = tsize;
} else {
- /* MEM_RDONLY may be carried from ret_flag, but it
- * doesn't apply on PTR_TO_BTF_ID. Fold it, otherwise
- * it will confuse the check of PTR_TO_BTF_ID in
- * check_mem_access().
- */
- ret_flag &= ~MEM_RDONLY;
+ if (returns_cpu_specific_alloc_ptr) {
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC | MEM_RCU;
+ } else {
+ /* MEM_RDONLY may be carried from ret_flag, but it
+ * doesn't apply on PTR_TO_BTF_ID. Fold it, otherwise
+ * it will confuse the check of PTR_TO_BTF_ID in
+ * check_mem_access().
+ */
+ ret_flag &= ~MEM_RDONLY;
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag;
+ }
- regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag;
regs[BPF_REG_0].btf = meta.ret_btf;
regs[BPF_REG_0].btf_id = meta.ret_btf_id;
}
@@ -9880,8 +10408,11 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
if (func_id == BPF_FUNC_kptr_xchg) {
ret_btf = meta.kptr_field->kptr.btf;
ret_btf_id = meta.kptr_field->kptr.btf_id;
- if (!btf_is_kernel(ret_btf))
+ if (!btf_is_kernel(ret_btf)) {
regs[BPF_REG_0].type |= MEM_ALLOC;
+ if (meta.kptr_field->type == BPF_KPTR_PERCPU)
+ regs[BPF_REG_0].type |= MEM_PERCPU;
+ }
} else {
if (fn->ret_btf_id == BPF_PTR_POISON) {
verbose(env, "verifier internal error:");
@@ -10028,6 +10559,11 @@ static bool is_kfunc_rcu(struct bpf_kfunc_call_arg_meta *meta)
return meta->kfunc_flags & KF_RCU;
}
+static bool is_kfunc_rcu_protected(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_RCU_PROTECTED;
+}
+
static bool __kfunc_param_match_suffix(const struct btf *btf,
const struct btf_param *arg,
const char *suffix)
@@ -10102,6 +10638,11 @@ static bool is_kfunc_arg_refcounted_kptr(const struct btf *btf, const struct btf
return __kfunc_param_match_suffix(btf, arg, "__refcounted_kptr");
}
+static bool is_kfunc_arg_nullable(const struct btf *btf, const struct btf_param *arg)
+{
+ return __kfunc_param_match_suffix(btf, arg, "__nullable");
+}
+
static bool is_kfunc_arg_scalar_with_name(const struct btf *btf,
const struct btf_param *arg,
const char *name)
@@ -10244,6 +10785,7 @@ enum kfunc_ptr_arg_type {
KF_ARG_PTR_TO_CALLBACK,
KF_ARG_PTR_TO_RB_ROOT,
KF_ARG_PTR_TO_RB_NODE,
+ KF_ARG_PTR_TO_NULL,
};
enum special_kfunc_type {
@@ -10266,6 +10808,10 @@ enum special_kfunc_type {
KF_bpf_dynptr_slice,
KF_bpf_dynptr_slice_rdwr,
KF_bpf_dynptr_clone,
+ KF_bpf_percpu_obj_new_impl,
+ KF_bpf_percpu_obj_drop_impl,
+ KF_bpf_throw,
+ KF_bpf_iter_css_task_new,
};
BTF_SET_START(special_kfunc_set)
@@ -10286,6 +10832,10 @@ BTF_ID(func, bpf_dynptr_from_xdp)
BTF_ID(func, bpf_dynptr_slice)
BTF_ID(func, bpf_dynptr_slice_rdwr)
BTF_ID(func, bpf_dynptr_clone)
+BTF_ID(func, bpf_percpu_obj_new_impl)
+BTF_ID(func, bpf_percpu_obj_drop_impl)
+BTF_ID(func, bpf_throw)
+BTF_ID(func, bpf_iter_css_task_new)
BTF_SET_END(special_kfunc_set)
BTF_ID_LIST(special_kfunc_list)
@@ -10308,6 +10858,10 @@ BTF_ID(func, bpf_dynptr_from_xdp)
BTF_ID(func, bpf_dynptr_slice)
BTF_ID(func, bpf_dynptr_slice_rdwr)
BTF_ID(func, bpf_dynptr_clone)
+BTF_ID(func, bpf_percpu_obj_new_impl)
+BTF_ID(func, bpf_percpu_obj_drop_impl)
+BTF_ID(func, bpf_throw)
+BTF_ID(func, bpf_iter_css_task_new)
static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
{
@@ -10388,6 +10942,8 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
if (is_kfunc_arg_callback(env, meta->btf, &args[argno]))
return KF_ARG_PTR_TO_CALLBACK;
+ if (is_kfunc_arg_nullable(meta->btf, &args[argno]) && register_is_null(reg))
+ return KF_ARG_PTR_TO_NULL;
if (argno + 1 < nargs &&
(is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1]) ||
@@ -10625,6 +11181,12 @@ static bool is_callback_calling_kfunc(u32 btf_id)
return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl];
}
+static bool is_bpf_throw_kfunc(struct bpf_insn *insn)
+{
+ return bpf_pseudo_kfunc_call(insn) && insn->off == 0 &&
+ insn->imm == special_kfunc_list[KF_bpf_throw];
+}
+
static bool is_rbtree_lock_required_kfunc(u32 btf_id)
{
return is_bpf_rbtree_api_kfunc(btf_id);
@@ -10832,6 +11394,20 @@ static int process_kf_arg_ptr_to_rbtree_node(struct bpf_verifier_env *env,
&meta->arg_rbtree_root.field);
}
+static bool check_css_task_iter_allowlist(struct bpf_verifier_env *env)
+{
+ enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
+
+ switch (prog_type) {
+ case BPF_PROG_TYPE_LSM:
+ return true;
+ case BPF_TRACE_ITER:
+ return env->prog->aux->sleepable;
+ default:
+ return false;
+ }
+}
+
static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta,
int insn_idx)
{
@@ -10918,7 +11494,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
}
if ((is_kfunc_trusted_args(meta) || is_kfunc_rcu(meta)) &&
- (register_is_null(reg) || type_may_be_null(reg->type))) {
+ (register_is_null(reg) || type_may_be_null(reg->type)) &&
+ !is_kfunc_arg_nullable(meta->btf, &args[i])) {
verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i);
return -EACCES;
}
@@ -10943,6 +11520,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
return kf_arg_type;
switch (kf_arg_type) {
+ case KF_ARG_PTR_TO_NULL:
+ continue;
case KF_ARG_PTR_TO_ALLOC_BTF_ID:
case KF_ARG_PTR_TO_BTF_ID:
if (!is_kfunc_trusted_args(meta) && !is_kfunc_rcu(meta))
@@ -11002,7 +11581,17 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
}
break;
case KF_ARG_PTR_TO_ALLOC_BTF_ID:
- if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
+ if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC)) {
+ if (meta->func_id != special_kfunc_list[KF_bpf_obj_drop_impl]) {
+ verbose(env, "arg#%d expected for bpf_obj_drop_impl()\n", i);
+ return -EINVAL;
+ }
+ } else if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC | MEM_PERCPU)) {
+ if (meta->func_id != special_kfunc_list[KF_bpf_percpu_obj_drop_impl]) {
+ verbose(env, "arg#%d expected for bpf_percpu_obj_drop_impl()\n", i);
+ return -EINVAL;
+ }
+ } else {
verbose(env, "arg#%d expected pointer to allocated object\n", i);
return -EINVAL;
}
@@ -11010,8 +11599,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
verbose(env, "allocated object must be referenced\n");
return -EINVAL;
}
- if (meta->btf == btf_vmlinux &&
- meta->func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) {
+ if (meta->btf == btf_vmlinux) {
meta->arg_btf = reg->btf;
meta->arg_btf_id = reg->btf_id;
}
@@ -11073,6 +11661,12 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
break;
}
case KF_ARG_PTR_TO_ITER:
+ if (meta->func_id == special_kfunc_list[KF_bpf_iter_css_task_new]) {
+ if (!check_css_task_iter_allowlist(env)) {
+ verbose(env, "css_task_iter is only allowed in bpf_lsm and bpf iter-s\n");
+ return -EINVAL;
+ }
+ }
ret = process_iter_arg(env, regno, insn_idx, meta);
if (ret < 0)
return ret;
@@ -11202,6 +11796,10 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
break;
}
case KF_ARG_PTR_TO_CALLBACK:
+ if (reg->type != PTR_TO_FUNC) {
+ verbose(env, "arg%d expected pointer to func\n", i);
+ return -EINVAL;
+ }
meta->subprogno = reg->subprogno;
break;
case KF_ARG_PTR_TO_REFCOUNTED_KPTR:
@@ -11280,6 +11878,8 @@ static int fetch_kfunc_meta(struct bpf_verifier_env *env,
return 0;
}
+static int check_return_code(struct bpf_verifier_env *env, int regno);
+
static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
int *insn_idx_p)
{
@@ -11326,6 +11926,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
if (env->cur_state->active_rcu_lock) {
struct bpf_func_state *state;
struct bpf_reg_state *reg;
+ u32 clear_mask = (1 << STACK_SPILL) | (1 << STACK_ITER);
if (in_rbtree_lock_required_cb(env) && (rcu_lock || rcu_unlock)) {
verbose(env, "Calling bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n");
@@ -11336,7 +11937,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
verbose(env, "nested rcu read lock (kernel function %s)\n", func_name);
return -EINVAL;
} else if (rcu_unlock) {
- bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
+ bpf_for_each_reg_in_vstate_mask(env->cur_state, state, reg, clear_mask, ({
if (reg->type & MEM_RCU) {
reg->type &= ~(MEM_RCU | PTR_MAYBE_NULL);
reg->type |= PTR_UNTRUSTED;
@@ -11401,6 +12002,24 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
}
}
+ if (meta.func_id == special_kfunc_list[KF_bpf_throw]) {
+ if (!bpf_jit_supports_exceptions()) {
+ verbose(env, "JIT does not support calling kfunc %s#%d\n",
+ func_name, meta.func_id);
+ return -ENOTSUPP;
+ }
+ env->seen_exception = true;
+
+ /* In the case of the default callback, the cookie value passed
+ * to bpf_throw becomes the return value of the program.
+ */
+ if (!env->exception_callback_subprog) {
+ err = check_return_code(env, BPF_REG_1);
+ if (err < 0)
+ return err;
+ }
+ }
+
for (i = 0; i < CALLER_SAVED_REGS; i++)
mark_reg_not_init(env, regs, caller_saved[i]);
@@ -11411,6 +12030,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
/* Only exception is bpf_obj_new_impl */
if (meta.btf != btf_vmlinux ||
(meta.func_id != special_kfunc_list[KF_bpf_obj_new_impl] &&
+ meta.func_id != special_kfunc_list[KF_bpf_percpu_obj_new_impl] &&
meta.func_id != special_kfunc_list[KF_bpf_refcount_acquire_impl])) {
verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n");
return -EINVAL;
@@ -11424,11 +12044,16 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
ptr_type = btf_type_skip_modifiers(desc_btf, t->type, &ptr_type_id);
if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) {
- if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl]) {
+ if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] ||
+ meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
+ struct btf_struct_meta *struct_meta;
struct btf *ret_btf;
u32 ret_btf_id;
- if (unlikely(!bpf_global_ma_set))
+ if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set)
+ return -ENOMEM;
+
+ if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl] && !bpf_global_percpu_ma_set)
return -ENOMEM;
if (((u64)(u32)meta.arg_constant.value) != meta.arg_constant.value) {
@@ -11441,24 +12066,38 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
/* This may be NULL due to user not supplying a BTF */
if (!ret_btf) {
- verbose(env, "bpf_obj_new requires prog BTF\n");
+ verbose(env, "bpf_obj_new/bpf_percpu_obj_new requires prog BTF\n");
return -EINVAL;
}
ret_t = btf_type_by_id(ret_btf, ret_btf_id);
if (!ret_t || !__btf_type_is_struct(ret_t)) {
- verbose(env, "bpf_obj_new type ID argument must be of a struct\n");
+ verbose(env, "bpf_obj_new/bpf_percpu_obj_new type ID argument must be of a struct\n");
return -EINVAL;
}
+ struct_meta = btf_find_struct_meta(ret_btf, ret_btf_id);
+ if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
+ if (!__btf_type_is_scalar_struct(env, ret_btf, ret_t, 0)) {
+ verbose(env, "bpf_percpu_obj_new type ID argument must be of a struct of scalars\n");
+ return -EINVAL;
+ }
+
+ if (struct_meta) {
+ verbose(env, "bpf_percpu_obj_new type ID argument must not contain special fields\n");
+ return -EINVAL;
+ }
+ }
+
mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC;
regs[BPF_REG_0].btf = ret_btf;
regs[BPF_REG_0].btf_id = ret_btf_id;
+ if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl])
+ regs[BPF_REG_0].type |= MEM_PERCPU;
insn_aux->obj_new_size = ret_t->size;
- insn_aux->kptr_struct_meta =
- btf_find_struct_meta(ret_btf, ret_btf_id);
+ insn_aux->kptr_struct_meta = struct_meta;
} else if (meta.func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) {
mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC;
@@ -11595,7 +12234,8 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
regs[BPF_REG_0].id = ++env->id_gen;
} else if (btf_type_is_void(t)) {
if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) {
- if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) {
+ if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl] ||
+ meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl]) {
insn_aux->kptr_struct_meta =
btf_find_struct_meta(meta.arg_btf,
meta.arg_btf_id);
@@ -13391,12 +14031,16 @@ static int is_branch32_taken(struct bpf_reg_state *reg, u32 val, u8 opcode)
return !!tnum_equals_const(subreg, val);
else if (val < reg->u32_min_value || val > reg->u32_max_value)
return 0;
+ else if (sval < reg->s32_min_value || sval > reg->s32_max_value)
+ return 0;
break;
case BPF_JNE:
if (tnum_is_const(subreg))
return !tnum_equals_const(subreg, val);
else if (val < reg->u32_min_value || val > reg->u32_max_value)
return 1;
+ else if (sval < reg->s32_min_value || sval > reg->s32_max_value)
+ return 1;
break;
case BPF_JSET:
if ((~subreg.mask & subreg.value) & val)
@@ -13468,12 +14112,16 @@ static int is_branch64_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)
return !!tnum_equals_const(reg->var_off, val);
else if (val < reg->umin_value || val > reg->umax_value)
return 0;
+ else if (sval < reg->smin_value || sval > reg->smax_value)
+ return 0;
break;
case BPF_JNE:
if (tnum_is_const(reg->var_off))
return !tnum_equals_const(reg->var_off, val);
else if (val < reg->umin_value || val > reg->umax_value)
return 1;
+ else if (sval < reg->smin_value || sval > reg->smax_value)
+ return 1;
break;
case BPF_JSET:
if ((~reg->var_off.mask & reg->var_off.value) & val)
@@ -14135,6 +14783,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
!sanitize_speculative_path(env, insn, *insn_idx + 1,
*insn_idx))
return -EFAULT;
+ if (env->log.level & BPF_LOG_LEVEL)
+ print_insn_state(env, this_branch->frame[this_branch->curframe]);
*insn_idx += insn->off;
return 0;
} else if (pred == 0) {
@@ -14147,6 +14797,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
*insn_idx + insn->off + 1,
*insn_idx))
return -EFAULT;
+ if (env->log.level & BPF_LOG_LEVEL)
+ print_insn_state(env, this_branch->frame[this_branch->curframe]);
return 0;
}
@@ -14425,7 +15077,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
* gen_ld_abs() may terminate the program at runtime, leading to
* reference leak.
*/
- err = check_reference_leak(env);
+ err = check_reference_leak(env, false);
if (err) {
verbose(env, "BPF_LD_[ABS|IND] cannot be mixed with socket references\n");
return err;
@@ -14474,7 +15126,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
return 0;
}
-static int check_return_code(struct bpf_verifier_env *env)
+static int check_return_code(struct bpf_verifier_env *env, int regno)
{
struct tnum enforce_attach_type_range = tnum_unknown;
const struct bpf_prog *prog = env->prog;
@@ -14486,7 +15138,7 @@ static int check_return_code(struct bpf_verifier_env *env)
const bool is_subprog = frame->subprogno;
/* LSM and struct_ops func-ptr's return type could be "void" */
- if (!is_subprog) {
+ if (!is_subprog || frame->in_exception_callback_fn) {
switch (prog_type) {
case BPF_PROG_TYPE_LSM:
if (prog->expected_attach_type == BPF_LSM_CGROUP)
@@ -14508,22 +15160,22 @@ static int check_return_code(struct bpf_verifier_env *env)
* of bpf_exit, which means that program wrote
* something into it earlier
*/
- err = check_reg_arg(env, BPF_REG_0, SRC_OP);
+ err = check_reg_arg(env, regno, SRC_OP);
if (err)
return err;
- if (is_pointer_value(env, BPF_REG_0)) {
- verbose(env, "R0 leaks addr as return value\n");
+ if (is_pointer_value(env, regno)) {
+ verbose(env, "R%d leaks addr as return value\n", regno);
return -EACCES;
}
- reg = cur_regs(env) + BPF_REG_0;
+ reg = cur_regs(env) + regno;
if (frame->in_async_callback_fn) {
/* enforce return zero from async callbacks like timer */
if (reg->type != SCALAR_VALUE) {
- verbose(env, "In async callback the register R0 is not a known value (%s)\n",
- reg_type_str(env, reg->type));
+ verbose(env, "In async callback the register R%d is not a known value (%s)\n",
+ regno, reg_type_str(env, reg->type));
return -EINVAL;
}
@@ -14534,10 +15186,10 @@ static int check_return_code(struct bpf_verifier_env *env)
return 0;
}
- if (is_subprog) {
+ if (is_subprog && !frame->in_exception_callback_fn) {
if (reg->type != SCALAR_VALUE) {
- verbose(env, "At subprogram exit the register R0 is not a scalar value (%s)\n",
- reg_type_str(env, reg->type));
+ verbose(env, "At subprogram exit the register R%d is not a scalar value (%s)\n",
+ regno, reg_type_str(env, reg->type));
return -EINVAL;
}
return 0;
@@ -14547,10 +15199,13 @@ static int check_return_code(struct bpf_verifier_env *env)
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG ||
env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG ||
+ env->prog->expected_attach_type == BPF_CGROUP_UNIX_RECVMSG ||
env->prog->expected_attach_type == BPF_CGROUP_INET4_GETPEERNAME ||
env->prog->expected_attach_type == BPF_CGROUP_INET6_GETPEERNAME ||
+ env->prog->expected_attach_type == BPF_CGROUP_UNIX_GETPEERNAME ||
env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME ||
- env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME)
+ env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME ||
+ env->prog->expected_attach_type == BPF_CGROUP_UNIX_GETSOCKNAME)
range = tnum_range(1, 1);
if (env->prog->expected_attach_type == BPF_CGROUP_INET4_BIND ||
env->prog->expected_attach_type == BPF_CGROUP_INET6_BIND)
@@ -14619,8 +15274,8 @@ static int check_return_code(struct bpf_verifier_env *env)
}
if (reg->type != SCALAR_VALUE) {
- verbose(env, "At program exit the register R0 is not a known value (%s)\n",
- reg_type_str(env, reg->type));
+ verbose(env, "At program exit the register R%d is not a known value (%s)\n",
+ regno, reg_type_str(env, reg->type));
return -EINVAL;
}
@@ -14679,21 +15334,6 @@ enum {
BRANCH = 2,
};
-static u32 state_htab_size(struct bpf_verifier_env *env)
-{
- return env->prog->len;
-}
-
-static struct bpf_verifier_state_list **explored_state(
- struct bpf_verifier_env *env,
- int idx)
-{
- struct bpf_verifier_state *cur = env->cur_state;
- struct bpf_func_state *state = cur->frame[cur->curframe];
-
- return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)];
-}
-
static void mark_prune_point(struct bpf_verifier_env *env, int idx)
{
env->insn_aux_data[idx].prune_point = true;
@@ -14891,8 +15531,8 @@ static int check_cfg(struct bpf_verifier_env *env)
{
int insn_cnt = env->prog->len;
int *insn_stack, *insn_state;
- int ret = 0;
- int i;
+ int ex_insn_beg, i, ret = 0;
+ bool ex_done = false;
insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
if (!insn_state)
@@ -14908,6 +15548,7 @@ static int check_cfg(struct bpf_verifier_env *env)
insn_stack[0] = 0; /* 0 is the first instruction */
env->cfg.cur_stack = 1;
+walk_cfg:
while (env->cfg.cur_stack > 0) {
int t = insn_stack[env->cfg.cur_stack - 1];
@@ -14934,6 +15575,16 @@ static int check_cfg(struct bpf_verifier_env *env)
goto err_free;
}
+ if (env->exception_callback_subprog && !ex_done) {
+ ex_insn_beg = env->subprog_info[env->exception_callback_subprog].start;
+
+ insn_state[ex_insn_beg] = DISCOVERED;
+ insn_stack[0] = ex_insn_beg;
+ env->cfg.cur_stack = 1;
+ ex_done = true;
+ goto walk_cfg;
+ }
+
for (i = 0; i < insn_cnt; i++) {
if (insn_state[i] != EXPLORED) {
verbose(env, "unreachable insn %d\n", i);
@@ -14971,20 +15622,18 @@ static int check_abnormal_return(struct bpf_verifier_env *env)
#define MIN_BPF_FUNCINFO_SIZE 8
#define MAX_FUNCINFO_REC_SIZE 252
-static int check_btf_func(struct bpf_verifier_env *env,
- const union bpf_attr *attr,
- bpfptr_t uattr)
+static int check_btf_func_early(struct bpf_verifier_env *env,
+ const union bpf_attr *attr,
+ bpfptr_t uattr)
{
- const struct btf_type *type, *func_proto, *ret_type;
- u32 i, nfuncs, urec_size, min_size;
u32 krec_size = sizeof(struct bpf_func_info);
+ const struct btf_type *type, *func_proto;
+ u32 i, nfuncs, urec_size, min_size;
struct bpf_func_info *krecord;
- struct bpf_func_info_aux *info_aux = NULL;
struct bpf_prog *prog;
const struct btf *btf;
- bpfptr_t urecord;
u32 prev_offset = 0;
- bool scalar_return;
+ bpfptr_t urecord;
int ret = -ENOMEM;
nfuncs = attr->func_info_cnt;
@@ -14994,11 +15643,6 @@ static int check_btf_func(struct bpf_verifier_env *env,
return 0;
}
- if (nfuncs != env->subprog_cnt) {
- verbose(env, "number of funcs in func_info doesn't match number of subprogs\n");
- return -EINVAL;
- }
-
urec_size = attr->func_info_rec_size;
if (urec_size < MIN_BPF_FUNCINFO_SIZE ||
urec_size > MAX_FUNCINFO_REC_SIZE ||
@@ -15016,9 +15660,6 @@ static int check_btf_func(struct bpf_verifier_env *env,
krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL | __GFP_NOWARN);
if (!krecord)
return -ENOMEM;
- info_aux = kcalloc(nfuncs, sizeof(*info_aux), GFP_KERNEL | __GFP_NOWARN);
- if (!info_aux)
- goto err_free;
for (i = 0; i < nfuncs; i++) {
ret = bpf_check_uarg_tail_zero(urecord, krec_size, urec_size);
@@ -15057,11 +15698,6 @@ static int check_btf_func(struct bpf_verifier_env *env,
goto err_free;
}
- if (env->subprog_info[i].start != krecord[i].insn_off) {
- verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n");
- goto err_free;
- }
-
/* check type_id */
type = btf_type_by_id(btf, krecord[i].type_id);
if (!type || !btf_type_is_func(type)) {
@@ -15069,12 +15705,77 @@ static int check_btf_func(struct bpf_verifier_env *env,
krecord[i].type_id);
goto err_free;
}
- info_aux[i].linkage = BTF_INFO_VLEN(type->info);
func_proto = btf_type_by_id(btf, type->type);
if (unlikely(!func_proto || !btf_type_is_func_proto(func_proto)))
/* btf_func_check() already verified it during BTF load */
goto err_free;
+
+ prev_offset = krecord[i].insn_off;
+ bpfptr_add(&urecord, urec_size);
+ }
+
+ prog->aux->func_info = krecord;
+ prog->aux->func_info_cnt = nfuncs;
+ return 0;
+
+err_free:
+ kvfree(krecord);
+ return ret;
+}
+
+static int check_btf_func(struct bpf_verifier_env *env,
+ const union bpf_attr *attr,
+ bpfptr_t uattr)
+{
+ const struct btf_type *type, *func_proto, *ret_type;
+ u32 i, nfuncs, urec_size;
+ struct bpf_func_info *krecord;
+ struct bpf_func_info_aux *info_aux = NULL;
+ struct bpf_prog *prog;
+ const struct btf *btf;
+ bpfptr_t urecord;
+ bool scalar_return;
+ int ret = -ENOMEM;
+
+ nfuncs = attr->func_info_cnt;
+ if (!nfuncs) {
+ if (check_abnormal_return(env))
+ return -EINVAL;
+ return 0;
+ }
+ if (nfuncs != env->subprog_cnt) {
+ verbose(env, "number of funcs in func_info doesn't match number of subprogs\n");
+ return -EINVAL;
+ }
+
+ urec_size = attr->func_info_rec_size;
+
+ prog = env->prog;
+ btf = prog->aux->btf;
+
+ urecord = make_bpfptr(attr->func_info, uattr.is_kernel);
+
+ krecord = prog->aux->func_info;
+ info_aux = kcalloc(nfuncs, sizeof(*info_aux), GFP_KERNEL | __GFP_NOWARN);
+ if (!info_aux)
+ return -ENOMEM;
+
+ for (i = 0; i < nfuncs; i++) {
+ /* check insn_off */
+ ret = -EINVAL;
+
+ if (env->subprog_info[i].start != krecord[i].insn_off) {
+ verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n");
+ goto err_free;
+ }
+
+ /* Already checked type_id */
+ type = btf_type_by_id(btf, krecord[i].type_id);
+ info_aux[i].linkage = BTF_INFO_VLEN(type->info);
+ /* Already checked func_proto */
+ func_proto = btf_type_by_id(btf, type->type);
+
ret_type = btf_type_skip_modifiers(btf, func_proto->type, NULL);
scalar_return =
btf_type_is_small_int(ret_type) || btf_is_any_enum(ret_type);
@@ -15087,17 +15788,13 @@ static int check_btf_func(struct bpf_verifier_env *env,
goto err_free;
}
- prev_offset = krecord[i].insn_off;
bpfptr_add(&urecord, urec_size);
}
- prog->aux->func_info = krecord;
- prog->aux->func_info_cnt = nfuncs;
prog->aux->func_info_aux = info_aux;
return 0;
err_free:
- kvfree(krecord);
kfree(info_aux);
return ret;
}
@@ -15110,7 +15807,8 @@ static void adjust_btf_func(struct bpf_verifier_env *env)
if (!aux->func_info)
return;
- for (i = 0; i < env->subprog_cnt; i++)
+ /* func_info is not available for hidden subprogs */
+ for (i = 0; i < env->subprog_cnt - env->hidden_subprog_cnt; i++)
aux->func_info[i].insn_off = env->subprog_info[i].start;
}
@@ -15314,9 +16012,9 @@ static int check_core_relo(struct bpf_verifier_env *env,
return err;
}
-static int check_btf_info(struct bpf_verifier_env *env,
- const union bpf_attr *attr,
- bpfptr_t uattr)
+static int check_btf_info_early(struct bpf_verifier_env *env,
+ const union bpf_attr *attr,
+ bpfptr_t uattr)
{
struct btf *btf;
int err;
@@ -15336,6 +16034,24 @@ static int check_btf_info(struct bpf_verifier_env *env,
}
env->prog->aux->btf = btf;
+ err = check_btf_func_early(env, attr, uattr);
+ if (err)
+ return err;
+ return 0;
+}
+
+static int check_btf_info(struct bpf_verifier_env *env,
+ const union bpf_attr *attr,
+ bpfptr_t uattr)
+{
+ int err;
+
+ if (!attr->func_info_cnt && !attr->line_info_cnt) {
+ if (check_abnormal_return(env))
+ return -EINVAL;
+ return 0;
+ }
+
err = check_btf_func(env, attr, uattr);
if (err)
return err;
@@ -15494,18 +16210,14 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn,
struct bpf_verifier_state *cur)
{
struct bpf_verifier_state_list *sl;
- int i;
sl = *explored_state(env, insn);
while (sl) {
if (sl->state.branches)
goto next;
if (sl->state.insn_idx != insn ||
- sl->state.curframe != cur->curframe)
+ !same_callsites(&sl->state, cur))
goto next;
- for (i = 0; i <= cur->curframe; i++)
- if (sl->state.frame[i]->callsite != cur->frame[i]->callsite)
- goto next;
clean_verifier_state(env, &sl->state);
next:
sl = sl->next;
@@ -15523,8 +16235,11 @@ static bool regs_exact(const struct bpf_reg_state *rold,
/* Returns true if (rold safe implies rcur safe) */
static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
- struct bpf_reg_state *rcur, struct bpf_idmap *idmap)
+ struct bpf_reg_state *rcur, struct bpf_idmap *idmap, bool exact)
{
+ if (exact)
+ return regs_exact(rold, rcur, idmap);
+
if (!(rold->live & REG_LIVE_READ))
/* explored state didn't use this */
return true;
@@ -15641,7 +16356,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
}
static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
- struct bpf_func_state *cur, struct bpf_idmap *idmap)
+ struct bpf_func_state *cur, struct bpf_idmap *idmap, bool exact)
{
int i, spi;
@@ -15654,7 +16369,12 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
spi = i / BPF_REG_SIZE;
- if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)) {
+ if (exact &&
+ old->stack[spi].slot_type[i % BPF_REG_SIZE] !=
+ cur->stack[spi].slot_type[i % BPF_REG_SIZE])
+ return false;
+
+ if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ) && !exact) {
i += BPF_REG_SIZE - 1;
/* explored state didn't use this */
continue;
@@ -15704,7 +16424,7 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
* return false to continue verification of this path
*/
if (!regsafe(env, &old->stack[spi].spilled_ptr,
- &cur->stack[spi].spilled_ptr, idmap))
+ &cur->stack[spi].spilled_ptr, idmap, exact))
return false;
break;
case STACK_DYNPTR:
@@ -15786,16 +16506,16 @@ static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur,
* the current state will reach 'bpf_exit' instruction safely
*/
static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_state *old,
- struct bpf_func_state *cur)
+ struct bpf_func_state *cur, bool exact)
{
int i;
for (i = 0; i < MAX_BPF_REG; i++)
if (!regsafe(env, &old->regs[i], &cur->regs[i],
- &env->idmap_scratch))
+ &env->idmap_scratch, exact))
return false;
- if (!stacksafe(env, old, cur, &env->idmap_scratch))
+ if (!stacksafe(env, old, cur, &env->idmap_scratch, exact))
return false;
if (!refsafe(old, cur, &env->idmap_scratch))
@@ -15804,17 +16524,23 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat
return true;
}
+static void reset_idmap_scratch(struct bpf_verifier_env *env)
+{
+ env->idmap_scratch.tmp_id_gen = env->id_gen;
+ memset(&env->idmap_scratch.map, 0, sizeof(env->idmap_scratch.map));
+}
+
static bool states_equal(struct bpf_verifier_env *env,
struct bpf_verifier_state *old,
- struct bpf_verifier_state *cur)
+ struct bpf_verifier_state *cur,
+ bool exact)
{
int i;
if (old->curframe != cur->curframe)
return false;
- env->idmap_scratch.tmp_id_gen = env->id_gen;
- memset(&env->idmap_scratch.map, 0, sizeof(env->idmap_scratch.map));
+ reset_idmap_scratch(env);
/* Verification state from speculative execution simulation
* must never prune a non-speculative execution one.
@@ -15844,7 +16570,7 @@ static bool states_equal(struct bpf_verifier_env *env,
for (i = 0; i <= old->curframe; i++) {
if (old->frame[i]->callsite != cur->frame[i]->callsite)
return false;
- if (!func_states_equal(env, old->frame[i], cur->frame[i]))
+ if (!func_states_equal(env, old->frame[i], cur->frame[i], exact))
return false;
}
return true;
@@ -16098,10 +16824,11 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
{
struct bpf_verifier_state_list *new_sl;
struct bpf_verifier_state_list *sl, **pprev;
- struct bpf_verifier_state *cur = env->cur_state, *new;
- int i, j, err, states_cnt = 0;
+ struct bpf_verifier_state *cur = env->cur_state, *new, *loop_entry;
+ int i, j, n, err, states_cnt = 0;
bool force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx);
bool add_new_state = force_new_state;
+ bool force_exact;
/* bpf progs typically have pruning point every 4 instructions
* http://vger.kernel.org/bpfconf2019.html#session-1
@@ -16154,9 +16881,33 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
* It's safe to assume that iterator loop will finish, taking into
* account iter_next() contract of eventually returning
* sticky NULL result.
+ *
+ * Note, that states have to be compared exactly in this case because
+ * read and precision marks might not be finalized inside the loop.
+ * E.g. as in the program below:
+ *
+ * 1. r7 = -16
+ * 2. r6 = bpf_get_prandom_u32()
+ * 3. while (bpf_iter_num_next(&fp[-8])) {
+ * 4. if (r6 != 42) {
+ * 5. r7 = -32
+ * 6. r6 = bpf_get_prandom_u32()
+ * 7. continue
+ * 8. }
+ * 9. r0 = r10
+ * 10. r0 += r7
+ * 11. r8 = *(u64 *)(r0 + 0)
+ * 12. r6 = bpf_get_prandom_u32()
+ * 13. }
+ *
+ * Here verifier would first visit path 1-3, create a checkpoint at 3
+ * with r7=-16, continue to 4-7,3. Existing checkpoint at 3 does
+ * not have read or precision mark for r7 yet, thus inexact states
+ * comparison would discard current state with r7=-32
+ * => unsafe memory access at 11 would not be caught.
*/
if (is_iter_next_insn(env, insn_idx)) {
- if (states_equal(env, &sl->state, cur)) {
+ if (states_equal(env, &sl->state, cur, true)) {
struct bpf_func_state *cur_frame;
struct bpf_reg_state *iter_state, *iter_reg;
int spi;
@@ -16172,17 +16923,23 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
*/
spi = __get_spi(iter_reg->off + iter_reg->var_off.value);
iter_state = &func(env, iter_reg)->stack[spi].spilled_ptr;
- if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE)
+ if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE) {
+ update_loop_entry(cur, &sl->state);
goto hit;
+ }
}
goto skip_inf_loop_check;
}
/* attempt to detect infinite loop to avoid unnecessary doomed work */
if (states_maybe_looping(&sl->state, cur) &&
- states_equal(env, &sl->state, cur) &&
+ states_equal(env, &sl->state, cur, false) &&
!iter_active_depths_differ(&sl->state, cur)) {
verbose_linfo(env, insn_idx, "; ");
verbose(env, "infinite loop detected at insn %d\n", insn_idx);
+ verbose(env, "cur state:");
+ print_verifier_state(env, cur->frame[cur->curframe], true);
+ verbose(env, "old state:");
+ print_verifier_state(env, sl->state.frame[cur->curframe], true);
return -EINVAL;
}
/* if the verifier is processing a loop, avoid adding new state
@@ -16204,7 +16961,36 @@ skip_inf_loop_check:
add_new_state = false;
goto miss;
}
- if (states_equal(env, &sl->state, cur)) {
+ /* If sl->state is a part of a loop and this loop's entry is a part of
+ * current verification path then states have to be compared exactly.
+ * 'force_exact' is needed to catch the following case:
+ *
+ * initial Here state 'succ' was processed first,
+ * | it was eventually tracked to produce a
+ * V state identical to 'hdr'.
+ * .---------> hdr All branches from 'succ' had been explored
+ * | | and thus 'succ' has its .branches == 0.
+ * | V
+ * | .------... Suppose states 'cur' and 'succ' correspond
+ * | | | to the same instruction + callsites.
+ * | V V In such case it is necessary to check
+ * | ... ... if 'succ' and 'cur' are states_equal().
+ * | | | If 'succ' and 'cur' are a part of the
+ * | V V same loop exact flag has to be set.
+ * | succ <- cur To check if that is the case, verify
+ * | | if loop entry of 'succ' is in current
+ * | V DFS path.
+ * | ...
+ * | |
+ * '----'
+ *
+ * Additional details are in the comment before get_loop_entry().
+ */
+ loop_entry = get_loop_entry(&sl->state);
+ force_exact = loop_entry && loop_entry->branches > 0;
+ if (states_equal(env, &sl->state, cur, force_exact)) {
+ if (force_exact)
+ update_loop_entry(cur, loop_entry);
hit:
sl->hit_cnt++;
/* reached equivalent register/stack state,
@@ -16243,13 +17029,18 @@ miss:
* to keep checking from state equivalence point of view.
* Higher numbers increase max_states_per_insn and verification time,
* but do not meaningfully decrease insn_processed.
+ * 'n' controls how many times state could miss before eviction.
+ * Use bigger 'n' for checkpoints because evicting checkpoint states
+ * too early would hinder iterator convergence.
*/
- if (sl->miss_cnt > sl->hit_cnt * 3 + 3) {
+ n = is_force_checkpoint(env, insn_idx) && sl->state.branches > 0 ? 64 : 3;
+ if (sl->miss_cnt > sl->hit_cnt * n + n) {
/* the state is unlikely to be useful. Remove it to
* speed up verification
*/
*pprev = sl->next;
- if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE) {
+ if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE &&
+ !sl->state.used_as_loop_entry) {
u32 br = sl->state.branches;
WARN_ONCE(br,
@@ -16318,6 +17109,7 @@ next:
cur->parent = new;
cur->first_insn_idx = insn_idx;
+ cur->dfs_depth = new->dfs_depth + 1;
clear_jmp_history(cur);
new_sl->next = *explored_state(env, insn_idx);
*explored_state(env, insn_idx) = new_sl;
@@ -16438,6 +17230,7 @@ static int do_check(struct bpf_verifier_env *env)
int prev_insn_idx = -1;
for (;;) {
+ bool exception_exit = false;
struct bpf_insn *insn;
u8 class;
int err;
@@ -16652,12 +17445,17 @@ static int do_check(struct bpf_verifier_env *env)
return -EINVAL;
}
}
- if (insn->src_reg == BPF_PSEUDO_CALL)
+ if (insn->src_reg == BPF_PSEUDO_CALL) {
err = check_func_call(env, insn, &env->insn_idx);
- else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL)
+ } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
err = check_kfunc_call(env, insn, &env->insn_idx);
- else
+ if (!err && is_bpf_throw_kfunc(insn)) {
+ exception_exit = true;
+ goto process_bpf_exit_full;
+ }
+ } else {
err = check_helper_call(env, insn, &env->insn_idx);
+ }
if (err)
return err;
@@ -16687,7 +17485,7 @@ static int do_check(struct bpf_verifier_env *env)
verbose(env, "BPF_EXIT uses reserved fields\n");
return -EINVAL;
}
-
+process_bpf_exit_full:
if (env->cur_state->active_lock.ptr &&
!in_rbtree_lock_required_cb(env)) {
verbose(env, "bpf_spin_unlock is missing\n");
@@ -16706,10 +17504,23 @@ static int do_check(struct bpf_verifier_env *env)
* function, for which reference_state must
* match caller reference state when it exits.
*/
- err = check_reference_leak(env);
+ err = check_reference_leak(env, exception_exit);
if (err)
return err;
+ /* The side effect of the prepare_func_exit
+ * which is being skipped is that it frees
+ * bpf_func_state. Typically, process_bpf_exit
+ * will only be hit with outermost exit.
+ * copy_verifier_state in pop_stack will handle
+ * freeing of any extra bpf_func_state left over
+ * from not processing all nested function
+ * exits. We also skip return code checks as
+ * they are not needed for exceptional exits.
+ */
+ if (exception_exit)
+ goto process_bpf_exit;
+
if (state->curframe) {
/* exit from nested function */
err = prepare_func_exit(env, &env->insn_idx);
@@ -16719,7 +17530,7 @@ static int do_check(struct bpf_verifier_env *env)
continue;
}
- err = check_return_code(env);
+ err = check_return_code(env, BPF_REG_0);
if (err)
return err;
process_bpf_exit:
@@ -18012,6 +18823,9 @@ static int jit_subprogs(struct bpf_verifier_env *env)
}
func[i]->aux->num_exentries = num_exentries;
func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable;
+ func[i]->aux->exception_cb = env->subprog_info[i].is_exception_cb;
+ if (!i)
+ func[i]->aux->exception_boundary = env->seen_exception;
func[i] = bpf_int_jit_compile(func[i]);
if (!func[i]->jited) {
err = -ENOTSUPP;
@@ -18051,7 +18865,8 @@ static int jit_subprogs(struct bpf_verifier_env *env)
* the call instruction, as an index for this list
*/
func[i]->aux->func = func;
- func[i]->aux->func_cnt = env->subprog_cnt;
+ func[i]->aux->func_cnt = env->subprog_cnt - env->hidden_subprog_cnt;
+ func[i]->aux->real_func_cnt = env->subprog_cnt;
}
for (i = 0; i < env->subprog_cnt; i++) {
old_bpf_func = func[i]->bpf_func;
@@ -18097,7 +18912,10 @@ static int jit_subprogs(struct bpf_verifier_env *env)
prog->aux->extable = func[0]->aux->extable;
prog->aux->num_exentries = func[0]->aux->num_exentries;
prog->aux->func = func;
- prog->aux->func_cnt = env->subprog_cnt;
+ prog->aux->func_cnt = env->subprog_cnt - env->hidden_subprog_cnt;
+ prog->aux->real_func_cnt = env->subprog_cnt;
+ prog->aux->bpf_exception_cb = (void *)func[env->exception_callback_subprog]->bpf_func;
+ prog->aux->exception_boundary = func[0]->aux->exception_boundary;
bpf_prog_jit_attempt_done(prog);
return 0;
out_free:
@@ -18264,21 +19082,35 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
insn->imm = BPF_CALL_IMM(desc->addr);
if (insn->off)
return 0;
- if (desc->func_id == special_kfunc_list[KF_bpf_obj_new_impl]) {
+ if (desc->func_id == special_kfunc_list[KF_bpf_obj_new_impl] ||
+ desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) };
u64 obj_new_size = env->insn_aux_data[insn_idx].obj_new_size;
+ if (desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl] && kptr_struct_meta) {
+ verbose(env, "verifier internal error: NULL kptr_struct_meta expected at insn_idx %d\n",
+ insn_idx);
+ return -EFAULT;
+ }
+
insn_buf[0] = BPF_MOV64_IMM(BPF_REG_1, obj_new_size);
insn_buf[1] = addr[0];
insn_buf[2] = addr[1];
insn_buf[3] = *insn;
*cnt = 4;
} else if (desc->func_id == special_kfunc_list[KF_bpf_obj_drop_impl] ||
+ desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl] ||
desc->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) {
struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) };
+ if (desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl] && kptr_struct_meta) {
+ verbose(env, "verifier internal error: NULL kptr_struct_meta expected at insn_idx %d\n",
+ insn_idx);
+ return -EFAULT;
+ }
+
if (desc->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl] &&
!kptr_struct_meta) {
verbose(env, "verifier internal error: kptr_struct_meta expected at insn_idx %d\n",
@@ -18319,6 +19151,33 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
return 0;
}
+/* The function requires that first instruction in 'patch' is insnsi[prog->len - 1] */
+static int add_hidden_subprog(struct bpf_verifier_env *env, struct bpf_insn *patch, int len)
+{
+ struct bpf_subprog_info *info = env->subprog_info;
+ int cnt = env->subprog_cnt;
+ struct bpf_prog *prog;
+
+ /* We only reserve one slot for hidden subprogs in subprog_info. */
+ if (env->hidden_subprog_cnt) {
+ verbose(env, "verifier internal error: only one hidden subprog supported\n");
+ return -EFAULT;
+ }
+ /* We're not patching any existing instruction, just appending the new
+ * ones for the hidden subprog. Hence all of the adjustment operations
+ * in bpf_patch_insn_data are no-ops.
+ */
+ prog = bpf_patch_insn_data(env, env->prog->len - 1, patch, len);
+ if (!prog)
+ return -ENOMEM;
+ env->prog = prog;
+ info[cnt + 1].start = info[cnt].start;
+ info[cnt].start = prog->len - len + 1;
+ env->subprog_cnt++;
+ env->hidden_subprog_cnt++;
+ return 0;
+}
+
/* Do various post-verification rewrites in a single program pass.
* These rewrites simplify JIT and interpreter implementations.
*/
@@ -18337,6 +19196,26 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
struct bpf_map *map_ptr;
int i, ret, cnt, delta = 0;
+ if (env->seen_exception && !env->exception_callback_subprog) {
+ struct bpf_insn patch[] = {
+ env->prog->insnsi[insn_cnt - 1],
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ };
+
+ ret = add_hidden_subprog(env, patch, ARRAY_SIZE(patch));
+ if (ret < 0)
+ return ret;
+ prog = env->prog;
+ insn = prog->insnsi;
+
+ env->exception_callback_subprog = env->subprog_cnt - 1;
+ /* Don't update insn_cnt, as add_hidden_subprog always appends insns */
+ env->subprog_info[env->exception_callback_subprog].is_cb = true;
+ env->subprog_info[env->exception_callback_subprog].is_async_cb = true;
+ env->subprog_info[env->exception_callback_subprog].is_exception_cb = true;
+ }
+
for (i = 0; i < insn_cnt; i++, insn++) {
/* Make divide-by-zero exceptions impossible. */
if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) ||
@@ -18606,6 +19485,25 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
goto patch_call_imm;
}
+ /* bpf_per_cpu_ptr() and bpf_this_cpu_ptr() */
+ if (env->insn_aux_data[i + delta].call_with_percpu_alloc_ptr) {
+ /* patch with 'r1 = *(u64 *)(r1 + 0)' since for percpu data,
+ * bpf_mem_alloc() returns a ptr to the percpu data ptr.
+ */
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0);
+ insn_buf[1] = *insn;
+ cnt = 2;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto patch_call_imm;
+ }
+
/* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup
* and other inlining handlers are currently limited to 64 bit
* only.
@@ -19015,7 +19913,7 @@ static void free_states(struct bpf_verifier_env *env)
}
}
-static int do_check_common(struct bpf_verifier_env *env, int subprog)
+static int do_check_common(struct bpf_verifier_env *env, int subprog, bool is_ex_cb)
{
bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
struct bpf_verifier_state *state;
@@ -19046,7 +19944,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
regs = state->frame[state->curframe]->regs;
if (subprog || env->prog->type == BPF_PROG_TYPE_EXT) {
- ret = btf_prepare_func_args(env, subprog, regs);
+ ret = btf_prepare_func_args(env, subprog, regs, is_ex_cb);
if (ret)
goto out;
for (i = BPF_REG_1; i <= BPF_REG_5; i++) {
@@ -19062,6 +19960,12 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
regs[i].id = ++env->id_gen;
}
}
+ if (is_ex_cb) {
+ state->frame[0]->in_exception_callback_fn = true;
+ env->subprog_info[subprog].is_cb = true;
+ env->subprog_info[subprog].is_async_cb = true;
+ env->subprog_info[subprog].is_exception_cb = true;
+ }
} else {
/* 1st arg to a function */
regs[BPF_REG_1].type = PTR_TO_CTX;
@@ -19126,7 +20030,7 @@ static int do_check_subprogs(struct bpf_verifier_env *env)
continue;
env->insn_idx = env->subprog_info[i].start;
WARN_ON_ONCE(env->insn_idx == 0);
- ret = do_check_common(env, i);
+ ret = do_check_common(env, i, env->exception_callback_subprog == i);
if (ret) {
return ret;
} else if (env->log.level & BPF_LOG_LEVEL) {
@@ -19143,7 +20047,7 @@ static int do_check_main(struct bpf_verifier_env *env)
int ret;
env->insn_idx = 0;
- ret = do_check_common(env, 0);
+ ret = do_check_common(env, 0, false);
if (!ret)
env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
return ret;
@@ -19312,6 +20216,12 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
bpf_log(log, "Subprog %s doesn't exist\n", tname);
return -EINVAL;
}
+ if (aux->func && aux->func[subprog]->aux->exception_cb) {
+ bpf_log(log,
+ "%s programs cannot attach to exception callback\n",
+ prog_extension ? "Extension" : "FENTRY/FEXIT");
+ return -EINVAL;
+ }
conservative = aux->func_info_aux[subprog].unreliable;
if (prog_extension) {
if (conservative) {
@@ -19641,6 +20551,9 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
if (!tr)
return -ENOMEM;
+ if (tgt_prog && tgt_prog->aux->tail_call_reachable)
+ tr->flags = BPF_TRAMP_F_TAIL_CALL_CTX;
+
prog->aux->dst_trampoline = tr;
return 0;
}
@@ -19736,6 +20649,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
if (!env->explored_states)
goto skip_full_check;
+ ret = check_btf_info_early(env, attr, uattr);
+ if (ret < 0)
+ goto skip_full_check;
+
ret = add_subprog_and_kfunc(env);
if (ret < 0)
goto skip_full_check;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index c09531bace38..484adb375b15 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -4923,9 +4923,11 @@ repeat:
void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
struct css_task_iter *it)
{
+ unsigned long irqflags;
+
memset(it, 0, sizeof(*it));
- spin_lock_irq(&css_set_lock);
+ spin_lock_irqsave(&css_set_lock, irqflags);
it->ss = css->ss;
it->flags = flags;
@@ -4939,7 +4941,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
css_task_iter_advance(it);
- spin_unlock_irq(&css_set_lock);
+ spin_unlock_irqrestore(&css_set_lock, irqflags);
}
/**
@@ -4952,12 +4954,14 @@ void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
*/
struct task_struct *css_task_iter_next(struct css_task_iter *it)
{
+ unsigned long irqflags;
+
if (it->cur_task) {
put_task_struct(it->cur_task);
it->cur_task = NULL;
}
- spin_lock_irq(&css_set_lock);
+ spin_lock_irqsave(&css_set_lock, irqflags);
/* @it may be half-advanced by skips, finish advancing */
if (it->flags & CSS_TASK_ITER_SKIPPED)
@@ -4970,7 +4974,7 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
css_task_iter_advance(it);
}
- spin_unlock_irq(&css_set_lock);
+ spin_unlock_irqrestore(&css_set_lock, irqflags);
return it->cur_task;
}
@@ -4983,11 +4987,13 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
*/
void css_task_iter_end(struct css_task_iter *it)
{
+ unsigned long irqflags;
+
if (it->cur_cset) {
- spin_lock_irq(&css_set_lock);
+ spin_lock_irqsave(&css_set_lock, irqflags);
list_del(&it->iters_node);
put_css_set_locked(it->cur_cset);
- spin_unlock_irq(&css_set_lock);
+ spin_unlock_irqrestore(&css_set_lock, irqflags);
}
if (it->cur_dcset)
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index 77c0c2370b6d..9de66bbbb3d1 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -19,7 +19,8 @@
*/
static struct posix_clock *get_posix_clock(struct file *fp)
{
- struct posix_clock *clk = fp->private_data;
+ struct posix_clock_context *pccontext = fp->private_data;
+ struct posix_clock *clk = pccontext->clk;
down_read(&clk->rwsem);
@@ -39,6 +40,7 @@ static void put_posix_clock(struct posix_clock *clk)
static ssize_t posix_clock_read(struct file *fp, char __user *buf,
size_t count, loff_t *ppos)
{
+ struct posix_clock_context *pccontext = fp->private_data;
struct posix_clock *clk = get_posix_clock(fp);
int err = -EINVAL;
@@ -46,7 +48,7 @@ static ssize_t posix_clock_read(struct file *fp, char __user *buf,
return -ENODEV;
if (clk->ops.read)
- err = clk->ops.read(clk, fp->f_flags, buf, count);
+ err = clk->ops.read(pccontext, fp->f_flags, buf, count);
put_posix_clock(clk);
@@ -55,6 +57,7 @@ static ssize_t posix_clock_read(struct file *fp, char __user *buf,
static __poll_t posix_clock_poll(struct file *fp, poll_table *wait)
{
+ struct posix_clock_context *pccontext = fp->private_data;
struct posix_clock *clk = get_posix_clock(fp);
__poll_t result = 0;
@@ -62,7 +65,7 @@ static __poll_t posix_clock_poll(struct file *fp, poll_table *wait)
return EPOLLERR;
if (clk->ops.poll)
- result = clk->ops.poll(clk, fp, wait);
+ result = clk->ops.poll(pccontext, fp, wait);
put_posix_clock(clk);
@@ -72,6 +75,7 @@ static __poll_t posix_clock_poll(struct file *fp, poll_table *wait)
static long posix_clock_ioctl(struct file *fp,
unsigned int cmd, unsigned long arg)
{
+ struct posix_clock_context *pccontext = fp->private_data;
struct posix_clock *clk = get_posix_clock(fp);
int err = -ENOTTY;
@@ -79,7 +83,7 @@ static long posix_clock_ioctl(struct file *fp,
return -ENODEV;
if (clk->ops.ioctl)
- err = clk->ops.ioctl(clk, cmd, arg);
+ err = clk->ops.ioctl(pccontext, cmd, arg);
put_posix_clock(clk);
@@ -90,6 +94,7 @@ static long posix_clock_ioctl(struct file *fp,
static long posix_clock_compat_ioctl(struct file *fp,
unsigned int cmd, unsigned long arg)
{
+ struct posix_clock_context *pccontext = fp->private_data;
struct posix_clock *clk = get_posix_clock(fp);
int err = -ENOTTY;
@@ -97,7 +102,7 @@ static long posix_clock_compat_ioctl(struct file *fp,
return -ENODEV;
if (clk->ops.ioctl)
- err = clk->ops.ioctl(clk, cmd, arg);
+ err = clk->ops.ioctl(pccontext, cmd, arg);
put_posix_clock(clk);
@@ -110,6 +115,7 @@ static int posix_clock_open(struct inode *inode, struct file *fp)
int err;
struct posix_clock *clk =
container_of(inode->i_cdev, struct posix_clock, cdev);
+ struct posix_clock_context *pccontext;
down_read(&clk->rwsem);
@@ -117,14 +123,20 @@ static int posix_clock_open(struct inode *inode, struct file *fp)
err = -ENODEV;
goto out;
}
+ pccontext = kzalloc(sizeof(*pccontext), GFP_KERNEL);
+ if (!pccontext) {
+ err = -ENOMEM;
+ goto out;
+ }
+ pccontext->clk = clk;
+ fp->private_data = pccontext;
if (clk->ops.open)
- err = clk->ops.open(clk, fp->f_mode);
+ err = clk->ops.open(pccontext, fp->f_mode);
else
err = 0;
if (!err) {
get_device(clk->dev);
- fp->private_data = clk;
}
out:
up_read(&clk->rwsem);
@@ -133,14 +145,20 @@ out:
static int posix_clock_release(struct inode *inode, struct file *fp)
{
- struct posix_clock *clk = fp->private_data;
+ struct posix_clock_context *pccontext = fp->private_data;
+ struct posix_clock *clk;
int err = 0;
+ if (!pccontext)
+ return -ENODEV;
+ clk = pccontext->clk;
+
if (clk->ops.release)
- err = clk->ops.release(clk);
+ err = clk->ops.release(pccontext);
put_device(clk->dev);
+ kfree(pccontext);
fp->private_data = NULL;
return err;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 868008f56fec..df697c74d519 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -117,6 +117,9 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
* and don't send kprobe event into ring-buffer,
* so return zero here
*/
+ rcu_read_lock();
+ bpf_prog_inc_misses_counters(rcu_dereference(call->prog_array));
+ rcu_read_unlock();
ret = 0;
goto out;
}
@@ -2384,7 +2387,8 @@ int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
u32 *fd_type, const char **buf,
- u64 *probe_offset, u64 *probe_addr)
+ u64 *probe_offset, u64 *probe_addr,
+ unsigned long *missed)
{
bool is_tracepoint, is_syscall_tp;
struct bpf_prog *prog;
@@ -2419,7 +2423,7 @@ int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
#ifdef CONFIG_KPROBE_EVENTS
if (flags & TRACE_EVENT_FL_KPROBE)
err = bpf_get_kprobe_info(event, fd_type, buf,
- probe_offset, probe_addr,
+ probe_offset, probe_addr, missed,
event->attr.type == PERF_TYPE_TRACEPOINT);
#endif
#ifdef CONFIG_UPROBE_EVENTS
@@ -2614,6 +2618,7 @@ static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link,
kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
info->kprobe_multi.count = kmulti_link->cnt;
info->kprobe_multi.flags = kmulti_link->flags;
+ info->kprobe_multi.missed = kmulti_link->fp.nmissed;
if (!uaddrs)
return 0;
@@ -2710,6 +2715,7 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
int err;
if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
+ bpf_prog_inc_misses_counter(link->link.prog);
err = 0;
goto out;
}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index e834f149695b..a3442db35670 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1249,6 +1249,12 @@ static const struct file_operations kprobe_events_ops = {
.write = probes_write,
};
+static unsigned long trace_kprobe_missed(struct trace_kprobe *tk)
+{
+ return trace_kprobe_is_return(tk) ?
+ tk->rp.kp.nmissed + tk->rp.nmissed : tk->rp.kp.nmissed;
+}
+
/* Probes profiling interfaces */
static int probes_profile_seq_show(struct seq_file *m, void *v)
{
@@ -1260,8 +1266,7 @@ static int probes_profile_seq_show(struct seq_file *m, void *v)
return 0;
tk = to_trace_kprobe(ev);
- nmissed = trace_kprobe_is_return(tk) ?
- tk->rp.kp.nmissed + tk->rp.nmissed : tk->rp.kp.nmissed;
+ nmissed = trace_kprobe_missed(tk);
seq_printf(m, " %-44s %15lu %15lu\n",
trace_probe_name(&tk->tp),
trace_kprobe_nhit(tk),
@@ -1607,7 +1612,8 @@ NOKPROBE_SYMBOL(kretprobe_perf_func);
int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type,
const char **symbol, u64 *probe_offset,
- u64 *probe_addr, bool perf_type_tracepoint)
+ u64 *probe_addr, unsigned long *missed,
+ bool perf_type_tracepoint)
{
const char *pevent = trace_event_name(event->tp_event);
const char *group = event->tp_event->class->system;
@@ -1626,6 +1632,8 @@ int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type,
*probe_addr = kallsyms_show_value(current_cred()) ?
(unsigned long)tk->rp.kp.addr : 0;
*symbol = tk->symbol;
+ if (missed)
+ *missed = trace_kprobe_missed(tk);
return 0;
}
#endif /* CONFIG_PERF_EVENTS */
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index de753403cdaf..9c581d6da843 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -556,7 +556,7 @@ static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *re
{
struct syscall_tp_t {
struct trace_entry ent;
- unsigned long syscall_nr;
+ int syscall_nr;
unsigned long args[SYSCALL_DEFINE_MAXARGS];
} __aligned(8) param;
int i;
@@ -661,7 +661,7 @@ static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *reg
{
struct syscall_tp_t {
struct trace_entry ent;
- unsigned long syscall_nr;
+ int syscall_nr;
unsigned long ret;
} __aligned(8) param;