summaryrefslogtreecommitdiff
path: root/kernel/bpf/task_iter.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-10-31 05:10:11 -1000
committerLinus Torvalds <torvalds@linux-foundation.org>2023-10-31 05:10:11 -1000
commit89ed67ef126c4160349c1b96fdb775ea6170ac90 (patch)
tree98caaf8bba44b21f9345a0af1dd2bd9987764e27 /kernel/bpf/task_iter.c
parent5a6a09e97199d6600d31383055f9d43fbbcbe86f (diff)
parentf1c73396133cb3d913e2075298005644ee8dfade (diff)
Merge tag 'net-next-6.7' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from Jakub Kicinski: "Core & protocols: - Support usec resolution of TCP timestamps, enabled selectively by a route attribute. - Defer regular TCP ACK while processing socket backlog, try to send a cumulative ACK at the end. Increase single TCP flow performance on a 200Gbit NIC by 20% (100Gbit -> 120Gbit). - The Fair Queuing (FQ) packet scheduler: - add built-in 3 band prio / WRR scheduling - support bypass if the qdisc is mostly idle (5% speed up for TCP RR) - improve inactive flow reporting - optimize the layout of structures for better cache locality - Support TCP Authentication Option (RFC 5925, TCP-AO), a more modern replacement for the old MD5 option. - Add more retransmission timeout (RTO) related statistics to TCP_INFO. - Support sending fragmented skbs over vsock sockets. - Make sure we send SIGPIPE for vsock sockets if socket was shutdown(). - Add sysctl for ignoring lower limit on lifetime in Router Advertisement PIO, based on an in-progress IETF draft. - Add sysctl to control activation of TCP ping-pong mode. - Add sysctl to make connection timeout in MPTCP configurable. - Support rcvlowat and notsent_lowat on MPTCP sockets, to help apps limit the number of wakeups. - Support netlink GET for MDB (multicast forwarding), allowing user space to request a single MDB entry instead of dumping the entire table. - Support selective FDB flushing in the VXLAN tunnel driver. - Allow limiting learned FDB entries in bridges, prevent OOM attacks. - Allow controlling via configfs netconsole targets which were created via the kernel cmdline at boot, rather than via configfs at runtime. - Support multiple PTP timestamp event queue readers with different filters. - MCTP over I3C. BPF: - Add new veth-like netdevice where BPF program defines the logic of the xmit routine. It can operate in L3 and L2 mode. - Support exceptions - allow asserting conditions which should never be true but are hard for the verifier to infer. With some extra flexibility around handling of the exit / failure: https://lwn.net/Articles/938435/ - Add support for local per-cpu kptr, allow allocating and storing per-cpu objects in maps. Access to those objects operates on the value for the current CPU. This allows to deprecate local one-off implementations of per-CPU storage like BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE maps. - Extend cgroup BPF sockaddr hooks for UNIX sockets. The use case is for systemd to re-implement the LogNamespace feature which allows running multiple instances of systemd-journald to process the logs of different services. - Enable open-coded task_vma iteration, after maple tree conversion made it hard to directly walk VMAs in tracing programs. - Add open-coded task, css_task and css iterator support. One of the use cases is customizable OOM victim selection via BPF. - Allow source address selection with bpf_*_fib_lookup(). - Add ability to pin BPF timer to the current CPU. - Prevent creation of infinite loops by combining tail calls and fentry/fexit programs. - Add missed stats for kprobes to retrieve the number of missed kprobe executions and subsequent executions of BPF programs. - Inherit system settings for CPU security mitigations. - Add BPF v4 CPU instruction support for arm32 and s390x. Changes to common code: - overflow: add DEFINE_FLEX() for on-stack definition of structs with flexible array members. - Process doc update with more guidance for reviewers. Driver API: - Simplify locking in WiFi (cfg80211 and mac80211 layers), use wiphy mutex in most places and remove a lot of smaller locks. - Create a common DPLL configuration API. Allow configuring and querying state of PLL circuits used for clock syntonization, in network time distribution. - Unify fragmented and full page allocation APIs in page pool code. Let drivers be ignorant of PAGE_SIZE. - Rework PHY state machine to avoid races with calls to phy_stop(). - Notify DSA drivers of MAC address changes on user ports, improve correctness of offloads which depend on matching port MAC addresses. - Allow antenna control on injected WiFi frames. - Reduce the number of variants of napi_schedule(). - Simplify error handling when composing devlink health messages. Misc: - A lot of KCSAN data race "fixes", from Eric. - A lot of __counted_by() annotations, from Kees. - A lot of strncpy -> strscpy and printf format fixes. - Replace master/slave terminology with conduit/user in DSA drivers. - Handful of KUnit tests for netdev and WiFi core. Removed: - AppleTalk COPS. - AppleTalk ipddp. - TI AR7 CPMAC Ethernet driver. Drivers: - Ethernet high-speed NICs: - Intel (100G, ice, idpf): - add a driver for the Intel E2000 IPUs - make CRC/FCS stripping configurable - cross-timestamping for E823 devices - basic support for E830 devices - use aux-bus for managing client drivers - i40e: report firmware versions via devlink - nVidia/Mellanox: - support 4-port NICs - increase max number of channels to 256 - optimize / parallelize SF creation flow - Broadcom (bnxt): - enhance NIC temperature reporting - support PAM4 speeds and lane configuration - Marvell OcteonTX2: - PTP pulse-per-second output support - enable hardware timestamping for VFs - Solarflare/AMD: - conntrack NAT offload and offload for tunnels - Wangxun (ngbe/txgbe): - expose HW statistics - Pensando/AMD: - support PCI level reset - narrow down the condition under which skbs are linearized - Netronome/Corigine (nfp): - support CHACHA20-POLY1305 crypto in IPsec offload - Ethernet NICs embedded, slower, virtual: - Synopsys (stmmac): - add Loongson-1 SoC support - enable use of HW queues with no offload capabilities - enable PPS input support on all 5 channels - increase TX coalesce timer to 5ms - RealTek USB (r8152): improve efficiency of Rx by using GRO frags - xen: support SW packet timestamping - add drivers for implementations based on TI's PRUSS (AM64x EVM) - nVidia/Mellanox Ethernet datacenter switches: - avoid poor HW resource use on Spectrum-4 by better block selection for IPv6 multicast forwarding and ordering of blocks in ACL region - Ethernet embedded switches: - Microchip: - support configuring the drive strength for EMI compliance - ksz9477: partial ACL support - ksz9477: HSR offload - ksz9477: Wake on LAN - Realtek: - rtl8366rb: respect device tree config of the CPU port - Ethernet PHYs: - support Broadcom BCM5221 PHYs - TI dp83867: support hardware LED blinking - CAN: - add support for Linux-PHY based CAN transceivers - at91_can: clean up and use rx-offload helpers - WiFi: - MediaTek (mt76): - new sub-driver for mt7925 USB/PCIe devices - HW wireless <> Ethernet bridging in MT7988 chips - mt7603/mt7628 stability improvements - Qualcomm (ath12k): - WCN7850: - enable 320 MHz channels in 6 GHz band - hardware rfkill support - enable IEEE80211_HW_SINGLE_SCAN_ON_ALL_BANDS to make scan faster - read board data variant name from SMBIOS - QCN9274: mesh support - RealTek (rtw89): - TDMA-based multi-channel concurrency (MCC) - Silicon Labs (wfx): - Remain-On-Channel (ROC) support - Bluetooth: - ISO: many improvements for broadcast support - mark BCM4378/BCM4387 as BROKEN_LE_CODED - add support for QCA2066 - btmtksdio: enable Bluetooth wakeup from suspend" * tag 'net-next-6.7' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1816 commits) net: pcs: xpcs: Add 2500BASE-X case in get state for XPCS drivers net: bpf: Use sockopt_lock_sock() in ip_sock_set_tos() net: mana: Use xdp_set_features_flag instead of direct assignment vxlan: Cleanup IFLA_VXLAN_PORT_RANGE entry in vxlan_get_size() iavf: delete the iavf client interface iavf: add a common function for undoing the interrupt scheme iavf: use unregister_netdev iavf: rely on netdev's own registered state iavf: fix the waiting time for initial reset iavf: in iavf_down, don't queue watchdog_task if comms failed iavf: simplify mutex_trylock+sleep loops iavf: fix comments about old bit locks doc/netlink: Update schema to support cmd-cnt-name and cmd-max-name tools: ynl: introduce option to process unknown attributes or types ipvlan: properly track tx_errors netdevsim: Block until all devices are released nfp: using napi_build_skb() to replace build_skb() net: dsa: microchip: ksz9477: Fix spelling mistake "Enery" -> "Energy" net: dsa: microchip: Ensure Stable PME Pin State for Wake-on-LAN net: dsa: microchip: Refactor switch shutdown routine for WoL preparation ...
Diffstat (limited to 'kernel/bpf/task_iter.c')
-rw-r--r--kernel/bpf/task_iter.c282
1 files changed, 252 insertions, 30 deletions
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 82ad23b1d257..654601dd6b49 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -7,7 +7,9 @@
#include <linux/fs.h>
#include <linux/fdtable.h>
#include <linux/filter.h>
+#include <linux/bpf_mem_alloc.h>
#include <linux/btf_ids.h>
+#include <linux/mm_types.h>
#include "mmap_unlock_work.h"
static const char * const iter_task_type_names[] = {
@@ -35,16 +37,13 @@ static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_comm
u32 *tid,
bool skip_if_dup_files)
{
- struct task_struct *task, *next_task;
+ struct task_struct *task;
struct pid *pid;
- u32 saved_tid;
+ u32 next_tid;
if (!*tid) {
/* The first time, the iterator calls this function. */
pid = find_pid_ns(common->pid, common->ns);
- if (!pid)
- return NULL;
-
task = get_pid_task(pid, PIDTYPE_TGID);
if (!task)
return NULL;
@@ -66,44 +65,27 @@ static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_comm
return task;
}
- pid = find_pid_ns(common->pid_visiting, common->ns);
- if (!pid)
- return NULL;
-
- task = get_pid_task(pid, PIDTYPE_PID);
+ task = find_task_by_pid_ns(common->pid_visiting, common->ns);
if (!task)
return NULL;
retry:
- if (!pid_alive(task)) {
- put_task_struct(task);
- return NULL;
- }
+ task = next_thread(task);
- next_task = next_thread(task);
- put_task_struct(task);
- if (!next_task)
- return NULL;
-
- saved_tid = *tid;
- *tid = __task_pid_nr_ns(next_task, PIDTYPE_PID, common->ns);
- if (!*tid || *tid == common->pid) {
+ next_tid = __task_pid_nr_ns(task, PIDTYPE_PID, common->ns);
+ if (!next_tid || next_tid == common->pid) {
/* Run out of tasks of a process. The tasks of a
* thread_group are linked as circular linked list.
*/
- *tid = saved_tid;
return NULL;
}
- get_task_struct(next_task);
- common->pid_visiting = *tid;
-
- if (skip_if_dup_files && task->files == task->group_leader->files) {
- task = next_task;
+ if (skip_if_dup_files && task->files == task->group_leader->files)
goto retry;
- }
- return next_task;
+ *tid = common->pid_visiting = next_tid;
+ get_task_struct(task);
+ return task;
}
static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *common,
@@ -821,6 +803,246 @@ const struct bpf_func_proto bpf_find_vma_proto = {
.arg5_type = ARG_ANYTHING,
};
+struct bpf_iter_task_vma_kern_data {
+ struct task_struct *task;
+ struct mm_struct *mm;
+ struct mmap_unlock_irq_work *work;
+ struct vma_iterator vmi;
+};
+
+struct bpf_iter_task_vma {
+ /* opaque iterator state; having __u64 here allows to preserve correct
+ * alignment requirements in vmlinux.h, generated from BTF
+ */
+ __u64 __opaque[1];
+} __attribute__((aligned(8)));
+
+/* Non-opaque version of bpf_iter_task_vma */
+struct bpf_iter_task_vma_kern {
+ struct bpf_iter_task_vma_kern_data *data;
+} __attribute__((aligned(8)));
+
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+ "Global functions as their definitions will be in vmlinux BTF");
+
+__bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
+ struct task_struct *task, u64 addr)
+{
+ struct bpf_iter_task_vma_kern *kit = (void *)it;
+ bool irq_work_busy = false;
+ int err;
+
+ BUILD_BUG_ON(sizeof(struct bpf_iter_task_vma_kern) != sizeof(struct bpf_iter_task_vma));
+ BUILD_BUG_ON(__alignof__(struct bpf_iter_task_vma_kern) != __alignof__(struct bpf_iter_task_vma));
+
+ /* is_iter_reg_valid_uninit guarantees that kit hasn't been initialized
+ * before, so non-NULL kit->data doesn't point to previously
+ * bpf_mem_alloc'd bpf_iter_task_vma_kern_data
+ */
+ kit->data = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_iter_task_vma_kern_data));
+ if (!kit->data)
+ return -ENOMEM;
+
+ kit->data->task = get_task_struct(task);
+ kit->data->mm = task->mm;
+ if (!kit->data->mm) {
+ err = -ENOENT;
+ goto err_cleanup_iter;
+ }
+
+ /* kit->data->work == NULL is valid after bpf_mmap_unlock_get_irq_work */
+ irq_work_busy = bpf_mmap_unlock_get_irq_work(&kit->data->work);
+ if (irq_work_busy || !mmap_read_trylock(kit->data->mm)) {
+ err = -EBUSY;
+ goto err_cleanup_iter;
+ }
+
+ vma_iter_init(&kit->data->vmi, kit->data->mm, addr);
+ return 0;
+
+err_cleanup_iter:
+ if (kit->data->task)
+ put_task_struct(kit->data->task);
+ bpf_mem_free(&bpf_global_ma, kit->data);
+ /* NULL kit->data signals failed bpf_iter_task_vma initialization */
+ kit->data = NULL;
+ return err;
+}
+
+__bpf_kfunc struct vm_area_struct *bpf_iter_task_vma_next(struct bpf_iter_task_vma *it)
+{
+ struct bpf_iter_task_vma_kern *kit = (void *)it;
+
+ if (!kit->data) /* bpf_iter_task_vma_new failed */
+ return NULL;
+ return vma_next(&kit->data->vmi);
+}
+
+__bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it)
+{
+ struct bpf_iter_task_vma_kern *kit = (void *)it;
+
+ if (kit->data) {
+ bpf_mmap_unlock_mm(kit->data->work, kit->data->mm);
+ put_task_struct(kit->data->task);
+ bpf_mem_free(&bpf_global_ma, kit->data);
+ }
+}
+
+__diag_pop();
+
+struct bpf_iter_css_task {
+ __u64 __opaque[1];
+} __attribute__((aligned(8)));
+
+struct bpf_iter_css_task_kern {
+ struct css_task_iter *css_it;
+} __attribute__((aligned(8)));
+
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+ "Global functions as their definitions will be in vmlinux BTF");
+
+__bpf_kfunc int bpf_iter_css_task_new(struct bpf_iter_css_task *it,
+ struct cgroup_subsys_state *css, unsigned int flags)
+{
+ struct bpf_iter_css_task_kern *kit = (void *)it;
+
+ BUILD_BUG_ON(sizeof(struct bpf_iter_css_task_kern) != sizeof(struct bpf_iter_css_task));
+ BUILD_BUG_ON(__alignof__(struct bpf_iter_css_task_kern) !=
+ __alignof__(struct bpf_iter_css_task));
+ kit->css_it = NULL;
+ switch (flags) {
+ case CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED:
+ case CSS_TASK_ITER_PROCS:
+ case 0:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ kit->css_it = bpf_mem_alloc(&bpf_global_ma, sizeof(struct css_task_iter));
+ if (!kit->css_it)
+ return -ENOMEM;
+ css_task_iter_start(css, flags, kit->css_it);
+ return 0;
+}
+
+__bpf_kfunc struct task_struct *bpf_iter_css_task_next(struct bpf_iter_css_task *it)
+{
+ struct bpf_iter_css_task_kern *kit = (void *)it;
+
+ if (!kit->css_it)
+ return NULL;
+ return css_task_iter_next(kit->css_it);
+}
+
+__bpf_kfunc void bpf_iter_css_task_destroy(struct bpf_iter_css_task *it)
+{
+ struct bpf_iter_css_task_kern *kit = (void *)it;
+
+ if (!kit->css_it)
+ return;
+ css_task_iter_end(kit->css_it);
+ bpf_mem_free(&bpf_global_ma, kit->css_it);
+}
+
+__diag_pop();
+
+struct bpf_iter_task {
+ __u64 __opaque[3];
+} __attribute__((aligned(8)));
+
+struct bpf_iter_task_kern {
+ struct task_struct *task;
+ struct task_struct *pos;
+ unsigned int flags;
+} __attribute__((aligned(8)));
+
+enum {
+ /* all process in the system */
+ BPF_TASK_ITER_ALL_PROCS,
+ /* all threads in the system */
+ BPF_TASK_ITER_ALL_THREADS,
+ /* all threads of a specific process */
+ BPF_TASK_ITER_PROC_THREADS
+};
+
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+ "Global functions as their definitions will be in vmlinux BTF");
+
+__bpf_kfunc int bpf_iter_task_new(struct bpf_iter_task *it,
+ struct task_struct *task__nullable, unsigned int flags)
+{
+ struct bpf_iter_task_kern *kit = (void *)it;
+
+ BUILD_BUG_ON(sizeof(struct bpf_iter_task_kern) > sizeof(struct bpf_iter_task));
+ BUILD_BUG_ON(__alignof__(struct bpf_iter_task_kern) !=
+ __alignof__(struct bpf_iter_task));
+
+ kit->task = kit->pos = NULL;
+ switch (flags) {
+ case BPF_TASK_ITER_ALL_THREADS:
+ case BPF_TASK_ITER_ALL_PROCS:
+ break;
+ case BPF_TASK_ITER_PROC_THREADS:
+ if (!task__nullable)
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (flags == BPF_TASK_ITER_PROC_THREADS)
+ kit->task = task__nullable;
+ else
+ kit->task = &init_task;
+ kit->pos = kit->task;
+ kit->flags = flags;
+ return 0;
+}
+
+__bpf_kfunc struct task_struct *bpf_iter_task_next(struct bpf_iter_task *it)
+{
+ struct bpf_iter_task_kern *kit = (void *)it;
+ struct task_struct *pos;
+ unsigned int flags;
+
+ flags = kit->flags;
+ pos = kit->pos;
+
+ if (!pos)
+ return pos;
+
+ if (flags == BPF_TASK_ITER_ALL_PROCS)
+ goto get_next_task;
+
+ kit->pos = next_thread(kit->pos);
+ if (kit->pos == kit->task) {
+ if (flags == BPF_TASK_ITER_PROC_THREADS) {
+ kit->pos = NULL;
+ return pos;
+ }
+ } else
+ return pos;
+
+get_next_task:
+ kit->pos = next_task(kit->pos);
+ kit->task = kit->pos;
+ if (kit->pos == &init_task)
+ kit->pos = NULL;
+
+ return pos;
+}
+
+__bpf_kfunc void bpf_iter_task_destroy(struct bpf_iter_task *it)
+{
+}
+
+__diag_pop();
+
DEFINE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work);
static void do_mmap_read_unlock(struct irq_work *entry)