summaryrefslogtreecommitdiff
path: root/tools/bpf/bpftool/common.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-08-03 16:29:08 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2022-08-03 16:29:08 -0700
commitf86d1fbbe7858884d6754534a0afbb74fc30bc26 (patch)
treef61796870edefbe77d495e9d719c68af1d14275b /tools/bpf/bpftool/common.c
parent526942b8134cc34d25d27f95dfff98b8ce2f6fcd (diff)
parent7c6327c77d509e78bff76f2a4551fcfee851682e (diff)
Merge tag 'net-next-6.0' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking changes from Paolo Abeni: "Core: - Refactor the forward memory allocation to better cope with memory pressure with many open sockets, moving from a per socket cache to a per-CPU one - Replace rwlocks with RCU for better fairness in ping, raw sockets and IP multicast router. - Network-side support for IO uring zero-copy send. - A few skb drop reason improvements, including codegen the source file with string mapping instead of using macro magic. - Rename reference tracking helpers to a more consistent netdev_* schema. - Adapt u64_stats_t type to address load/store tearing issues. - Refine debug helper usage to reduce the log noise caused by bots. BPF: - Improve socket map performance, avoiding skb cloning on read operation. - Add support for 64 bits enum, to match types exposed by kernel. - Introduce support for sleepable uprobes program. - Introduce support for enum textual representation in libbpf. - New helpers to implement synproxy with eBPF/XDP. - Improve loop performances, inlining indirect calls when possible. - Removed all the deprecated libbpf APIs. - Implement new eBPF-based LSM flavor. - Add type match support, which allow accurate queries to the eBPF used types. - A few TCP congetsion control framework usability improvements. - Add new infrastructure to manipulate CT entries via eBPF programs. - Allow for livepatch (KLP) and BPF trampolines to attach to the same kernel function. Protocols: - Introduce per network namespace lookup tables for unix sockets, increasing scalability and reducing contention. - Preparation work for Wi-Fi 7 Multi-Link Operation (MLO) support. - Add support to forciby close TIME_WAIT TCP sockets via user-space tools. - Significant performance improvement for the TLS 1.3 receive path, both for zero-copy and not-zero-copy. - Support for changing the initial MTPCP subflow priority/backup status - Introduce virtually contingus buffers for sockets over RDMA, to cope better with memory pressure. - Extend CAN ethtool support with timestamping capabilities - Refactor CAN build infrastructure to allow building only the needed features. Driver API: - Remove devlink mutex to allow parallel commands on multiple links. - Add support for pause stats in distributed switch. - Implement devlink helpers to query and flash line cards. - New helper for phy mode to register conversion. New hardware / drivers: - Ethernet DSA driver for the rockchip mt7531 on BPI-R2 Pro. - Ethernet DSA driver for the Renesas RZ/N1 A5PSW switch. - Ethernet DSA driver for the Microchip LAN937x switch. - Ethernet PHY driver for the Aquantia AQR113C EPHY. - CAN driver for the OBD-II ELM327 interface. - CAN driver for RZ/N1 SJA1000 CAN controller. - Bluetooth: Infineon CYW55572 Wi-Fi plus Bluetooth combo device. Drivers: - Intel Ethernet NICs: - i40e: add support for vlan pruning - i40e: add support for XDP framented packets - ice: improved vlan offload support - ice: add support for PPPoE offload - Mellanox Ethernet (mlx5) - refactor packet steering offload for performance and scalability - extend support for TC offload - refactor devlink code to clean-up the locking schema - support stacked vlans for bridge offloads - use TLS objects pool to improve connection rate - Netronome Ethernet NICs (nfp): - extend support for IPv6 fields mangling offload - add support for vepa mode in HW bridge - better support for virtio data path acceleration (VDPA) - enable TSO by default - Microsoft vNIC driver (mana) - add support for XDP redirect - Others Ethernet drivers: - bonding: add per-port priority support - microchip lan743x: extend phy support - Fungible funeth: support UDP segmentation offload and XDP xmit - Solarflare EF100: add support for virtual function representors - MediaTek SoC: add XDP support - Mellanox Ethernet/IB switch (mlxsw): - dropped support for unreleased H/W (XM router). - improved stats accuracy - unified bridge model coversion improving scalability (parts 1-6) - support for PTP in Spectrum-2 asics - Broadcom PHYs - add PTP support for BCM54210E - add support for the BCM53128 internal PHY - Marvell Ethernet switches (prestera): - implement support for multicast forwarding offload - Embedded Ethernet switches: - refactor OcteonTx MAC filter for better scalability - improve TC H/W offload for the Felix driver - refactor the Microchip ksz8 and ksz9477 drivers to share the probe code (parts 1, 2), add support for phylink mac configuration - Other WiFi: - Microchip wilc1000: diable WEP support and enable WPA3 - Atheros ath10k: encapsulation offload support Old code removal: - Neterion vxge ethernet driver: this is untouched since more than 10 years" * tag 'net-next-6.0' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1890 commits) doc: sfp-phylink: Fix a broken reference wireguard: selftests: support UML wireguard: allowedips: don't corrupt stack when detecting overflow wireguard: selftests: update config fragments wireguard: ratelimiter: use hrtimer in selftest net/mlx5e: xsk: Discard unaligned XSK frames on striding RQ net: usb: ax88179_178a: Bind only to vendor-specific interface selftests: net: fix IOAM test skip return code net: usb: make USB_RTL8153_ECM non user configurable net: marvell: prestera: remove reduntant code octeontx2-pf: Reduce minimum mtu size to 60 net: devlink: Fix missing mutex_unlock() call net/tls: Remove redundant workqueue flush before destroy net: txgbe: Fix an error handling path in txgbe_probe() net: dsa: Fix spelling mistakes and cleanup code Documentation: devlink: add add devlink-selftests to the table of contents dccp: put dccp_qpolicy_full() and dccp_qpolicy_push() in the same lock net: ionic: fix error check for vlan flags in ionic_set_nic_features() net: ice: fix error NETIF_F_HW_VLAN_CTAG_FILTER check in ice_vsi_sync_fltr() nfp: flower: add support for tunnel offload without key ID ...
Diffstat (limited to 'tools/bpf/bpftool/common.c')
-rw-r--r--tools/bpf/bpftool/common.c160
1 files changed, 112 insertions, 48 deletions
diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c
index c740142c24d8..067e9ea59e3b 100644
--- a/tools/bpf/bpftool/common.c
+++ b/tools/bpf/bpftool/common.c
@@ -13,13 +13,17 @@
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
-#include <linux/limits.h>
-#include <linux/magic.h>
#include <net/if.h>
#include <sys/mount.h>
+#include <sys/resource.h>
#include <sys/stat.h>
#include <sys/vfs.h>
+#include <linux/filter.h>
+#include <linux/limits.h>
+#include <linux/magic.h>
+#include <linux/unistd.h>
+
#include <bpf/bpf.h>
#include <bpf/hashmap.h>
#include <bpf/libbpf.h> /* libbpf_num_possible_cpus */
@@ -31,52 +35,6 @@
#define BPF_FS_MAGIC 0xcafe4a11
#endif
-const char * const attach_type_name[__MAX_BPF_ATTACH_TYPE] = {
- [BPF_CGROUP_INET_INGRESS] = "ingress",
- [BPF_CGROUP_INET_EGRESS] = "egress",
- [BPF_CGROUP_INET_SOCK_CREATE] = "sock_create",
- [BPF_CGROUP_INET_SOCK_RELEASE] = "sock_release",
- [BPF_CGROUP_SOCK_OPS] = "sock_ops",
- [BPF_CGROUP_DEVICE] = "device",
- [BPF_CGROUP_INET4_BIND] = "bind4",
- [BPF_CGROUP_INET6_BIND] = "bind6",
- [BPF_CGROUP_INET4_CONNECT] = "connect4",
- [BPF_CGROUP_INET6_CONNECT] = "connect6",
- [BPF_CGROUP_INET4_POST_BIND] = "post_bind4",
- [BPF_CGROUP_INET6_POST_BIND] = "post_bind6",
- [BPF_CGROUP_INET4_GETPEERNAME] = "getpeername4",
- [BPF_CGROUP_INET6_GETPEERNAME] = "getpeername6",
- [BPF_CGROUP_INET4_GETSOCKNAME] = "getsockname4",
- [BPF_CGROUP_INET6_GETSOCKNAME] = "getsockname6",
- [BPF_CGROUP_UDP4_SENDMSG] = "sendmsg4",
- [BPF_CGROUP_UDP6_SENDMSG] = "sendmsg6",
- [BPF_CGROUP_SYSCTL] = "sysctl",
- [BPF_CGROUP_UDP4_RECVMSG] = "recvmsg4",
- [BPF_CGROUP_UDP6_RECVMSG] = "recvmsg6",
- [BPF_CGROUP_GETSOCKOPT] = "getsockopt",
- [BPF_CGROUP_SETSOCKOPT] = "setsockopt",
- [BPF_SK_SKB_STREAM_PARSER] = "sk_skb_stream_parser",
- [BPF_SK_SKB_STREAM_VERDICT] = "sk_skb_stream_verdict",
- [BPF_SK_SKB_VERDICT] = "sk_skb_verdict",
- [BPF_SK_MSG_VERDICT] = "sk_msg_verdict",
- [BPF_LIRC_MODE2] = "lirc_mode2",
- [BPF_FLOW_DISSECTOR] = "flow_dissector",
- [BPF_TRACE_RAW_TP] = "raw_tp",
- [BPF_TRACE_FENTRY] = "fentry",
- [BPF_TRACE_FEXIT] = "fexit",
- [BPF_MODIFY_RETURN] = "mod_ret",
- [BPF_LSM_MAC] = "lsm_mac",
- [BPF_SK_LOOKUP] = "sk_lookup",
- [BPF_TRACE_ITER] = "trace_iter",
- [BPF_XDP_DEVMAP] = "xdp_devmap",
- [BPF_XDP_CPUMAP] = "xdp_cpumap",
- [BPF_XDP] = "xdp",
- [BPF_SK_REUSEPORT_SELECT] = "sk_skb_reuseport_select",
- [BPF_SK_REUSEPORT_SELECT_OR_MIGRATE] = "sk_skb_reuseport_select_or_migrate",
- [BPF_PERF_EVENT] = "perf_event",
- [BPF_TRACE_KPROBE_MULTI] = "trace_kprobe_multi",
-};
-
void p_err(const char *fmt, ...)
{
va_list ap;
@@ -118,6 +76,75 @@ static bool is_bpffs(char *path)
return (unsigned long)st_fs.f_type == BPF_FS_MAGIC;
}
+/* Probe whether kernel switched from memlock-based (RLIMIT_MEMLOCK) to
+ * memcg-based memory accounting for BPF maps and programs. This was done in
+ * commit 97306be45fbe ("Merge branch 'switch to memcg-based memory
+ * accounting'"), in Linux 5.11.
+ *
+ * Libbpf also offers to probe for memcg-based accounting vs rlimit, but does
+ * so by checking for the availability of a given BPF helper and this has
+ * failed on some kernels with backports in the past, see commit 6b4384ff1088
+ * ("Revert "bpftool: Use libbpf 1.0 API mode instead of RLIMIT_MEMLOCK"").
+ * Instead, we can probe by lowering the process-based rlimit to 0, trying to
+ * load a BPF object, and resetting the rlimit. If the load succeeds then
+ * memcg-based accounting is supported.
+ *
+ * This would be too dangerous to do in the library, because multithreaded
+ * applications might attempt to load items while the rlimit is at 0. Given
+ * that bpftool is single-threaded, this is fine to do here.
+ */
+static bool known_to_need_rlimit(void)
+{
+ struct rlimit rlim_init, rlim_cur_zero = {};
+ struct bpf_insn insns[] = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ };
+ size_t insn_cnt = ARRAY_SIZE(insns);
+ union bpf_attr attr;
+ int prog_fd, err;
+
+ memset(&attr, 0, sizeof(attr));
+ attr.prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
+ attr.insns = ptr_to_u64(insns);
+ attr.insn_cnt = insn_cnt;
+ attr.license = ptr_to_u64("GPL");
+
+ if (getrlimit(RLIMIT_MEMLOCK, &rlim_init))
+ return false;
+
+ /* Drop the soft limit to zero. We maintain the hard limit to its
+ * current value, because lowering it would be a permanent operation
+ * for unprivileged users.
+ */
+ rlim_cur_zero.rlim_max = rlim_init.rlim_max;
+ if (setrlimit(RLIMIT_MEMLOCK, &rlim_cur_zero))
+ return false;
+
+ /* Do not use bpf_prog_load() from libbpf here, because it calls
+ * bump_rlimit_memlock(), interfering with the current probe.
+ */
+ prog_fd = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
+ err = errno;
+
+ /* reset soft rlimit to its initial value */
+ setrlimit(RLIMIT_MEMLOCK, &rlim_init);
+
+ if (prog_fd < 0)
+ return err == EPERM;
+
+ close(prog_fd);
+ return false;
+}
+
+void set_max_rlimit(void)
+{
+ struct rlimit rinf = { RLIM_INFINITY, RLIM_INFINITY };
+
+ if (known_to_need_rlimit())
+ setrlimit(RLIMIT_MEMLOCK, &rinf);
+}
+
static int
mnt_fs(const char *target, const char *type, char *buff, size_t bufflen)
{
@@ -289,6 +316,7 @@ const char *get_fd_type_name(enum bpf_obj_type type)
[BPF_OBJ_UNKNOWN] = "unknown",
[BPF_OBJ_PROG] = "prog",
[BPF_OBJ_MAP] = "map",
+ [BPF_OBJ_LINK] = "link",
};
if (type < 0 || type >= ARRAY_SIZE(names) || !names[type])
@@ -1009,3 +1037,39 @@ bool equal_fn_for_key_as_id(const void *k1, const void *k2, void *ctx)
{
return k1 == k2;
}
+
+const char *bpf_attach_type_input_str(enum bpf_attach_type t)
+{
+ switch (t) {
+ case BPF_CGROUP_INET_INGRESS: return "ingress";
+ case BPF_CGROUP_INET_EGRESS: return "egress";
+ case BPF_CGROUP_INET_SOCK_CREATE: return "sock_create";
+ case BPF_CGROUP_INET_SOCK_RELEASE: return "sock_release";
+ case BPF_CGROUP_SOCK_OPS: return "sock_ops";
+ case BPF_CGROUP_DEVICE: return "device";
+ case BPF_CGROUP_INET4_BIND: return "bind4";
+ case BPF_CGROUP_INET6_BIND: return "bind6";
+ case BPF_CGROUP_INET4_CONNECT: return "connect4";
+ case BPF_CGROUP_INET6_CONNECT: return "connect6";
+ case BPF_CGROUP_INET4_POST_BIND: return "post_bind4";
+ case BPF_CGROUP_INET6_POST_BIND: return "post_bind6";
+ case BPF_CGROUP_INET4_GETPEERNAME: return "getpeername4";
+ case BPF_CGROUP_INET6_GETPEERNAME: return "getpeername6";
+ case BPF_CGROUP_INET4_GETSOCKNAME: return "getsockname4";
+ case BPF_CGROUP_INET6_GETSOCKNAME: return "getsockname6";
+ case BPF_CGROUP_UDP4_SENDMSG: return "sendmsg4";
+ case BPF_CGROUP_UDP6_SENDMSG: return "sendmsg6";
+ case BPF_CGROUP_SYSCTL: return "sysctl";
+ case BPF_CGROUP_UDP4_RECVMSG: return "recvmsg4";
+ case BPF_CGROUP_UDP6_RECVMSG: return "recvmsg6";
+ case BPF_CGROUP_GETSOCKOPT: return "getsockopt";
+ case BPF_CGROUP_SETSOCKOPT: return "setsockopt";
+ case BPF_TRACE_RAW_TP: return "raw_tp";
+ case BPF_TRACE_FENTRY: return "fentry";
+ case BPF_TRACE_FEXIT: return "fexit";
+ case BPF_MODIFY_RETURN: return "mod_ret";
+ case BPF_SK_REUSEPORT_SELECT: return "sk_skb_reuseport_select";
+ case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE: return "sk_skb_reuseport_select_or_migrate";
+ default: return libbpf_bpf_attach_type_str(t);
+ }
+}