summaryrefslogtreecommitdiff
path: root/kernel/bpf/syscall.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-10-04 13:38:03 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2022-10-04 13:38:03 -0700
commit0326074ff4652329f2a1a9c8685104576bd8d131 (patch)
tree9a7574c7ccb05bf4c7cb34fc5a65457bb8f495cb /kernel/bpf/syscall.c
parent522667b24f08009591c90e75bfe2ffb67f555498 (diff)
parent681bf011b9b5989c6e9db6beb64494918aab9a43 (diff)
Merge tag 'net-next-6.1' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from Jakub Kicinski: "Core: - Introduce and use a single page frag cache for allocating small skb heads, clawing back the 10-20% performance regression in UDP flood test from previous fixes. - Run packets which already went thru HW coalescing thru SW GRO. This significantly improves TCP segment coalescing and simplifies deployments as different workloads benefit from HW or SW GRO. - Shrink the size of the base zero-copy send structure. - Move TCP init under a new slow / sleepable version of DO_ONCE(). BPF: - Add BPF-specific, any-context-safe memory allocator. - Add helpers/kfuncs for PKCS#7 signature verification from BPF programs. - Define a new map type and related helpers for user space -> kernel communication over a ring buffer (BPF_MAP_TYPE_USER_RINGBUF). - Allow targeting BPF iterators to loop through resources of one task/thread. - Add ability to call selected destructive functions. Expose crash_kexec() to allow BPF to trigger a kernel dump. Use CAP_SYS_BOOT check on the loading process to judge permissions. - Enable BPF to collect custom hierarchical cgroup stats efficiently by integrating with the rstat framework. - Support struct arguments for trampoline based programs. Only structs with size <= 16B and x86 are supported. - Invoke cgroup/connect{4,6} programs for unprivileged ICMP ping sockets (instead of just TCP and UDP sockets). - Add a helper for accessing CLOCK_TAI for time sensitive network related programs. - Support accessing network tunnel metadata's flags. - Make TCP SYN ACK RTO tunable by BPF programs with TCP Fast Open. - Add support for writing to Netfilter's nf_conn:mark. Protocols: - WiFi: more Extremely High Throughput (EHT) and Multi-Link Operation (MLO) work (802.11be, WiFi 7). - vsock: improve support for SO_RCVLOWAT. - SMC: support SO_REUSEPORT. - Netlink: define and document how to use netlink in a "modern" way. Support reporting missing attributes via extended ACK. - IPSec: support collect metadata mode for xfrm interfaces. - TCPv6: send consistent autoflowlabel in SYN_RECV state and RST packets. - TCP: introduce optional per-netns connection hash table to allow better isolation between namespaces (opt-in, at the cost of memory and cache pressure). - MPTCP: support TCP_FASTOPEN_CONNECT. - Add NEXT-C-SID support in Segment Routing (SRv6) End behavior. - Adjust IP_UNICAST_IF sockopt behavior for connected UDP sockets. - Open vSwitch: - Allow specifying ifindex of new interfaces. - Allow conntrack and metering in non-initial user namespace. - TLS: support the Korean ARIA-GCM crypto algorithm. - Remove DECnet support. Driver API: - Allow selecting the conduit interface used by each port in DSA switches, at runtime. - Ethernet Power Sourcing Equipment and Power Device support. - Add tc-taprio support for queueMaxSDU parameter, i.e. setting per traffic class max frame size for time-based packet schedules. - Support PHY rate matching - adapting between differing host-side and link-side speeds. - Introduce QUSGMII PHY mode and 1000BASE-KX interface mode. - Validate OF (device tree) nodes for DSA shared ports; make phylink-related properties mandatory on DSA and CPU ports. Enforcing more uniformity should allow transitioning to phylink. - Require that flash component name used during update matches one of the components for which version is reported by info_get(). - Remove "weight" argument from driver-facing NAPI API as much as possible. It's one of those magic knobs which seemed like a good idea at the time but is too indirect to use in practice. - Support offload of TLS connections with 256 bit keys. New hardware / drivers: - Ethernet: - Microchip KSZ9896 6-port Gigabit Ethernet Switch - Renesas Ethernet AVB (EtherAVB-IF) Gen4 SoCs - Analog Devices ADIN1110 and ADIN2111 industrial single pair Ethernet (10BASE-T1L) MAC+PHY. - Rockchip RV1126 Gigabit Ethernet (a version of stmmac IP). - Ethernet SFPs / modules: - RollBall / Hilink / Turris 10G copper SFPs - HALNy GPON module - WiFi: - CYW43439 SDIO chipset (brcmfmac) - CYW89459 PCIe chipset (brcmfmac) - BCM4378 on Apple platforms (brcmfmac) Drivers: - CAN: - gs_usb: HW timestamp support - Ethernet PHYs: - lan8814: cable diagnostics - Ethernet NICs: - Intel (100G): - implement control of FCS/CRC stripping - port splitting via devlink - L2TPv3 filtering offload - nVidia/Mellanox: - tunnel offload for sub-functions - MACSec offload, w/ Extended packet number and replay window offload - significantly restructure, and optimize the AF_XDP support, align the behavior with other vendors - Huawei: - configuring DSCP map for traffic class selection - querying standard FEC statistics - querying SerDes lane number via ethtool - Marvell/Cavium: - egress priority flow control - MACSec offload - AMD/SolarFlare: - PTP over IPv6 and raw Ethernet - small / embedded: - ax88772: convert to phylink (to support SFP cages) - altera: tse: convert to phylink - ftgmac100: support fixed link - enetc: standard Ethtool counters - macb: ZynqMP SGMII dynamic configuration support - tsnep: support multi-queue and use page pool - lan743x: Rx IP & TCP checksum offload - igc: add xdp frags support to ndo_xdp_xmit - Ethernet high-speed switches: - Marvell (prestera): - support SPAN port features (traffic mirroring) - nexthop object offloading - Microchip (sparx5): - multicast forwarding offload - QoS queuing offload (tc-mqprio, tc-tbf, tc-ets) - Ethernet embedded switches: - Marvell (mv88e6xxx): - support RGMII cmode - NXP (felix): - standardized ethtool counters - Microchip (lan966x): - QoS queuing offload (tc-mqprio, tc-tbf, tc-cbs, tc-ets) - traffic policing and mirroring - link aggregation / bonding offload - QUSGMII PHY mode support - Qualcomm 802.11ax WiFi (ath11k): - cold boot calibration support on WCN6750 - support to connect to a non-transmit MBSSID AP profile - enable remain-on-channel support on WCN6750 - Wake-on-WLAN support for WCN6750 - support to provide transmit power from firmware via nl80211 - support to get power save duration for each client - spectral scan support for 160 MHz - MediaTek WiFi (mt76): - WiFi-to-Ethernet bridging offload for MT7986 chips - RealTek WiFi (rtw89): - P2P support" * tag 'net-next-6.1' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1864 commits) eth: pse: add missing static inlines once: rename _SLOW to _SLEEPABLE net: pse-pd: add regulator based PSE driver dt-bindings: net: pse-dt: add bindings for regulator based PoDL PSE controller ethtool: add interface to interact with Ethernet Power Equipment net: mdiobus: search for PSE nodes by parsing PHY nodes. net: mdiobus: fwnode_mdiobus_register_phy() rework error handling net: add framework to support Ethernet PSE and PDs devices dt-bindings: net: phy: add PoDL PSE property net: marvell: prestera: Propagate nh state from hw to kernel net: marvell: prestera: Add neighbour cache accounting net: marvell: prestera: add stub handler neighbour events net: marvell: prestera: Add heplers to interact with fib_notifier_info net: marvell: prestera: Add length macros for prestera_ip_addr net: marvell: prestera: add delayed wq and flush wq on deinit net: marvell: prestera: Add strict cleanup of fib arbiter net: marvell: prestera: Add cleanup of allocated fib_nodes net: marvell: prestera: Add router nexthops ABI eth: octeon: fix build after netif_napi_add() changes net/mlx5: E-Switch, Return EBUSY if can't get mode lock ...
Diffstat (limited to 'kernel/bpf/syscall.c')
-rw-r--r--kernel/bpf/syscall.c46
1 files changed, 30 insertions, 16 deletions
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 27760627370d..7b373a5e861f 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -598,7 +598,7 @@ void bpf_map_free_kptrs(struct bpf_map *map, void *map_value)
if (off_desc->type == BPF_KPTR_UNREF) {
u64 *p = (u64 *)btf_id_ptr;
- WRITE_ONCE(p, 0);
+ WRITE_ONCE(*p, 0);
continue;
}
old_ptr = xchg(btf_id_ptr, 0);
@@ -638,7 +638,10 @@ static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock)
bpf_map_free_id(map, do_idr_lock);
btf_put(map->btf);
INIT_WORK(&map->work, bpf_map_free_deferred);
- schedule_work(&map->work);
+ /* Avoid spawning kworkers, since they all might contend
+ * for the same mutex like slab_mutex.
+ */
+ queue_work(system_unbound_wq, &map->work);
}
}
@@ -1046,7 +1049,8 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
}
if (map->map_type != BPF_MAP_TYPE_HASH &&
map->map_type != BPF_MAP_TYPE_LRU_HASH &&
- map->map_type != BPF_MAP_TYPE_ARRAY) {
+ map->map_type != BPF_MAP_TYPE_ARRAY &&
+ map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY) {
ret = -EOPNOTSUPP;
goto free_map_tab;
}
@@ -1413,19 +1417,14 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
}
value_size = bpf_map_value_size(map);
-
- err = -ENOMEM;
- value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
- if (!value)
+ value = kvmemdup_bpfptr(uvalue, value_size);
+ if (IS_ERR(value)) {
+ err = PTR_ERR(value);
goto free_key;
-
- err = -EFAULT;
- if (copy_from_bpfptr(value, uvalue, value_size) != 0)
- goto free_value;
+ }
err = bpf_map_update_value(map, f, key, value, attr->flags);
-free_value:
kvfree(value);
free_key:
kvfree(key);
@@ -1437,9 +1436,9 @@ err_put:
#define BPF_MAP_DELETE_ELEM_LAST_FIELD key
-static int map_delete_elem(union bpf_attr *attr)
+static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr)
{
- void __user *ukey = u64_to_user_ptr(attr->key);
+ bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel);
int ufd = attr->map_fd;
struct bpf_map *map;
struct fd f;
@@ -1459,7 +1458,7 @@ static int map_delete_elem(union bpf_attr *attr)
goto err_put;
}
- key = __bpf_copy_key(ukey, map->key_size);
+ key = ___bpf_copy_key(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
goto err_put;
@@ -2094,6 +2093,17 @@ struct bpf_prog_kstats {
u64 misses;
};
+void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog)
+{
+ struct bpf_prog_stats *stats;
+ unsigned int flags;
+
+ stats = this_cpu_ptr(prog->stats);
+ flags = u64_stats_update_begin_irqsave(&stats->syncp);
+ u64_stats_inc(&stats->misses);
+ u64_stats_update_end_irqrestore(&stats->syncp, flags);
+}
+
static void bpf_prog_get_stats(const struct bpf_prog *prog,
struct bpf_prog_kstats *stats)
{
@@ -4395,7 +4405,9 @@ static int bpf_task_fd_query(const union bpf_attr *attr,
if (attr->task_fd_query.flags != 0)
return -EINVAL;
+ rcu_read_lock();
task = get_pid_task(find_vpid(pid), PIDTYPE_PID);
+ rcu_read_unlock();
if (!task)
return -ENOENT;
@@ -4941,7 +4953,7 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
err = map_update_elem(&attr, uattr);
break;
case BPF_MAP_DELETE_ELEM:
- err = map_delete_elem(&attr);
+ err = map_delete_elem(&attr, uattr);
break;
case BPF_MAP_GET_NEXT_KEY:
err = map_get_next_key(&attr);
@@ -5073,8 +5085,10 @@ BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size)
{
switch (cmd) {
case BPF_MAP_CREATE:
+ case BPF_MAP_DELETE_ELEM:
case BPF_MAP_UPDATE_ELEM:
case BPF_MAP_FREEZE:
+ case BPF_MAP_GET_FD_BY_ID:
case BPF_PROG_LOAD:
case BPF_BTF_LOAD:
case BPF_LINK_CREATE: