From ccab434e674ca95d483788b1895a70c21b7f016a Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Wed, 15 Nov 2023 11:08:57 +0100 Subject: usb: aqc111: check packet for fixup for true limit If a device sends a packet that is inbetween 0 and sizeof(u64) the value passed to skb_trim() as length will wrap around ending up as some very large value. The driver will then proceed to parse the header located at that position, which will either oops or process some random value. The fix is to check against sizeof(u64) rather than 0, which the driver currently does. The issue exists since the introduction of the driver. Signed-off-by: Oliver Neukum Signed-off-by: David S. Miller --- drivers/net/usb/aqc111.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/usb/aqc111.c b/drivers/net/usb/aqc111.c index a017e9de2119..7b8afa589a53 100644 --- a/drivers/net/usb/aqc111.c +++ b/drivers/net/usb/aqc111.c @@ -1079,17 +1079,17 @@ static int aqc111_rx_fixup(struct usbnet *dev, struct sk_buff *skb) u16 pkt_count = 0; u64 desc_hdr = 0; u16 vlan_tag = 0; - u32 skb_len = 0; + u32 skb_len; if (!skb) goto err; - if (skb->len == 0) + skb_len = skb->len; + if (skb_len < sizeof(desc_hdr)) goto err; - skb_len = skb->len; /* RX Descriptor Header */ - skb_trim(skb, skb->len - sizeof(desc_hdr)); + skb_trim(skb, skb_len - sizeof(desc_hdr)); desc_hdr = le64_to_cpup((u64 *)skb_tail_pointer(skb)); /* Check these packets */ -- cgit From 7fbd5fc2b35a8f559a6b380dfa9bcd964a758186 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Wed, 15 Nov 2023 11:53:31 +0100 Subject: stmmac: dwmac-loongson: Add architecture dependency Only present the DWMAC_LOONGSON option on architectures where it can actually be used. This follows the same logic as the DWMAC_INTEL option. Signed-off-by: Jean Delvare Cc: Keguang Zhang Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/Kconfig b/drivers/net/ethernet/stmicro/stmmac/Kconfig index a2b9e289aa36..85dcda51df05 100644 --- a/drivers/net/ethernet/stmicro/stmmac/Kconfig +++ b/drivers/net/ethernet/stmicro/stmmac/Kconfig @@ -280,7 +280,7 @@ config DWMAC_INTEL config DWMAC_LOONGSON tristate "Loongson PCI DWMAC support" default MACH_LOONGSON64 - depends on STMMAC_ETH && PCI + depends on (MACH_LOONGSON64 || COMPILE_TEST) && STMMAC_ETH && PCI depends on COMMON_CLK help This selects the LOONGSON PCI bus support for the stmmac driver, -- cgit From 0c3bd086d12d185650d095a906662593ec607bd0 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 15 Nov 2023 17:15:40 +0000 Subject: rxrpc: Fix some minor issues with bundle tracing Fix some superficial issues with the tracing of rxrpc_bundle structs, including: (1) Set the debug_id when the bundle is allocated rather than when it is set up so that the "NEW" trace line displays the correct bundle ID. (2) Show the refcount when emitting the "FREE" traceline. Signed-off-by: David Howells cc: Marc Dionne cc: "David S. Miller" cc: Eric Dumazet cc: Jakub Kicinski cc: Paolo Abeni cc: linux-afs@lists.infradead.org cc: netdev@vger.kernel.org Signed-off-by: David S. Miller --- net/rxrpc/conn_client.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 981ca5b98bcb..1d95f8bc769f 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -73,6 +73,7 @@ static void rxrpc_destroy_client_conn_ids(struct rxrpc_local *local) static struct rxrpc_bundle *rxrpc_alloc_bundle(struct rxrpc_call *call, gfp_t gfp) { + static atomic_t rxrpc_bundle_id; struct rxrpc_bundle *bundle; bundle = kzalloc(sizeof(*bundle), gfp); @@ -85,6 +86,7 @@ static struct rxrpc_bundle *rxrpc_alloc_bundle(struct rxrpc_call *call, bundle->upgrade = test_bit(RXRPC_CALL_UPGRADE, &call->flags); bundle->service_id = call->dest_srx.srx_service; bundle->security_level = call->security_level; + bundle->debug_id = atomic_inc_return(&rxrpc_bundle_id); refcount_set(&bundle->ref, 1); atomic_set(&bundle->active, 1); INIT_LIST_HEAD(&bundle->waiting_calls); @@ -105,7 +107,8 @@ struct rxrpc_bundle *rxrpc_get_bundle(struct rxrpc_bundle *bundle, static void rxrpc_free_bundle(struct rxrpc_bundle *bundle) { - trace_rxrpc_bundle(bundle->debug_id, 1, rxrpc_bundle_free); + trace_rxrpc_bundle(bundle->debug_id, refcount_read(&bundle->ref), + rxrpc_bundle_free); rxrpc_put_peer(bundle->peer, rxrpc_peer_put_bundle); key_put(bundle->key); kfree(bundle); @@ -239,7 +242,6 @@ dont_reuse: */ int rxrpc_look_up_bundle(struct rxrpc_call *call, gfp_t gfp) { - static atomic_t rxrpc_bundle_id; struct rxrpc_bundle *bundle, *candidate; struct rxrpc_local *local = call->local; struct rb_node *p, **pp, *parent; @@ -306,7 +308,6 @@ int rxrpc_look_up_bundle(struct rxrpc_call *call, gfp_t gfp) } _debug("new bundle"); - candidate->debug_id = atomic_inc_return(&rxrpc_bundle_id); rb_link_node(&candidate->local_node, parent, pp); rb_insert_color(&candidate->local_node, &local->client_bundles); call->bundle = rxrpc_get_bundle(candidate, rxrpc_bundle_get_client_call); -- cgit From d565fa4300d9ebd5ba3bbd259ce841f8dab609d6 Mon Sep 17 00:00:00 2001 From: Gerd Bayer Date: Wed, 15 Nov 2023 16:59:58 +0100 Subject: s390/ism: ism driver implies smc protocol Since commit a72178cfe855 ("net/smc: Fix dependency of SMC on ISM") you can build the ism code without selecting the SMC network protocol. That leaves some ism functions be reported as unused. Move these functions under the conditional compile with CONFIG_SMC. Also codify the suggestion to also configure the SMC protocol in ism's Kconfig - but with an "imply" rather than a "select" as SMC depends on other config options and allow for a deliberate decision not to build SMC. Also, mention that in ISM's help. Fixes: a72178cfe855 ("net/smc: Fix dependency of SMC on ISM") Reported-by: Randy Dunlap Closes: https://lore.kernel.org/netdev/afd142a2-1fa0-46b9-8b2d-7652d41d3ab8@infradead.org/ Signed-off-by: Gerd Bayer Reviewed-by: Wenjia Zhang Reviewed-by: Simon Horman Acked-by: Randy Dunlap Tested-by: Randy Dunlap # build-tested Signed-off-by: David S. Miller --- drivers/s390/net/Kconfig | 3 +- drivers/s390/net/ism_drv.c | 93 +++++++++++++++++++++++----------------------- 2 files changed, 48 insertions(+), 48 deletions(-) diff --git a/drivers/s390/net/Kconfig b/drivers/s390/net/Kconfig index 4902d45e929c..c61e6427384c 100644 --- a/drivers/s390/net/Kconfig +++ b/drivers/s390/net/Kconfig @@ -103,10 +103,11 @@ config CCWGROUP config ISM tristate "Support for ISM vPCI Adapter" depends on PCI + imply SMC default n help Select this option if you want to use the Internal Shared Memory - vPCI Adapter. + vPCI Adapter. The adapter can be used with the SMC network protocol. To compile as a module choose M. The module name is ism. If unsure, choose N. diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index 6df7f377d2f9..81aabbfbbe2c 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -30,7 +30,6 @@ static const struct pci_device_id ism_device_table[] = { MODULE_DEVICE_TABLE(pci, ism_device_table); static debug_info_t *ism_debug_info; -static const struct smcd_ops ism_ops; #define NO_CLIENT 0xff /* must be >= MAX_CLIENTS */ static struct ism_client *clients[MAX_CLIENTS]; /* use an array rather than */ @@ -289,22 +288,6 @@ out: return ret; } -static int ism_query_rgid(struct ism_dev *ism, u64 rgid, u32 vid_valid, - u32 vid) -{ - union ism_query_rgid cmd; - - memset(&cmd, 0, sizeof(cmd)); - cmd.request.hdr.cmd = ISM_QUERY_RGID; - cmd.request.hdr.len = sizeof(cmd.request); - - cmd.request.rgid = rgid; - cmd.request.vlan_valid = vid_valid; - cmd.request.vlan_id = vid; - - return ism_cmd(ism, &cmd); -} - static void ism_free_dmb(struct ism_dev *ism, struct ism_dmb *dmb) { clear_bit(dmb->sba_idx, ism->sba_bitmap); @@ -429,23 +412,6 @@ static int ism_del_vlan_id(struct ism_dev *ism, u64 vlan_id) return ism_cmd(ism, &cmd); } -static int ism_signal_ieq(struct ism_dev *ism, u64 rgid, u32 trigger_irq, - u32 event_code, u64 info) -{ - union ism_sig_ieq cmd; - - memset(&cmd, 0, sizeof(cmd)); - cmd.request.hdr.cmd = ISM_SIGNAL_IEQ; - cmd.request.hdr.len = sizeof(cmd.request); - - cmd.request.rgid = rgid; - cmd.request.trigger_irq = trigger_irq; - cmd.request.event_code = event_code; - cmd.request.info = info; - - return ism_cmd(ism, &cmd); -} - static unsigned int max_bytes(unsigned int start, unsigned int len, unsigned int boundary) { @@ -503,14 +469,6 @@ u8 *ism_get_seid(void) } EXPORT_SYMBOL_GPL(ism_get_seid); -static u16 ism_get_chid(struct ism_dev *ism) -{ - if (!ism || !ism->pdev) - return 0; - - return to_zpci(ism->pdev)->pchid; -} - static void ism_handle_event(struct ism_dev *ism) { struct ism_event *entry; @@ -569,11 +527,6 @@ static irqreturn_t ism_handle_irq(int irq, void *data) return IRQ_HANDLED; } -static u64 ism_get_local_gid(struct ism_dev *ism) -{ - return ism->local_gid; -} - static int ism_dev_init(struct ism_dev *ism) { struct pci_dev *pdev = ism->pdev; @@ -774,6 +727,22 @@ module_exit(ism_exit); /*************************** SMC-D Implementation *****************************/ #if IS_ENABLED(CONFIG_SMC) +static int ism_query_rgid(struct ism_dev *ism, u64 rgid, u32 vid_valid, + u32 vid) +{ + union ism_query_rgid cmd; + + memset(&cmd, 0, sizeof(cmd)); + cmd.request.hdr.cmd = ISM_QUERY_RGID; + cmd.request.hdr.len = sizeof(cmd.request); + + cmd.request.rgid = rgid; + cmd.request.vlan_valid = vid_valid; + cmd.request.vlan_id = vid; + + return ism_cmd(ism, &cmd); +} + static int smcd_query_rgid(struct smcd_dev *smcd, u64 rgid, u32 vid_valid, u32 vid) { @@ -811,6 +780,23 @@ static int smcd_reset_vlan_required(struct smcd_dev *smcd) return ism_cmd_simple(smcd->priv, ISM_RESET_VLAN); } +static int ism_signal_ieq(struct ism_dev *ism, u64 rgid, u32 trigger_irq, + u32 event_code, u64 info) +{ + union ism_sig_ieq cmd; + + memset(&cmd, 0, sizeof(cmd)); + cmd.request.hdr.cmd = ISM_SIGNAL_IEQ; + cmd.request.hdr.len = sizeof(cmd.request); + + cmd.request.rgid = rgid; + cmd.request.trigger_irq = trigger_irq; + cmd.request.event_code = event_code; + cmd.request.info = info; + + return ism_cmd(ism, &cmd); +} + static int smcd_signal_ieq(struct smcd_dev *smcd, u64 rgid, u32 trigger_irq, u32 event_code, u64 info) { @@ -830,11 +816,24 @@ static int smcd_supports_v2(void) SYSTEM_EID.type[0] != '0'; } +static u64 ism_get_local_gid(struct ism_dev *ism) +{ + return ism->local_gid; +} + static u64 smcd_get_local_gid(struct smcd_dev *smcd) { return ism_get_local_gid(smcd->priv); } +static u16 ism_get_chid(struct ism_dev *ism) +{ + if (!ism || !ism->pdev) + return 0; + + return to_zpci(ism->pdev)->pchid; +} + static u16 smcd_get_chid(struct smcd_dev *smcd) { return ism_get_chid(smcd->priv); -- cgit From 75a50c4f5b957a34dedb7c8cce52b582a9162828 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 16 Nov 2023 18:01:41 +0100 Subject: kselftest: rtnetlink: fix ip route command typo The blamed commit below introduced a typo causing 'gretap' test-case failures: ./rtnetlink.sh -t kci_test_gretap -v COMMAND: ip link add name test-dummy0 type dummy COMMAND: ip link set test-dummy0 up COMMAND: ip netns add testns COMMAND: ip link help gretap 2>&1 | grep -q '^Usage:' COMMAND: ip -netns testns link add dev gretap00 type gretap seq key 102 local 172.16.1.100 remote 172.16.1.200 COMMAND: ip -netns testns addr add dev gretap00 10.1.1.100/24 COMMAND: ip -netns testns link set dev gretap00 ups Error: either "dev" is duplicate, or "ups" is a garbage. COMMAND: ip -netns testns link del gretap00 COMMAND: ip -netns testns link add dev gretap00 type gretap external COMMAND: ip -netns testns link del gretap00 FAIL: gretap Fix it by using the correct keyword. Fixes: 9c2a19f71515 ("kselftest: rtnetlink.sh: add verbose flag") Signed-off-by: Paolo Abeni Reviewed-by: Hangbin Liu Signed-off-by: David S. Miller --- tools/testing/selftests/net/rtnetlink.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/rtnetlink.sh b/tools/testing/selftests/net/rtnetlink.sh index 5f2b3f6c0d74..38be9706c45f 100755 --- a/tools/testing/selftests/net/rtnetlink.sh +++ b/tools/testing/selftests/net/rtnetlink.sh @@ -859,7 +859,7 @@ kci_test_gretap() run_cmd ip -netns "$testns" addr add dev "$DEV_NS" 10.1.1.100/24 - run_cmd ip -netns "$testns" link set dev $DEV_NS ups + run_cmd ip -netns "$testns" link set dev $DEV_NS up run_cmd ip -netns "$testns" link del "$DEV_NS" # test external mode -- cgit From 3798680f2fbbe0ca3ab6138b34e0d161c36497ee Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Nov 2023 13:12:58 +0000 Subject: rxrpc: Fix RTT determination to use any ACK as a source Fix RTT determination to be able to use any type of ACK as the response from which RTT can be calculated provided its ack.serial is non-zero and matches the serial number of an outgoing DATA or ACK packet. This shouldn't be limited to REQUESTED-type ACKs as these can have other types substituted for them for things like duplicate or out-of-order packets. Fixes: 4700c4d80b7b ("rxrpc: Fix loss of RTT samples due to interposed ACK") Signed-off-by: David Howells cc: Marc Dionne cc: "David S. Miller" cc: Eric Dumazet cc: Jakub Kicinski cc: Paolo Abeni cc: linux-afs@lists.infradead.org cc: netdev@vger.kernel.org Signed-off-by: David S. Miller --- include/trace/events/rxrpc.h | 2 +- net/rxrpc/input.c | 35 ++++++++++++++++------------------- 2 files changed, 17 insertions(+), 20 deletions(-) diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 4c53a5ef6257..f7e537f64db4 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -328,7 +328,7 @@ E_(rxrpc_rtt_tx_ping, "PING") #define rxrpc_rtt_rx_traces \ - EM(rxrpc_rtt_rx_cancel, "CNCL") \ + EM(rxrpc_rtt_rx_other_ack, "OACK") \ EM(rxrpc_rtt_rx_obsolete, "OBSL") \ EM(rxrpc_rtt_rx_lost, "LOST") \ EM(rxrpc_rtt_rx_ping_response, "PONG") \ diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 030d64f282f3..3f9594d12519 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -643,12 +643,8 @@ static void rxrpc_complete_rtt_probe(struct rxrpc_call *call, clear_bit(i + RXRPC_CALL_RTT_PEND_SHIFT, &call->rtt_avail); smp_mb(); /* Read data before setting avail bit */ set_bit(i, &call->rtt_avail); - if (type != rxrpc_rtt_rx_cancel) - rxrpc_peer_add_rtt(call, type, i, acked_serial, ack_serial, - sent_at, resp_time); - else - trace_rxrpc_rtt_rx(call, rxrpc_rtt_rx_cancel, i, - orig_serial, acked_serial, 0, 0); + rxrpc_peer_add_rtt(call, type, i, acked_serial, ack_serial, + sent_at, resp_time); matched = true; } @@ -801,20 +797,21 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) summary.ack_reason, nr_acks); rxrpc_inc_stat(call->rxnet, stat_rx_acks[ack.reason]); - switch (ack.reason) { - case RXRPC_ACK_PING_RESPONSE: - rxrpc_complete_rtt_probe(call, skb->tstamp, acked_serial, ack_serial, - rxrpc_rtt_rx_ping_response); - break; - case RXRPC_ACK_REQUESTED: - rxrpc_complete_rtt_probe(call, skb->tstamp, acked_serial, ack_serial, - rxrpc_rtt_rx_requested_ack); - break; - default: - if (acked_serial != 0) + if (acked_serial != 0) { + switch (ack.reason) { + case RXRPC_ACK_PING_RESPONSE: rxrpc_complete_rtt_probe(call, skb->tstamp, acked_serial, ack_serial, - rxrpc_rtt_rx_cancel); - break; + rxrpc_rtt_rx_ping_response); + break; + case RXRPC_ACK_REQUESTED: + rxrpc_complete_rtt_probe(call, skb->tstamp, acked_serial, ack_serial, + rxrpc_rtt_rx_requested_ack); + break; + default: + rxrpc_complete_rtt_probe(call, skb->tstamp, acked_serial, ack_serial, + rxrpc_rtt_rx_other_ack); + break; + } } if (ack.reason == RXRPC_ACK_PING) { -- cgit From 1a01319feef7047aa2ba400ffa3e047776aa29ca Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Nov 2023 13:12:59 +0000 Subject: rxrpc: Defer the response to a PING ACK until we've parsed it Defer the generation of a PING RESPONSE ACK in response to a PING ACK until we've parsed the PING ACK so that we pick up any changes to the packet queue so that we can update ackinfo. This is also applied to an ACK generated in response to an ACK with the REQUEST_ACK flag set. Note that whilst the problem was added in commit 248f219cb8bc, it didn't really matter at that point because the ACK was proposed in softirq mode and generated asynchronously later in process context, taking the latest values at the time. But this fix is only needed since the move to parse incoming packets in an I/O thread rather than in softirq and generate the ACK at point of proposal (b0346843b1076b34a0278ff601f8f287535cb064). Fixes: 248f219cb8bc ("rxrpc: Rewrite the data and ack handling code") Signed-off-by: David Howells cc: Marc Dionne cc: "David S. Miller" cc: Eric Dumazet cc: Jakub Kicinski cc: Paolo Abeni cc: linux-afs@lists.infradead.org cc: netdev@vger.kernel.org Signed-off-by: David S. Miller --- net/rxrpc/input.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 3f9594d12519..92495e73b869 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -814,14 +814,6 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) } } - if (ack.reason == RXRPC_ACK_PING) { - rxrpc_send_ACK(call, RXRPC_ACK_PING_RESPONSE, ack_serial, - rxrpc_propose_ack_respond_to_ping); - } else if (sp->hdr.flags & RXRPC_REQUEST_ACK) { - rxrpc_send_ACK(call, RXRPC_ACK_REQUESTED, ack_serial, - rxrpc_propose_ack_respond_to_ack); - } - /* If we get an EXCEEDS_WINDOW ACK from the server, it probably * indicates that the client address changed due to NAT. The server * lost the call because it switched to a different peer. @@ -832,7 +824,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) rxrpc_is_client_call(call)) { rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED, 0, -ENETRESET); - return; + goto send_response; } /* If we get an OUT_OF_SEQUENCE ACK from the server, that can also @@ -846,7 +838,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) rxrpc_is_client_call(call)) { rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED, 0, -ENETRESET); - return; + goto send_response; } /* Discard any out-of-order or duplicate ACKs (outside lock). */ @@ -854,7 +846,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) trace_rxrpc_rx_discard_ack(call->debug_id, ack_serial, first_soft_ack, call->acks_first_seq, prev_pkt, call->acks_prev_seq); - return; + goto send_response; } info.rxMTU = 0; @@ -894,7 +886,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) case RXRPC_CALL_SERVER_AWAIT_ACK: break; default: - return; + goto send_response; } if (before(hard_ack, call->acks_hard_ack) || @@ -906,7 +898,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) if (after(hard_ack, call->acks_hard_ack)) { if (rxrpc_rotate_tx_window(call, hard_ack, &summary)) { rxrpc_end_tx_phase(call, false, rxrpc_eproto_unexpected_ack); - return; + goto send_response; } } @@ -924,6 +916,14 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) rxrpc_propose_ack_ping_for_lost_reply); rxrpc_congestion_management(call, skb, &summary, acked_serial); + +send_response: + if (ack.reason == RXRPC_ACK_PING) + rxrpc_send_ACK(call, RXRPC_ACK_PING_RESPONSE, ack_serial, + rxrpc_propose_ack_respond_to_ping); + else if (sp->hdr.flags & RXRPC_REQUEST_ACK) + rxrpc_send_ACK(call, RXRPC_ACK_REQUESTED, ack_serial, + rxrpc_propose_ack_respond_to_ack); } /* -- cgit From 76df934c6d5f5c93ba7a0112b1818620ddc10b19 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 16 Nov 2023 12:11:51 -0800 Subject: MAINTAINERS: Add netdev subsystem profile link The netdev subsystem has had a subsystem process document for a while now. Link it appropriately in MAINTAINERS with the P: tag. Cc: Jakub Kicinski Cc: "David S. Miller" Cc: Eric Dumazet Cc: Paolo Abeni Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 482d428472e7..80a95878c61d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14994,6 +14994,7 @@ M: Jakub Kicinski M: Paolo Abeni L: netdev@vger.kernel.org S: Maintained +P: Documentation/process/maintainer-netdev.rst Q: https://patchwork.kernel.org/project/netdevbpf/list/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git T: git git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git @@ -15045,6 +15046,7 @@ M: Jakub Kicinski M: Paolo Abeni L: netdev@vger.kernel.org S: Maintained +P: Documentation/process/maintainer-netdev.rst Q: https://patchwork.kernel.org/project/netdevbpf/list/ B: mailto:netdev@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git -- cgit From 8ba2c459668cfe2aaacc5ebcd35b4b9ef8643013 Mon Sep 17 00:00:00 2001 From: Jiawen Wu Date: Fri, 17 Nov 2023 18:11:08 +0800 Subject: net: wangxun: fix kernel panic due to null pointer When the device uses a custom subsystem vendor ID, the function wx_sw_init() returns before the memory of 'wx->mac_table' is allocated. The null pointer will causes the kernel panic. Fixes: 79625f45ca73 ("net: wangxun: Move MAC address handling to libwx") Signed-off-by: Jiawen Wu Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/wangxun/libwx/wx_hw.c | 8 +++++--- drivers/net/ethernet/wangxun/ngbe/ngbe_main.c | 4 +--- drivers/net/ethernet/wangxun/txgbe/txgbe_main.c | 4 +--- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/wangxun/libwx/wx_hw.c b/drivers/net/ethernet/wangxun/libwx/wx_hw.c index a3c5de9d547a..533e912af089 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_hw.c +++ b/drivers/net/ethernet/wangxun/libwx/wx_hw.c @@ -1769,10 +1769,12 @@ int wx_sw_init(struct wx *wx) wx->subsystem_device_id = pdev->subsystem_device; } else { err = wx_flash_read_dword(wx, 0xfffdc, &ssid); - if (!err) - wx->subsystem_device_id = swab16((u16)ssid); + if (err < 0) { + wx_err(wx, "read of internal subsystem device id failed\n"); + return err; + } - return err; + wx->subsystem_device_id = swab16((u16)ssid); } wx->mac_table = kcalloc(wx->mac.num_rar_entries, diff --git a/drivers/net/ethernet/wangxun/ngbe/ngbe_main.c b/drivers/net/ethernet/wangxun/ngbe/ngbe_main.c index 3d43f808c86b..8db804543e66 100644 --- a/drivers/net/ethernet/wangxun/ngbe/ngbe_main.c +++ b/drivers/net/ethernet/wangxun/ngbe/ngbe_main.c @@ -121,10 +121,8 @@ static int ngbe_sw_init(struct wx *wx) /* PCI config space info */ err = wx_sw_init(wx); - if (err < 0) { - wx_err(wx, "read of internal subsystem device id failed\n"); + if (err < 0) return err; - } /* mac type, phy type , oem type */ ngbe_init_type_code(wx); diff --git a/drivers/net/ethernet/wangxun/txgbe/txgbe_main.c b/drivers/net/ethernet/wangxun/txgbe/txgbe_main.c index 70f0b5c01dac..526250102db2 100644 --- a/drivers/net/ethernet/wangxun/txgbe/txgbe_main.c +++ b/drivers/net/ethernet/wangxun/txgbe/txgbe_main.c @@ -364,10 +364,8 @@ static int txgbe_sw_init(struct wx *wx) /* PCI config space info */ err = wx_sw_init(wx); - if (err < 0) { - wx_err(wx, "read of internal subsystem device id failed\n"); + if (err < 0) return err; - } txgbe_init_type_code(wx); -- cgit From 93da8d75a66568ba4bb5b14ad2833acd7304cd02 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 17 Nov 2023 14:17:33 +0000 Subject: wireguard: use DEV_STATS_INC() wg_xmit() can be called concurrently, KCSAN reported [1] some device stats updates can be lost. Use DEV_STATS_INC() for this unlikely case. [1] BUG: KCSAN: data-race in wg_xmit / wg_xmit read-write to 0xffff888104239160 of 8 bytes by task 1375 on cpu 0: wg_xmit+0x60f/0x680 drivers/net/wireguard/device.c:231 __netdev_start_xmit include/linux/netdevice.h:4918 [inline] netdev_start_xmit include/linux/netdevice.h:4932 [inline] xmit_one net/core/dev.c:3543 [inline] dev_hard_start_xmit+0x11b/0x3f0 net/core/dev.c:3559 ... read-write to 0xffff888104239160 of 8 bytes by task 1378 on cpu 1: wg_xmit+0x60f/0x680 drivers/net/wireguard/device.c:231 __netdev_start_xmit include/linux/netdevice.h:4918 [inline] netdev_start_xmit include/linux/netdevice.h:4932 [inline] xmit_one net/core/dev.c:3543 [inline] dev_hard_start_xmit+0x11b/0x3f0 net/core/dev.c:3559 ... v2: also change wg_packet_consume_data_done() (Hangbin Liu) and wg_packet_purge_staged_packets() Fixes: e7096c131e51 ("net: WireGuard secure network tunnel") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Jason A. Donenfeld Cc: Hangbin Liu Signed-off-by: Jason A. Donenfeld Reviewed-by: Hangbin Liu Signed-off-by: David S. Miller --- drivers/net/wireguard/device.c | 4 ++-- drivers/net/wireguard/receive.c | 12 ++++++------ drivers/net/wireguard/send.c | 3 ++- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/drivers/net/wireguard/device.c b/drivers/net/wireguard/device.c index 258dcc103921..deb9636b0ecf 100644 --- a/drivers/net/wireguard/device.c +++ b/drivers/net/wireguard/device.c @@ -210,7 +210,7 @@ static netdev_tx_t wg_xmit(struct sk_buff *skb, struct net_device *dev) */ while (skb_queue_len(&peer->staged_packet_queue) > MAX_STAGED_PACKETS) { dev_kfree_skb(__skb_dequeue(&peer->staged_packet_queue)); - ++dev->stats.tx_dropped; + DEV_STATS_INC(dev, tx_dropped); } skb_queue_splice_tail(&packets, &peer->staged_packet_queue); spin_unlock_bh(&peer->staged_packet_queue.lock); @@ -228,7 +228,7 @@ err_icmp: else if (skb->protocol == htons(ETH_P_IPV6)) icmpv6_ndo_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); err: - ++dev->stats.tx_errors; + DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); return ret; } diff --git a/drivers/net/wireguard/receive.c b/drivers/net/wireguard/receive.c index 0b3f0c843550..a176653c8861 100644 --- a/drivers/net/wireguard/receive.c +++ b/drivers/net/wireguard/receive.c @@ -416,20 +416,20 @@ dishonest_packet_peer: net_dbg_skb_ratelimited("%s: Packet has unallowed src IP (%pISc) from peer %llu (%pISpfsc)\n", dev->name, skb, peer->internal_id, &peer->endpoint.addr); - ++dev->stats.rx_errors; - ++dev->stats.rx_frame_errors; + DEV_STATS_INC(dev, rx_errors); + DEV_STATS_INC(dev, rx_frame_errors); goto packet_processed; dishonest_packet_type: net_dbg_ratelimited("%s: Packet is neither ipv4 nor ipv6 from peer %llu (%pISpfsc)\n", dev->name, peer->internal_id, &peer->endpoint.addr); - ++dev->stats.rx_errors; - ++dev->stats.rx_frame_errors; + DEV_STATS_INC(dev, rx_errors); + DEV_STATS_INC(dev, rx_frame_errors); goto packet_processed; dishonest_packet_size: net_dbg_ratelimited("%s: Packet has incorrect size from peer %llu (%pISpfsc)\n", dev->name, peer->internal_id, &peer->endpoint.addr); - ++dev->stats.rx_errors; - ++dev->stats.rx_length_errors; + DEV_STATS_INC(dev, rx_errors); + DEV_STATS_INC(dev, rx_length_errors); goto packet_processed; packet_processed: dev_kfree_skb(skb); diff --git a/drivers/net/wireguard/send.c b/drivers/net/wireguard/send.c index 95c853b59e1d..0d48e0f4a1ba 100644 --- a/drivers/net/wireguard/send.c +++ b/drivers/net/wireguard/send.c @@ -333,7 +333,8 @@ err: void wg_packet_purge_staged_packets(struct wg_peer *peer) { spin_lock_bh(&peer->staged_packet_queue.lock); - peer->device->dev->stats.tx_dropped += peer->staged_packet_queue.qlen; + DEV_STATS_ADD(peer->device->dev, tx_dropped, + peer->staged_packet_queue.qlen); __skb_queue_purge(&peer->staged_packet_queue); spin_unlock_bh(&peer->staged_packet_queue.lock); } -- cgit From 5f228d7c8a539714c1e9b7e7534f76bb7979f268 Mon Sep 17 00:00:00 2001 From: Suman Ghosh Date: Fri, 17 Nov 2023 16:10:18 +0530 Subject: octeontx2-pf: Fix memory leak during interface down During 'ifconfig down' one RSS memory was not getting freed. This patch fixes the same. Fixes: 81a4362016e7 ("octeontx2-pf: Add RSS multi group support") Signed-off-by: Suman Ghosh Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c index 91b99fd70361..ba95ac913274 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c @@ -1934,6 +1934,8 @@ int otx2_stop(struct net_device *netdev) /* Clear RSS enable flag */ rss = &pf->hw.rss_info; rss->enable = false; + if (!netif_is_rxfh_configured(netdev)) + kfree(rss->rss_ctx[DEFAULT_RSS_CONTEXT_GROUP]); /* Cleanup Queue IRQ */ vec = pci_irq_vector(pf->pdev, -- cgit From 938dbead34cd139c50c5d51cc58e18ef2f1c3d2c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sat, 18 Nov 2023 19:30:06 -0800 Subject: net: fill in MODULE_DESCRIPTION()s for SOCK_DIAG modules W=1 builds now warn if module is built without a MODULE_DESCRIPTION(). Add descriptions to all the sock diag modules in one fell swoop. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/ipv4/inet_diag.c | 1 + net/ipv4/raw_diag.c | 1 + net/ipv4/tcp_diag.c | 1 + net/ipv4/udp_diag.c | 1 + net/mptcp/mptcp_diag.c | 1 + net/packet/diag.c | 1 + net/sctp/diag.c | 1 + net/smc/smc_diag.c | 1 + net/tipc/diag.c | 1 + net/unix/diag.c | 1 + net/vmw_vsock/diag.c | 1 + net/xdp/xsk_diag.c | 1 + 12 files changed, 12 insertions(+) diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index f01aee832aab..7d0e7aaa71e0 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -1481,5 +1481,6 @@ static void __exit inet_diag_exit(void) module_init(inet_diag_init); module_exit(inet_diag_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("INET/INET6: socket monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2 /* AF_INET */); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 10 /* AF_INET6 */); diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c index 63a40e4b678f..fe2140c8375c 100644 --- a/net/ipv4/raw_diag.c +++ b/net/ipv4/raw_diag.c @@ -257,5 +257,6 @@ static void __exit raw_diag_exit(void) module_init(raw_diag_init); module_exit(raw_diag_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("RAW socket monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-255 /* AF_INET - IPPROTO_RAW */); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 10-255 /* AF_INET6 - IPPROTO_RAW */); diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 01b50fa79189..4cbe4b44425a 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -247,4 +247,5 @@ static void __exit tcp_diag_exit(void) module_init(tcp_diag_init); module_exit(tcp_diag_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TCP socket monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-6 /* AF_INET - IPPROTO_TCP */); diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index de3f2d31f510..dc41a22ee80e 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -296,5 +296,6 @@ static void __exit udp_diag_exit(void) module_init(udp_diag_init); module_exit(udp_diag_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("UDP socket monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-17 /* AF_INET - IPPROTO_UDP */); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-136 /* AF_INET - IPPROTO_UDPLITE */); diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c index 8df1bdb647e2..5409c2ea3f57 100644 --- a/net/mptcp/mptcp_diag.c +++ b/net/mptcp/mptcp_diag.c @@ -245,4 +245,5 @@ static void __exit mptcp_diag_exit(void) module_init(mptcp_diag_init); module_exit(mptcp_diag_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("MPTCP socket monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-262 /* AF_INET - IPPROTO_MPTCP */); diff --git a/net/packet/diag.c b/net/packet/diag.c index f6b200cb3c06..9a7980e3309d 100644 --- a/net/packet/diag.c +++ b/net/packet/diag.c @@ -262,4 +262,5 @@ static void __exit packet_diag_exit(void) module_init(packet_diag_init); module_exit(packet_diag_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("PACKET socket monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 17 /* AF_PACKET */); diff --git a/net/sctp/diag.c b/net/sctp/diag.c index c3d6b92dd386..eb05131ff1dd 100644 --- a/net/sctp/diag.c +++ b/net/sctp/diag.c @@ -527,4 +527,5 @@ static void __exit sctp_diag_exit(void) module_init(sctp_diag_init); module_exit(sctp_diag_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("SCTP socket monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-132); diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index 7ff2152971a5..a584613aca12 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -268,5 +268,6 @@ static void __exit smc_diag_exit(void) module_init(smc_diag_init); module_exit(smc_diag_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("SMC socket monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 43 /* AF_SMC */); MODULE_ALIAS_GENL_FAMILY(SMCR_GENL_FAMILY_NAME); diff --git a/net/tipc/diag.c b/net/tipc/diag.c index 73137f4aeb68..18733451c9e0 100644 --- a/net/tipc/diag.c +++ b/net/tipc/diag.c @@ -113,4 +113,5 @@ module_init(tipc_diag_init); module_exit(tipc_diag_exit); MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("TIPC socket monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, AF_TIPC); diff --git a/net/unix/diag.c b/net/unix/diag.c index 616b55c5b890..bec09a3a1d44 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -339,4 +339,5 @@ static void __exit unix_diag_exit(void) module_init(unix_diag_init); module_exit(unix_diag_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("UNIX socket monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 1 /* AF_LOCAL */); diff --git a/net/vmw_vsock/diag.c b/net/vmw_vsock/diag.c index a2823b1c5e28..2e29994f92ff 100644 --- a/net/vmw_vsock/diag.c +++ b/net/vmw_vsock/diag.c @@ -174,5 +174,6 @@ static void __exit vsock_diag_exit(void) module_init(vsock_diag_init); module_exit(vsock_diag_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("VMware Virtual Sockets monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 40 /* AF_VSOCK */); diff --git a/net/xdp/xsk_diag.c b/net/xdp/xsk_diag.c index 22b36c8143cf..9f8955367275 100644 --- a/net/xdp/xsk_diag.c +++ b/net/xdp/xsk_diag.c @@ -211,4 +211,5 @@ static void __exit xsk_diag_exit(void) module_init(xsk_diag_init); module_exit(xsk_diag_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("XDP socket monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, AF_XDP); -- cgit From 79e0c5be8c73a674c92bd4ba77b75f4f8c91d32e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 14 Nov 2023 01:42:13 +0100 Subject: net, vrf: Move dstats structure to core Just move struct pcpu_dstats out of the vrf into the core, and streamline the field names slightly, so they better align with the {t,l}stats ones. No functional change otherwise. A conversion of the u64s to u64_stats_t could be done at a separate point in future. This move is needed as we are moving the {t,l,d}stats allocation/freeing to the core. Signed-off-by: Daniel Borkmann Reviewed-by: Nikolay Aleksandrov Cc: Jakub Kicinski Cc: David Ahern Link: https://lore.kernel.org/r/20231114004220.6495-2-daniel@iogearbox.net Signed-off-by: Martin KaFai Lau --- drivers/net/vrf.c | 24 +++++++----------------- include/linux/netdevice.h | 10 ++++++++++ 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index db766941b78f..3e6e0fdc3ba7 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -121,22 +121,12 @@ struct net_vrf { int ifindex; }; -struct pcpu_dstats { - u64 tx_pkts; - u64 tx_bytes; - u64 tx_drps; - u64 rx_pkts; - u64 rx_bytes; - u64 rx_drps; - struct u64_stats_sync syncp; -}; - static void vrf_rx_stats(struct net_device *dev, int len) { struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); u64_stats_update_begin(&dstats->syncp); - dstats->rx_pkts++; + dstats->rx_packets++; dstats->rx_bytes += len; u64_stats_update_end(&dstats->syncp); } @@ -161,10 +151,10 @@ static void vrf_get_stats64(struct net_device *dev, do { start = u64_stats_fetch_begin(&dstats->syncp); tbytes = dstats->tx_bytes; - tpkts = dstats->tx_pkts; - tdrops = dstats->tx_drps; + tpkts = dstats->tx_packets; + tdrops = dstats->tx_drops; rbytes = dstats->rx_bytes; - rpkts = dstats->rx_pkts; + rpkts = dstats->rx_packets; } while (u64_stats_fetch_retry(&dstats->syncp, start)); stats->tx_bytes += tbytes; stats->tx_packets += tpkts; @@ -421,7 +411,7 @@ static int vrf_local_xmit(struct sk_buff *skb, struct net_device *dev, if (likely(__netif_rx(skb) == NET_RX_SUCCESS)) vrf_rx_stats(dev, len); else - this_cpu_inc(dev->dstats->rx_drps); + this_cpu_inc(dev->dstats->rx_drops); return NETDEV_TX_OK; } @@ -616,11 +606,11 @@ static netdev_tx_t vrf_xmit(struct sk_buff *skb, struct net_device *dev) struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); u64_stats_update_begin(&dstats->syncp); - dstats->tx_pkts++; + dstats->tx_packets++; dstats->tx_bytes += len; u64_stats_update_end(&dstats->syncp); } else { - this_cpu_inc(dev->dstats->tx_drps); + this_cpu_inc(dev->dstats->tx_drops); } return ret; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a16c9cc063fe..98082113156e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2755,6 +2755,16 @@ struct pcpu_sw_netstats { struct u64_stats_sync syncp; } __aligned(4 * sizeof(u64)); +struct pcpu_dstats { + u64 rx_packets; + u64 rx_bytes; + u64 rx_drops; + u64 tx_packets; + u64 tx_bytes; + u64 tx_drops; + struct u64_stats_sync syncp; +} __aligned(8 * sizeof(u64)); + struct pcpu_lstats { u64_stats_t packets; u64_stats_t bytes; -- cgit From 34d21de99cea9cb17967874313e5b0262527833c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 14 Nov 2023 01:42:14 +0100 Subject: net: Move {l,t,d}stats allocation to core and convert veth & vrf Move {l,t,d}stats allocation to the core and let netdevs pick the stats type they need. That way the driver doesn't have to bother with error handling (allocation failure checking, making sure free happens in the right spot, etc) - all happening in the core. Co-developed-by: Jakub Kicinski Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Borkmann Reviewed-by: Nikolay Aleksandrov Cc: David Ahern Link: https://lore.kernel.org/r/20231114004220.6495-3-daniel@iogearbox.net Signed-off-by: Martin KaFai Lau --- drivers/net/veth.c | 16 ++-------------- drivers/net/vrf.c | 14 +++----------- include/linux/netdevice.h | 20 +++++++++++++++---- net/core/dev.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 69 insertions(+), 30 deletions(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 9980517ed8b0..ac030c241d1a 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -1506,25 +1506,12 @@ static void veth_free_queues(struct net_device *dev) static int veth_dev_init(struct net_device *dev) { - int err; - - dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats); - if (!dev->lstats) - return -ENOMEM; - - err = veth_alloc_queues(dev); - if (err) { - free_percpu(dev->lstats); - return err; - } - - return 0; + return veth_alloc_queues(dev); } static void veth_dev_free(struct net_device *dev) { veth_free_queues(dev); - free_percpu(dev->lstats); } #ifdef CONFIG_NET_POLL_CONTROLLER @@ -1796,6 +1783,7 @@ static void veth_setup(struct net_device *dev) NETIF_F_HW_VLAN_STAG_RX); dev->needs_free_netdev = true; dev->priv_destructor = veth_dev_free; + dev->pcpu_stat_type = NETDEV_PCPU_STAT_LSTATS; dev->max_mtu = ETH_MAX_MTU; dev->hw_features = VETH_FEATURES; diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 3e6e0fdc3ba7..bb95ce43cd97 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -1164,22 +1164,15 @@ static void vrf_dev_uninit(struct net_device *dev) vrf_rtable_release(dev, vrf); vrf_rt6_release(dev, vrf); - - free_percpu(dev->dstats); - dev->dstats = NULL; } static int vrf_dev_init(struct net_device *dev) { struct net_vrf *vrf = netdev_priv(dev); - dev->dstats = netdev_alloc_pcpu_stats(struct pcpu_dstats); - if (!dev->dstats) - goto out_nomem; - /* create the default dst which points back to us */ if (vrf_rtable_create(dev) != 0) - goto out_stats; + goto out_nomem; if (vrf_rt6_create(dev) != 0) goto out_rth; @@ -1193,9 +1186,6 @@ static int vrf_dev_init(struct net_device *dev) out_rth: vrf_rtable_release(dev, vrf); -out_stats: - free_percpu(dev->dstats); - dev->dstats = NULL; out_nomem: return -ENOMEM; } @@ -1694,6 +1684,8 @@ static void vrf_setup(struct net_device *dev) dev->min_mtu = IPV6_MIN_MTU; dev->max_mtu = IP6_MAX_MTU; dev->mtu = dev->max_mtu; + + dev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS; } static int vrf_validate(struct nlattr *tb[], struct nlattr *data[], diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 98082113156e..2564e209465e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1797,6 +1797,13 @@ enum netdev_ml_priv_type { ML_PRIV_CAN, }; +enum netdev_stat_type { + NETDEV_PCPU_STAT_NONE, + NETDEV_PCPU_STAT_LSTATS, /* struct pcpu_lstats */ + NETDEV_PCPU_STAT_TSTATS, /* struct pcpu_sw_netstats */ + NETDEV_PCPU_STAT_DSTATS, /* struct pcpu_dstats */ +}; + /** * struct net_device - The DEVICE structure. * @@ -1991,10 +1998,14 @@ enum netdev_ml_priv_type { * * @ml_priv: Mid-layer private * @ml_priv_type: Mid-layer private type - * @lstats: Loopback statistics - * @tstats: Tunnel statistics - * @dstats: Dummy statistics - * @vstats: Virtual ethernet statistics + * + * @pcpu_stat_type: Type of device statistics which the core should + * allocate/free: none, lstats, tstats, dstats. none + * means the driver is handling statistics allocation/ + * freeing internally. + * @lstats: Loopback statistics: packets, bytes + * @tstats: Tunnel statistics: RX/TX packets, RX/TX bytes + * @dstats: Dummy statistics: RX/TX/drop packets, RX/TX bytes * * @garp_port: GARP * @mrp_port: MRP @@ -2354,6 +2365,7 @@ struct net_device { void *ml_priv; enum netdev_ml_priv_type ml_priv_type; + enum netdev_stat_type pcpu_stat_type:8; union { struct pcpu_lstats __percpu *lstats; struct pcpu_sw_netstats __percpu *tstats; diff --git a/net/core/dev.c b/net/core/dev.c index af53f6d838ce..0cc6e283edba 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10051,6 +10051,46 @@ void netif_tx_stop_all_queues(struct net_device *dev) } EXPORT_SYMBOL(netif_tx_stop_all_queues); +static int netdev_do_alloc_pcpu_stats(struct net_device *dev) +{ + void __percpu *v; + + switch (dev->pcpu_stat_type) { + case NETDEV_PCPU_STAT_NONE: + return 0; + case NETDEV_PCPU_STAT_LSTATS: + v = dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats); + break; + case NETDEV_PCPU_STAT_TSTATS: + v = dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + break; + case NETDEV_PCPU_STAT_DSTATS: + v = dev->dstats = netdev_alloc_pcpu_stats(struct pcpu_dstats); + break; + default: + return -EINVAL; + } + + return v ? 0 : -ENOMEM; +} + +static void netdev_do_free_pcpu_stats(struct net_device *dev) +{ + switch (dev->pcpu_stat_type) { + case NETDEV_PCPU_STAT_NONE: + return; + case NETDEV_PCPU_STAT_LSTATS: + free_percpu(dev->lstats); + break; + case NETDEV_PCPU_STAT_TSTATS: + free_percpu(dev->tstats); + break; + case NETDEV_PCPU_STAT_DSTATS: + free_percpu(dev->dstats); + break; + } +} + /** * register_netdevice() - register a network device * @dev: device to register @@ -10111,9 +10151,13 @@ int register_netdevice(struct net_device *dev) goto err_uninit; } + ret = netdev_do_alloc_pcpu_stats(dev); + if (ret) + goto err_uninit; + ret = dev_index_reserve(net, dev->ifindex); if (ret < 0) - goto err_uninit; + goto err_free_pcpu; dev->ifindex = ret; /* Transfer changeable features to wanted_features and enable @@ -10219,6 +10263,8 @@ err_uninit_notify: call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev); err_ifindex_release: dev_index_release(net, dev->ifindex); +err_free_pcpu: + netdev_do_free_pcpu_stats(dev); err_uninit: if (dev->netdev_ops->ndo_uninit) dev->netdev_ops->ndo_uninit(dev); @@ -10471,6 +10517,7 @@ void netdev_run_todo(void) WARN_ON(rcu_access_pointer(dev->ip_ptr)); WARN_ON(rcu_access_pointer(dev->ip6_ptr)); + netdev_do_free_pcpu_stats(dev); if (dev->priv_destructor) dev->priv_destructor(dev); if (dev->needs_free_netdev) -- cgit From ae1658272c6491a31ac968e39882fc569f312ac3 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 14 Nov 2023 01:42:15 +0100 Subject: netkit: Add tstats per-CPU traffic counters Add dev->tstats traffic accounting to netkit. The latter contains per-CPU RX and TX counters. The dev's TX counters are bumped upon pass/unspec as well as redirect verdicts, in other words, on everything except for drops. The dev's RX counters are bumped upon successful __netif_rx(), as well as from skb_do_redirect() (not part of this commit here). Using dev->lstats with having just a single packets/bytes counter and inferring one another's RX counters from the peer dev's lstats is not possible given skb_do_redirect() can also bump the device's stats. Signed-off-by: Daniel Borkmann Acked-by: Nikolay Aleksandrov Link: https://lore.kernel.org/r/20231114004220.6495-4-daniel@iogearbox.net Signed-off-by: Martin KaFai Lau --- drivers/net/netkit.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/drivers/net/netkit.c b/drivers/net/netkit.c index 5a0f86f38f09..99de11f9cde5 100644 --- a/drivers/net/netkit.c +++ b/drivers/net/netkit.c @@ -68,6 +68,7 @@ static netdev_tx_t netkit_xmit(struct sk_buff *skb, struct net_device *dev) netdev_tx_t ret_dev = NET_XMIT_SUCCESS; const struct bpf_mprog_entry *entry; struct net_device *peer; + int len = skb->len; rcu_read_lock(); peer = rcu_dereference(nk->peer); @@ -85,15 +86,22 @@ static netdev_tx_t netkit_xmit(struct sk_buff *skb, struct net_device *dev) case NETKIT_PASS: skb->protocol = eth_type_trans(skb, skb->dev); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); - __netif_rx(skb); + if (likely(__netif_rx(skb) == NET_RX_SUCCESS)) { + dev_sw_netstats_tx_add(dev, 1, len); + dev_sw_netstats_rx_add(peer, len); + } else { + goto drop_stats; + } break; case NETKIT_REDIRECT: + dev_sw_netstats_tx_add(dev, 1, len); skb_do_redirect(skb); break; case NETKIT_DROP: default: drop: kfree_skb(skb); +drop_stats: dev_core_stats_tx_dropped_inc(dev); ret_dev = NET_XMIT_DROP; break; @@ -174,6 +182,13 @@ static struct net_device *netkit_peer_dev(struct net_device *dev) return rcu_dereference(netkit_priv(dev)->peer); } +static void netkit_get_stats(struct net_device *dev, + struct rtnl_link_stats64 *stats) +{ + dev_fetch_sw_netstats(stats, dev->tstats); + stats->tx_dropped = DEV_STATS_READ(dev, tx_dropped); +} + static void netkit_uninit(struct net_device *dev); static const struct net_device_ops netkit_netdev_ops = { @@ -184,6 +199,7 @@ static const struct net_device_ops netkit_netdev_ops = { .ndo_set_rx_headroom = netkit_set_headroom, .ndo_get_iflink = netkit_get_iflink, .ndo_get_peer_dev = netkit_peer_dev, + .ndo_get_stats64 = netkit_get_stats, .ndo_uninit = netkit_uninit, .ndo_features_check = passthru_features_check, }; @@ -218,6 +234,7 @@ static void netkit_setup(struct net_device *dev) ether_setup(dev); dev->max_mtu = ETH_MAX_MTU; + dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; dev->flags |= IFF_NOARP; dev->priv_flags &= ~IFF_TX_SKB_SHARING; -- cgit From 6f2684bf2b4460c84d0d34612a939f78b96b03fc Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Tue, 14 Nov 2023 01:42:16 +0100 Subject: veth: Use tstats per-CPU traffic counters Currently veth devices use the lstats per-CPU traffic counters, which only cover TX traffic. veth_get_stats64() actually populates RX stats of a veth device from its peer's TX counters, based on the assumption that a veth device can _only_ receive packets from its peer, which is no longer true: For example, recent CNIs (like Cilium) can use the bpf_redirect_peer() BPF helper to redirect traffic from NIC's tc ingress to veth's tc ingress (in a different netns), skipping veth's peer device. Unfortunately, this kind of traffic isn't currently accounted for in veth's RX stats. In preparation for the fix, use tstats (instead of lstats) to maintain both RX and TX counters for each veth device. We'll use RX counters for bpf_redirect_peer() traffic, and keep using TX counters for the usual "peer-to-peer" traffic. In veth_get_stats64(), calculate RX stats by _adding_ RX count to peer's TX count, in order to cover both kinds of traffic. veth_stats_rx() might need a name change (perhaps to "veth_stats_xdp()") for less confusion, but let's leave it to another patch to keep the fix minimal. Signed-off-by: Peilin Ye Co-developed-by: Daniel Borkmann Signed-off-by: Daniel Borkmann Reviewed-by: Nikolay Aleksandrov Link: https://lore.kernel.org/r/20231114004220.6495-5-daniel@iogearbox.net Signed-off-by: Martin KaFai Lau --- drivers/net/veth.c | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index ac030c241d1a..6cc352296c67 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -373,7 +373,7 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) skb_tx_timestamp(skb); if (likely(veth_forward_skb(rcv, skb, rq, use_napi) == NET_RX_SUCCESS)) { if (!use_napi) - dev_lstats_add(dev, length); + dev_sw_netstats_tx_add(dev, 1, length); else __veth_xdp_flush(rq); } else { @@ -387,14 +387,6 @@ drop: return ret; } -static u64 veth_stats_tx(struct net_device *dev, u64 *packets, u64 *bytes) -{ - struct veth_priv *priv = netdev_priv(dev); - - dev_lstats_read(dev, packets, bytes); - return atomic64_read(&priv->dropped); -} - static void veth_stats_rx(struct veth_stats *result, struct net_device *dev) { struct veth_priv *priv = netdev_priv(dev); @@ -432,24 +424,24 @@ static void veth_get_stats64(struct net_device *dev, struct veth_priv *priv = netdev_priv(dev); struct net_device *peer; struct veth_stats rx; - u64 packets, bytes; - tot->tx_dropped = veth_stats_tx(dev, &packets, &bytes); - tot->tx_bytes = bytes; - tot->tx_packets = packets; + tot->tx_dropped = atomic64_read(&priv->dropped); + dev_fetch_sw_netstats(tot, dev->tstats); veth_stats_rx(&rx, dev); tot->tx_dropped += rx.xdp_tx_err; tot->rx_dropped = rx.rx_drops + rx.peer_tq_xdp_xmit_err; - tot->rx_bytes = rx.xdp_bytes; - tot->rx_packets = rx.xdp_packets; + tot->rx_bytes += rx.xdp_bytes; + tot->rx_packets += rx.xdp_packets; rcu_read_lock(); peer = rcu_dereference(priv->peer); if (peer) { - veth_stats_tx(peer, &packets, &bytes); - tot->rx_bytes += bytes; - tot->rx_packets += packets; + struct rtnl_link_stats64 tot_peer = {}; + + dev_fetch_sw_netstats(&tot_peer, peer->tstats); + tot->rx_bytes += tot_peer.tx_bytes; + tot->rx_packets += tot_peer.tx_packets; veth_stats_rx(&rx, peer); tot->tx_dropped += rx.peer_tq_xdp_xmit_err; @@ -1783,7 +1775,7 @@ static void veth_setup(struct net_device *dev) NETIF_F_HW_VLAN_STAG_RX); dev->needs_free_netdev = true; dev->priv_destructor = veth_dev_free; - dev->pcpu_stat_type = NETDEV_PCPU_STAT_LSTATS; + dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; dev->max_mtu = ETH_MAX_MTU; dev->hw_features = VETH_FEATURES; -- cgit From 024ee930cb3c9ae49e4266aee89cfde0ebb407e1 Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Tue, 14 Nov 2023 01:42:17 +0100 Subject: bpf: Fix dev's rx stats for bpf_redirect_peer traffic Traffic redirected by bpf_redirect_peer() (used by recent CNIs like Cilium) is not accounted for in the RX stats of supported devices (that is, veth and netkit), confusing user space metrics collectors such as cAdvisor [0], as reported by Youlun. Fix it by calling dev_sw_netstats_rx_add() in skb_do_redirect(), to update RX traffic counters. Devices that support ndo_get_peer_dev _must_ use the @tstats per-CPU counters (instead of @lstats, or @dstats). To make this more fool-proof, error out when ndo_get_peer_dev is set but @tstats are not selected. [0] Specifically, the "container_network_receive_{byte,packet}s_total" counters are affected. Fixes: 9aa1206e8f48 ("bpf: Add redirect_peer helper") Reported-by: Youlun Zhang Signed-off-by: Peilin Ye Co-developed-by: Daniel Borkmann Signed-off-by: Daniel Borkmann Reviewed-by: Nikolay Aleksandrov Link: https://lore.kernel.org/r/20231114004220.6495-6-daniel@iogearbox.net Signed-off-by: Martin KaFai Lau --- net/core/dev.c | 8 ++++++++ net/core/filter.c | 1 + 2 files changed, 9 insertions(+) diff --git a/net/core/dev.c b/net/core/dev.c index 0cc6e283edba..c879246be48d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10055,6 +10055,14 @@ static int netdev_do_alloc_pcpu_stats(struct net_device *dev) { void __percpu *v; + /* Drivers implementing ndo_get_peer_dev must support tstat + * accounting, so that skb_do_redirect() can bump the dev's + * RX stats upon network namespace switch. + */ + if (dev->netdev_ops->ndo_get_peer_dev && + dev->pcpu_stat_type != NETDEV_PCPU_STAT_TSTATS) + return -EOPNOTSUPP; + switch (dev->pcpu_stat_type) { case NETDEV_PCPU_STAT_NONE: return 0; diff --git a/net/core/filter.c b/net/core/filter.c index 383f96b0a1c7..cca810987c8d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2492,6 +2492,7 @@ int skb_do_redirect(struct sk_buff *skb) net_eq(net, dev_net(dev)))) goto out_drop; skb->dev = dev; + dev_sw_netstats_rx_add(dev, skb->len); return -EAGAIN; } return flags & BPF_F_NEIGH ? -- cgit From 2c225425704078282e152ba692649237f78b3d7a Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 14 Nov 2023 01:42:18 +0100 Subject: bpf, netkit: Add indirect call wrapper for fetching peer dev ndo_get_peer_dev is used in tcx BPF fast path, therefore make use of indirect call wrapper and therefore optimize the bpf_redirect_peer() internal handling a bit. Add a small skb_get_peer_dev() wrapper which utilizes the INDIRECT_CALL_1() macro instead of open coding. Future work could potentially add a peer pointer directly into struct net_device in future and convert veth and netkit over to use it so that eventually ndo_get_peer_dev can be removed. Co-developed-by: Nikolay Aleksandrov Signed-off-by: Nikolay Aleksandrov Signed-off-by: Daniel Borkmann Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20231114004220.6495-7-daniel@iogearbox.net Signed-off-by: Martin KaFai Lau --- drivers/net/netkit.c | 3 ++- include/net/netkit.h | 6 ++++++ net/core/filter.c | 18 +++++++++++++----- 3 files changed, 21 insertions(+), 6 deletions(-) diff --git a/drivers/net/netkit.c b/drivers/net/netkit.c index 99de11f9cde5..97bd6705c241 100644 --- a/drivers/net/netkit.c +++ b/drivers/net/netkit.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -177,7 +178,7 @@ out: rcu_read_unlock(); } -static struct net_device *netkit_peer_dev(struct net_device *dev) +INDIRECT_CALLABLE_SCOPE struct net_device *netkit_peer_dev(struct net_device *dev) { return rcu_dereference(netkit_priv(dev)->peer); } diff --git a/include/net/netkit.h b/include/net/netkit.h index 0ba2e6b847ca..9ec0163739f4 100644 --- a/include/net/netkit.h +++ b/include/net/netkit.h @@ -10,6 +10,7 @@ int netkit_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog); int netkit_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); int netkit_prog_detach(const union bpf_attr *attr, struct bpf_prog *prog); int netkit_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr); +INDIRECT_CALLABLE_DECLARE(struct net_device *netkit_peer_dev(struct net_device *dev)); #else static inline int netkit_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) @@ -34,5 +35,10 @@ static inline int netkit_prog_query(const union bpf_attr *attr, { return -EINVAL; } + +static inline struct net_device *netkit_peer_dev(struct net_device *dev) +{ + return NULL; +} #endif /* CONFIG_NETKIT */ #endif /* __NET_NETKIT_H */ diff --git a/net/core/filter.c b/net/core/filter.c index cca810987c8d..7e4d7c3bcc84 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -81,6 +81,7 @@ #include #include #include +#include #include #include "dev.h" @@ -2468,6 +2469,16 @@ static const struct bpf_func_proto bpf_clone_redirect_proto = { DEFINE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info); +static struct net_device *skb_get_peer_dev(struct net_device *dev) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (likely(ops->ndo_get_peer_dev)) + return INDIRECT_CALL_1(ops->ndo_get_peer_dev, + netkit_peer_dev, dev); + return NULL; +} + int skb_do_redirect(struct sk_buff *skb) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); @@ -2481,12 +2492,9 @@ int skb_do_redirect(struct sk_buff *skb) if (unlikely(!dev)) goto out_drop; if (flags & BPF_F_PEER) { - const struct net_device_ops *ops = dev->netdev_ops; - - if (unlikely(!ops->ndo_get_peer_dev || - !skb_at_tc_ingress(skb))) + if (unlikely(!skb_at_tc_ingress(skb))) goto out_drop; - dev = ops->ndo_get_peer_dev(dev); + dev = skb_get_peer_dev(dev); if (unlikely(!dev || !(dev->flags & IFF_UP) || net_eq(net, dev_net(dev)))) -- cgit From eee82da79f036bb49ff80d3088b9530e3c2e57eb Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 14 Nov 2023 01:42:19 +0100 Subject: selftests/bpf: De-veth-ize the tc_redirect test case No functional changes to the test case, but just renaming various functions, variables, etc, to remove veth part of their name for making it more generic and reusable later on (e.g. for netkit). Signed-off-by: Daniel Borkmann Acked-by: Stanislav Fomichev Reviewed-by: Nikolay Aleksandrov Link: https://lore.kernel.org/r/20231114004220.6495-8-daniel@iogearbox.net Signed-off-by: Martin KaFai Lau --- .../testing/selftests/bpf/prog_tests/tc_redirect.c | 263 +++++++++++---------- 1 file changed, 137 insertions(+), 126 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c index 6ee22c3b251a..407ff4e9bc78 100644 --- a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c @@ -110,11 +110,16 @@ static void netns_setup_namespaces_nofail(const char *verb) } } +enum dev_mode { + MODE_VETH, +}; + struct netns_setup_result { - int ifindex_veth_src; - int ifindex_veth_src_fwd; - int ifindex_veth_dst; - int ifindex_veth_dst_fwd; + enum dev_mode dev_mode; + int ifindex_src; + int ifindex_src_fwd; + int ifindex_dst; + int ifindex_dst_fwd; }; static int get_ifaddr(const char *name, char *ifaddr) @@ -140,55 +145,59 @@ static int get_ifaddr(const char *name, char *ifaddr) static int netns_setup_links_and_routes(struct netns_setup_result *result) { struct nstoken *nstoken = NULL; - char veth_src_fwd_addr[IFADDR_STR_LEN+1] = {}; + char src_fwd_addr[IFADDR_STR_LEN+1] = {}; - SYS(fail, "ip link add veth_src type veth peer name veth_src_fwd"); - SYS(fail, "ip link add veth_dst type veth peer name veth_dst_fwd"); + if (result->dev_mode == MODE_VETH) { + SYS(fail, "ip link add src type veth peer name src_fwd"); + SYS(fail, "ip link add dst type veth peer name dst_fwd"); - SYS(fail, "ip link set veth_dst_fwd address " MAC_DST_FWD); - SYS(fail, "ip link set veth_dst address " MAC_DST); + SYS(fail, "ip link set dst_fwd address " MAC_DST_FWD); + SYS(fail, "ip link set dst address " MAC_DST); + } - if (get_ifaddr("veth_src_fwd", veth_src_fwd_addr)) + if (get_ifaddr("src_fwd", src_fwd_addr)) goto fail; - result->ifindex_veth_src = if_nametoindex("veth_src"); - if (!ASSERT_GT(result->ifindex_veth_src, 0, "ifindex_veth_src")) + result->ifindex_src = if_nametoindex("src"); + if (!ASSERT_GT(result->ifindex_src, 0, "ifindex_src")) goto fail; - result->ifindex_veth_src_fwd = if_nametoindex("veth_src_fwd"); - if (!ASSERT_GT(result->ifindex_veth_src_fwd, 0, "ifindex_veth_src_fwd")) + result->ifindex_src_fwd = if_nametoindex("src_fwd"); + if (!ASSERT_GT(result->ifindex_src_fwd, 0, "ifindex_src_fwd")) goto fail; - result->ifindex_veth_dst = if_nametoindex("veth_dst"); - if (!ASSERT_GT(result->ifindex_veth_dst, 0, "ifindex_veth_dst")) + result->ifindex_dst = if_nametoindex("dst"); + if (!ASSERT_GT(result->ifindex_dst, 0, "ifindex_dst")) goto fail; - result->ifindex_veth_dst_fwd = if_nametoindex("veth_dst_fwd"); - if (!ASSERT_GT(result->ifindex_veth_dst_fwd, 0, "ifindex_veth_dst_fwd")) + result->ifindex_dst_fwd = if_nametoindex("dst_fwd"); + if (!ASSERT_GT(result->ifindex_dst_fwd, 0, "ifindex_dst_fwd")) goto fail; - SYS(fail, "ip link set veth_src netns " NS_SRC); - SYS(fail, "ip link set veth_src_fwd netns " NS_FWD); - SYS(fail, "ip link set veth_dst_fwd netns " NS_FWD); - SYS(fail, "ip link set veth_dst netns " NS_DST); + SYS(fail, "ip link set src netns " NS_SRC); + SYS(fail, "ip link set src_fwd netns " NS_FWD); + SYS(fail, "ip link set dst_fwd netns " NS_FWD); + SYS(fail, "ip link set dst netns " NS_DST); /** setup in 'src' namespace */ nstoken = open_netns(NS_SRC); if (!ASSERT_OK_PTR(nstoken, "setns src")) goto fail; - SYS(fail, "ip addr add " IP4_SRC "/32 dev veth_src"); - SYS(fail, "ip addr add " IP6_SRC "/128 dev veth_src nodad"); - SYS(fail, "ip link set dev veth_src up"); + SYS(fail, "ip addr add " IP4_SRC "/32 dev src"); + SYS(fail, "ip addr add " IP6_SRC "/128 dev src nodad"); + SYS(fail, "ip link set dev src up"); - SYS(fail, "ip route add " IP4_DST "/32 dev veth_src scope global"); - SYS(fail, "ip route add " IP4_NET "/16 dev veth_src scope global"); - SYS(fail, "ip route add " IP6_DST "/128 dev veth_src scope global"); + SYS(fail, "ip route add " IP4_DST "/32 dev src scope global"); + SYS(fail, "ip route add " IP4_NET "/16 dev src scope global"); + SYS(fail, "ip route add " IP6_DST "/128 dev src scope global"); - SYS(fail, "ip neigh add " IP4_DST " dev veth_src lladdr %s", - veth_src_fwd_addr); - SYS(fail, "ip neigh add " IP6_DST " dev veth_src lladdr %s", - veth_src_fwd_addr); + if (result->dev_mode == MODE_VETH) { + SYS(fail, "ip neigh add " IP4_DST " dev src lladdr %s", + src_fwd_addr); + SYS(fail, "ip neigh add " IP6_DST " dev src lladdr %s", + src_fwd_addr); + } close_netns(nstoken); @@ -201,15 +210,15 @@ static int netns_setup_links_and_routes(struct netns_setup_result *result) * needs v4 one in order to start ARP probing. IP4_NET route is added * to the endpoints so that the ARP processing will reply. */ - SYS(fail, "ip addr add " IP4_SLL "/32 dev veth_src_fwd"); - SYS(fail, "ip addr add " IP4_DLL "/32 dev veth_dst_fwd"); - SYS(fail, "ip link set dev veth_src_fwd up"); - SYS(fail, "ip link set dev veth_dst_fwd up"); + SYS(fail, "ip addr add " IP4_SLL "/32 dev src_fwd"); + SYS(fail, "ip addr add " IP4_DLL "/32 dev dst_fwd"); + SYS(fail, "ip link set dev src_fwd up"); + SYS(fail, "ip link set dev dst_fwd up"); - SYS(fail, "ip route add " IP4_SRC "/32 dev veth_src_fwd scope global"); - SYS(fail, "ip route add " IP6_SRC "/128 dev veth_src_fwd scope global"); - SYS(fail, "ip route add " IP4_DST "/32 dev veth_dst_fwd scope global"); - SYS(fail, "ip route add " IP6_DST "/128 dev veth_dst_fwd scope global"); + SYS(fail, "ip route add " IP4_SRC "/32 dev src_fwd scope global"); + SYS(fail, "ip route add " IP6_SRC "/128 dev src_fwd scope global"); + SYS(fail, "ip route add " IP4_DST "/32 dev dst_fwd scope global"); + SYS(fail, "ip route add " IP6_DST "/128 dev dst_fwd scope global"); close_netns(nstoken); @@ -218,16 +227,18 @@ static int netns_setup_links_and_routes(struct netns_setup_result *result) if (!ASSERT_OK_PTR(nstoken, "setns dst")) goto fail; - SYS(fail, "ip addr add " IP4_DST "/32 dev veth_dst"); - SYS(fail, "ip addr add " IP6_DST "/128 dev veth_dst nodad"); - SYS(fail, "ip link set dev veth_dst up"); + SYS(fail, "ip addr add " IP4_DST "/32 dev dst"); + SYS(fail, "ip addr add " IP6_DST "/128 dev dst nodad"); + SYS(fail, "ip link set dev dst up"); - SYS(fail, "ip route add " IP4_SRC "/32 dev veth_dst scope global"); - SYS(fail, "ip route add " IP4_NET "/16 dev veth_dst scope global"); - SYS(fail, "ip route add " IP6_SRC "/128 dev veth_dst scope global"); + SYS(fail, "ip route add " IP4_SRC "/32 dev dst scope global"); + SYS(fail, "ip route add " IP4_NET "/16 dev dst scope global"); + SYS(fail, "ip route add " IP6_SRC "/128 dev dst scope global"); - SYS(fail, "ip neigh add " IP4_SRC " dev veth_dst lladdr " MAC_DST_FWD); - SYS(fail, "ip neigh add " IP6_SRC " dev veth_dst lladdr " MAC_DST_FWD); + if (result->dev_mode == MODE_VETH) { + SYS(fail, "ip neigh add " IP4_SRC " dev dst lladdr " MAC_DST_FWD); + SYS(fail, "ip neigh add " IP6_SRC " dev dst lladdr " MAC_DST_FWD); + } close_netns(nstoken); @@ -293,23 +304,23 @@ static int netns_load_bpf(const struct bpf_program *src_prog, const struct bpf_program *chk_prog, const struct netns_setup_result *setup_result) { - LIBBPF_OPTS(bpf_tc_hook, qdisc_veth_src_fwd); - LIBBPF_OPTS(bpf_tc_hook, qdisc_veth_dst_fwd); + LIBBPF_OPTS(bpf_tc_hook, qdisc_src_fwd); + LIBBPF_OPTS(bpf_tc_hook, qdisc_dst_fwd); int err; - /* tc qdisc add dev veth_src_fwd clsact */ - QDISC_CLSACT_CREATE(&qdisc_veth_src_fwd, setup_result->ifindex_veth_src_fwd); - /* tc filter add dev veth_src_fwd ingress bpf da src_prog */ - XGRESS_FILTER_ADD(&qdisc_veth_src_fwd, BPF_TC_INGRESS, src_prog, 0); - /* tc filter add dev veth_src_fwd egress bpf da chk_prog */ - XGRESS_FILTER_ADD(&qdisc_veth_src_fwd, BPF_TC_EGRESS, chk_prog, 0); + /* tc qdisc add dev src_fwd clsact */ + QDISC_CLSACT_CREATE(&qdisc_src_fwd, setup_result->ifindex_src_fwd); + /* tc filter add dev src_fwd ingress bpf da src_prog */ + XGRESS_FILTER_ADD(&qdisc_src_fwd, BPF_TC_INGRESS, src_prog, 0); + /* tc filter add dev src_fwd egress bpf da chk_prog */ + XGRESS_FILTER_ADD(&qdisc_src_fwd, BPF_TC_EGRESS, chk_prog, 0); - /* tc qdisc add dev veth_dst_fwd clsact */ - QDISC_CLSACT_CREATE(&qdisc_veth_dst_fwd, setup_result->ifindex_veth_dst_fwd); - /* tc filter add dev veth_dst_fwd ingress bpf da dst_prog */ - XGRESS_FILTER_ADD(&qdisc_veth_dst_fwd, BPF_TC_INGRESS, dst_prog, 0); - /* tc filter add dev veth_dst_fwd egress bpf da chk_prog */ - XGRESS_FILTER_ADD(&qdisc_veth_dst_fwd, BPF_TC_EGRESS, chk_prog, 0); + /* tc qdisc add dev dst_fwd clsact */ + QDISC_CLSACT_CREATE(&qdisc_dst_fwd, setup_result->ifindex_dst_fwd); + /* tc filter add dev dst_fwd ingress bpf da dst_prog */ + XGRESS_FILTER_ADD(&qdisc_dst_fwd, BPF_TC_INGRESS, dst_prog, 0); + /* tc filter add dev dst_fwd egress bpf da chk_prog */ + XGRESS_FILTER_ADD(&qdisc_dst_fwd, BPF_TC_EGRESS, chk_prog, 0); return 0; fail: @@ -539,10 +550,10 @@ done: static int netns_load_dtime_bpf(struct test_tc_dtime *skel, const struct netns_setup_result *setup_result) { - LIBBPF_OPTS(bpf_tc_hook, qdisc_veth_src_fwd); - LIBBPF_OPTS(bpf_tc_hook, qdisc_veth_dst_fwd); - LIBBPF_OPTS(bpf_tc_hook, qdisc_veth_src); - LIBBPF_OPTS(bpf_tc_hook, qdisc_veth_dst); + LIBBPF_OPTS(bpf_tc_hook, qdisc_src_fwd); + LIBBPF_OPTS(bpf_tc_hook, qdisc_dst_fwd); + LIBBPF_OPTS(bpf_tc_hook, qdisc_src); + LIBBPF_OPTS(bpf_tc_hook, qdisc_dst); struct nstoken *nstoken; int err; @@ -550,58 +561,58 @@ static int netns_load_dtime_bpf(struct test_tc_dtime *skel, nstoken = open_netns(NS_SRC); if (!ASSERT_OK_PTR(nstoken, "setns " NS_SRC)) return -1; - /* tc qdisc add dev veth_src clsact */ - QDISC_CLSACT_CREATE(&qdisc_veth_src, setup_result->ifindex_veth_src); - /* tc filter add dev veth_src ingress bpf da ingress_host */ - XGRESS_FILTER_ADD(&qdisc_veth_src, BPF_TC_INGRESS, skel->progs.ingress_host, 0); - /* tc filter add dev veth_src egress bpf da egress_host */ - XGRESS_FILTER_ADD(&qdisc_veth_src, BPF_TC_EGRESS, skel->progs.egress_host, 0); + /* tc qdisc add dev src clsact */ + QDISC_CLSACT_CREATE(&qdisc_src, setup_result->ifindex_src); + /* tc filter add dev src ingress bpf da ingress_host */ + XGRESS_FILTER_ADD(&qdisc_src, BPF_TC_INGRESS, skel->progs.ingress_host, 0); + /* tc filter add dev src egress bpf da egress_host */ + XGRESS_FILTER_ADD(&qdisc_src, BPF_TC_EGRESS, skel->progs.egress_host, 0); close_netns(nstoken); /* setup ns_dst tc progs */ nstoken = open_netns(NS_DST); if (!ASSERT_OK_PTR(nstoken, "setns " NS_DST)) return -1; - /* tc qdisc add dev veth_dst clsact */ - QDISC_CLSACT_CREATE(&qdisc_veth_dst, setup_result->ifindex_veth_dst); - /* tc filter add dev veth_dst ingress bpf da ingress_host */ - XGRESS_FILTER_ADD(&qdisc_veth_dst, BPF_TC_INGRESS, skel->progs.ingress_host, 0); - /* tc filter add dev veth_dst egress bpf da egress_host */ - XGRESS_FILTER_ADD(&qdisc_veth_dst, BPF_TC_EGRESS, skel->progs.egress_host, 0); + /* tc qdisc add dev dst clsact */ + QDISC_CLSACT_CREATE(&qdisc_dst, setup_result->ifindex_dst); + /* tc filter add dev dst ingress bpf da ingress_host */ + XGRESS_FILTER_ADD(&qdisc_dst, BPF_TC_INGRESS, skel->progs.ingress_host, 0); + /* tc filter add dev dst egress bpf da egress_host */ + XGRESS_FILTER_ADD(&qdisc_dst, BPF_TC_EGRESS, skel->progs.egress_host, 0); close_netns(nstoken); /* setup ns_fwd tc progs */ nstoken = open_netns(NS_FWD); if (!ASSERT_OK_PTR(nstoken, "setns " NS_FWD)) return -1; - /* tc qdisc add dev veth_dst_fwd clsact */ - QDISC_CLSACT_CREATE(&qdisc_veth_dst_fwd, setup_result->ifindex_veth_dst_fwd); - /* tc filter add dev veth_dst_fwd ingress prio 100 bpf da ingress_fwdns_prio100 */ - XGRESS_FILTER_ADD(&qdisc_veth_dst_fwd, BPF_TC_INGRESS, + /* tc qdisc add dev dst_fwd clsact */ + QDISC_CLSACT_CREATE(&qdisc_dst_fwd, setup_result->ifindex_dst_fwd); + /* tc filter add dev dst_fwd ingress prio 100 bpf da ingress_fwdns_prio100 */ + XGRESS_FILTER_ADD(&qdisc_dst_fwd, BPF_TC_INGRESS, skel->progs.ingress_fwdns_prio100, 100); - /* tc filter add dev veth_dst_fwd ingress prio 101 bpf da ingress_fwdns_prio101 */ - XGRESS_FILTER_ADD(&qdisc_veth_dst_fwd, BPF_TC_INGRESS, + /* tc filter add dev dst_fwd ingress prio 101 bpf da ingress_fwdns_prio101 */ + XGRESS_FILTER_ADD(&qdisc_dst_fwd, BPF_TC_INGRESS, skel->progs.ingress_fwdns_prio101, 101); - /* tc filter add dev veth_dst_fwd egress prio 100 bpf da egress_fwdns_prio100 */ - XGRESS_FILTER_ADD(&qdisc_veth_dst_fwd, BPF_TC_EGRESS, + /* tc filter add dev dst_fwd egress prio 100 bpf da egress_fwdns_prio100 */ + XGRESS_FILTER_ADD(&qdisc_dst_fwd, BPF_TC_EGRESS, skel->progs.egress_fwdns_prio100, 100); - /* tc filter add dev veth_dst_fwd egress prio 101 bpf da egress_fwdns_prio101 */ - XGRESS_FILTER_ADD(&qdisc_veth_dst_fwd, BPF_TC_EGRESS, + /* tc filter add dev dst_fwd egress prio 101 bpf da egress_fwdns_prio101 */ + XGRESS_FILTER_ADD(&qdisc_dst_fwd, BPF_TC_EGRESS, skel->progs.egress_fwdns_prio101, 101); - /* tc qdisc add dev veth_src_fwd clsact */ - QDISC_CLSACT_CREATE(&qdisc_veth_src_fwd, setup_result->ifindex_veth_src_fwd); - /* tc filter add dev veth_src_fwd ingress prio 100 bpf da ingress_fwdns_prio100 */ - XGRESS_FILTER_ADD(&qdisc_veth_src_fwd, BPF_TC_INGRESS, + /* tc qdisc add dev src_fwd clsact */ + QDISC_CLSACT_CREATE(&qdisc_src_fwd, setup_result->ifindex_src_fwd); + /* tc filter add dev src_fwd ingress prio 100 bpf da ingress_fwdns_prio100 */ + XGRESS_FILTER_ADD(&qdisc_src_fwd, BPF_TC_INGRESS, skel->progs.ingress_fwdns_prio100, 100); - /* tc filter add dev veth_src_fwd ingress prio 101 bpf da ingress_fwdns_prio101 */ - XGRESS_FILTER_ADD(&qdisc_veth_src_fwd, BPF_TC_INGRESS, + /* tc filter add dev src_fwd ingress prio 101 bpf da ingress_fwdns_prio101 */ + XGRESS_FILTER_ADD(&qdisc_src_fwd, BPF_TC_INGRESS, skel->progs.ingress_fwdns_prio101, 101); - /* tc filter add dev veth_src_fwd egress prio 100 bpf da egress_fwdns_prio100 */ - XGRESS_FILTER_ADD(&qdisc_veth_src_fwd, BPF_TC_EGRESS, + /* tc filter add dev src_fwd egress prio 100 bpf da egress_fwdns_prio100 */ + XGRESS_FILTER_ADD(&qdisc_src_fwd, BPF_TC_EGRESS, skel->progs.egress_fwdns_prio100, 100); - /* tc filter add dev veth_src_fwd egress prio 101 bpf da egress_fwdns_prio101 */ - XGRESS_FILTER_ADD(&qdisc_veth_src_fwd, BPF_TC_EGRESS, + /* tc filter add dev src_fwd egress prio 101 bpf da egress_fwdns_prio101 */ + XGRESS_FILTER_ADD(&qdisc_src_fwd, BPF_TC_EGRESS, skel->progs.egress_fwdns_prio101, 101); close_netns(nstoken); return 0; @@ -777,8 +788,8 @@ static void test_tc_redirect_dtime(struct netns_setup_result *setup_result) if (!ASSERT_OK_PTR(skel, "test_tc_dtime__open")) return; - skel->rodata->IFINDEX_SRC = setup_result->ifindex_veth_src_fwd; - skel->rodata->IFINDEX_DST = setup_result->ifindex_veth_dst_fwd; + skel->rodata->IFINDEX_SRC = setup_result->ifindex_src_fwd; + skel->rodata->IFINDEX_DST = setup_result->ifindex_dst_fwd; err = test_tc_dtime__load(skel); if (!ASSERT_OK(err, "test_tc_dtime__load")) @@ -868,8 +879,8 @@ static void test_tc_redirect_neigh(struct netns_setup_result *setup_result) if (!ASSERT_OK_PTR(skel, "test_tc_neigh__open")) goto done; - skel->rodata->IFINDEX_SRC = setup_result->ifindex_veth_src_fwd; - skel->rodata->IFINDEX_DST = setup_result->ifindex_veth_dst_fwd; + skel->rodata->IFINDEX_SRC = setup_result->ifindex_src_fwd; + skel->rodata->IFINDEX_DST = setup_result->ifindex_dst_fwd; err = test_tc_neigh__load(skel); if (!ASSERT_OK(err, "test_tc_neigh__load")) @@ -904,8 +915,8 @@ static void test_tc_redirect_peer(struct netns_setup_result *setup_result) if (!ASSERT_OK_PTR(skel, "test_tc_peer__open")) goto done; - skel->rodata->IFINDEX_SRC = setup_result->ifindex_veth_src_fwd; - skel->rodata->IFINDEX_DST = setup_result->ifindex_veth_dst_fwd; + skel->rodata->IFINDEX_SRC = setup_result->ifindex_src_fwd; + skel->rodata->IFINDEX_DST = setup_result->ifindex_dst_fwd; err = test_tc_peer__load(skel); if (!ASSERT_OK(err, "test_tc_peer__load")) @@ -996,7 +1007,7 @@ static int tun_relay_loop(int src_fd, int target_fd) static void test_tc_redirect_peer_l3(struct netns_setup_result *setup_result) { LIBBPF_OPTS(bpf_tc_hook, qdisc_tun_fwd); - LIBBPF_OPTS(bpf_tc_hook, qdisc_veth_dst_fwd); + LIBBPF_OPTS(bpf_tc_hook, qdisc_dst_fwd); struct test_tc_peer *skel = NULL; struct nstoken *nstoken = NULL; int err; @@ -1045,7 +1056,7 @@ static void test_tc_redirect_peer_l3(struct netns_setup_result *setup_result) goto fail; skel->rodata->IFINDEX_SRC = ifindex; - skel->rodata->IFINDEX_DST = setup_result->ifindex_veth_dst_fwd; + skel->rodata->IFINDEX_DST = setup_result->ifindex_dst_fwd; err = test_tc_peer__load(skel); if (!ASSERT_OK(err, "test_tc_peer__load")) @@ -1053,19 +1064,19 @@ static void test_tc_redirect_peer_l3(struct netns_setup_result *setup_result) /* Load "tc_src_l3" to the tun_fwd interface to redirect packets * towards dst, and "tc_dst" to redirect packets - * and "tc_chk" on veth_dst_fwd to drop non-redirected packets. + * and "tc_chk" on dst_fwd to drop non-redirected packets. */ /* tc qdisc add dev tun_fwd clsact */ QDISC_CLSACT_CREATE(&qdisc_tun_fwd, ifindex); /* tc filter add dev tun_fwd ingress bpf da tc_src_l3 */ XGRESS_FILTER_ADD(&qdisc_tun_fwd, BPF_TC_INGRESS, skel->progs.tc_src_l3, 0); - /* tc qdisc add dev veth_dst_fwd clsact */ - QDISC_CLSACT_CREATE(&qdisc_veth_dst_fwd, setup_result->ifindex_veth_dst_fwd); - /* tc filter add dev veth_dst_fwd ingress bpf da tc_dst_l3 */ - XGRESS_FILTER_ADD(&qdisc_veth_dst_fwd, BPF_TC_INGRESS, skel->progs.tc_dst_l3, 0); - /* tc filter add dev veth_dst_fwd egress bpf da tc_chk */ - XGRESS_FILTER_ADD(&qdisc_veth_dst_fwd, BPF_TC_EGRESS, skel->progs.tc_chk, 0); + /* tc qdisc add dev dst_fwd clsact */ + QDISC_CLSACT_CREATE(&qdisc_dst_fwd, setup_result->ifindex_dst_fwd); + /* tc filter add dev dst_fwd ingress bpf da tc_dst_l3 */ + XGRESS_FILTER_ADD(&qdisc_dst_fwd, BPF_TC_INGRESS, skel->progs.tc_dst_l3, 0); + /* tc filter add dev dst_fwd egress bpf da tc_chk */ + XGRESS_FILTER_ADD(&qdisc_dst_fwd, BPF_TC_EGRESS, skel->progs.tc_chk, 0); /* Setup route and neigh tables */ SYS(fail, "ip -netns " NS_SRC " addr add dev tun_src " IP4_TUN_SRC "/24"); @@ -1074,17 +1085,17 @@ static void test_tc_redirect_peer_l3(struct netns_setup_result *setup_result) SYS(fail, "ip -netns " NS_SRC " addr add dev tun_src " IP6_TUN_SRC "/64 nodad"); SYS(fail, "ip -netns " NS_FWD " addr add dev tun_fwd " IP6_TUN_FWD "/64 nodad"); - SYS(fail, "ip -netns " NS_SRC " route del " IP4_DST "/32 dev veth_src scope global"); + SYS(fail, "ip -netns " NS_SRC " route del " IP4_DST "/32 dev src scope global"); SYS(fail, "ip -netns " NS_SRC " route add " IP4_DST "/32 via " IP4_TUN_FWD " dev tun_src scope global"); - SYS(fail, "ip -netns " NS_DST " route add " IP4_TUN_SRC "/32 dev veth_dst scope global"); - SYS(fail, "ip -netns " NS_SRC " route del " IP6_DST "/128 dev veth_src scope global"); + SYS(fail, "ip -netns " NS_DST " route add " IP4_TUN_SRC "/32 dev dst scope global"); + SYS(fail, "ip -netns " NS_SRC " route del " IP6_DST "/128 dev src scope global"); SYS(fail, "ip -netns " NS_SRC " route add " IP6_DST "/128 via " IP6_TUN_FWD " dev tun_src scope global"); - SYS(fail, "ip -netns " NS_DST " route add " IP6_TUN_SRC "/128 dev veth_dst scope global"); + SYS(fail, "ip -netns " NS_DST " route add " IP6_TUN_SRC "/128 dev dst scope global"); - SYS(fail, "ip -netns " NS_DST " neigh add " IP4_TUN_SRC " dev veth_dst lladdr " MAC_DST_FWD); - SYS(fail, "ip -netns " NS_DST " neigh add " IP6_TUN_SRC " dev veth_dst lladdr " MAC_DST_FWD); + SYS(fail, "ip -netns " NS_DST " neigh add " IP4_TUN_SRC " dev dst lladdr " MAC_DST_FWD); + SYS(fail, "ip -netns " NS_DST " neigh add " IP6_TUN_SRC " dev dst lladdr " MAC_DST_FWD); if (!ASSERT_OK(set_forwarding(false), "disable forwarding")) goto fail; @@ -1106,9 +1117,9 @@ fail: close_netns(nstoken); } -#define RUN_TEST(name) \ +#define RUN_TEST(name, mode) \ ({ \ - struct netns_setup_result setup_result; \ + struct netns_setup_result setup_result = { .dev_mode = mode, }; \ if (test__start_subtest(#name)) \ if (ASSERT_OK(netns_setup_namespaces("add"), "setup namespaces")) { \ if (ASSERT_OK(netns_setup_links_and_routes(&setup_result), \ @@ -1122,11 +1133,11 @@ static void *test_tc_redirect_run_tests(void *arg) { netns_setup_namespaces_nofail("delete"); - RUN_TEST(tc_redirect_peer); - RUN_TEST(tc_redirect_peer_l3); - RUN_TEST(tc_redirect_neigh); - RUN_TEST(tc_redirect_neigh_fib); - RUN_TEST(tc_redirect_dtime); + RUN_TEST(tc_redirect_peer, MODE_VETH); + RUN_TEST(tc_redirect_peer_l3, MODE_VETH); + RUN_TEST(tc_redirect_neigh, MODE_VETH); + RUN_TEST(tc_redirect_neigh_fib, MODE_VETH); + RUN_TEST(tc_redirect_dtime, MODE_VETH); return NULL; } -- cgit From adfeae2d243d9e5b83d094af481d189156b11779 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 14 Nov 2023 01:42:20 +0100 Subject: selftests/bpf: Add netkit to tc_redirect selftest Extend the existing tc_redirect selftest to also cover netkit devices for exercising the bpf_redirect_peer() code paths, so that we have both veth as well as netkit covered, all tests still pass after this change. Signed-off-by: Daniel Borkmann Acked-by: Stanislav Fomichev Reviewed-by: Nikolay Aleksandrov Link: https://lore.kernel.org/r/20231114004220.6495-9-daniel@iogearbox.net Signed-off-by: Martin KaFai Lau --- .../testing/selftests/bpf/prog_tests/tc_redirect.c | 52 ++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c index 407ff4e9bc78..518f143c5b0f 100644 --- a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c @@ -24,6 +24,7 @@ #include "test_progs.h" #include "network_helpers.h" +#include "netlink_helpers.h" #include "test_tc_neigh_fib.skel.h" #include "test_tc_neigh.skel.h" #include "test_tc_peer.skel.h" @@ -112,6 +113,7 @@ static void netns_setup_namespaces_nofail(const char *verb) enum dev_mode { MODE_VETH, + MODE_NETKIT, }; struct netns_setup_result { @@ -142,10 +144,51 @@ static int get_ifaddr(const char *name, char *ifaddr) return 0; } +static int create_netkit(int mode, char *prim, char *peer) +{ + struct rtattr *linkinfo, *data, *peer_info; + struct rtnl_handle rth = { .fd = -1 }; + const char *type = "netkit"; + struct { + struct nlmsghdr n; + struct ifinfomsg i; + char buf[1024]; + } req = {}; + int err; + + err = rtnl_open(&rth, 0); + if (!ASSERT_OK(err, "open_rtnetlink")) + return err; + + memset(&req, 0, sizeof(req)); + req.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)); + req.n.nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL; + req.n.nlmsg_type = RTM_NEWLINK; + req.i.ifi_family = AF_UNSPEC; + + addattr_l(&req.n, sizeof(req), IFLA_IFNAME, prim, strlen(prim)); + linkinfo = addattr_nest(&req.n, sizeof(req), IFLA_LINKINFO); + addattr_l(&req.n, sizeof(req), IFLA_INFO_KIND, type, strlen(type)); + data = addattr_nest(&req.n, sizeof(req), IFLA_INFO_DATA); + addattr32(&req.n, sizeof(req), IFLA_NETKIT_MODE, mode); + peer_info = addattr_nest(&req.n, sizeof(req), IFLA_NETKIT_PEER_INFO); + req.n.nlmsg_len += sizeof(struct ifinfomsg); + addattr_l(&req.n, sizeof(req), IFLA_IFNAME, peer, strlen(peer)); + addattr_nest_end(&req.n, peer_info); + addattr_nest_end(&req.n, data); + addattr_nest_end(&req.n, linkinfo); + + err = rtnl_talk(&rth, &req.n, NULL); + ASSERT_OK(err, "talk_rtnetlink"); + rtnl_close(&rth); + return err; +} + static int netns_setup_links_and_routes(struct netns_setup_result *result) { struct nstoken *nstoken = NULL; char src_fwd_addr[IFADDR_STR_LEN+1] = {}; + int err; if (result->dev_mode == MODE_VETH) { SYS(fail, "ip link add src type veth peer name src_fwd"); @@ -153,6 +196,13 @@ static int netns_setup_links_and_routes(struct netns_setup_result *result) SYS(fail, "ip link set dst_fwd address " MAC_DST_FWD); SYS(fail, "ip link set dst address " MAC_DST); + } else if (result->dev_mode == MODE_NETKIT) { + err = create_netkit(NETKIT_L3, "src", "src_fwd"); + if (!ASSERT_OK(err, "create_ifindex_src")) + goto fail; + err = create_netkit(NETKIT_L3, "dst", "dst_fwd"); + if (!ASSERT_OK(err, "create_ifindex_dst")) + goto fail; } if (get_ifaddr("src_fwd", src_fwd_addr)) @@ -1134,7 +1184,9 @@ static void *test_tc_redirect_run_tests(void *arg) netns_setup_namespaces_nofail("delete"); RUN_TEST(tc_redirect_peer, MODE_VETH); + RUN_TEST(tc_redirect_peer, MODE_NETKIT); RUN_TEST(tc_redirect_peer_l3, MODE_VETH); + RUN_TEST(tc_redirect_peer_l3, MODE_NETKIT); RUN_TEST(tc_redirect_neigh, MODE_VETH); RUN_TEST(tc_redirect_neigh_fib, MODE_VETH); RUN_TEST(tc_redirect_dtime, MODE_VETH); -- cgit From 977bc146d4eb7070118d8a974919b33bb52732b4 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 21 Nov 2023 04:06:51 +0200 Subject: selftests/bpf: track tcp payload offset as scalar in xdp_synproxy This change prepares syncookie_{tc,xdp} for update in callbakcs verification logic. To allow bpf_loop() verification converge when multiple callback itreations are considered: - track offset inside TCP payload explicitly, not as a part of the pointer; - make sure that offset does not exceed MAX_PACKET_OFF enforced by verifier; - make sure that offset is tracked as unbound scalar between iterations, otherwise verifier won't be able infer that bpf_loop callback reaches identical states. Acked-by: Andrii Nakryiko Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20231121020701.26440-2-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/xdp_synproxy_kern.c | 84 +++++++++++++--------- 1 file changed, 52 insertions(+), 32 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c b/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c index e959336c7a73..80f620602d50 100644 --- a/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c +++ b/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c @@ -53,6 +53,8 @@ #define DEFAULT_TTL 64 #define MAX_ALLOWED_PORTS 8 +#define MAX_PACKET_OFF 0xffff + #define swap(a, b) \ do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) @@ -183,63 +185,76 @@ static __always_inline __u32 tcp_clock_ms(void) } struct tcpopt_context { - __u8 *ptr; - __u8 *end; + void *data; void *data_end; __be32 *tsecr; __u8 wscale; bool option_timestamp; bool option_sack; + __u32 off; }; -static int tscookie_tcpopt_parse(struct tcpopt_context *ctx) +static __always_inline u8 *next(struct tcpopt_context *ctx, __u32 sz) { - __u8 opcode, opsize; + __u64 off = ctx->off; + __u8 *data; - if (ctx->ptr >= ctx->end) - return 1; - if (ctx->ptr >= ctx->data_end) - return 1; + /* Verifier forbids access to packet when offset exceeds MAX_PACKET_OFF */ + if (off > MAX_PACKET_OFF - sz) + return NULL; - opcode = ctx->ptr[0]; + data = ctx->data + off; + barrier_var(data); + if (data + sz >= ctx->data_end) + return NULL; - if (opcode == TCPOPT_EOL) - return 1; - if (opcode == TCPOPT_NOP) { - ++ctx->ptr; - return 0; - } + ctx->off += sz; + return data; +} - if (ctx->ptr + 1 >= ctx->end) - return 1; - if (ctx->ptr + 1 >= ctx->data_end) +static int tscookie_tcpopt_parse(struct tcpopt_context *ctx) +{ + __u8 *opcode, *opsize, *wscale, *tsecr; + __u32 off = ctx->off; + + opcode = next(ctx, 1); + if (!opcode) return 1; - opsize = ctx->ptr[1]; - if (opsize < 2) + + if (*opcode == TCPOPT_EOL) return 1; + if (*opcode == TCPOPT_NOP) + return 0; - if (ctx->ptr + opsize > ctx->end) + opsize = next(ctx, 1); + if (!opsize || *opsize < 2) return 1; - switch (opcode) { + switch (*opcode) { case TCPOPT_WINDOW: - if (opsize == TCPOLEN_WINDOW && ctx->ptr + TCPOLEN_WINDOW <= ctx->data_end) - ctx->wscale = ctx->ptr[2] < TCP_MAX_WSCALE ? ctx->ptr[2] : TCP_MAX_WSCALE; + wscale = next(ctx, 1); + if (!wscale) + return 1; + if (*opsize == TCPOLEN_WINDOW) + ctx->wscale = *wscale < TCP_MAX_WSCALE ? *wscale : TCP_MAX_WSCALE; break; case TCPOPT_TIMESTAMP: - if (opsize == TCPOLEN_TIMESTAMP && ctx->ptr + TCPOLEN_TIMESTAMP <= ctx->data_end) { + tsecr = next(ctx, 4); + if (!tsecr) + return 1; + if (*opsize == TCPOLEN_TIMESTAMP) { ctx->option_timestamp = true; /* Client's tsval becomes our tsecr. */ - *ctx->tsecr = get_unaligned((__be32 *)(ctx->ptr + 2)); + *ctx->tsecr = get_unaligned((__be32 *)tsecr); } break; case TCPOPT_SACK_PERM: - if (opsize == TCPOLEN_SACK_PERM) + if (*opsize == TCPOLEN_SACK_PERM) ctx->option_sack = true; break; } - ctx->ptr += opsize; + ctx->off = off + *opsize; return 0; } @@ -256,16 +271,21 @@ static int tscookie_tcpopt_parse_batch(__u32 index, void *context) static __always_inline bool tscookie_init(struct tcphdr *tcp_header, __u16 tcp_len, __be32 *tsval, - __be32 *tsecr, void *data_end) + __be32 *tsecr, void *data, void *data_end) { struct tcpopt_context loop_ctx = { - .ptr = (__u8 *)(tcp_header + 1), - .end = (__u8 *)tcp_header + tcp_len, + .data = data, .data_end = data_end, .tsecr = tsecr, .wscale = TS_OPT_WSCALE_MASK, .option_timestamp = false, .option_sack = false, + /* Note: currently verifier would track .off as unbound scalar. + * In case if verifier would at some point get smarter and + * compute bounded value for this var, beware that it might + * hinder bpf_loop() convergence validation. + */ + .off = (__u8 *)(tcp_header + 1) - (__u8 *)data, }; u32 cookie; @@ -635,7 +655,7 @@ static __always_inline int syncookie_handle_syn(struct header_pointers *hdr, cookie = (__u32)value; if (tscookie_init((void *)hdr->tcp, hdr->tcp_len, - &tsopt_buf[0], &tsopt_buf[1], data_end)) + &tsopt_buf[0], &tsopt_buf[1], data, data_end)) tsopt = tsopt_buf; /* Check that there is enough space for a SYNACK. It also covers -- cgit From 87eb0152bcc102ecbda866978f4e54db5a3be1ef Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 21 Nov 2023 04:06:52 +0200 Subject: selftests/bpf: track string payload offset as scalar in strobemeta This change prepares strobemeta for update in callbacks verification logic. To allow bpf_loop() verification converge when multiple callback iterations are considered: - track offset inside strobemeta_payload->payload directly as scalar value; - at each iteration make sure that remaining strobemeta_payload->payload capacity is sufficient for execution of read_{map,str}_var functions; - make sure that offset is tracked as unbound scalar between iterations, otherwise verifier won't be able infer that bpf_loop callback reaches identical states. Acked-by: Andrii Nakryiko Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20231121020701.26440-3-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/strobemeta.h | 78 ++++++++++++++++---------- 1 file changed, 48 insertions(+), 30 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/strobemeta.h b/tools/testing/selftests/bpf/progs/strobemeta.h index e02cfd380746..40df2cc26eaf 100644 --- a/tools/testing/selftests/bpf/progs/strobemeta.h +++ b/tools/testing/selftests/bpf/progs/strobemeta.h @@ -24,9 +24,11 @@ struct task_struct {}; #define STACK_TABLE_EPOCH_SHIFT 20 #define STROBE_MAX_STR_LEN 1 #define STROBE_MAX_CFGS 32 +#define READ_MAP_VAR_PAYLOAD_CAP \ + ((1 + STROBE_MAX_MAP_ENTRIES * 2) * STROBE_MAX_STR_LEN) #define STROBE_MAX_PAYLOAD \ (STROBE_MAX_STRS * STROBE_MAX_STR_LEN + \ - STROBE_MAX_MAPS * (1 + STROBE_MAX_MAP_ENTRIES * 2) * STROBE_MAX_STR_LEN) + STROBE_MAX_MAPS * READ_MAP_VAR_PAYLOAD_CAP) struct strobe_value_header { /* @@ -355,7 +357,7 @@ static __always_inline uint64_t read_str_var(struct strobemeta_cfg *cfg, size_t idx, void *tls_base, struct strobe_value_generic *value, struct strobemeta_payload *data, - void *payload) + size_t off) { void *location; uint64_t len; @@ -366,7 +368,7 @@ static __always_inline uint64_t read_str_var(struct strobemeta_cfg *cfg, return 0; bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location); - len = bpf_probe_read_user_str(payload, STROBE_MAX_STR_LEN, value->ptr); + len = bpf_probe_read_user_str(&data->payload[off], STROBE_MAX_STR_LEN, value->ptr); /* * if bpf_probe_read_user_str returns error (<0), due to casting to * unsinged int, it will become big number, so next check is @@ -378,14 +380,14 @@ static __always_inline uint64_t read_str_var(struct strobemeta_cfg *cfg, return 0; data->str_lens[idx] = len; - return len; + return off + len; } -static __always_inline void *read_map_var(struct strobemeta_cfg *cfg, - size_t idx, void *tls_base, - struct strobe_value_generic *value, - struct strobemeta_payload *data, - void *payload) +static __always_inline uint64_t read_map_var(struct strobemeta_cfg *cfg, + size_t idx, void *tls_base, + struct strobe_value_generic *value, + struct strobemeta_payload *data, + size_t off) { struct strobe_map_descr* descr = &data->map_descrs[idx]; struct strobe_map_raw map; @@ -397,11 +399,11 @@ static __always_inline void *read_map_var(struct strobemeta_cfg *cfg, location = calc_location(&cfg->map_locs[idx], tls_base); if (!location) - return payload; + return off; bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location); if (bpf_probe_read_user(&map, sizeof(struct strobe_map_raw), value->ptr)) - return payload; + return off; descr->id = map.id; descr->cnt = map.cnt; @@ -410,10 +412,10 @@ static __always_inline void *read_map_var(struct strobemeta_cfg *cfg, data->req_meta_valid = 1; } - len = bpf_probe_read_user_str(payload, STROBE_MAX_STR_LEN, map.tag); + len = bpf_probe_read_user_str(&data->payload[off], STROBE_MAX_STR_LEN, map.tag); if (len <= STROBE_MAX_STR_LEN) { descr->tag_len = len; - payload += len; + off += len; } #ifdef NO_UNROLL @@ -426,22 +428,22 @@ static __always_inline void *read_map_var(struct strobemeta_cfg *cfg, break; descr->key_lens[i] = 0; - len = bpf_probe_read_user_str(payload, STROBE_MAX_STR_LEN, + len = bpf_probe_read_user_str(&data->payload[off], STROBE_MAX_STR_LEN, map.entries[i].key); if (len <= STROBE_MAX_STR_LEN) { descr->key_lens[i] = len; - payload += len; + off += len; } descr->val_lens[i] = 0; - len = bpf_probe_read_user_str(payload, STROBE_MAX_STR_LEN, + len = bpf_probe_read_user_str(&data->payload[off], STROBE_MAX_STR_LEN, map.entries[i].val); if (len <= STROBE_MAX_STR_LEN) { descr->val_lens[i] = len; - payload += len; + off += len; } } - return payload; + return off; } #ifdef USE_BPF_LOOP @@ -455,14 +457,20 @@ struct read_var_ctx { struct strobemeta_payload *data; void *tls_base; struct strobemeta_cfg *cfg; - void *payload; + size_t payload_off; /* value gets mutated */ struct strobe_value_generic *value; enum read_type type; }; -static int read_var_callback(__u32 index, struct read_var_ctx *ctx) +static int read_var_callback(__u64 index, struct read_var_ctx *ctx) { + /* lose precision info for ctx->payload_off, verifier won't track + * double xor, barrier_var() is needed to force clang keep both xors. + */ + ctx->payload_off ^= index; + barrier_var(ctx->payload_off); + ctx->payload_off ^= index; switch (ctx->type) { case READ_INT_VAR: if (index >= STROBE_MAX_INTS) @@ -472,14 +480,18 @@ static int read_var_callback(__u32 index, struct read_var_ctx *ctx) case READ_MAP_VAR: if (index >= STROBE_MAX_MAPS) return 1; - ctx->payload = read_map_var(ctx->cfg, index, ctx->tls_base, - ctx->value, ctx->data, ctx->payload); + if (ctx->payload_off > sizeof(ctx->data->payload) - READ_MAP_VAR_PAYLOAD_CAP) + return 1; + ctx->payload_off = read_map_var(ctx->cfg, index, ctx->tls_base, + ctx->value, ctx->data, ctx->payload_off); break; case READ_STR_VAR: if (index >= STROBE_MAX_STRS) return 1; - ctx->payload += read_str_var(ctx->cfg, index, ctx->tls_base, - ctx->value, ctx->data, ctx->payload); + if (ctx->payload_off > sizeof(ctx->data->payload) - STROBE_MAX_STR_LEN) + return 1; + ctx->payload_off = read_str_var(ctx->cfg, index, ctx->tls_base, + ctx->value, ctx->data, ctx->payload_off); break; } return 0; @@ -501,7 +513,8 @@ static void *read_strobe_meta(struct task_struct *task, pid_t pid = bpf_get_current_pid_tgid() >> 32; struct strobe_value_generic value = {0}; struct strobemeta_cfg *cfg; - void *tls_base, *payload; + size_t payload_off; + void *tls_base; cfg = bpf_map_lookup_elem(&strobemeta_cfgs, &pid); if (!cfg) @@ -509,7 +522,7 @@ static void *read_strobe_meta(struct task_struct *task, data->int_vals_set_mask = 0; data->req_meta_valid = 0; - payload = data->payload; + payload_off = 0; /* * we don't have struct task_struct definition, it should be: * tls_base = (void *)task->thread.fsbase; @@ -522,7 +535,7 @@ static void *read_strobe_meta(struct task_struct *task, .tls_base = tls_base, .value = &value, .data = data, - .payload = payload, + .payload_off = 0, }; int err; @@ -540,6 +553,11 @@ static void *read_strobe_meta(struct task_struct *task, err = bpf_loop(STROBE_MAX_MAPS, read_var_callback, &ctx, 0); if (err != STROBE_MAX_MAPS) return NULL; + + payload_off = ctx.payload_off; + /* this should not really happen, here only to satisfy verifer */ + if (payload_off > sizeof(data->payload)) + payload_off = sizeof(data->payload); #else #ifdef NO_UNROLL #pragma clang loop unroll(disable) @@ -555,7 +573,7 @@ static void *read_strobe_meta(struct task_struct *task, #pragma unroll #endif /* NO_UNROLL */ for (int i = 0; i < STROBE_MAX_STRS; ++i) { - payload += read_str_var(cfg, i, tls_base, &value, data, payload); + payload_off = read_str_var(cfg, i, tls_base, &value, data, payload_off); } #ifdef NO_UNROLL #pragma clang loop unroll(disable) @@ -563,7 +581,7 @@ static void *read_strobe_meta(struct task_struct *task, #pragma unroll #endif /* NO_UNROLL */ for (int i = 0; i < STROBE_MAX_MAPS; ++i) { - payload = read_map_var(cfg, i, tls_base, &value, data, payload); + payload_off = read_map_var(cfg, i, tls_base, &value, data, payload_off); } #endif /* USE_BPF_LOOP */ @@ -571,7 +589,7 @@ static void *read_strobe_meta(struct task_struct *task, * return pointer right after end of payload, so it's possible to * calculate exact amount of useful data that needs to be sent */ - return payload; + return &data->payload[payload_off]; } SEC("raw_tracepoint/kfree_skb") -- cgit From f40bfd1679446b22d321e64a1fa98b7d07d2be08 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 21 Nov 2023 04:06:53 +0200 Subject: selftests/bpf: fix bpf_loop_bench for new callback verification scheme MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a preparatory change. A follow-up patch "bpf: verify callbacks as if they are called unknown number of times" changes logic for callbacks handling. While previously callbacks were verified as a single function call, new scheme takes into account that callbacks could be executed unknown number of times. This has dire implications for bpf_loop_bench: SEC("fentry/" SYS_PREFIX "sys_getpgid") int benchmark(void *ctx) { for (int i = 0; i < 1000; i++) { bpf_loop(nr_loops, empty_callback, NULL, 0); __sync_add_and_fetch(&hits, nr_loops); } return 0; } W/o callbacks change verifier sees it as a 1000 calls to empty_callback(). However, with callbacks change things become exponential: - i=0: state exploring empty_callback is scheduled with i=0 (a); - i=1: state exploring empty_callback is scheduled with i=1; ... - i=999: state exploring empty_callback is scheduled with i=999; - state (a) is popped from stack; - i=1: state exploring empty_callback is scheduled with i=1; ... Avoid this issue by rewriting outer loop as bpf_loop(). Unfortunately, this adds a function call to a loop at runtime, which negatively affects performance: throughput latency before: 149.919 ± 0.168 M ops/s, 6.670 ns/op after : 137.040 ± 0.187 M ops/s, 7.297 ns/op Acked-by: Andrii Nakryiko Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20231121020701.26440-4-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/bpf_loop_bench.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/bpf_loop_bench.c b/tools/testing/selftests/bpf/progs/bpf_loop_bench.c index 4ce76eb064c4..d461746fd3c1 100644 --- a/tools/testing/selftests/bpf/progs/bpf_loop_bench.c +++ b/tools/testing/selftests/bpf/progs/bpf_loop_bench.c @@ -15,13 +15,16 @@ static int empty_callback(__u32 index, void *data) return 0; } +static int outer_loop(__u32 index, void *data) +{ + bpf_loop(nr_loops, empty_callback, NULL, 0); + __sync_add_and_fetch(&hits, nr_loops); + return 0; +} + SEC("fentry/" SYS_PREFIX "sys_getpgid") int benchmark(void *ctx) { - for (int i = 0; i < 1000; i++) { - bpf_loop(nr_loops, empty_callback, NULL, 0); - - __sync_add_and_fetch(&hits, nr_loops); - } + bpf_loop(1000, outer_loop, NULL, 0); return 0; } -- cgit From 683b96f9606ab7308ffb23c46ab43cecdef8a241 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 21 Nov 2023 04:06:54 +0200 Subject: bpf: extract __check_reg_arg() utility function Split check_reg_arg() into two utility functions: - check_reg_arg() operating on registers from current verifier state; - __check_reg_arg() operating on a specific set of registers passed as a parameter; The __check_reg_arg() function would be used by a follow-up change for callbacks handling. Acked-by: Andrii Nakryiko Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20231121020701.26440-5-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6da370a047fe..e6e1bcfe00f5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3439,13 +3439,11 @@ static void mark_insn_zext(struct bpf_verifier_env *env, reg->subreg_def = DEF_NOT_SUBREG; } -static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, - enum reg_arg_type t) +static int __check_reg_arg(struct bpf_verifier_env *env, struct bpf_reg_state *regs, u32 regno, + enum reg_arg_type t) { - struct bpf_verifier_state *vstate = env->cur_state; - struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_insn *insn = env->prog->insnsi + env->insn_idx; - struct bpf_reg_state *reg, *regs = state->regs; + struct bpf_reg_state *reg; bool rw64; if (regno >= MAX_BPF_REG) { @@ -3486,6 +3484,15 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, return 0; } +static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, + enum reg_arg_type t) +{ + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; + + return __check_reg_arg(env, state->regs, regno, t); +} + static void mark_jmp_point(struct bpf_verifier_env *env, int idx) { env->insn_aux_data[idx].jmp_point = true; @@ -9350,7 +9357,7 @@ static void clear_caller_saved_regs(struct bpf_verifier_env *env, /* after the call registers r0 - r5 were scratched */ for (i = 0; i < CALLER_SAVED_REGS; i++) { mark_reg_not_init(env, regs, caller_saved[i]); - check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); + __check_reg_arg(env, regs, caller_saved[i], DST_OP_NO_MARK); } } -- cgit From 58124a98cb8eda69d248d7f1de954c8b2767c945 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 21 Nov 2023 04:06:55 +0200 Subject: bpf: extract setup_func_entry() utility function Move code for simulated stack frame creation to a separate utility function. This function would be used in the follow-up change for callbacks handling. Acked-by: Andrii Nakryiko Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20231121020701.26440-6-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 84 +++++++++++++++++++++++++++++---------------------- 1 file changed, 48 insertions(+), 36 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e6e1bcfe00f5..68ee4803d3a2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9370,11 +9370,10 @@ static int set_callee_state(struct bpf_verifier_env *env, struct bpf_func_state *caller, struct bpf_func_state *callee, int insn_idx); -static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, - int *insn_idx, int subprog, - set_callee_state_fn set_callee_state_cb) +static int setup_func_entry(struct bpf_verifier_env *env, int subprog, int callsite, + set_callee_state_fn set_callee_state_cb, + struct bpf_verifier_state *state) { - struct bpf_verifier_state *state = env->cur_state; struct bpf_func_state *caller, *callee; int err; @@ -9384,13 +9383,53 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn return -E2BIG; } - caller = state->frame[state->curframe]; if (state->frame[state->curframe + 1]) { verbose(env, "verifier bug. Frame %d already allocated\n", state->curframe + 1); return -EFAULT; } + caller = state->frame[state->curframe]; + callee = kzalloc(sizeof(*callee), GFP_KERNEL); + if (!callee) + return -ENOMEM; + state->frame[state->curframe + 1] = callee; + + /* callee cannot access r0, r6 - r9 for reading and has to write + * into its own stack before reading from it. + * callee can read/write into caller's stack + */ + init_func_state(env, callee, + /* remember the callsite, it will be used by bpf_exit */ + callsite, + state->curframe + 1 /* frameno within this callchain */, + subprog /* subprog number within this prog */); + /* Transfer references to the callee */ + err = copy_reference_state(callee, caller); + err = err ?: set_callee_state_cb(env, caller, callee, callsite); + if (err) + goto err_out; + + /* only increment it after check_reg_arg() finished */ + state->curframe++; + + return 0; + +err_out: + free_func_state(callee); + state->frame[state->curframe + 1] = NULL; + return err; +} + +static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, + int *insn_idx, int subprog, + set_callee_state_fn set_callee_state_cb) +{ + struct bpf_verifier_state *state = env->cur_state; + struct bpf_func_state *caller, *callee; + int err; + + caller = state->frame[state->curframe]; err = btf_check_subprog_call(env, subprog, caller->regs); if (err == -EFAULT) return err; @@ -9460,35 +9499,12 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn return 0; } - callee = kzalloc(sizeof(*callee), GFP_KERNEL); - if (!callee) - return -ENOMEM; - state->frame[state->curframe + 1] = callee; - - /* callee cannot access r0, r6 - r9 for reading and has to write - * into its own stack before reading from it. - * callee can read/write into caller's stack - */ - init_func_state(env, callee, - /* remember the callsite, it will be used by bpf_exit */ - *insn_idx /* callsite */, - state->curframe + 1 /* frameno within this callchain */, - subprog /* subprog number within this prog */); - - /* Transfer references to the callee */ - err = copy_reference_state(callee, caller); + err = setup_func_entry(env, subprog, *insn_idx, set_callee_state_cb, state); if (err) - goto err_out; - - err = set_callee_state_cb(env, caller, callee, *insn_idx); - if (err) - goto err_out; + return err; clear_caller_saved_regs(env, caller->regs); - /* only increment it after check_reg_arg() finished */ - state->curframe++; - /* and go analyze first insn of the callee */ *insn_idx = env->subprog_info[subprog].start - 1; @@ -9496,14 +9512,10 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn verbose(env, "caller:\n"); print_verifier_state(env, caller, true); verbose(env, "callee:\n"); - print_verifier_state(env, callee, true); + print_verifier_state(env, state->frame[state->curframe], true); } - return 0; -err_out: - free_func_state(callee); - state->frame[state->curframe + 1] = NULL; - return err; + return 0; } int map_set_for_each_callback_args(struct bpf_verifier_env *env, -- cgit From ab5cfac139ab8576fb54630d4cca23c3e690ee90 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 21 Nov 2023 04:06:56 +0200 Subject: bpf: verify callbacks as if they are called unknown number of times Prior to this patch callbacks were handled as regular function calls, execution of callback body was modeled exactly once. This patch updates callbacks handling logic as follows: - introduces a function push_callback_call() that schedules callback body verification in env->head stack; - updates prepare_func_exit() to reschedule callback body verification upon BPF_EXIT; - as calls to bpf_*_iter_next(), calls to callback invoking functions are marked as checkpoints; - is_state_visited() is updated to stop callback based iteration when some identical parent state is found. Paths with callback function invoked zero times are now verified first, which leads to necessity to modify some selftests: - the following negative tests required adding release/unlock/drop calls to avoid previously masked unrelated error reports: - cb_refs.c:underflow_prog - exceptions_fail.c:reject_rbtree_add_throw - exceptions_fail.c:reject_with_cp_reference - the following precision tracking selftests needed change in expected log trace: - verifier_subprog_precision.c:callback_result_precise (note: r0 precision is no longer propagated inside callback and I think this is a correct behavior) - verifier_subprog_precision.c:parent_callee_saved_reg_precise_with_callback - verifier_subprog_precision.c:parent_stack_slot_precise_with_callback Reported-by: Andrew Werner Closes: https://lore.kernel.org/bpf/CA+vRuzPChFNXmouzGG+wsy=6eMcfr1mFG0F3g7rbg-sedGKW3w@mail.gmail.com/ Acked-by: Andrii Nakryiko Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20231121020701.26440-7-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 5 + kernel/bpf/verifier.c | 274 +++++++++++++-------- tools/testing/selftests/bpf/progs/cb_refs.c | 1 + .../testing/selftests/bpf/progs/exceptions_fail.c | 2 + .../bpf/progs/verifier_subprog_precision.c | 71 ++++-- 5 files changed, 240 insertions(+), 113 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 24213a99cc79..dd326936dd6f 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -400,6 +400,7 @@ struct bpf_verifier_state { struct bpf_idx_pair *jmp_history; u32 jmp_history_cnt; u32 dfs_depth; + u32 callback_unroll_depth; }; #define bpf_get_spilled_reg(slot, frame, mask) \ @@ -511,6 +512,10 @@ struct bpf_insn_aux_data { * this instruction, regardless of any heuristics */ bool force_checkpoint; + /* true if instruction is a call to a helper function that + * accepts callback function as a parameter. + */ + bool calls_callback; }; #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 68ee4803d3a2..a60dfa56ebb3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -547,13 +547,12 @@ static bool is_dynptr_ref_function(enum bpf_func_id func_id) return func_id == BPF_FUNC_dynptr_data; } -static bool is_callback_calling_kfunc(u32 btf_id); +static bool is_sync_callback_calling_kfunc(u32 btf_id); static bool is_bpf_throw_kfunc(struct bpf_insn *insn); -static bool is_callback_calling_function(enum bpf_func_id func_id) +static bool is_sync_callback_calling_function(enum bpf_func_id func_id) { return func_id == BPF_FUNC_for_each_map_elem || - func_id == BPF_FUNC_timer_set_callback || func_id == BPF_FUNC_find_vma || func_id == BPF_FUNC_loop || func_id == BPF_FUNC_user_ringbuf_drain; @@ -564,6 +563,18 @@ static bool is_async_callback_calling_function(enum bpf_func_id func_id) return func_id == BPF_FUNC_timer_set_callback; } +static bool is_callback_calling_function(enum bpf_func_id func_id) +{ + return is_sync_callback_calling_function(func_id) || + is_async_callback_calling_function(func_id); +} + +static bool is_sync_callback_calling_insn(struct bpf_insn *insn) +{ + return (bpf_helper_call(insn) && is_sync_callback_calling_function(insn->imm)) || + (bpf_pseudo_kfunc_call(insn) && is_sync_callback_calling_kfunc(insn->imm)); +} + static bool is_storage_get_function(enum bpf_func_id func_id) { return func_id == BPF_FUNC_sk_storage_get || @@ -1808,6 +1819,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, dst_state->first_insn_idx = src->first_insn_idx; dst_state->last_insn_idx = src->last_insn_idx; dst_state->dfs_depth = src->dfs_depth; + dst_state->callback_unroll_depth = src->callback_unroll_depth; dst_state->used_as_loop_entry = src->used_as_loop_entry; for (i = 0; i <= src->curframe; i++) { dst = dst_state->frame[i]; @@ -3731,6 +3743,8 @@ static void fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask) } } +static bool calls_callback(struct bpf_verifier_env *env, int insn_idx); + /* For given verifier state backtrack_insn() is called from the last insn to * the first insn. Its purpose is to compute a bitmask of registers and * stack slots that needs precision in the parent verifier state. @@ -3906,16 +3920,13 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, return -EFAULT; return 0; } - } else if ((bpf_helper_call(insn) && - is_callback_calling_function(insn->imm) && - !is_async_callback_calling_function(insn->imm)) || - (bpf_pseudo_kfunc_call(insn) && is_callback_calling_kfunc(insn->imm))) { - /* callback-calling helper or kfunc call, which means - * we are exiting from subprog, but unlike the subprog - * call handling above, we shouldn't propagate - * precision of r1-r5 (if any requested), as they are - * not actually arguments passed directly to callback - * subprogs + } else if (is_sync_callback_calling_insn(insn) && idx != subseq_idx - 1) { + /* exit from callback subprog to callback-calling helper or + * kfunc call. Use idx/subseq_idx check to discern it from + * straight line code backtracking. + * Unlike the subprog call handling above, we shouldn't + * propagate precision of r1-r5 (if any requested), as they are + * not actually arguments passed directly to callback subprogs */ if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) { verbose(env, "BUG regs %x\n", bt_reg_mask(bt)); @@ -3950,10 +3961,18 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, } else if (opcode == BPF_EXIT) { bool r0_precise; + /* Backtracking to a nested function call, 'idx' is a part of + * the inner frame 'subseq_idx' is a part of the outer frame. + * In case of a regular function call, instructions giving + * precision to registers R1-R5 should have been found already. + * In case of a callback, it is ok to have R1-R5 marked for + * backtracking, as these registers are set by the function + * invoking callback. + */ + if (subseq_idx >= 0 && calls_callback(env, subseq_idx)) + for (i = BPF_REG_1; i <= BPF_REG_5; i++) + bt_clear_reg(bt, i); if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) { - /* if backtracing was looking for registers R1-R5 - * they should have been found already. - */ verbose(env, "BUG regs %x\n", bt_reg_mask(bt)); WARN_ONCE(1, "verifier backtracking bug"); return -EFAULT; @@ -9421,11 +9440,11 @@ err_out: return err; } -static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, - int *insn_idx, int subprog, - set_callee_state_fn set_callee_state_cb) +static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *insn, + int insn_idx, int subprog, + set_callee_state_fn set_callee_state_cb) { - struct bpf_verifier_state *state = env->cur_state; + struct bpf_verifier_state *state = env->cur_state, *callback_state; struct bpf_func_state *caller, *callee; int err; @@ -9433,44 +9452,22 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn err = btf_check_subprog_call(env, subprog, caller->regs); if (err == -EFAULT) return err; - if (subprog_is_global(env, subprog)) { - if (err) { - verbose(env, "Caller passes invalid args into func#%d\n", - subprog); - return err; - } else { - if (env->log.level & BPF_LOG_LEVEL) - verbose(env, - "Func#%d is global and valid. Skipping.\n", - subprog); - clear_caller_saved_regs(env, caller->regs); - - /* All global functions return a 64-bit SCALAR_VALUE */ - mark_reg_unknown(env, caller->regs, BPF_REG_0); - caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG; - - /* continue with next insn after call */ - return 0; - } - } /* set_callee_state is used for direct subprog calls, but we are * interested in validating only BPF helpers that can call subprogs as * callbacks */ - if (set_callee_state_cb != set_callee_state) { - env->subprog_info[subprog].is_cb = true; - if (bpf_pseudo_kfunc_call(insn) && - !is_callback_calling_kfunc(insn->imm)) { - verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n", - func_id_name(insn->imm), insn->imm); - return -EFAULT; - } else if (!bpf_pseudo_kfunc_call(insn) && - !is_callback_calling_function(insn->imm)) { /* helper */ - verbose(env, "verifier bug: helper %s#%d not marked as callback-calling\n", - func_id_name(insn->imm), insn->imm); - return -EFAULT; - } + env->subprog_info[subprog].is_cb = true; + if (bpf_pseudo_kfunc_call(insn) && + !is_sync_callback_calling_kfunc(insn->imm)) { + verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n", + func_id_name(insn->imm), insn->imm); + return -EFAULT; + } else if (!bpf_pseudo_kfunc_call(insn) && + !is_callback_calling_function(insn->imm)) { /* helper */ + verbose(env, "verifier bug: helper %s#%d not marked as callback-calling\n", + func_id_name(insn->imm), insn->imm); + return -EFAULT; } if (insn->code == (BPF_JMP | BPF_CALL) && @@ -9481,25 +9478,76 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn /* there is no real recursion here. timer callbacks are async */ env->subprog_info[subprog].is_async_cb = true; async_cb = push_async_cb(env, env->subprog_info[subprog].start, - *insn_idx, subprog); + insn_idx, subprog); if (!async_cb) return -EFAULT; callee = async_cb->frame[0]; callee->async_entry_cnt = caller->async_entry_cnt + 1; /* Convert bpf_timer_set_callback() args into timer callback args */ - err = set_callee_state_cb(env, caller, callee, *insn_idx); + err = set_callee_state_cb(env, caller, callee, insn_idx); if (err) return err; + return 0; + } + + /* for callback functions enqueue entry to callback and + * proceed with next instruction within current frame. + */ + callback_state = push_stack(env, env->subprog_info[subprog].start, insn_idx, false); + if (!callback_state) + return -ENOMEM; + + err = setup_func_entry(env, subprog, insn_idx, set_callee_state_cb, + callback_state); + if (err) + return err; + + callback_state->callback_unroll_depth++; + return 0; +} + +static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, + int *insn_idx) +{ + struct bpf_verifier_state *state = env->cur_state; + struct bpf_func_state *caller; + int err, subprog, target_insn; + + target_insn = *insn_idx + insn->imm + 1; + subprog = find_subprog(env, target_insn); + if (subprog < 0) { + verbose(env, "verifier bug. No program starts at insn %d\n", target_insn); + return -EFAULT; + } + + caller = state->frame[state->curframe]; + err = btf_check_subprog_call(env, subprog, caller->regs); + if (err == -EFAULT) + return err; + if (subprog_is_global(env, subprog)) { + if (err) { + verbose(env, "Caller passes invalid args into func#%d\n", subprog); + return err; + } + + if (env->log.level & BPF_LOG_LEVEL) + verbose(env, "Func#%d is global and valid. Skipping.\n", subprog); clear_caller_saved_regs(env, caller->regs); + + /* All global functions return a 64-bit SCALAR_VALUE */ mark_reg_unknown(env, caller->regs, BPF_REG_0); caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG; + /* continue with next insn after call */ return 0; } - err = setup_func_entry(env, subprog, *insn_idx, set_callee_state_cb, state); + /* for regular function entry setup new frame and continue + * from that frame. + */ + err = setup_func_entry(env, subprog, *insn_idx, set_callee_state, state); if (err) return err; @@ -9559,22 +9607,6 @@ static int set_callee_state(struct bpf_verifier_env *env, return 0; } -static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, - int *insn_idx) -{ - int subprog, target_insn; - - target_insn = *insn_idx + insn->imm + 1; - subprog = find_subprog(env, target_insn); - if (subprog < 0) { - verbose(env, "verifier bug. No program starts at insn %d\n", - target_insn); - return -EFAULT; - } - - return __check_func_call(env, insn, insn_idx, subprog, set_callee_state); -} - static int set_map_elem_callback_state(struct bpf_verifier_env *env, struct bpf_func_state *caller, struct bpf_func_state *callee, @@ -9798,6 +9830,11 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) verbose_invalid_scalar(env, r0, &range, "callback return", "R0"); return -EINVAL; } + if (!calls_callback(env, callee->callsite)) { + verbose(env, "BUG: in callback at %d, callsite %d !calls_callback\n", + *insn_idx, callee->callsite); + return -EFAULT; + } } else { /* return to the caller whatever r0 had in the callee */ caller->regs[BPF_REG_0] = *r0; @@ -9815,7 +9852,15 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) return err; } - *insn_idx = callee->callsite + 1; + /* for callbacks like bpf_loop or bpf_for_each_map_elem go back to callsite, + * there function call logic would reschedule callback visit. If iteration + * converges is_state_visited() would prune that visit eventually. + */ + if (callee->in_callback_fn) + *insn_idx = callee->callsite; + else + *insn_idx = callee->callsite + 1; + if (env->log.level & BPF_LOG_LEVEL) { verbose(env, "returning from callee:\n"); print_verifier_state(env, callee, true); @@ -10228,24 +10273,24 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn } break; case BPF_FUNC_for_each_map_elem: - err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, - set_map_elem_callback_state); + err = push_callback_call(env, insn, insn_idx, meta.subprogno, + set_map_elem_callback_state); break; case BPF_FUNC_timer_set_callback: - err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, - set_timer_callback_state); + err = push_callback_call(env, insn, insn_idx, meta.subprogno, + set_timer_callback_state); break; case BPF_FUNC_find_vma: - err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, - set_find_vma_callback_state); + err = push_callback_call(env, insn, insn_idx, meta.subprogno, + set_find_vma_callback_state); break; case BPF_FUNC_snprintf: err = check_bpf_snprintf_call(env, regs); break; case BPF_FUNC_loop: update_loop_inline_state(env, meta.subprogno); - err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, - set_loop_callback_state); + err = push_callback_call(env, insn, insn_idx, meta.subprogno, + set_loop_callback_state); break; case BPF_FUNC_dynptr_from_mem: if (regs[BPF_REG_1].type != PTR_TO_MAP_VALUE) { @@ -10341,8 +10386,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn break; } case BPF_FUNC_user_ringbuf_drain: - err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, - set_user_ringbuf_callback_state); + err = push_callback_call(env, insn, insn_idx, meta.subprogno, + set_user_ringbuf_callback_state); break; } @@ -11230,7 +11275,7 @@ static bool is_bpf_graph_api_kfunc(u32 btf_id) btf_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]; } -static bool is_callback_calling_kfunc(u32 btf_id) +static bool is_sync_callback_calling_kfunc(u32 btf_id) { return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl]; } @@ -11982,6 +12027,21 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -EACCES; } + /* Check the arguments */ + err = check_kfunc_args(env, &meta, insn_idx); + if (err < 0) + return err; + + if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) { + err = push_callback_call(env, insn, insn_idx, meta.subprogno, + set_rbtree_add_callback_state); + if (err) { + verbose(env, "kfunc %s#%d failed callback verification\n", + func_name, meta.func_id); + return err; + } + } + rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta); rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta); @@ -12017,10 +12077,6 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -EINVAL; } - /* Check the arguments */ - err = check_kfunc_args(env, &meta, insn_idx); - if (err < 0) - return err; /* In case of release function, we get register number of refcounted * PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now. */ @@ -12054,16 +12110,6 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } } - if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) { - err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, - set_rbtree_add_callback_state); - if (err) { - verbose(env, "kfunc %s#%d failed callback verification\n", - func_name, meta.func_id); - return err; - } - } - if (meta.func_id == special_kfunc_list[KF_bpf_throw]) { if (!bpf_jit_supports_exceptions()) { verbose(env, "JIT does not support calling kfunc %s#%d\n", @@ -15427,6 +15473,15 @@ static bool is_force_checkpoint(struct bpf_verifier_env *env, int insn_idx) return env->insn_aux_data[insn_idx].force_checkpoint; } +static void mark_calls_callback(struct bpf_verifier_env *env, int idx) +{ + env->insn_aux_data[idx].calls_callback = true; +} + +static bool calls_callback(struct bpf_verifier_env *env, int insn_idx) +{ + return env->insn_aux_data[insn_idx].calls_callback; +} enum { DONE_EXPLORING = 0, @@ -15540,6 +15595,21 @@ static int visit_insn(int t, struct bpf_verifier_env *env) * async state will be pushed for further exploration. */ mark_prune_point(env, t); + /* For functions that invoke callbacks it is not known how many times + * callback would be called. Verifier models callback calling functions + * by repeatedly visiting callback bodies and returning to origin call + * instruction. + * In order to stop such iteration verifier needs to identify when a + * state identical some state from a previous iteration is reached. + * Check below forces creation of checkpoint before callback calling + * instruction to allow search for such identical states. + */ + if (is_sync_callback_calling_insn(insn)) { + mark_calls_callback(env, t); + mark_force_checkpoint(env, t); + mark_prune_point(env, t); + mark_jmp_point(env, t); + } if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { struct bpf_kfunc_call_arg_meta meta; @@ -17009,10 +17079,16 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) } goto skip_inf_loop_check; } + if (calls_callback(env, insn_idx)) { + if (states_equal(env, &sl->state, cur, true)) + goto hit; + goto skip_inf_loop_check; + } /* attempt to detect infinite loop to avoid unnecessary doomed work */ if (states_maybe_looping(&sl->state, cur) && states_equal(env, &sl->state, cur, false) && - !iter_active_depths_differ(&sl->state, cur)) { + !iter_active_depths_differ(&sl->state, cur) && + sl->state.callback_unroll_depth == cur->callback_unroll_depth) { verbose_linfo(env, insn_idx, "; "); verbose(env, "infinite loop detected at insn %d\n", insn_idx); verbose(env, "cur state:"); diff --git a/tools/testing/selftests/bpf/progs/cb_refs.c b/tools/testing/selftests/bpf/progs/cb_refs.c index 76d661b20e87..56c764df8196 100644 --- a/tools/testing/selftests/bpf/progs/cb_refs.c +++ b/tools/testing/selftests/bpf/progs/cb_refs.c @@ -33,6 +33,7 @@ int underflow_prog(void *ctx) if (!p) return 0; bpf_for_each_map_elem(&array_map, cb1, &p, 0); + bpf_kfunc_call_test_release(p); return 0; } diff --git a/tools/testing/selftests/bpf/progs/exceptions_fail.c b/tools/testing/selftests/bpf/progs/exceptions_fail.c index 4c39e920dac2..8c0ef2742208 100644 --- a/tools/testing/selftests/bpf/progs/exceptions_fail.c +++ b/tools/testing/selftests/bpf/progs/exceptions_fail.c @@ -171,6 +171,7 @@ int reject_with_rbtree_add_throw(void *ctx) return 0; bpf_spin_lock(&lock); bpf_rbtree_add(&rbtree, &f->node, rbless); + bpf_spin_unlock(&lock); return 0; } @@ -214,6 +215,7 @@ int reject_with_cb_reference(void *ctx) if (!f) return 0; bpf_loop(5, subprog_cb_ref, NULL, 0); + bpf_obj_drop(f); return 0; } diff --git a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c index db6b3143338b..da803cffb5ef 100644 --- a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c +++ b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c @@ -119,15 +119,26 @@ __naked int global_subprog_result_precise(void) SEC("?raw_tp") __success __log_level(2) +/* First simulated path does not include callback body */ __msg("14: (0f) r1 += r6") -__msg("mark_precise: frame0: last_idx 14 first_idx 10") +__msg("mark_precise: frame0: last_idx 14 first_idx 9") __msg("mark_precise: frame0: regs=r6 stack= before 13: (bf) r1 = r7") __msg("mark_precise: frame0: regs=r6 stack= before 12: (27) r6 *= 4") __msg("mark_precise: frame0: regs=r6 stack= before 11: (25) if r6 > 0x3 goto pc+4") __msg("mark_precise: frame0: regs=r6 stack= before 10: (bf) r6 = r0") -__msg("mark_precise: frame0: parent state regs=r0 stack=:") -__msg("mark_precise: frame0: last_idx 18 first_idx 0") -__msg("mark_precise: frame0: regs=r0 stack= before 18: (95) exit") +__msg("mark_precise: frame0: regs=r0 stack= before 9: (85) call bpf_loop") +/* State entering callback body popped from states stack */ +__msg("from 9 to 17: frame1:") +__msg("17: frame1: R1=scalar() R2=0 R10=fp0 cb") +__msg("17: (b7) r0 = 0") +__msg("18: (95) exit") +__msg("returning from callee:") +__msg("to caller at 9:") +/* r4 (flags) is always precise for bpf_loop() */ +__msg("frame 0: propagating r4") +__msg("mark_precise: frame0: last_idx 9 first_idx 9 subseq_idx -1") +__msg("mark_precise: frame0: regs=r4 stack= before 18: (95) exit") +__msg("from 18 to 9: safe") __naked int callback_result_precise(void) { asm volatile ( @@ -233,20 +244,36 @@ __naked int parent_callee_saved_reg_precise_global(void) SEC("?raw_tp") __success __log_level(2) +/* First simulated path does not include callback body */ __msg("12: (0f) r1 += r6") -__msg("mark_precise: frame0: last_idx 12 first_idx 10") +__msg("mark_precise: frame0: last_idx 12 first_idx 9") __msg("mark_precise: frame0: regs=r6 stack= before 11: (bf) r1 = r7") __msg("mark_precise: frame0: regs=r6 stack= before 10: (27) r6 *= 4") +__msg("mark_precise: frame0: regs=r6 stack= before 9: (85) call bpf_loop") __msg("mark_precise: frame0: parent state regs=r6 stack=:") -__msg("mark_precise: frame0: last_idx 16 first_idx 0") -__msg("mark_precise: frame0: regs=r6 stack= before 16: (95) exit") -__msg("mark_precise: frame1: regs= stack= before 15: (b7) r0 = 0") -__msg("mark_precise: frame1: regs= stack= before 9: (85) call bpf_loop#181") +__msg("mark_precise: frame0: last_idx 8 first_idx 0 subseq_idx 9") __msg("mark_precise: frame0: regs=r6 stack= before 8: (b7) r4 = 0") __msg("mark_precise: frame0: regs=r6 stack= before 7: (b7) r3 = 0") __msg("mark_precise: frame0: regs=r6 stack= before 6: (bf) r2 = r8") __msg("mark_precise: frame0: regs=r6 stack= before 5: (b7) r1 = 1") __msg("mark_precise: frame0: regs=r6 stack= before 4: (b7) r6 = 3") +/* State entering callback body popped from states stack */ +__msg("from 9 to 15: frame1:") +__msg("15: frame1: R1=scalar() R2=0 R10=fp0 cb") +__msg("15: (b7) r0 = 0") +__msg("16: (95) exit") +__msg("returning from callee:") +__msg("to caller at 9:") +/* r4 (flags) is always precise for bpf_loop(), + * r6 was marked before backtracking to callback body. + */ +__msg("frame 0: propagating r4,r6") +__msg("mark_precise: frame0: last_idx 9 first_idx 9 subseq_idx -1") +__msg("mark_precise: frame0: regs=r4,r6 stack= before 16: (95) exit") +__msg("mark_precise: frame1: regs= stack= before 15: (b7) r0 = 0") +__msg("mark_precise: frame1: regs= stack= before 9: (85) call bpf_loop") +__msg("mark_precise: frame0: parent state regs= stack=:") +__msg("from 16 to 9: safe") __naked int parent_callee_saved_reg_precise_with_callback(void) { asm volatile ( @@ -373,22 +400,38 @@ __naked int parent_stack_slot_precise_global(void) SEC("?raw_tp") __success __log_level(2) +/* First simulated path does not include callback body */ __msg("14: (0f) r1 += r6") -__msg("mark_precise: frame0: last_idx 14 first_idx 11") +__msg("mark_precise: frame0: last_idx 14 first_idx 10") __msg("mark_precise: frame0: regs=r6 stack= before 13: (bf) r1 = r7") __msg("mark_precise: frame0: regs=r6 stack= before 12: (27) r6 *= 4") __msg("mark_precise: frame0: regs=r6 stack= before 11: (79) r6 = *(u64 *)(r10 -8)") +__msg("mark_precise: frame0: regs= stack=-8 before 10: (85) call bpf_loop") __msg("mark_precise: frame0: parent state regs= stack=-8:") -__msg("mark_precise: frame0: last_idx 18 first_idx 0") -__msg("mark_precise: frame0: regs= stack=-8 before 18: (95) exit") -__msg("mark_precise: frame1: regs= stack= before 17: (b7) r0 = 0") -__msg("mark_precise: frame1: regs= stack= before 10: (85) call bpf_loop#181") +__msg("mark_precise: frame0: last_idx 9 first_idx 0 subseq_idx 10") __msg("mark_precise: frame0: regs= stack=-8 before 9: (b7) r4 = 0") __msg("mark_precise: frame0: regs= stack=-8 before 8: (b7) r3 = 0") __msg("mark_precise: frame0: regs= stack=-8 before 7: (bf) r2 = r8") __msg("mark_precise: frame0: regs= stack=-8 before 6: (bf) r1 = r6") __msg("mark_precise: frame0: regs= stack=-8 before 5: (7b) *(u64 *)(r10 -8) = r6") __msg("mark_precise: frame0: regs=r6 stack= before 4: (b7) r6 = 3") +/* State entering callback body popped from states stack */ +__msg("from 10 to 17: frame1:") +__msg("17: frame1: R1=scalar() R2=0 R10=fp0 cb") +__msg("17: (b7) r0 = 0") +__msg("18: (95) exit") +__msg("returning from callee:") +__msg("to caller at 10:") +/* r4 (flags) is always precise for bpf_loop(), + * fp-8 was marked before backtracking to callback body. + */ +__msg("frame 0: propagating r4,fp-8") +__msg("mark_precise: frame0: last_idx 10 first_idx 10 subseq_idx -1") +__msg("mark_precise: frame0: regs=r4 stack=-8 before 18: (95) exit") +__msg("mark_precise: frame1: regs= stack= before 17: (b7) r0 = 0") +__msg("mark_precise: frame1: regs= stack= before 10: (85) call bpf_loop#181") +__msg("mark_precise: frame0: parent state regs= stack=:") +__msg("from 18 to 10: safe") __naked int parent_stack_slot_precise_with_callback(void) { asm volatile ( -- cgit From 958465e217dbf5fc6677d42d8827fb3073d86afd Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 21 Nov 2023 04:06:57 +0200 Subject: selftests/bpf: tests for iterating callbacks A set of test cases to check behavior of callback handling logic, check if verifier catches the following situations: - program not safe on second callback iteration; - program not safe on zero callback iterations; - infinite loop inside a callback. Verify that callback logic works for bpf_loop, bpf_for_each_map_elem, bpf_user_ringbuf_drain, bpf_find_vma. Acked-by: Andrii Nakryiko Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20231121020701.26440-8-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/verifier.c | 2 + .../bpf/progs/verifier_iterating_callbacks.c | 147 +++++++++++++++++++++ 2 files changed, 149 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index e5c61aa6604a..5cfa7a6316b6 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -31,6 +31,7 @@ #include "verifier_helper_restricted.skel.h" #include "verifier_helper_value_access.skel.h" #include "verifier_int_ptr.skel.h" +#include "verifier_iterating_callbacks.skel.h" #include "verifier_jeq_infer_not_null.skel.h" #include "verifier_ld_ind.skel.h" #include "verifier_ldsx.skel.h" @@ -139,6 +140,7 @@ void test_verifier_helper_packet_access(void) { RUN(verifier_helper_packet_acces void test_verifier_helper_restricted(void) { RUN(verifier_helper_restricted); } void test_verifier_helper_value_access(void) { RUN(verifier_helper_value_access); } void test_verifier_int_ptr(void) { RUN(verifier_int_ptr); } +void test_verifier_iterating_callbacks(void) { RUN(verifier_iterating_callbacks); } void test_verifier_jeq_infer_not_null(void) { RUN(verifier_jeq_infer_not_null); } void test_verifier_ld_ind(void) { RUN(verifier_ld_ind); } void test_verifier_ldsx(void) { RUN(verifier_ldsx); } diff --git a/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c b/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c new file mode 100644 index 000000000000..fa9429f77a81 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c @@ -0,0 +1,147 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include "bpf_misc.h" + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 8); + __type(key, __u32); + __type(value, __u64); +} map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_USER_RINGBUF); + __uint(max_entries, 8); +} ringbuf SEC(".maps"); + +struct vm_area_struct; +struct bpf_map; + +struct buf_context { + char *buf; +}; + +struct num_context { + __u64 i; +}; + +__u8 choice_arr[2] = { 0, 1 }; + +static int unsafe_on_2nd_iter_cb(__u32 idx, struct buf_context *ctx) +{ + if (idx == 0) { + ctx->buf = (char *)(0xDEAD); + return 0; + } + + if (bpf_probe_read_user(ctx->buf, 8, (void *)(0xBADC0FFEE))) + return 1; + + return 0; +} + +SEC("?raw_tp") +__failure __msg("R1 type=scalar expected=fp") +int unsafe_on_2nd_iter(void *unused) +{ + char buf[4]; + struct buf_context loop_ctx = { .buf = buf }; + + bpf_loop(100, unsafe_on_2nd_iter_cb, &loop_ctx, 0); + return 0; +} + +static int unsafe_on_zero_iter_cb(__u32 idx, struct num_context *ctx) +{ + ctx->i = 0; + return 0; +} + +SEC("?raw_tp") +__failure __msg("invalid access to map value, value_size=2 off=32 size=1") +int unsafe_on_zero_iter(void *unused) +{ + struct num_context loop_ctx = { .i = 32 }; + + bpf_loop(100, unsafe_on_zero_iter_cb, &loop_ctx, 0); + return choice_arr[loop_ctx.i]; +} + +static int loop_detection_cb(__u32 idx, struct num_context *ctx) +{ + for (;;) {} + return 0; +} + +SEC("?raw_tp") +__failure __msg("infinite loop detected") +int loop_detection(void *unused) +{ + struct num_context loop_ctx = { .i = 0 }; + + bpf_loop(100, loop_detection_cb, &loop_ctx, 0); + return 0; +} + +static __always_inline __u64 oob_state_machine(struct num_context *ctx) +{ + switch (ctx->i) { + case 0: + ctx->i = 1; + break; + case 1: + ctx->i = 32; + break; + } + return 0; +} + +static __u64 for_each_map_elem_cb(struct bpf_map *map, __u32 *key, __u64 *val, void *data) +{ + return oob_state_machine(data); +} + +SEC("?raw_tp") +__failure __msg("invalid access to map value, value_size=2 off=32 size=1") +int unsafe_for_each_map_elem(void *unused) +{ + struct num_context loop_ctx = { .i = 0 }; + + bpf_for_each_map_elem(&map, for_each_map_elem_cb, &loop_ctx, 0); + return choice_arr[loop_ctx.i]; +} + +static __u64 ringbuf_drain_cb(struct bpf_dynptr *dynptr, void *data) +{ + return oob_state_machine(data); +} + +SEC("?raw_tp") +__failure __msg("invalid access to map value, value_size=2 off=32 size=1") +int unsafe_ringbuf_drain(void *unused) +{ + struct num_context loop_ctx = { .i = 0 }; + + bpf_user_ringbuf_drain(&ringbuf, ringbuf_drain_cb, &loop_ctx, 0); + return choice_arr[loop_ctx.i]; +} + +static __u64 find_vma_cb(struct task_struct *task, struct vm_area_struct *vma, void *data) +{ + return oob_state_machine(data); +} + +SEC("?raw_tp") +__failure __msg("invalid access to map value, value_size=2 off=32 size=1") +int unsafe_find_vma(void *unused) +{ + struct task_struct *task = bpf_get_current_task_btf(); + struct num_context loop_ctx = { .i = 0 }; + + bpf_find_vma(task, 0, find_vma_cb, &loop_ctx, 0); + return choice_arr[loop_ctx.i]; +} + +char _license[] SEC("license") = "GPL"; -- cgit From cafe2c21508a38cdb3ed22708842e957b2572c3e Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 21 Nov 2023 04:06:58 +0200 Subject: bpf: widening for callback iterators Callbacks are similar to open coded iterators, so add imprecise widening logic for callback body processing. This makes callback based loops behave identically to open coded iterators, e.g. allowing to verify programs like below: struct ctx { u32 i; }; int cb(u32 idx, struct ctx* ctx) { ++ctx->i; return 0; } ... struct ctx ctx = { .i = 0 }; bpf_loop(100, cb, &ctx, 0); ... Acked-by: Andrii Nakryiko Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20231121020701.26440-9-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a60dfa56ebb3..2f03e6b11bb9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9799,9 +9799,10 @@ static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env) static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) { - struct bpf_verifier_state *state = env->cur_state; + struct bpf_verifier_state *state = env->cur_state, *prev_st; struct bpf_func_state *caller, *callee; struct bpf_reg_state *r0; + bool in_callback_fn; int err; callee = state->frame[state->curframe]; @@ -9856,7 +9857,8 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) * there function call logic would reschedule callback visit. If iteration * converges is_state_visited() would prune that visit eventually. */ - if (callee->in_callback_fn) + in_callback_fn = callee->in_callback_fn; + if (in_callback_fn) *insn_idx = callee->callsite; else *insn_idx = callee->callsite + 1; @@ -9871,6 +9873,24 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) * bpf_throw, this will be done by copy_verifier_state for extra frames. */ free_func_state(callee); state->frame[state->curframe--] = NULL; + + /* for callbacks widen imprecise scalars to make programs like below verify: + * + * struct ctx { int i; } + * void cb(int idx, struct ctx *ctx) { ctx->i++; ... } + * ... + * struct ctx = { .i = 0; } + * bpf_loop(100, cb, &ctx, 0); + * + * This is similar to what is done in process_iter_next_call() for open + * coded iterators. + */ + prev_st = in_callback_fn ? find_prev_entry(env, state, *insn_idx) : NULL; + if (prev_st) { + err = widen_imprecise_scalars(env, prev_st, state); + if (err) + return err; + } return 0; } -- cgit From 9f3330aa644d6d979eb064c46e85c62d4b4eac75 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 21 Nov 2023 04:06:59 +0200 Subject: selftests/bpf: test widening for iterating callbacks A test case to verify that imprecise scalars widening is applied to callback entering state, when callback call is simulated repeatedly. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20231121020701.26440-10-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- .../bpf/progs/verifier_iterating_callbacks.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c b/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c index fa9429f77a81..598c1e984b26 100644 --- a/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c +++ b/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c @@ -25,6 +25,7 @@ struct buf_context { struct num_context { __u64 i; + __u64 j; }; __u8 choice_arr[2] = { 0, 1 }; @@ -69,6 +70,25 @@ int unsafe_on_zero_iter(void *unused) return choice_arr[loop_ctx.i]; } +static int widening_cb(__u32 idx, struct num_context *ctx) +{ + ++ctx->i; + return 0; +} + +SEC("?raw_tp") +__success +int widening(void *unused) +{ + struct num_context loop_ctx = { .i = 0, .j = 1 }; + + bpf_loop(100, widening_cb, &loop_ctx, 0); + /* loop_ctx.j is not changed during callback iteration, + * verifier should not apply widening to it. + */ + return choice_arr[loop_ctx.j]; +} + static int loop_detection_cb(__u32 idx, struct num_context *ctx) { for (;;) {} -- cgit From bb124da69c47dd98d69361ec13244ece50bec63e Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 21 Nov 2023 04:07:00 +0200 Subject: bpf: keep track of max number of bpf_loop callback iterations In some cases verifier can't infer convergence of the bpf_loop() iteration. E.g. for the following program: static int cb(__u32 idx, struct num_context* ctx) { ctx->i++; return 0; } SEC("?raw_tp") int prog(void *_) { struct num_context ctx = { .i = 0 }; __u8 choice_arr[2] = { 0, 1 }; bpf_loop(2, cb, &ctx, 0); return choice_arr[ctx.i]; } Each 'cb' simulation would eventually return to 'prog' and reach 'return choice_arr[ctx.i]' statement. At which point ctx.i would be marked precise, thus forcing verifier to track multitude of separate states with {.i=0}, {.i=1}, ... at bpf_loop() callback entry. This commit allows "brute force" handling for such cases by limiting number of callback body simulations using 'umax' value of the first bpf_loop() parameter. For this, extend bpf_func_state with 'callback_depth' field. Increment this field when callback visiting state is pushed to states traversal stack. For frame #N it's 'callback_depth' field counts how many times callback with frame depth N+1 had been executed. Use bpf_func_state specifically to allow independent tracking of callback depths when multiple nested bpf_loop() calls are present. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20231121020701.26440-11-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 11 +++++++ kernel/bpf/verifier.c | 19 ++++++++++-- .../bpf/progs/verifier_subprog_precision.c | 35 +++++++++++++++------- 3 files changed, 53 insertions(+), 12 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index dd326936dd6f..aa4d19d0bc94 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -301,6 +301,17 @@ struct bpf_func_state { struct tnum callback_ret_range; bool in_async_callback_fn; bool in_exception_callback_fn; + /* For callback calling functions that limit number of possible + * callback executions (e.g. bpf_loop) keeps track of current + * simulated iteration number. + * Value in frame N refers to number of times callback with frame + * N+1 was simulated, e.g. for the following call: + * + * bpf_loop(..., fn, ...); | suppose current frame is N + * | fn would be simulated in frame N+1 + * | number of simulations is tracked in frame N + */ + u32 callback_depth; /* The following fields should be last. See copy_func_state() */ int acquired_refs; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2f03e6b11bb9..af2819d5c8ee 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9505,6 +9505,8 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins return err; callback_state->callback_unroll_depth++; + callback_state->frame[callback_state->curframe - 1]->callback_depth++; + caller->callback_depth = 0; return 0; } @@ -10309,8 +10311,21 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn break; case BPF_FUNC_loop: update_loop_inline_state(env, meta.subprogno); - err = push_callback_call(env, insn, insn_idx, meta.subprogno, - set_loop_callback_state); + /* Verifier relies on R1 value to determine if bpf_loop() iteration + * is finished, thus mark it precise. + */ + err = mark_chain_precision(env, BPF_REG_1); + if (err) + return err; + if (cur_func(env)->callback_depth < regs[BPF_REG_1].umax_value) { + err = push_callback_call(env, insn, insn_idx, meta.subprogno, + set_loop_callback_state); + } else { + cur_func(env)->callback_depth = 0; + if (env->log.level & BPF_LOG_LEVEL2) + verbose(env, "frame%d bpf_loop iteration limit reached\n", + env->cur_state->curframe); + } break; case BPF_FUNC_dynptr_from_mem: if (regs[BPF_REG_1].type != PTR_TO_MAP_VALUE) { diff --git a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c index da803cffb5ef..f61d623b1ce8 100644 --- a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c +++ b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c @@ -119,7 +119,23 @@ __naked int global_subprog_result_precise(void) SEC("?raw_tp") __success __log_level(2) -/* First simulated path does not include callback body */ +/* First simulated path does not include callback body, + * r1 and r4 are always precise for bpf_loop() calls. + */ +__msg("9: (85) call bpf_loop#181") +__msg("mark_precise: frame0: last_idx 9 first_idx 9 subseq_idx -1") +__msg("mark_precise: frame0: parent state regs=r4 stack=:") +__msg("mark_precise: frame0: last_idx 8 first_idx 0 subseq_idx 9") +__msg("mark_precise: frame0: regs=r4 stack= before 8: (b7) r4 = 0") +__msg("mark_precise: frame0: last_idx 9 first_idx 9 subseq_idx -1") +__msg("mark_precise: frame0: parent state regs=r1 stack=:") +__msg("mark_precise: frame0: last_idx 8 first_idx 0 subseq_idx 9") +__msg("mark_precise: frame0: regs=r1 stack= before 8: (b7) r4 = 0") +__msg("mark_precise: frame0: regs=r1 stack= before 7: (b7) r3 = 0") +__msg("mark_precise: frame0: regs=r1 stack= before 6: (bf) r2 = r8") +__msg("mark_precise: frame0: regs=r1 stack= before 5: (bf) r1 = r6") +__msg("mark_precise: frame0: regs=r6 stack= before 4: (b7) r6 = 3") +/* r6 precision propagation */ __msg("14: (0f) r1 += r6") __msg("mark_precise: frame0: last_idx 14 first_idx 9") __msg("mark_precise: frame0: regs=r6 stack= before 13: (bf) r1 = r7") @@ -134,10 +150,9 @@ __msg("17: (b7) r0 = 0") __msg("18: (95) exit") __msg("returning from callee:") __msg("to caller at 9:") -/* r4 (flags) is always precise for bpf_loop() */ -__msg("frame 0: propagating r4") +__msg("frame 0: propagating r1,r4") __msg("mark_precise: frame0: last_idx 9 first_idx 9 subseq_idx -1") -__msg("mark_precise: frame0: regs=r4 stack= before 18: (95) exit") +__msg("mark_precise: frame0: regs=r1,r4 stack= before 18: (95) exit") __msg("from 18 to 9: safe") __naked int callback_result_precise(void) { @@ -264,12 +279,12 @@ __msg("15: (b7) r0 = 0") __msg("16: (95) exit") __msg("returning from callee:") __msg("to caller at 9:") -/* r4 (flags) is always precise for bpf_loop(), +/* r1, r4 are always precise for bpf_loop(), * r6 was marked before backtracking to callback body. */ -__msg("frame 0: propagating r4,r6") +__msg("frame 0: propagating r1,r4,r6") __msg("mark_precise: frame0: last_idx 9 first_idx 9 subseq_idx -1") -__msg("mark_precise: frame0: regs=r4,r6 stack= before 16: (95) exit") +__msg("mark_precise: frame0: regs=r1,r4,r6 stack= before 16: (95) exit") __msg("mark_precise: frame1: regs= stack= before 15: (b7) r0 = 0") __msg("mark_precise: frame1: regs= stack= before 9: (85) call bpf_loop") __msg("mark_precise: frame0: parent state regs= stack=:") @@ -422,12 +437,12 @@ __msg("17: (b7) r0 = 0") __msg("18: (95) exit") __msg("returning from callee:") __msg("to caller at 10:") -/* r4 (flags) is always precise for bpf_loop(), +/* r1, r4 are always precise for bpf_loop(), * fp-8 was marked before backtracking to callback body. */ -__msg("frame 0: propagating r4,fp-8") +__msg("frame 0: propagating r1,r4,fp-8") __msg("mark_precise: frame0: last_idx 10 first_idx 10 subseq_idx -1") -__msg("mark_precise: frame0: regs=r4 stack=-8 before 18: (95) exit") +__msg("mark_precise: frame0: regs=r1,r4 stack=-8 before 18: (95) exit") __msg("mark_precise: frame1: regs= stack= before 17: (b7) r0 = 0") __msg("mark_precise: frame1: regs= stack= before 10: (85) call bpf_loop#181") __msg("mark_precise: frame0: parent state regs= stack=:") -- cgit From 57e2a52deeb12ab84c15c6d0fb93638b5b94001b Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 21 Nov 2023 04:07:01 +0200 Subject: selftests/bpf: check if max number of bpf_loop iterations is tracked Check that even if bpf_loop() callback simulation does not converge to a specific state, verification could proceed via "brute force" simulation of maximal number of callback calls. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20231121020701.26440-12-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- .../bpf/progs/verifier_iterating_callbacks.c | 75 ++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c b/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c index 598c1e984b26..5905e036e0ea 100644 --- a/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c +++ b/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c @@ -164,4 +164,79 @@ int unsafe_find_vma(void *unused) return choice_arr[loop_ctx.i]; } +static int iter_limit_cb(__u32 idx, struct num_context *ctx) +{ + ctx->i++; + return 0; +} + +SEC("?raw_tp") +__success +int bpf_loop_iter_limit_ok(void *unused) +{ + struct num_context ctx = { .i = 0 }; + + bpf_loop(1, iter_limit_cb, &ctx, 0); + return choice_arr[ctx.i]; +} + +SEC("?raw_tp") +__failure __msg("invalid access to map value, value_size=2 off=2 size=1") +int bpf_loop_iter_limit_overflow(void *unused) +{ + struct num_context ctx = { .i = 0 }; + + bpf_loop(2, iter_limit_cb, &ctx, 0); + return choice_arr[ctx.i]; +} + +static int iter_limit_level2a_cb(__u32 idx, struct num_context *ctx) +{ + ctx->i += 100; + return 0; +} + +static int iter_limit_level2b_cb(__u32 idx, struct num_context *ctx) +{ + ctx->i += 10; + return 0; +} + +static int iter_limit_level1_cb(__u32 idx, struct num_context *ctx) +{ + ctx->i += 1; + bpf_loop(1, iter_limit_level2a_cb, ctx, 0); + bpf_loop(1, iter_limit_level2b_cb, ctx, 0); + return 0; +} + +/* Check that path visiting every callback function once had been + * reached by verifier. Variables 'ctx{1,2}i' below serve as flags, + * with each decimal digit corresponding to a callback visit marker. + */ +SEC("socket") +__success __retval(111111) +int bpf_loop_iter_limit_nested(void *unused) +{ + struct num_context ctx1 = { .i = 0 }; + struct num_context ctx2 = { .i = 0 }; + __u64 a, b, c; + + bpf_loop(1, iter_limit_level1_cb, &ctx1, 0); + bpf_loop(1, iter_limit_level1_cb, &ctx2, 0); + a = ctx1.i; + b = ctx2.i; + /* Force 'ctx1.i' and 'ctx2.i' precise. */ + c = choice_arr[(a + b) % 2]; + /* This makes 'c' zero, but neither clang nor verifier know it. */ + c /= 10; + /* Make sure that verifier does not visit 'impossible' states: + * enumerate all possible callback visit masks. + */ + if (a != 0 && a != 1 && a != 11 && a != 101 && a != 111 && + b != 0 && b != 1 && b != 11 && b != 101 && b != 111) + asm volatile ("r0 /= 0;" ::: "r0"); + return 1000 * a + b + c; +} + char _license[] SEC("license") = "GPL"; -- cgit From c0e2926266af3b5acf28df0a8fc6e4d90effe0bb Mon Sep 17 00:00:00 2001 From: Kunwu Chan Date: Sun, 19 Nov 2023 22:17:59 +0800 Subject: ipv4: Correct/silence an endian warning in __ip_do_redirect net/ipv4/route.c:783:46: warning: incorrect type in argument 2 (different base types) net/ipv4/route.c:783:46: expected unsigned int [usertype] key net/ipv4/route.c:783:46: got restricted __be32 [usertype] new_gw Fixes: 969447f226b4 ("ipv4: use new_gw for redirect neigh lookup") Suggested-by: Eric Dumazet Signed-off-by: Kunwu Chan Link: https://lore.kernel.org/r/20231119141759.420477-1-chentao@kylinos.cn Signed-off-by: Paolo Abeni --- net/ipv4/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 3290a4442b4a..16615d107cf0 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -780,7 +780,7 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow goto reject_redirect; } - n = __ipv4_neigh_lookup(rt->dst.dev, new_gw); + n = __ipv4_neigh_lookup(rt->dst.dev, (__force u32)new_gw); if (!n) n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev); if (!IS_ERR(n)) { -- cgit From d30fb712e52964f2cf9a9c14cf67078394044837 Mon Sep 17 00:00:00 2001 From: Haiyang Zhang Date: Sun, 19 Nov 2023 08:23:41 -0800 Subject: hv_netvsc: fix race of netvsc and VF register_netdevice The rtnl lock also needs to be held before rndis_filter_device_add() which advertises nvsp_2_vsc_capability / sriov bit, and triggers VF NIC offering and registering. If VF NIC finished register_netdev() earlier it may cause name based config failure. To fix this issue, move the call to rtnl_lock() before rndis_filter_device_add(), so VF will be registered later than netvsc / synthetic NIC, and gets a name numbered (ethX) after netvsc. Cc: stable@vger.kernel.org Fixes: e04e7a7bbd4b ("hv_netvsc: Fix a deadlock by getting rtnl lock earlier in netvsc_probe()") Reported-by: Dexuan Cui Signed-off-by: Haiyang Zhang Reviewed-by: Wojciech Drewek Reviewed-by: Simon Horman Reviewed-by: Dexuan Cui Signed-off-by: Paolo Abeni --- drivers/net/hyperv/netvsc_drv.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 3ba3c8fb28a5..5e528a76f5f5 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -2531,15 +2531,6 @@ static int netvsc_probe(struct hv_device *dev, goto devinfo_failed; } - nvdev = rndis_filter_device_add(dev, device_info); - if (IS_ERR(nvdev)) { - ret = PTR_ERR(nvdev); - netdev_err(net, "unable to add netvsc device (ret %d)\n", ret); - goto rndis_failed; - } - - eth_hw_addr_set(net, device_info->mac_adr); - /* We must get rtnl lock before scheduling nvdev->subchan_work, * otherwise netvsc_subchan_work() can get rtnl lock first and wait * all subchannels to show up, but that may not happen because @@ -2547,9 +2538,23 @@ static int netvsc_probe(struct hv_device *dev, * -> ... -> device_add() -> ... -> __device_attach() can't get * the device lock, so all the subchannels can't be processed -- * finally netvsc_subchan_work() hangs forever. + * + * The rtnl lock also needs to be held before rndis_filter_device_add() + * which advertises nvsp_2_vsc_capability / sriov bit, and triggers + * VF NIC offering and registering. If VF NIC finished register_netdev() + * earlier it may cause name based config failure. */ rtnl_lock(); + nvdev = rndis_filter_device_add(dev, device_info); + if (IS_ERR(nvdev)) { + ret = PTR_ERR(nvdev); + netdev_err(net, "unable to add netvsc device (ret %d)\n", ret); + goto rndis_failed; + } + + eth_hw_addr_set(net, device_info->mac_adr); + if (nvdev->num_chn > 1) schedule_work(&nvdev->subchan_work); @@ -2586,9 +2591,9 @@ static int netvsc_probe(struct hv_device *dev, return 0; register_failed: - rtnl_unlock(); rndis_filter_device_remove(dev, nvdev); rndis_failed: + rtnl_unlock(); netvsc_devinfo_put(device_info); devinfo_failed: free_percpu(net_device_ctx->vf_stats); -- cgit From 85520856466ed6bc3b1ccb013cddac70ceb437db Mon Sep 17 00:00:00 2001 From: Haiyang Zhang Date: Sun, 19 Nov 2023 08:23:42 -0800 Subject: hv_netvsc: Fix race of register_netdevice_notifier and VF register If VF NIC is registered earlier, NETDEV_REGISTER event is replayed, but NETDEV_POST_INIT is not. Move register_netdevice_notifier() earlier, so the call back function is set before probing. Cc: stable@vger.kernel.org Fixes: e04e7a7bbd4b ("hv_netvsc: Fix a deadlock by getting rtnl lock earlier in netvsc_probe()") Reported-by: Dexuan Cui Signed-off-by: Haiyang Zhang Reviewed-by: Wojciech Drewek Reviewed-by: Dexuan Cui Signed-off-by: Paolo Abeni --- drivers/net/hyperv/netvsc_drv.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 5e528a76f5f5..b7dfd51f09e6 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -2793,12 +2793,17 @@ static int __init netvsc_drv_init(void) } netvsc_ring_bytes = ring_size * PAGE_SIZE; + register_netdevice_notifier(&netvsc_netdev_notifier); + ret = vmbus_driver_register(&netvsc_drv); if (ret) - return ret; + goto err_vmbus_reg; - register_netdevice_notifier(&netvsc_netdev_notifier); return 0; + +err_vmbus_reg: + unregister_netdevice_notifier(&netvsc_netdev_notifier); + return ret; } MODULE_LICENSE("GPL"); -- cgit From c807d6cd089d2f4951baa838081ec5ae3e2360f8 Mon Sep 17 00:00:00 2001 From: Long Li Date: Sun, 19 Nov 2023 08:23:43 -0800 Subject: hv_netvsc: Mark VF as slave before exposing it to user-mode When a VF is being exposed form the kernel, it should be marked as "slave" before exposing to the user-mode. The VF is not usable without netvsc running as master. The user-mode should never see a VF without the "slave" flag. This commit moves the code of setting the slave flag to the time before VF is exposed to user-mode. Cc: stable@vger.kernel.org Fixes: 0c195567a8f6 ("netvsc: transparent VF management") Signed-off-by: Long Li Signed-off-by: Haiyang Zhang Acked-by: Stephen Hemminger Acked-by: Dexuan Cui Signed-off-by: Paolo Abeni --- drivers/net/hyperv/netvsc_drv.c | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index b7dfd51f09e6..706ea5263e87 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -2206,9 +2206,6 @@ static int netvsc_vf_join(struct net_device *vf_netdev, goto upper_link_failed; } - /* set slave flag before open to prevent IPv6 addrconf */ - vf_netdev->flags |= IFF_SLAVE; - schedule_delayed_work(&ndev_ctx->vf_takeover, VF_TAKEOVER_INT); call_netdevice_notifiers(NETDEV_JOIN, vf_netdev); @@ -2315,16 +2312,18 @@ static struct net_device *get_netvsc_byslot(const struct net_device *vf_netdev) } - /* Fallback path to check synthetic vf with - * help of mac addr + /* Fallback path to check synthetic vf with help of mac addr. + * Because this function can be called before vf_netdev is + * initialized (NETDEV_POST_INIT) when its perm_addr has not been copied + * from dev_addr, also try to match to its dev_addr. + * Note: On Hyper-V and Azure, it's not possible to set a MAC address + * on a VF that matches to the MAC of a unrelated NETVSC device. */ list_for_each_entry(ndev_ctx, &netvsc_dev_list, list) { ndev = hv_get_drvdata(ndev_ctx->device_ctx); - if (ether_addr_equal(vf_netdev->perm_addr, ndev->perm_addr)) { - netdev_notice(vf_netdev, - "falling back to mac addr based matching\n"); + if (ether_addr_equal(vf_netdev->perm_addr, ndev->perm_addr) || + ether_addr_equal(vf_netdev->dev_addr, ndev->perm_addr)) return ndev; - } } netdev_notice(vf_netdev, @@ -2332,6 +2331,19 @@ static struct net_device *get_netvsc_byslot(const struct net_device *vf_netdev) return NULL; } +static int netvsc_prepare_bonding(struct net_device *vf_netdev) +{ + struct net_device *ndev; + + ndev = get_netvsc_byslot(vf_netdev); + if (!ndev) + return NOTIFY_DONE; + + /* set slave flag before open to prevent IPv6 addrconf */ + vf_netdev->flags |= IFF_SLAVE; + return NOTIFY_DONE; +} + static int netvsc_register_vf(struct net_device *vf_netdev) { struct net_device_context *net_device_ctx; @@ -2758,6 +2770,8 @@ static int netvsc_netdev_event(struct notifier_block *this, return NOTIFY_DONE; switch (event) { + case NETDEV_POST_INIT: + return netvsc_prepare_bonding(event_dev); case NETDEV_REGISTER: return netvsc_register_vf(event_dev); case NETDEV_UNREGISTER: -- cgit From 9c6dc13106f2dd2d6819d66618b25a6f41f0ee6a Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Mon, 20 Nov 2023 08:28:40 +0000 Subject: MAINTAINERS: Add indirect_call_wrapper.h to NETWORKING [GENERAL] indirect_call_wrapper.h is not, strictly speaking, networking specific. However, it's git history indicates that in practice changes go through netdev and thus the netdev maintainers have effectively been taking responsibility for it. Formalise this by adding it to the NETWORKING [GENERAL] section in the MAINTAINERS file. It is not clear how many other files under include/linux fall into this category and it would be interesting, as a follow-up, to audit that and propose further updates to the MAINTAINERS file as appropriate. Link: https://lore.kernel.org/netdev/20231116010310.4664dd38@kernel.org/ Signed-off-by: Simon Horman Link: https://lore.kernel.org/r/20231120-indirect_call_wrapper-maintainer-v1-1-0a6bb1f7363e@kernel.org Signed-off-by: Paolo Abeni --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 80a95878c61d..6cc0708f77b4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15057,6 +15057,7 @@ F: Documentation/networking/ F: Documentation/process/maintainer-netdev.rst F: Documentation/userspace-api/netlink/ F: include/linux/in.h +F: include/linux/indirect_call_wrapper.h F: include/linux/net.h F: include/linux/netdevice.h F: include/net/ -- cgit From 0739af07d1d947af27c877f797cb82ceee702515 Mon Sep 17 00:00:00 2001 From: Jose Ignacio Tornos Martinez Date: Mon, 20 Nov 2023 13:06:29 +0100 Subject: net: usb: ax88179_178a: fix failed operations during ax88179_reset Using generic ASIX Electronics Corp. AX88179 Gigabit Ethernet device, the following test cycle has been implemented: - power on - check logs - shutdown - after detecting the system shutdown, disconnect power - after approximately 60 seconds of sleep, power is restored Running some cycles, sometimes error logs like this appear: kernel: ax88179_178a 2-9:1.0 (unnamed net_device) (uninitialized): Failed to write reg index 0x0001: -19 kernel: ax88179_178a 2-9:1.0 (unnamed net_device) (uninitialized): Failed to read reg index 0x0001: -19 ... These failed operation are happening during ax88179_reset execution, so the initialization could not be correct. In order to avoid this, we need to increase the delay after reset and clock initial operations. By using these larger values, many cycles have been run and no failed operations appear. It would be better to check some status register to verify when the operation has finished, but I do not have found any available information (neither in the public datasheets nor in the manufacturer's driver). The only available information for the necessary delays is the maufacturer's driver (original values) but the proposed values are not enough for the tested devices. Fixes: e2ca90c276e1f ("ax88179_178a: ASIX AX88179_178A USB 3.0/2.0 to gigabit ethernet adapter driver") Reported-by: Herb Wei Tested-by: Herb Wei Signed-off-by: Jose Ignacio Tornos Martinez Link: https://lore.kernel.org/r/20231120120642.54334-1-jtornosm@redhat.com Signed-off-by: Jakub Kicinski --- drivers/net/usb/ax88179_178a.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/usb/ax88179_178a.c b/drivers/net/usb/ax88179_178a.c index aff39bf3161d..4ea0e155bb0d 100644 --- a/drivers/net/usb/ax88179_178a.c +++ b/drivers/net/usb/ax88179_178a.c @@ -1583,11 +1583,11 @@ static int ax88179_reset(struct usbnet *dev) *tmp16 = AX_PHYPWR_RSTCTL_IPRL; ax88179_write_cmd(dev, AX_ACCESS_MAC, AX_PHYPWR_RSTCTL, 2, 2, tmp16); - msleep(200); + msleep(500); *tmp = AX_CLK_SELECT_ACS | AX_CLK_SELECT_BCS; ax88179_write_cmd(dev, AX_ACCESS_MAC, AX_CLK_SELECT, 1, 1, tmp); - msleep(100); + msleep(200); /* Ethernet PHY Auto Detach*/ ax88179_auto_detach(dev); -- cgit From 495ec91b48e489afefb2ad714f0d9b68c3016c6c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 20 Nov 2023 12:01:09 -0800 Subject: docs: netdev: try to guide people on dealing with silence There has been more than a few threads which went idle before the merge window and now people came back to them and started asking about next steps. We currently tell people to be patient and not to repost too often. Our "not too often", however, is still a few orders of magnitude faster than other subsystems. Or so I feel after hearing people talk about review rates at LPC. Clarify in the doc that if the discussion went idle for a week on netdev, 95% of the time there's no point waiting longer. Link: https://lore.kernel.org/r/20231120200109.620392-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/process/maintainer-netdev.rst | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/Documentation/process/maintainer-netdev.rst b/Documentation/process/maintainer-netdev.rst index 7feacc20835e..84ee60fceef2 100644 --- a/Documentation/process/maintainer-netdev.rst +++ b/Documentation/process/maintainer-netdev.rst @@ -193,9 +193,23 @@ Review timelines Generally speaking, the patches get triaged quickly (in less than 48h). But be patient, if your patch is active in patchwork (i.e. it's listed on the project's patch list) the chances it was missed are close to zero. -Asking the maintainer for status updates on your -patch is a good way to ensure your patch is ignored or pushed to the -bottom of the priority list. + +The high volume of development on netdev makes reviewers move on +from discussions relatively quickly. New comments and replies +are very unlikely to arrive after a week of silence. If a patch +is no longer active in patchwork and the thread went idle for more +than a week - clarify the next steps and/or post the next version. + +For RFC postings specifically, if nobody responded in a week - reviewers +either missed the posting or have no strong opinions. If the code is ready, +repost as a PATCH. + +Emails saying just "ping" or "bump" are considered rude. If you can't figure +out the status of the patch from patchwork or where the discussion has +landed - describe your best guess and ask if it's correct. For example:: + + I don't understand what the next steps are. Person X seems to be unhappy + with A, should I do B and repost the patches? .. _Changes requested: -- cgit From b6fe6f03716da246b453369f98a553d4ab21447c Mon Sep 17 00:00:00 2001 From: Hao Ge Date: Tue, 21 Nov 2023 09:37:09 +0800 Subject: dpll: Fix potential msg memleak when genlmsg_put_reply failed We should clean the skb resource if genlmsg_put_reply failed. Fixes: 9d71b54b65b1 ("dpll: netlink: Add DPLL framework base functions") Signed-off-by: Hao Ge Reviewed-by: Vadim Fedorenko Link: https://lore.kernel.org/r/20231121013709.73323-1-gehao@kylinos.cn Signed-off-by: Jakub Kicinski --- drivers/dpll/dpll_netlink.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/dpll/dpll_netlink.c b/drivers/dpll/dpll_netlink.c index a6dc3997bf5c..442a0ebeb953 100644 --- a/drivers/dpll/dpll_netlink.c +++ b/drivers/dpll/dpll_netlink.c @@ -1093,9 +1093,10 @@ int dpll_nl_pin_id_get_doit(struct sk_buff *skb, struct genl_info *info) return -ENOMEM; hdr = genlmsg_put_reply(msg, info, &dpll_nl_family, 0, DPLL_CMD_PIN_ID_GET); - if (!hdr) + if (!hdr) { + nlmsg_free(msg); return -EMSGSIZE; - + } pin = dpll_pin_find_from_nlattr(info); if (!IS_ERR(pin)) { ret = dpll_msg_add_pin_handle(msg, pin); @@ -1123,8 +1124,10 @@ int dpll_nl_pin_get_doit(struct sk_buff *skb, struct genl_info *info) return -ENOMEM; hdr = genlmsg_put_reply(msg, info, &dpll_nl_family, 0, DPLL_CMD_PIN_GET); - if (!hdr) + if (!hdr) { + nlmsg_free(msg); return -EMSGSIZE; + } ret = dpll_cmd_pin_get_one(msg, pin, info->extack); if (ret) { nlmsg_free(msg); @@ -1256,8 +1259,10 @@ int dpll_nl_device_id_get_doit(struct sk_buff *skb, struct genl_info *info) return -ENOMEM; hdr = genlmsg_put_reply(msg, info, &dpll_nl_family, 0, DPLL_CMD_DEVICE_ID_GET); - if (!hdr) + if (!hdr) { + nlmsg_free(msg); return -EMSGSIZE; + } dpll = dpll_device_find_from_nlattr(info); if (!IS_ERR(dpll)) { @@ -1284,8 +1289,10 @@ int dpll_nl_device_get_doit(struct sk_buff *skb, struct genl_info *info) return -ENOMEM; hdr = genlmsg_put_reply(msg, info, &dpll_nl_family, 0, DPLL_CMD_DEVICE_GET); - if (!hdr) + if (!hdr) { + nlmsg_free(msg); return -EMSGSIZE; + } ret = dpll_device_get_one(dpll, msg, info->extack); if (ret) { -- cgit From 84d2db91f14a32dc856a5972e3f0907089093c7a Mon Sep 17 00:00:00 2001 From: Nguyen Dinh Phi Date: Tue, 21 Nov 2023 15:53:57 +0800 Subject: nfc: virtual_ncidev: Add variable to check if ndev is running syzbot reported an memory leak that happens when an skb is add to send_buff after virtual nci closed. This patch adds a variable to track if the ndev is running before handling new skb in send function. Signed-off-by: Nguyen Dinh Phi Reported-by: syzbot+6eb09d75211863f15e3e@syzkaller.appspotmail.com Closes: https://lore.kernel.org/lkml/00000000000075472b06007df4fb@google.com Reviewed-by: Bongsu Jeon Reviewed-by: Krzysztof Kozlowski Signed-off-by: David S. Miller --- drivers/nfc/virtual_ncidev.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/nfc/virtual_ncidev.c b/drivers/nfc/virtual_ncidev.c index b027be0b0b6f..590b038e449e 100644 --- a/drivers/nfc/virtual_ncidev.c +++ b/drivers/nfc/virtual_ncidev.c @@ -26,10 +26,14 @@ struct virtual_nci_dev { struct mutex mtx; struct sk_buff *send_buff; struct wait_queue_head wq; + bool running; }; static int virtual_nci_open(struct nci_dev *ndev) { + struct virtual_nci_dev *vdev = nci_get_drvdata(ndev); + + vdev->running = true; return 0; } @@ -40,6 +44,7 @@ static int virtual_nci_close(struct nci_dev *ndev) mutex_lock(&vdev->mtx); kfree_skb(vdev->send_buff); vdev->send_buff = NULL; + vdev->running = false; mutex_unlock(&vdev->mtx); return 0; @@ -50,7 +55,7 @@ static int virtual_nci_send(struct nci_dev *ndev, struct sk_buff *skb) struct virtual_nci_dev *vdev = nci_get_drvdata(ndev); mutex_lock(&vdev->mtx); - if (vdev->send_buff) { + if (vdev->send_buff || !vdev->running) { mutex_unlock(&vdev->mtx); kfree_skb(skb); return -1; -- cgit From e6d71b437abc2f249e3b6a1ae1a7228e09c6e563 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Wed, 22 Nov 2023 10:37:05 +0800 Subject: net/smc: avoid data corruption caused by decline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We found a data corruption issue during testing of SMC-R on Redis applications. The benchmark has a low probability of reporting a strange error as shown below. "Error: Protocol error, got "\xe2" as reply type byte" Finally, we found that the retrieved error data was as follows: 0xE2 0xD4 0xC3 0xD9 0x04 0x00 0x2C 0x20 0xA6 0x56 0x00 0x16 0x3E 0x0C 0xCB 0x04 0x02 0x01 0x00 0x00 0x20 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0xE2 It is quite obvious that this is a SMC DECLINE message, which means that the applications received SMC protocol message. We found that this was caused by the following situations: client server ¦ clc proposal -------------> ¦ clc accept <------------- ¦ clc confirm -------------> wait llc confirm send llc confirm ¦failed llc confirm ¦ x------ (after 2s)timeout wait llc confirm rsp wait decline (after 1s) timeout (after 2s) timeout ¦ decline --------------> ¦ decline <-------------- As a result, a decline message was sent in the implementation, and this message was read from TCP by the already-fallback connection. This patch double the client timeout as 2x of the server value, With this simple change, the Decline messages should never cross or collide (during Confirm link timeout). This issue requires an immediate solution, since the protocol updates involve a more long-term solution. Fixes: 0fb0b02bd6fd ("net/smc: adapt SMC client code to use the LLC flow") Signed-off-by: D. Wythe Reviewed-by: Wen Gu Reviewed-by: Wenjia Zhang Signed-off-by: David S. Miller --- net/smc/af_smc.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index da97f946b79b..2a1388841951 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -598,8 +598,12 @@ static int smcr_clnt_conf_first_link(struct smc_sock *smc) struct smc_llc_qentry *qentry; int rc; - /* receive CONFIRM LINK request from server over RoCE fabric */ - qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME, + /* Receive CONFIRM LINK request from server over RoCE fabric. + * Increasing the client's timeout by twice as much as the server's + * timeout by default can temporarily avoid decline messages of + * both sides crossing or colliding + */ + qentry = smc_llc_wait(link->lgr, NULL, 2 * SMC_LLC_WAIT_TIME, SMC_LLC_CONFIRM_LINK); if (!qentry) { struct smc_clc_msg_decline dclc; -- cgit From 6a26310273c323380da21eb23fcfd50e31140913 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Tue, 21 Nov 2023 09:09:33 +0100 Subject: Revert "net: r8169: Disable multicast filter for RTL8168H and RTL8107E" This reverts commit efa5f1311c4998e9e6317c52bc5ee93b3a0f36df. I couldn't reproduce the reported issue. What I did, based on a pcap packet log provided by the reporter: - Used same chip version (RTL8168h) - Set MAC address to the one used on the reporters system - Replayed the EAPOL unicast packet that, according to the reporter, was filtered out by the mc filter. The packet was properly received. Therefore the root cause of the reported issue seems to be somewhere else. Disabling mc filtering completely for the most common chip version is a quite big hammer. Therefore revert the change and wait for further analysis results from the reporter. Cc: stable@vger.kernel.org Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169_main.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index b9bb1d2f0237..295366a85c63 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -2599,9 +2599,7 @@ static void rtl_set_rx_mode(struct net_device *dev) rx_mode &= ~AcceptMulticast; } else if (netdev_mc_count(dev) > MC_FILTER_LIMIT || dev->flags & IFF_ALLMULTI || - tp->mac_version == RTL_GIGA_MAC_VER_35 || - tp->mac_version == RTL_GIGA_MAC_VER_46 || - tp->mac_version == RTL_GIGA_MAC_VER_48) { + tp->mac_version == RTL_GIGA_MAC_VER_35) { /* accept all multicasts */ } else if (netdev_mc_empty(dev)) { rx_mode &= ~AcceptMulticast; -- cgit From 99360d9620f09fb8bc15548d855011bbb198c680 Mon Sep 17 00:00:00 2001 From: Lech Perczak Date: Sat, 18 Nov 2023 00:19:18 +0100 Subject: net: usb: qmi_wwan: claim interface 4 for ZTE MF290 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Interface 4 is used by for QMI interface in stock firmware of MF28D, the router which uses MF290 modem. Rebind it to qmi_wwan after freeing it up from option driver. The proper configuration is: Interface mapping is: 0: QCDM, 1: (unknown), 2: AT (PCUI), 2: AT (Modem), 4: QMI T: Bus=01 Lev=02 Prnt=02 Port=00 Cnt=01 Dev#= 4 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=19d2 ProdID=0189 Rev= 0.00 S: Manufacturer=ZTE, Incorporated S: Product=ZTE LTE Technologies MSM C:* #Ifs= 5 Cfg#= 1 Atr=e0 MxPwr=500mA I:* If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=4ms I:* If#= 1 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=4ms I:* If#= 2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=4ms I:* If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=84(I) Atr=03(Int.) MxPS= 64 Ivl=2ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=4ms I:* If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=qmi_wwan E: Ad=86(I) Atr=03(Int.) MxPS= 64 Ivl=2ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=4ms Cc: Bjørn Mork Signed-off-by: Lech Perczak Link: https://lore.kernel.org/r/20231117231918.100278-3-lech.perczak@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/usb/qmi_wwan.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index 344af3c5c836..e2e181378f41 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -1289,6 +1289,7 @@ static const struct usb_device_id products[] = { {QMI_FIXED_INTF(0x19d2, 0x0168, 4)}, {QMI_FIXED_INTF(0x19d2, 0x0176, 3)}, {QMI_FIXED_INTF(0x19d2, 0x0178, 3)}, + {QMI_FIXED_INTF(0x19d2, 0x0189, 4)}, /* ZTE MF290 */ {QMI_FIXED_INTF(0x19d2, 0x0191, 4)}, /* ZTE EuFi890 */ {QMI_FIXED_INTF(0x19d2, 0x0199, 1)}, /* ZTE MF820S */ {QMI_FIXED_INTF(0x19d2, 0x0200, 1)}, -- cgit From 4aa1d8f89b10cdc25a231dabf808d8935e0b137a Mon Sep 17 00:00:00 2001 From: Suman Ghosh Date: Tue, 21 Nov 2023 22:26:24 +0530 Subject: octeontx2-pf: Fix ntuple rule creation to direct packet to VF with higher Rx queue than its PF It is possible to add a ntuple rule which would like to direct packet to a VF whose number of queues are greater/less than its PF's queue numbers. For example a PF can have 2 Rx queues but a VF created on that PF can have 8 Rx queues. As of today, ntuple rule will reject rule because it is checking the requested queue number against PF's number of Rx queues. As a part of this fix if the action of a ntuple rule is to move a packet to a VF's queue then the check is removed. Also, a debug information is printed to aware user that it is user's responsibility to cross check if the requested queue number on that VF is a valid one. Fixes: f0a1913f8a6f ("octeontx2-pf: Add support for ethtool ntuple filters") Signed-off-by: Suman Ghosh Reviewed-by: Wojciech Drewek Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20231121165624.3664182-1-sumang@marvell.com Signed-off-by: Paolo Abeni --- .../net/ethernet/marvell/octeontx2/nic/otx2_flows.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_flows.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_flows.c index 4762dbea64a1..97a71e9b8563 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_flows.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_flows.c @@ -1088,6 +1088,7 @@ int otx2_add_flow(struct otx2_nic *pfvf, struct ethtool_rxnfc *nfc) struct ethhdr *eth_hdr; bool new = false; int err = 0; + u64 vf_num; u32 ring; if (!flow_cfg->max_flows) { @@ -1100,7 +1101,21 @@ int otx2_add_flow(struct otx2_nic *pfvf, struct ethtool_rxnfc *nfc) if (!(pfvf->flags & OTX2_FLAG_NTUPLE_SUPPORT)) return -ENOMEM; - if (ring >= pfvf->hw.rx_queues && fsp->ring_cookie != RX_CLS_FLOW_DISC) + /* Number of queues on a VF can be greater or less than + * the PF's queue. Hence no need to check for the + * queue count. Hence no need to check queue count if PF + * is installing for its VF. Below is the expected vf_num value + * based on the ethtool commands. + * + * e.g. + * 1. ethtool -U ... action -1 ==> vf_num:255 + * 2. ethtool -U ... action ==> vf_num:0 + * 3. ethtool -U ... vf queue ==> + * vf_num:vf_idx+1 + */ + vf_num = ethtool_get_flow_spec_ring_vf(fsp->ring_cookie); + if (!is_otx2_vf(pfvf->pcifunc) && !vf_num && + ring >= pfvf->hw.rx_queues && fsp->ring_cookie != RX_CLS_FLOW_DISC) return -EINVAL; if (fsp->location >= otx2_get_maxflows(flow_cfg)) @@ -1182,6 +1197,9 @@ int otx2_add_flow(struct otx2_nic *pfvf, struct ethtool_rxnfc *nfc) flow_cfg->nr_flows++; } + if (flow->is_vf) + netdev_info(pfvf->netdev, + "Make sure that VF's queue number is within its queue limit\n"); return 0; } -- cgit From 818ad9cc90d4a7165caaee7e32800c50d0564ec3 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 21 Nov 2023 20:08:44 +0100 Subject: net: veth: fix ethtool stats reporting Fix a possible misalignment between page_pool stats and tx xdp_stats reported in veth_get_ethtool_stats routine. The issue can be reproduced configuring the veth pair with the following tx/rx queues: $ip link add v0 numtxqueues 2 numrxqueues 4 type veth peer name v1 \ numtxqueues 1 numrxqueues 1 and loading a simple XDP program on v0 that just returns XDP_PASS. In this case on v0 the page_pool stats overwrites tx xdp_stats for queue 1. Fix the issue incrementing pp_idx of dev->real_num_tx_queues * VETH_TQ_STATS_LEN since we always report xdp_stats for all tx queues in ethtool. Fixes: 4fc418053ec7 ("net: veth: add page_pool stats") Signed-off-by: Lorenzo Bianconi Link: https://lore.kernel.org/r/c5b5d0485016836448453f12846c7c4ab75b094a.1700593593.git.lorenzo@kernel.org Signed-off-by: Paolo Abeni --- drivers/net/veth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 6cc352296c67..57efb3454c57 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -236,8 +236,8 @@ static void veth_get_ethtool_stats(struct net_device *dev, data[tx_idx + j] += *(u64 *)(base + offset); } } while (u64_stats_fetch_retry(&rq_stats->syncp, start)); - pp_idx = tx_idx + VETH_TQ_STATS_LEN; } + pp_idx = idx + dev->real_num_tx_queues * VETH_TQ_STATS_LEN; page_pool_stats: veth_get_page_pool_stats(dev, &data[pp_idx]); -- cgit From 676ec53844cbdf2f47e68a076cdff7f0ec6cbe3f Mon Sep 17 00:00:00 2001 From: Raju Rangoju Date: Wed, 22 Nov 2023 00:44:33 +0530 Subject: amd-xgbe: handle corner-case during sfp hotplug Force the mode change for SFI in Fixed PHY configurations. Fixed PHY configurations needs PLL to be enabled while doing mode set. When the SFP module isn't connected during boot, driver assumes AN is ON and attempts auto-negotiation. However, if the connected SFP comes up in Fixed PHY configuration the link will not come up as PLL isn't enabled while the initial mode set command is issued. So, force the mode change for SFI in Fixed PHY configuration to fix link issues. Fixes: e57f7a3feaef ("amd-xgbe: Prepare for working with more than one type of phy") Acked-by: Shyam Sundar S K Signed-off-by: Raju Rangoju Reviewed-by: Wojciech Drewek Signed-off-by: Paolo Abeni --- drivers/net/ethernet/amd/xgbe/xgbe-mdio.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c index 32d2c6fac652..4a2dc705b528 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c @@ -1193,7 +1193,19 @@ static int xgbe_phy_config_fixed(struct xgbe_prv_data *pdata) if (pdata->phy.duplex != DUPLEX_FULL) return -EINVAL; - xgbe_set_mode(pdata, mode); + /* Force the mode change for SFI in Fixed PHY config. + * Fixed PHY configs needs PLL to be enabled while doing mode set. + * When the SFP module isn't connected during boot, driver assumes + * AN is ON and attempts autonegotiation. However, if the connected + * SFP comes up in Fixed PHY config, the link will not come up as + * PLL isn't enabled while the initial mode set command is issued. + * So, force the mode change for SFI in Fixed PHY configuration to + * fix link issues. + */ + if (mode == XGBE_MODE_SFI) + xgbe_change_mode(pdata, mode); + else + xgbe_set_mode(pdata, mode); return 0; } -- cgit From 7121205d5330c6a3cb3379348886d47c77b78d06 Mon Sep 17 00:00:00 2001 From: Raju Rangoju Date: Wed, 22 Nov 2023 00:44:34 +0530 Subject: amd-xgbe: handle the corner-case during tx completion The existing implementation uses software logic to accumulate tx completions until the specified time (1ms) is met and then poll them. However, there exists a tiny gap which leads to a race between resetting and checking the tx_activate flag. Due to this the tx completions are not reported to upper layer and tx queue timeout kicks-in restarting the device. To address this, introduce a tx cleanup mechanism as part of the periodic maintenance process. Fixes: c5aa9e3b8156 ("amd-xgbe: Initial AMD 10GbE platform driver") Acked-by: Shyam Sundar S K Signed-off-by: Raju Rangoju Reviewed-by: Wojciech Drewek Signed-off-by: Paolo Abeni --- drivers/net/ethernet/amd/xgbe/xgbe-drv.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c index 614c0278419b..6b73648b3779 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c @@ -682,10 +682,24 @@ static void xgbe_service(struct work_struct *work) static void xgbe_service_timer(struct timer_list *t) { struct xgbe_prv_data *pdata = from_timer(pdata, t, service_timer); + struct xgbe_channel *channel; + unsigned int i; queue_work(pdata->dev_workqueue, &pdata->service_work); mod_timer(&pdata->service_timer, jiffies + HZ); + + if (!pdata->tx_usecs) + return; + + for (i = 0; i < pdata->channel_count; i++) { + channel = pdata->channel[i]; + if (!channel->tx_ring || channel->tx_timer_active) + break; + channel->tx_timer_active = 1; + mod_timer(&channel->tx_timer, + jiffies + usecs_to_jiffies(pdata->tx_usecs)); + } } static void xgbe_init_timers(struct xgbe_prv_data *pdata) -- cgit From 7a2323ac24a50311f64a3a9b54ed5bef5821ecae Mon Sep 17 00:00:00 2001 From: Raju Rangoju Date: Wed, 22 Nov 2023 00:44:35 +0530 Subject: amd-xgbe: propagate the correct speed and duplex status xgbe_get_link_ksettings() does not propagate correct speed and duplex information to ethtool during cable unplug. Due to which ethtool reports incorrect values for speed and duplex. Address this by propagating correct information. Fixes: 7c12aa08779c ("amd-xgbe: Move the PHY support into amd-xgbe") Acked-by: Shyam Sundar S K Signed-off-by: Raju Rangoju Reviewed-by: Wojciech Drewek Signed-off-by: Paolo Abeni --- drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c b/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c index 6e83ff59172a..32fab5e77246 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c @@ -314,10 +314,15 @@ static int xgbe_get_link_ksettings(struct net_device *netdev, cmd->base.phy_address = pdata->phy.address; - cmd->base.autoneg = pdata->phy.autoneg; - cmd->base.speed = pdata->phy.speed; - cmd->base.duplex = pdata->phy.duplex; + if (netif_carrier_ok(netdev)) { + cmd->base.speed = pdata->phy.speed; + cmd->base.duplex = pdata->phy.duplex; + } else { + cmd->base.speed = SPEED_UNKNOWN; + cmd->base.duplex = DUPLEX_UNKNOWN; + } + cmd->base.autoneg = pdata->phy.autoneg; cmd->base.port = PORT_NONE; XGBE_LM_COPY(cmd, supported, lks, supported); -- cgit From 0ffb08b1a45bd6b7694e01da0e1d9e3e788418fb Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Tue, 21 Nov 2023 13:12:55 -0800 Subject: ice: remove ptp_tx ring parameter flag Before performing a Tx timestamp in ice_stamp(), the driver checks a ptp_tx ring variable to see if timestamping is enabled on that ring. This value is set for all rings whenever userspace configures Tx timestamping. Ostensibly this was done to avoid wasting cycles checking other fields when timestamping has not been enabled. However, for Tx timestamps we already get an individual per-SKB flag indicating whether userspace wants to request a timestamp on that packet. We do not gain much by also having a separate flag to check for whether timestamping was enabled. In fact, the driver currently fails to restore the field after a PF reset. Because of this, if a PF reset occurs, timestamps will be disabled. Since this flag doesn't add value in the hotpath, remove it and always provide a timestamp if the SKB flag has been set. A following change will fix the reset path to properly restore user timestamping configuration completely. This went unnoticed for some time because one of the most common applications using Tx timestamps, ptp4l, will reconfigure the socket as part of its fault recovery logic. Fixes: ea9b847cda64 ("ice: enable transmit timestamps for E810 devices") Signed-off-by: Jacob Keller Reviewed-by: Jesse Brandeburg Reviewed-by: Simon Horman Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Tony Nguyen Signed-off-by: Paolo Abeni --- drivers/net/ethernet/intel/ice/ice_ptp.c | 14 -------------- drivers/net/ethernet/intel/ice/ice_txrx.c | 3 --- drivers/net/ethernet/intel/ice/ice_txrx.h | 1 - 3 files changed, 18 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_ptp.c b/drivers/net/ethernet/intel/ice/ice_ptp.c index 1eddcbe89b0c..affd90622a68 100644 --- a/drivers/net/ethernet/intel/ice/ice_ptp.c +++ b/drivers/net/ethernet/intel/ice/ice_ptp.c @@ -280,20 +280,6 @@ static void ice_ptp_configure_tx_tstamp(struct ice_pf *pf, bool on) */ static void ice_set_tx_tstamp(struct ice_pf *pf, bool on) { - struct ice_vsi *vsi; - u16 i; - - vsi = ice_get_main_vsi(pf); - if (!vsi) - return; - - /* Set the timestamp enable flag for all the Tx rings */ - ice_for_each_txq(vsi, i) { - if (!vsi->tx_rings[i]) - continue; - vsi->tx_rings[i]->ptp_tx = on; - } - if (pf->ptp.tx_interrupt_mode == ICE_PTP_TX_INTERRUPT_SELF) ice_ptp_configure_tx_tstamp(pf, on); diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index 52d0a126eb61..9e97ea863068 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -2306,9 +2306,6 @@ ice_tstamp(struct ice_tx_ring *tx_ring, struct sk_buff *skb, if (likely(!(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP))) return; - if (!tx_ring->ptp_tx) - return; - /* Tx timestamps cannot be sampled when doing TSO */ if (first->tx_flags & ICE_TX_FLAGS_TSO) return; diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h b/drivers/net/ethernet/intel/ice/ice_txrx.h index 166413fc33f4..daf7b9dbb143 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.h +++ b/drivers/net/ethernet/intel/ice/ice_txrx.h @@ -380,7 +380,6 @@ struct ice_tx_ring { #define ICE_TX_FLAGS_RING_VLAN_L2TAG2 BIT(2) u8 flags; u8 dcb_tc; /* Traffic class of ring */ - u8 ptp_tx; } ____cacheline_internodealigned_in_smp; static inline bool ice_ring_uses_build_skb(struct ice_rx_ring *ring) -- cgit From 7d606a1e2d0575b6c3a2600f43f90d1e409f9661 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Tue, 21 Nov 2023 13:12:56 -0800 Subject: ice: unify logic for programming PFINT_TSYN_MSK Commit d938a8cca88a ("ice: Auxbus devices & driver for E822 TS") modified how Tx timestamps are handled for E822 devices. On these devices, only the clock owner handles reading the Tx timestamp data from firmware. To do this, the PFINT_TSYN_MSK register is modified from the default value to one which enables reacting to a Tx timestamp on all PHY ports. The driver currently programs PFINT_TSYN_MSK in different places depending on whether the port is the clock owner or not. For the clock owner, the PFINT_TSYN_MSK value is programmed during ice_ptp_init_owner just before calling ice_ptp_tx_ena_intr to program the PHY ports. For the non-clock owner ports, the PFINT_TSYN_MSK is programmed during ice_ptp_init_port. If a large enough device reset occurs, the PFINT_TSYN_MSK register will be reset to the default value in which only the PHY associated directly with the PF will cause the Tx timestamp interrupt to trigger. The driver lacks logic to reprogram the PFINT_TSYN_MSK register after a device reset. For the E822 device, this results in the PF no longer responding to interrupts for other ports. This results in failure to deliver Tx timestamps to user space applications. Rename ice_ptp_configure_tx_tstamp to ice_ptp_cfg_tx_interrupt, and unify the logic for programming PFINT_TSYN_MSK and PFINT_OICR_ENA into one place. This function will program both registers according to the combination of user configuration and device requirements. This ensures that PFINT_TSYN_MSK is always restored when we configure the Tx timestamp interrupt. Fixes: d938a8cca88a ("ice: Auxbus devices & driver for E822 TS") Signed-off-by: Jacob Keller Reviewed-by: Jesse Brandeburg Reviewed-by: Simon Horman Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Tony Nguyen Signed-off-by: Paolo Abeni --- drivers/net/ethernet/intel/ice/ice_ptp.c | 60 ++++++++++++++++++-------------- 1 file changed, 34 insertions(+), 26 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_ptp.c b/drivers/net/ethernet/intel/ice/ice_ptp.c index affd90622a68..624d05b4bbd9 100644 --- a/drivers/net/ethernet/intel/ice/ice_ptp.c +++ b/drivers/net/ethernet/intel/ice/ice_ptp.c @@ -256,21 +256,42 @@ ice_verify_pin_e810t(struct ptp_clock_info *info, unsigned int pin, } /** - * ice_ptp_configure_tx_tstamp - Enable or disable Tx timestamp interrupt - * @pf: The PF pointer to search in - * @on: bool value for whether timestamp interrupt is enabled or disabled + * ice_ptp_cfg_tx_interrupt - Configure Tx timestamp interrupt for the device + * @pf: Board private structure + * + * Program the device to respond appropriately to the Tx timestamp interrupt + * cause. */ -static void ice_ptp_configure_tx_tstamp(struct ice_pf *pf, bool on) +static void ice_ptp_cfg_tx_interrupt(struct ice_pf *pf) { + struct ice_hw *hw = &pf->hw; + bool enable; u32 val; + switch (pf->ptp.tx_interrupt_mode) { + case ICE_PTP_TX_INTERRUPT_ALL: + /* React to interrupts across all quads. */ + wr32(hw, PFINT_TSYN_MSK + (0x4 * hw->pf_id), (u32)0x1f); + enable = true; + break; + case ICE_PTP_TX_INTERRUPT_NONE: + /* Do not react to interrupts on any quad. */ + wr32(hw, PFINT_TSYN_MSK + (0x4 * hw->pf_id), (u32)0x0); + enable = false; + break; + case ICE_PTP_TX_INTERRUPT_SELF: + default: + enable = pf->ptp.tstamp_config.tx_type == HWTSTAMP_TX_ON; + break; + } + /* Configure the Tx timestamp interrupt */ - val = rd32(&pf->hw, PFINT_OICR_ENA); - if (on) + val = rd32(hw, PFINT_OICR_ENA); + if (enable) val |= PFINT_OICR_TSYN_TX_M; else val &= ~PFINT_OICR_TSYN_TX_M; - wr32(&pf->hw, PFINT_OICR_ENA, val); + wr32(hw, PFINT_OICR_ENA, val); } /** @@ -280,10 +301,9 @@ static void ice_ptp_configure_tx_tstamp(struct ice_pf *pf, bool on) */ static void ice_set_tx_tstamp(struct ice_pf *pf, bool on) { - if (pf->ptp.tx_interrupt_mode == ICE_PTP_TX_INTERRUPT_SELF) - ice_ptp_configure_tx_tstamp(pf, on); - pf->ptp.tstamp_config.tx_type = on ? HWTSTAMP_TX_ON : HWTSTAMP_TX_OFF; + + ice_ptp_cfg_tx_interrupt(pf); } /** @@ -2789,15 +2809,7 @@ static int ice_ptp_init_owner(struct ice_pf *pf) /* Release the global hardware lock */ ice_ptp_unlock(hw); - if (pf->ptp.tx_interrupt_mode == ICE_PTP_TX_INTERRUPT_ALL) { - /* The clock owner for this device type handles the timestamp - * interrupt for all ports. - */ - ice_ptp_configure_tx_tstamp(pf, true); - - /* React on all quads interrupts for E82x */ - wr32(hw, PFINT_TSYN_MSK + (0x4 * hw->pf_id), (u32)0x1f); - + if (!ice_is_e810(hw)) { /* Enable quad interrupts */ err = ice_ptp_tx_ena_intr(pf, true, itr); if (err) @@ -2867,13 +2879,6 @@ static int ice_ptp_init_port(struct ice_pf *pf, struct ice_ptp_port *ptp_port) case ICE_PHY_E810: return ice_ptp_init_tx_e810(pf, &ptp_port->tx); case ICE_PHY_E822: - /* Non-owner PFs don't react to any interrupts on E82x, - * neither on own quad nor on others - */ - if (!ice_ptp_pf_handles_tx_interrupt(pf)) { - ice_ptp_configure_tx_tstamp(pf, false); - wr32(hw, PFINT_TSYN_MSK + (0x4 * hw->pf_id), (u32)0x0); - } kthread_init_delayed_work(&ptp_port->ov_work, ice_ptp_wait_for_offsets); @@ -3018,6 +3023,9 @@ void ice_ptp_init(struct ice_pf *pf) /* Start the PHY timestamping block */ ice_ptp_reset_phy_timestamping(pf); + /* Configure initial Tx interrupt settings */ + ice_ptp_cfg_tx_interrupt(pf); + set_bit(ICE_FLAG_PTP, pf->flags); err = ice_ptp_init_work(pf, ptp); if (err) -- cgit From 7758017911a4f2578d54c318e8fe77bcb5899054 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Tue, 21 Nov 2023 13:12:57 -0800 Subject: ice: restore timestamp configuration after device reset The driver calls ice_ptp_cfg_timestamp() during ice_ptp_prepare_for_reset() to disable timestamping while the device is resetting. This operation destroys the user requested configuration. While the driver does call ice_ptp_cfg_timestamp in ice_rebuild() to restore some hardware settings after a reset, it unconditionally passes true or false, resulting in failure to restore previous user space configuration. This results in a device reset forcibly disabling timestamp configuration regardless of current user settings. This was not detected previously due to a quirk of the LinuxPTP ptp4l application. If ptp4l detects a missing timestamp, it enters a fault state and performs recovery logic which includes executing SIOCSHWTSTAMP again, restoring the now accidentally cleared configuration. Not every application does this, and for these applications, timestamps will mysteriously stop after a PF reset, without being restored until an application restart. Fix this by replacing ice_ptp_cfg_timestamp() with two new functions: 1) ice_ptp_disable_timestamp_mode() which unconditionally disables the timestamping logic in ice_ptp_prepare_for_reset() and ice_ptp_release() 2) ice_ptp_restore_timestamp_mode() which calls ice_ptp_restore_tx_interrupt() to restore Tx timestamping configuration, calls ice_set_rx_tstamp() to restore Rx timestamping configuration, and issues an immediate TSYN_TX interrupt to ensure that timestamps which may have occurred during the device reset get processed. Modify the ice_ptp_set_timestamp_mode to directly save the user configuration and then call ice_ptp_restore_timestamp_mode. This way, reset no longer destroys the saved user configuration. This obsoletes the ice_set_tx_tstamp() function which can now be safely removed. With this change, all devices should now restore Tx and Rx timestamping functionality correctly after a PF reset without application intervention. Fixes: 77a781155a65 ("ice: enable receive hardware timestamping") Fixes: ea9b847cda64 ("ice: enable transmit timestamps for E810 devices") Signed-off-by: Jacob Keller Reviewed-by: Jesse Brandeburg Reviewed-by: Simon Horman Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Tony Nguyen Signed-off-by: Paolo Abeni --- drivers/net/ethernet/intel/ice/ice_main.c | 12 ++--- drivers/net/ethernet/intel/ice/ice_ptp.c | 74 +++++++++++++++++++------------ drivers/net/ethernet/intel/ice/ice_ptp.h | 5 +-- 3 files changed, 51 insertions(+), 40 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 6607fa6fe556..fb9c93f37e84 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -7401,15 +7401,6 @@ static void ice_rebuild(struct ice_pf *pf, enum ice_reset_req reset_type) goto err_vsi_rebuild; } - /* configure PTP timestamping after VSI rebuild */ - if (test_bit(ICE_FLAG_PTP_SUPPORTED, pf->flags)) { - if (pf->ptp.tx_interrupt_mode == ICE_PTP_TX_INTERRUPT_SELF) - ice_ptp_cfg_timestamp(pf, false); - else if (pf->ptp.tx_interrupt_mode == ICE_PTP_TX_INTERRUPT_ALL) - /* for E82x PHC owner always need to have interrupts */ - ice_ptp_cfg_timestamp(pf, true); - } - err = ice_vsi_rebuild_by_type(pf, ICE_VSI_SWITCHDEV_CTRL); if (err) { dev_err(dev, "Switchdev CTRL VSI rebuild failed: %d\n", err); @@ -7461,6 +7452,9 @@ static void ice_rebuild(struct ice_pf *pf, enum ice_reset_req reset_type) ice_plug_aux_dev(pf); if (ice_is_feature_supported(pf, ICE_F_SRIOV_LAG)) ice_lag_rebuild(pf); + + /* Restore timestamp mode settings after VSI rebuild */ + ice_ptp_restore_timestamp_mode(pf); return; err_vsi_rebuild: diff --git a/drivers/net/ethernet/intel/ice/ice_ptp.c b/drivers/net/ethernet/intel/ice/ice_ptp.c index 624d05b4bbd9..71f405f8a6fe 100644 --- a/drivers/net/ethernet/intel/ice/ice_ptp.c +++ b/drivers/net/ethernet/intel/ice/ice_ptp.c @@ -294,18 +294,6 @@ static void ice_ptp_cfg_tx_interrupt(struct ice_pf *pf) wr32(hw, PFINT_OICR_ENA, val); } -/** - * ice_set_tx_tstamp - Enable or disable Tx timestamping - * @pf: The PF pointer to search in - * @on: bool value for whether timestamps are enabled or disabled - */ -static void ice_set_tx_tstamp(struct ice_pf *pf, bool on) -{ - pf->ptp.tstamp_config.tx_type = on ? HWTSTAMP_TX_ON : HWTSTAMP_TX_OFF; - - ice_ptp_cfg_tx_interrupt(pf); -} - /** * ice_set_rx_tstamp - Enable or disable Rx timestamping * @pf: The PF pointer to search in @@ -317,7 +305,7 @@ static void ice_set_rx_tstamp(struct ice_pf *pf, bool on) u16 i; vsi = ice_get_main_vsi(pf); - if (!vsi) + if (!vsi || !vsi->rx_rings) return; /* Set the timestamp flag for all the Rx rings */ @@ -326,23 +314,50 @@ static void ice_set_rx_tstamp(struct ice_pf *pf, bool on) continue; vsi->rx_rings[i]->ptp_rx = on; } +} + +/** + * ice_ptp_disable_timestamp_mode - Disable current timestamp mode + * @pf: Board private structure + * + * Called during preparation for reset to temporarily disable timestamping on + * the device. Called during remove to disable timestamping while cleaning up + * driver resources. + */ +static void ice_ptp_disable_timestamp_mode(struct ice_pf *pf) +{ + struct ice_hw *hw = &pf->hw; + u32 val; + + val = rd32(hw, PFINT_OICR_ENA); + val &= ~PFINT_OICR_TSYN_TX_M; + wr32(hw, PFINT_OICR_ENA, val); - pf->ptp.tstamp_config.rx_filter = on ? HWTSTAMP_FILTER_ALL : - HWTSTAMP_FILTER_NONE; + ice_set_rx_tstamp(pf, false); } /** - * ice_ptp_cfg_timestamp - Configure timestamp for init/deinit + * ice_ptp_restore_timestamp_mode - Restore timestamp configuration * @pf: Board private structure - * @ena: bool value to enable or disable time stamp * - * This function will configure timestamping during PTP initialization - * and deinitialization + * Called at the end of rebuild to restore timestamp configuration after + * a device reset. */ -void ice_ptp_cfg_timestamp(struct ice_pf *pf, bool ena) +void ice_ptp_restore_timestamp_mode(struct ice_pf *pf) { - ice_set_tx_tstamp(pf, ena); - ice_set_rx_tstamp(pf, ena); + struct ice_hw *hw = &pf->hw; + bool enable_rx; + + ice_ptp_cfg_tx_interrupt(pf); + + enable_rx = pf->ptp.tstamp_config.rx_filter == HWTSTAMP_FILTER_ALL; + ice_set_rx_tstamp(pf, enable_rx); + + /* Trigger an immediate software interrupt to ensure that timestamps + * which occurred during reset are handled now. + */ + wr32(hw, PFINT_OICR, PFINT_OICR_TSYN_TX_M); + ice_flush(hw); } /** @@ -2043,10 +2058,10 @@ ice_ptp_set_timestamp_mode(struct ice_pf *pf, struct hwtstamp_config *config) { switch (config->tx_type) { case HWTSTAMP_TX_OFF: - ice_set_tx_tstamp(pf, false); + pf->ptp.tstamp_config.tx_type = HWTSTAMP_TX_OFF; break; case HWTSTAMP_TX_ON: - ice_set_tx_tstamp(pf, true); + pf->ptp.tstamp_config.tx_type = HWTSTAMP_TX_ON; break; default: return -ERANGE; @@ -2054,7 +2069,7 @@ ice_ptp_set_timestamp_mode(struct ice_pf *pf, struct hwtstamp_config *config) switch (config->rx_filter) { case HWTSTAMP_FILTER_NONE: - ice_set_rx_tstamp(pf, false); + pf->ptp.tstamp_config.rx_filter = HWTSTAMP_FILTER_NONE; break; case HWTSTAMP_FILTER_PTP_V1_L4_EVENT: case HWTSTAMP_FILTER_PTP_V1_L4_SYNC: @@ -2070,12 +2085,15 @@ ice_ptp_set_timestamp_mode(struct ice_pf *pf, struct hwtstamp_config *config) case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ: case HWTSTAMP_FILTER_NTP_ALL: case HWTSTAMP_FILTER_ALL: - ice_set_rx_tstamp(pf, true); + pf->ptp.tstamp_config.rx_filter = HWTSTAMP_FILTER_ALL; break; default: return -ERANGE; } + /* Immediately update the device timestamping mode */ + ice_ptp_restore_timestamp_mode(pf); + return 0; } @@ -2743,7 +2761,7 @@ void ice_ptp_prepare_for_reset(struct ice_pf *pf) clear_bit(ICE_FLAG_PTP, pf->flags); /* Disable timestamping for both Tx and Rx */ - ice_ptp_cfg_timestamp(pf, false); + ice_ptp_disable_timestamp_mode(pf); kthread_cancel_delayed_work_sync(&ptp->work); @@ -3061,7 +3079,7 @@ void ice_ptp_release(struct ice_pf *pf) return; /* Disable timestamping for both Tx and Rx */ - ice_ptp_cfg_timestamp(pf, false); + ice_ptp_disable_timestamp_mode(pf); ice_ptp_remove_auxbus_device(pf); diff --git a/drivers/net/ethernet/intel/ice/ice_ptp.h b/drivers/net/ethernet/intel/ice/ice_ptp.h index 8f6f94392756..06a330867fc9 100644 --- a/drivers/net/ethernet/intel/ice/ice_ptp.h +++ b/drivers/net/ethernet/intel/ice/ice_ptp.h @@ -292,7 +292,7 @@ int ice_ptp_clock_index(struct ice_pf *pf); struct ice_pf; int ice_ptp_set_ts_config(struct ice_pf *pf, struct ifreq *ifr); int ice_ptp_get_ts_config(struct ice_pf *pf, struct ifreq *ifr); -void ice_ptp_cfg_timestamp(struct ice_pf *pf, bool ena); +void ice_ptp_restore_timestamp_mode(struct ice_pf *pf); void ice_ptp_extts_event(struct ice_pf *pf); s8 ice_ptp_request_ts(struct ice_ptp_tx *tx, struct sk_buff *skb); @@ -317,8 +317,7 @@ static inline int ice_ptp_get_ts_config(struct ice_pf *pf, struct ifreq *ifr) return -EOPNOTSUPP; } -static inline void ice_ptp_cfg_timestamp(struct ice_pf *pf, bool ena) { } - +static inline void ice_ptp_restore_timestamp_mode(struct ice_pf *pf) { } static inline void ice_ptp_extts_event(struct ice_pf *pf) { } static inline s8 ice_ptp_request_ts(struct ice_ptp_tx *tx, struct sk_buff *skb) -- cgit From 4e20655e503e3a478cd1682bf25e3202dd823da8 Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Tue, 21 Nov 2023 13:13:36 -0800 Subject: i40e: Fix adding unsupported cloud filters If a VF tries to add unsupported cloud filter through virtchnl then i40e_add_del_cloud_filter(_big_buf) returns -ENOTSUPP but this error code is stored in 'ret' instead of 'aq_ret' that is used as error code sent back to VF. In this scenario where one of the mentioned functions fails the value of 'aq_ret' is zero so the VF will incorrectly receive a 'success'. Use 'aq_ret' to store return value and remove 'ret' local variable. Additionally fix the issue when filter allocation fails, in this case no notification is sent back to the VF. Fixes: e284fc280473 ("i40e: Add and delete cloud filter") Reviewed-by: Simon Horman Signed-off-by: Ivan Vecera Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen Link: https://lore.kernel.org/r/20231121211338.3348677-1-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index 08d7edccfb8d..3f99eb198245 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -3844,7 +3844,7 @@ static int i40e_vc_add_cloud_filter(struct i40e_vf *vf, u8 *msg) struct i40e_pf *pf = vf->pf; struct i40e_vsi *vsi = NULL; int aq_ret = 0; - int i, ret; + int i; if (!i40e_sync_vf_state(vf, I40E_VF_STATE_ACTIVE)) { aq_ret = -EINVAL; @@ -3868,8 +3868,10 @@ static int i40e_vc_add_cloud_filter(struct i40e_vf *vf, u8 *msg) } cfilter = kzalloc(sizeof(*cfilter), GFP_KERNEL); - if (!cfilter) - return -ENOMEM; + if (!cfilter) { + aq_ret = -ENOMEM; + goto err_out; + } /* parse destination mac address */ for (i = 0; i < ETH_ALEN; i++) @@ -3917,13 +3919,13 @@ static int i40e_vc_add_cloud_filter(struct i40e_vf *vf, u8 *msg) /* Adding cloud filter programmed as TC filter */ if (tcf.dst_port) - ret = i40e_add_del_cloud_filter_big_buf(vsi, cfilter, true); + aq_ret = i40e_add_del_cloud_filter_big_buf(vsi, cfilter, true); else - ret = i40e_add_del_cloud_filter(vsi, cfilter, true); - if (ret) { + aq_ret = i40e_add_del_cloud_filter(vsi, cfilter, true); + if (aq_ret) { dev_err(&pf->pdev->dev, "VF %d: Failed to add cloud filter, err %pe aq_err %s\n", - vf->vf_id, ERR_PTR(ret), + vf->vf_id, ERR_PTR(aq_ret), i40e_aq_str(&pf->hw, pf->hw.aq.asq_last_status)); goto err_free; } -- cgit From f0863888f6cfef33e3117dccfe94fa78edf76be4 Mon Sep 17 00:00:00 2001 From: Arseniy Krasnov Date: Wed, 22 Nov 2023 00:16:42 +0300 Subject: vsock/test: fix SEQPACKET message bounds test Tune message length calculation to make this test work on machines where 'getpagesize()' returns >32KB. Now maximum message length is not hardcoded (on machines above it was smaller than 'getpagesize()' return value, thus we get negative value and test fails), but calculated at runtime and always bigger than 'getpagesize()' result. Reproduced on aarch64 with 64KB page size. Fixes: 5c338112e48a ("test/vsock: rework message bounds test") Signed-off-by: Arseniy Krasnov Reported-by: Bogdan Marcynkov Reviewed-by: Stefano Garzarella Link: https://lore.kernel.org/r/20231121211642.163474-1-avkrasnov@salutedevices.com Signed-off-by: Jakub Kicinski --- tools/testing/vsock/vsock_test.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c index 5b0e93f9996c..01fa816868bc 100644 --- a/tools/testing/vsock/vsock_test.c +++ b/tools/testing/vsock/vsock_test.c @@ -353,11 +353,12 @@ static void test_stream_msg_peek_server(const struct test_opts *opts) } #define SOCK_BUF_SIZE (2 * 1024 * 1024) -#define MAX_MSG_SIZE (32 * 1024) +#define MAX_MSG_PAGES 4 static void test_seqpacket_msg_bounds_client(const struct test_opts *opts) { unsigned long curr_hash; + size_t max_msg_size; int page_size; int msg_count; int fd; @@ -373,7 +374,8 @@ static void test_seqpacket_msg_bounds_client(const struct test_opts *opts) curr_hash = 0; page_size = getpagesize(); - msg_count = SOCK_BUF_SIZE / MAX_MSG_SIZE; + max_msg_size = MAX_MSG_PAGES * page_size; + msg_count = SOCK_BUF_SIZE / max_msg_size; for (int i = 0; i < msg_count; i++) { size_t buf_size; @@ -383,7 +385,7 @@ static void test_seqpacket_msg_bounds_client(const struct test_opts *opts) /* Use "small" buffers and "big" buffers. */ if (i & 1) buf_size = page_size + - (rand() % (MAX_MSG_SIZE - page_size)); + (rand() % (max_msg_size - page_size)); else buf_size = 1 + (rand() % page_size); @@ -429,7 +431,6 @@ static void test_seqpacket_msg_bounds_server(const struct test_opts *opts) unsigned long remote_hash; unsigned long curr_hash; int fd; - char buf[MAX_MSG_SIZE]; struct msghdr msg = {0}; struct iovec iov = {0}; @@ -457,8 +458,13 @@ static void test_seqpacket_msg_bounds_server(const struct test_opts *opts) control_writeln("SRVREADY"); /* Wait, until peer sends whole data. */ control_expectln("SENDDONE"); - iov.iov_base = buf; - iov.iov_len = sizeof(buf); + iov.iov_len = MAX_MSG_PAGES * getpagesize(); + iov.iov_base = malloc(iov.iov_len); + if (!iov.iov_base) { + perror("malloc"); + exit(EXIT_FAILURE); + } + msg.msg_iov = &iov; msg.msg_iovlen = 1; @@ -483,6 +489,7 @@ static void test_seqpacket_msg_bounds_server(const struct test_opts *opts) curr_hash += hash_djb2(msg.msg_iov[0].iov_base, recv_size); } + free(iov.iov_base); close(fd); remote_hash = control_readulong(); -- cgit From fd0413bbf8b11f56e8aa842783b0deda0dfe2926 Mon Sep 17 00:00:00 2001 From: Samuel Holland Date: Tue, 21 Nov 2023 16:42:17 -0800 Subject: net: axienet: Fix check for partial TX checksum Due to a typo, the code checked the RX checksum feature in the TX path. Fixes: 8a3b7a252dca ("drivers/net/ethernet/xilinx: added Xilinx AXI Ethernet driver") Signed-off-by: Samuel Holland Reviewed-by: Andrew Lunn Reviewed-by: Radhey Shyam Pandey Link: https://lore.kernel.org/r/20231122004219.3504219-1-samuel.holland@sifive.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/xilinx/xilinx_axienet_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c index 82d0d44b2b02..bf6e33990490 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c @@ -822,7 +822,7 @@ axienet_start_xmit(struct sk_buff *skb, struct net_device *ndev) if (lp->features & XAE_FEATURE_FULL_TX_CSUM) { /* Tx Full Checksum Offload Enabled */ cur_p->app0 |= 2; - } else if (lp->features & XAE_FEATURE_PARTIAL_RX_CSUM) { + } else if (lp->features & XAE_FEATURE_PARTIAL_TX_CSUM) { csum_start_off = skb_transport_offset(skb); csum_index_off = csum_start_off + skb->csum_offset; /* Tx Partial Checksum Offload Enabled */ -- cgit From 53f2cb491b500897a619ff6abd72f565933760f0 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 22 Nov 2023 22:44:47 +0100 Subject: tls: fix NULL deref on tls_sw_splice_eof() with empty record syzkaller discovered that if tls_sw_splice_eof() is executed as part of sendfile() when the plaintext/ciphertext sk_msg are empty, the send path gets confused because the empty ciphertext buffer does not have enough space for the encryption overhead. This causes tls_push_record() to go on the `split = true` path (which is only supposed to be used when interacting with an attached BPF program), and then get further confused and hit the tls_merge_open_record() path, which then assumes that there must be at least one populated buffer element, leading to a NULL deref. It is possible to have empty plaintext/ciphertext buffers if we previously bailed from tls_sw_sendmsg_locked() via the tls_trim_both_msgs() path. tls_sw_push_pending_record() already handles this case correctly; let's do the same check in tls_sw_splice_eof(). Fixes: df720d288dbb ("tls/sw: Use splice_eof() to flush") Cc: stable@vger.kernel.org Reported-by: syzbot+40d43509a099ea756317@syzkaller.appspotmail.com Signed-off-by: Jann Horn Link: https://lore.kernel.org/r/20231122214447.675768-1-jannh@google.com Signed-off-by: Jakub Kicinski --- net/tls/tls_sw.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index a78e8e722409..316f76187962 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1232,11 +1232,14 @@ void tls_sw_splice_eof(struct socket *sock) lock_sock(sk); retry: + /* same checks as in tls_sw_push_pending_record() */ rec = ctx->open_rec; if (!rec) goto unlock; msg_pl = &rec->msg_plaintext; + if (msg_pl->sg.size == 0) + goto unlock; /* Check the BPF advisor and perform transmission. */ ret = bpf_exec_tx_verdict(msg_pl, sk, false, TLS_RECORD_TYPE_DATA, -- cgit From 37f0205538baf70beb57cdcb6c7d14aa13257926 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 22 Nov 2023 17:17:08 -0600 Subject: net: ipa: fix one GSI register field width The width of the R_LENGTH field of the EV_CH_E_CNTXT_1 GSI register is 24 bits (not 20 bits) starting with IPA v5.0. Fix this. Fixes: faf0678ec8a0 ("net: ipa: add IPA v5.0 GSI register definitions") Signed-off-by: Alex Elder Link: https://lore.kernel.org/r/20231122231708.896632-1-elder@linaro.org Signed-off-by: Jakub Kicinski --- drivers/net/ipa/reg/gsi_reg-v5.0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ipa/reg/gsi_reg-v5.0.c b/drivers/net/ipa/reg/gsi_reg-v5.0.c index d7b81a36d673..145eb0bd096d 100644 --- a/drivers/net/ipa/reg/gsi_reg-v5.0.c +++ b/drivers/net/ipa/reg/gsi_reg-v5.0.c @@ -78,7 +78,7 @@ REG_STRIDE_FIELDS(EV_CH_E_CNTXT_0, ev_ch_e_cntxt_0, 0x0001c000 + 0x12000 * GSI_EE_AP, 0x80); static const u32 reg_ev_ch_e_cntxt_1_fmask[] = { - [R_LENGTH] = GENMASK(19, 0), + [R_LENGTH] = GENMASK(23, 0), }; REG_STRIDE_FIELDS(EV_CH_E_CNTXT_1, ev_ch_e_cntxt_1, -- cgit From 2be35a619482c1f4e5bc7a2d84049b8d7d171882 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 22 Nov 2023 19:06:24 -0800 Subject: tools: ynl: fix header path for nfsd The makefile dependency is trying to include the wrong header: : fatal error: ../../../../include/uapi//linux/nfsd.h: No such file or directory The guard also looks wrong. Fixes: f14122b2c2ac ("tools: ynl: Add source files for nfsd netlink protocol") Reviewed-by: Chuck Lever Link: https://lore.kernel.org/r/20231123030624.1611925-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/net/ynl/Makefile.deps | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/net/ynl/Makefile.deps b/tools/net/ynl/Makefile.deps index 64d139400db1..3110f84dd029 100644 --- a/tools/net/ynl/Makefile.deps +++ b/tools/net/ynl/Makefile.deps @@ -18,4 +18,4 @@ CFLAGS_devlink:=$(call get_hdr_inc,_LINUX_DEVLINK_H_,devlink.h) CFLAGS_ethtool:=$(call get_hdr_inc,_LINUX_ETHTOOL_NETLINK_H_,ethtool_netlink.h) CFLAGS_handshake:=$(call get_hdr_inc,_LINUX_HANDSHAKE_H,handshake.h) CFLAGS_netdev:=$(call get_hdr_inc,_LINUX_NETDEV_H,netdev.h) -CFLAGS_nfsd:=$(call get_hdr_inc,_LINUX_NFSD_H,nfsd.h) +CFLAGS_nfsd:=$(call get_hdr_inc,_LINUX_NFSD_NETLINK_H,nfsd_netlink.h) -- cgit From 39f04b1406b23fcc129a67e70d6205d5a7322f38 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 22 Nov 2023 19:05:58 -0800 Subject: tools: ynl: fix duplicate op name in devlink MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We don't support CRUD-inspired message types in YNL too well. One aspect that currently trips us up is the fact that single message ID can be used in multiple commands (as the response). This leads to duplicate entries in the id-to-string tables: devlink-user.c:19:34: warning: initialized field overwritten [-Woverride-init] 19 | [DEVLINK_CMD_PORT_NEW] = "port-new", | ^~~~~~~~~~ devlink-user.c:19:34: note: (near initialization for ‘devlink_op_strmap[7]’) Fixes tag points at where the code was generated, the "real" problem is that the code generator does not support CRUD. Fixes: f2f9dd164db0 ("netlink: specs: devlink: add the remaining command to generate complete split_ops") Link: https://lore.kernel.org/r/20231123030558.1611831-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/net/ynl/generated/devlink-user.c | 2 +- tools/net/ynl/ynl-gen-c.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/tools/net/ynl/generated/devlink-user.c b/tools/net/ynl/generated/devlink-user.c index bc5065bd99b2..c12ca87ca2bb 100644 --- a/tools/net/ynl/generated/devlink-user.c +++ b/tools/net/ynl/generated/devlink-user.c @@ -15,7 +15,7 @@ /* Enums */ static const char * const devlink_op_strmap[] = { [3] = "get", - [7] = "port-get", + // skip "port-get", duplicate reply value [DEVLINK_CMD_PORT_NEW] = "port-new", [13] = "sb-get", [17] = "sb-pool-get", diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py index c4003a83cd5d..3bd6b928c14f 100755 --- a/tools/net/ynl/ynl-gen-c.py +++ b/tools/net/ynl/ynl-gen-c.py @@ -1505,6 +1505,12 @@ def put_op_name(family, cw): cw.block_start(line=f"static const char * const {map_name}[] =") for op_name, op in family.msgs.items(): if op.rsp_value: + # Make sure we don't add duplicated entries, if multiple commands + # produce the same response in legacy families. + if family.rsp_by_value[op.rsp_value] != op: + cw.p(f'// skip "{op_name}", duplicate reply value') + continue + if op.req_value == op.rsp_value: cw.p(f'[{op.enum_name}] = "{op_name}",') else: -- cgit