From 49b02a19c23a6541026ae8d36fee85ce8af11b60 Mon Sep 17 00:00:00 2001 From: Victor Nogueira Date: Fri, 27 Oct 2023 08:50:43 -0700 Subject: net: sched: Fill in MODULE_DESCRIPTION for act_gate W=1 builds now warn if module is built without a MODULE_DESCRIPTION(). Gate is the only TC action that is lacking such description. Fill MODULE_DESCRIPTION for Gate TC ACTION. Signed-off-by: Victor Nogueira Acked-by: Jamal Hadi Salim Reviewed-by: Vinicius Costa Gomes Link: https://lore.kernel.org/r/20231027155045.46291-2-victor@mojatatu.com Signed-off-by: Jakub Kicinski --- net/sched/act_gate.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/sched/act_gate.c b/net/sched/act_gate.c index c9a811f4c7ee..393b78729216 100644 --- a/net/sched/act_gate.c +++ b/net/sched/act_gate.c @@ -677,4 +677,5 @@ static void __exit gate_cleanup_module(void) module_init(gate_init_module); module_exit(gate_cleanup_module); +MODULE_DESCRIPTION("TC gate action"); MODULE_LICENSE("GPL v2"); -- cgit From a9c92771fa23b263ac747fc5a12e0e233aed23d5 Mon Sep 17 00:00:00 2001 From: Victor Nogueira Date: Fri, 27 Oct 2023 08:50:44 -0700 Subject: net: sched: Fill in missing MODULE_DESCRIPTION for classifiers W=1 builds now warn if module is built without a MODULE_DESCRIPTION(). Fill in missing MODULE_DESCRIPTIONs for TC classifiers. Signed-off-by: Victor Nogueira Acked-by: Jamal Hadi Salim Reviewed-by: Vinicius Costa Gomes Link: https://lore.kernel.org/r/20231027155045.46291-3-victor@mojatatu.com Signed-off-by: Jakub Kicinski --- net/sched/cls_basic.c | 1 + net/sched/cls_cgroup.c | 1 + net/sched/cls_fw.c | 1 + net/sched/cls_route.c | 1 + net/sched/cls_u32.c | 1 + 5 files changed, 5 insertions(+) diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index 1b92c33b5f81..a1f56931330c 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -341,4 +341,5 @@ static void __exit exit_basic(void) module_init(init_basic) module_exit(exit_basic) +MODULE_DESCRIPTION("TC basic classifier"); MODULE_LICENSE("GPL"); diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index bd9322d71910..7ee8dbf49ed0 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -222,4 +222,5 @@ static void __exit exit_cgroup_cls(void) module_init(init_cgroup_cls); module_exit(exit_cgroup_cls); +MODULE_DESCRIPTION("TC cgroup classifier"); MODULE_LICENSE("GPL"); diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index c49d6af0e048..afc534ee0a18 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -446,4 +446,5 @@ static void __exit exit_fw(void) module_init(init_fw) module_exit(exit_fw) +MODULE_DESCRIPTION("SKB mark based TC classifier"); MODULE_LICENSE("GPL"); diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index 1424bfeaca73..12a505db4183 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -684,4 +684,5 @@ static void __exit exit_route4(void) module_init(init_route4) module_exit(exit_route4) +MODULE_DESCRIPTION("Routing table realm based TC classifier"); MODULE_LICENSE("GPL"); diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 6663e971a13e..d5bdfd4a7655 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -1489,4 +1489,5 @@ static void __exit exit_u32(void) module_init(init_u32) module_exit(exit_u32) +MODULE_DESCRIPTION("Universal 32bit based TC Classifier"); MODULE_LICENSE("GPL"); -- cgit From f96118c5d86f03d81bc24c7941f133ae5dd56a7b Mon Sep 17 00:00:00 2001 From: Victor Nogueira Date: Fri, 27 Oct 2023 08:50:45 -0700 Subject: net: sched: Fill in missing MODULE_DESCRIPTION for qdiscs W=1 builds now warn if module is built without a MODULE_DESCRIPTION(). Fill in missing MODULE_DESCRIPTIONs for TC qdiscs. Signed-off-by: Victor Nogueira Acked-by: Jamal Hadi Salim Reviewed-by: Vinicius Costa Gomes Link: https://lore.kernel.org/r/20231027155045.46291-4-victor@mojatatu.com Signed-off-by: Jakub Kicinski --- net/sched/sch_cbs.c | 1 + net/sched/sch_choke.c | 1 + net/sched/sch_drr.c | 1 + net/sched/sch_etf.c | 1 + net/sched/sch_ets.c | 1 + net/sched/sch_fifo.c | 1 + net/sched/sch_gred.c | 1 + net/sched/sch_hfsc.c | 1 + net/sched/sch_htb.c | 1 + net/sched/sch_ingress.c | 1 + net/sched/sch_mqprio.c | 1 + net/sched/sch_mqprio_lib.c | 1 + net/sched/sch_multiq.c | 1 + net/sched/sch_netem.c | 1 + net/sched/sch_plug.c | 1 + net/sched/sch_prio.c | 1 + net/sched/sch_qfq.c | 1 + net/sched/sch_red.c | 1 + net/sched/sch_sfq.c | 1 + net/sched/sch_skbprio.c | 1 + net/sched/sch_taprio.c | 1 + net/sched/sch_tbf.c | 1 + net/sched/sch_teql.c | 1 + 23 files changed, 23 insertions(+) diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c index cac870eb7897..9a0b85190a2c 100644 --- a/net/sched/sch_cbs.c +++ b/net/sched/sch_cbs.c @@ -574,3 +574,4 @@ static void __exit cbs_module_exit(void) module_init(cbs_module_init) module_exit(cbs_module_exit) MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Credit Based shaper"); diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index 19c851125901..ae1da08e268f 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -513,3 +513,4 @@ module_init(choke_module_init) module_exit(choke_module_exit) MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Choose and keep responsive flows scheduler"); diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index 19901e77cd3b..097740a9afea 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -495,3 +495,4 @@ static void __exit drr_exit(void) module_init(drr_init); module_exit(drr_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Deficit Round Robin scheduler"); diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c index 61d1f0e32cf3..4808159a5466 100644 --- a/net/sched/sch_etf.c +++ b/net/sched/sch_etf.c @@ -513,3 +513,4 @@ static void __exit etf_module_exit(void) module_init(etf_module_init) module_exit(etf_module_exit) MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Earliest TxTime First (ETF) qdisc"); diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c index b10efeaf0629..f7c88495946b 100644 --- a/net/sched/sch_ets.c +++ b/net/sched/sch_ets.c @@ -826,3 +826,4 @@ static void __exit ets_exit(void) module_init(ets_init); module_exit(ets_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Enhanced Transmission Selection(ETS) scheduler"); diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c index e1040421b797..450f5c67ac49 100644 --- a/net/sched/sch_fifo.c +++ b/net/sched/sch_fifo.c @@ -269,3 +269,4 @@ struct Qdisc *fifo_create_dflt(struct Qdisc *sch, struct Qdisc_ops *ops, return q ? : ERR_PTR(err); } EXPORT_SYMBOL(fifo_create_dflt); +MODULE_DESCRIPTION("Single queue packet and byte based First In First Out(P/BFIFO) scheduler"); diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index 872d127c9db4..8c61eb3dc943 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -945,3 +945,4 @@ module_init(gred_module_init) module_exit(gred_module_exit) MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Generic Random Early Detection qdisc"); diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 880c5f16b29c..16c45da4036a 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1693,5 +1693,6 @@ hfsc_cleanup(void) } MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Hierarchical Fair Service Curve scheduler"); module_init(hfsc_init); module_exit(hfsc_cleanup); diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 0d947414e616..7349233eaa9b 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -2179,3 +2179,4 @@ static void __exit htb_module_exit(void) module_init(htb_module_init) module_exit(htb_module_exit) MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Hierarchical Token Bucket scheduler"); diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index a463a63192c3..5fa9eaa79bfc 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -370,3 +370,4 @@ module_exit(ingress_module_exit); MODULE_ALIAS("sch_clsact"); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Ingress and clsact based ingress and egress qdiscs"); diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 793009f445c0..43e53ee00a56 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -789,3 +789,4 @@ module_init(mqprio_module_init); module_exit(mqprio_module_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Classful multiqueue prio qdisc"); diff --git a/net/sched/sch_mqprio_lib.c b/net/sched/sch_mqprio_lib.c index 83b3793c4012..b3a5572c167b 100644 --- a/net/sched/sch_mqprio_lib.c +++ b/net/sched/sch_mqprio_lib.c @@ -129,3 +129,4 @@ void mqprio_fp_to_offload(u32 fp[TC_QOPT_MAX_QUEUE], EXPORT_SYMBOL_GPL(mqprio_fp_to_offload); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Shared mqprio qdisc code currently between taprio and mqprio"); diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index 75c9c860182b..d66d5f0ec080 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -410,3 +410,4 @@ module_init(multiq_module_init) module_exit(multiq_module_exit) MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Multi queue to hardware queue mapping qdisc"); diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 6ba2dc191ed9..fa678eb88528 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -1307,3 +1307,4 @@ static void __exit netem_module_exit(void) module_init(netem_module_init) module_exit(netem_module_exit) MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Network characteristics emulator qdisc"); diff --git a/net/sched/sch_plug.c b/net/sched/sch_plug.c index 35f49edf63db..992f0c8d7988 100644 --- a/net/sched/sch_plug.c +++ b/net/sched/sch_plug.c @@ -226,3 +226,4 @@ static void __exit plug_module_exit(void) module_init(plug_module_init) module_exit(plug_module_exit) MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Qdisc to plug and unplug traffic via netlink control"); diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index fdc5ef52c3ee..8ecdd3ef6f8e 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -433,3 +433,4 @@ module_init(prio_module_init) module_exit(prio_module_exit) MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Simple 3-band priority qdisc"); diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 28315166fe8e..48a604c320c7 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -1535,3 +1535,4 @@ static void __exit qfq_exit(void) module_init(qfq_init); module_exit(qfq_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Quick Fair Queueing Plus qdisc"); diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 16277b6a0238..607b6c8b3a9b 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -563,3 +563,4 @@ module_init(red_module_init) module_exit(red_module_exit) MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Random Early Detection qdisc"); diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 66dcb18638fe..eb77558fa367 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -937,3 +937,4 @@ static void __exit sfq_module_exit(void) module_init(sfq_module_init) module_exit(sfq_module_exit) MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Stochastic Fairness qdisc"); diff --git a/net/sched/sch_skbprio.c b/net/sched/sch_skbprio.c index 5df2dacb7b1a..28beb11762d8 100644 --- a/net/sched/sch_skbprio.c +++ b/net/sched/sch_skbprio.c @@ -307,3 +307,4 @@ module_init(skbprio_module_init) module_exit(skbprio_module_exit) MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("SKB priority based scheduling qdisc"); diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 2e1949de4171..31a8252bd09c 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -2572,3 +2572,4 @@ static void __exit taprio_module_exit(void) module_init(taprio_module_init); module_exit(taprio_module_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Time Aware Priority qdisc"); diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 17d2d00ddb18..dd6b1a723bf7 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -621,3 +621,4 @@ static void __exit tbf_module_exit(void) module_init(tbf_module_init) module_exit(tbf_module_exit) MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Token Bucket Filter qdisc"); diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index 7721239c185f..59304611dc00 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -523,3 +523,4 @@ module_init(teql_init); module_exit(teql_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("True (or trivial) link equalizer qdisc"); -- cgit From dd9d75fcf0f427ddcde4bde736908684ee05c353 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sat, 28 Oct 2023 20:44:57 +0200 Subject: net: phy: fill in missing MODULE_DESCRIPTION()s W=1 builds now warn if a module is built without a MODULE_DESCRIPTION(). Fill them in based on the Kconfig text, or similar. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Acked-by: Russell King (Oracle) Link: https://lore.kernel.org/r/20231028184458.99448-2-andrew@lunn.ch Signed-off-by: Jakub Kicinski --- drivers/net/phy/bcm-phy-ptp.c | 1 + drivers/net/phy/bcm87xx.c | 1 + drivers/net/phy/phylink.c | 1 + drivers/net/phy/sfp.c | 1 + 4 files changed, 4 insertions(+) diff --git a/drivers/net/phy/bcm-phy-ptp.c b/drivers/net/phy/bcm-phy-ptp.c index ef00d6163061..cb4b91af5e17 100644 --- a/drivers/net/phy/bcm-phy-ptp.c +++ b/drivers/net/phy/bcm-phy-ptp.c @@ -942,3 +942,4 @@ struct bcm_ptp_private *bcm_ptp_probe(struct phy_device *phydev) EXPORT_SYMBOL_GPL(bcm_ptp_probe); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Broadcom PHY PTP driver"); diff --git a/drivers/net/phy/bcm87xx.c b/drivers/net/phy/bcm87xx.c index cc2858107668..e81404bf8994 100644 --- a/drivers/net/phy/bcm87xx.c +++ b/drivers/net/phy/bcm87xx.c @@ -223,3 +223,4 @@ static struct phy_driver bcm87xx_driver[] = { module_phy_driver(bcm87xx_driver); MODULE_LICENSE("GPL v2"); +MODULE_DESCRIPTION("Broadcom BCM87xx PHY driver"); diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index 6712883498bb..6fdc6ba294d5 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -3726,3 +3726,4 @@ static int __init phylink_init(void) module_init(phylink_init); MODULE_LICENSE("GPL v2"); +MODULE_DESCRIPTION("phylink models the MAC to optional PHY connection"); diff --git a/drivers/net/phy/sfp.c b/drivers/net/phy/sfp.c index b8c0961daf53..5468bd209fab 100644 --- a/drivers/net/phy/sfp.c +++ b/drivers/net/phy/sfp.c @@ -3153,3 +3153,4 @@ module_exit(sfp_exit); MODULE_ALIAS("platform:sfp"); MODULE_AUTHOR("Russell King"); MODULE_LICENSE("GPL v2"); +MODULE_DESCRIPTION("SFP cage support"); -- cgit From 031fba65fc202abf1f193e321be7a2c274fd88ba Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sat, 28 Oct 2023 20:44:58 +0200 Subject: net: mdio: fill in missing MODULE_DESCRIPTION()s W=1 builds now warn if a module is built without a MODULE_DESCRIPTION(). Fill them in based on the Kconfig text, or similar. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Acked-by: Andrew Jeffery Link: https://lore.kernel.org/r/20231028184458.99448-3-andrew@lunn.ch Signed-off-by: Jakub Kicinski --- drivers/net/mdio/acpi_mdio.c | 1 + drivers/net/mdio/fwnode_mdio.c | 1 + drivers/net/mdio/mdio-aspeed.c | 1 + drivers/net/mdio/mdio-bitbang.c | 1 + drivers/net/mdio/of_mdio.c | 1 + 5 files changed, 5 insertions(+) diff --git a/drivers/net/mdio/acpi_mdio.c b/drivers/net/mdio/acpi_mdio.c index 4630dde01974..5d0f11f280cf 100644 --- a/drivers/net/mdio/acpi_mdio.c +++ b/drivers/net/mdio/acpi_mdio.c @@ -16,6 +16,7 @@ MODULE_AUTHOR("Calvin Johnson "); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ACPI MDIO bus (Ethernet PHY) accessors"); /** * __acpi_mdiobus_register - Register mii_bus and create PHYs from the ACPI ASL. diff --git a/drivers/net/mdio/fwnode_mdio.c b/drivers/net/mdio/fwnode_mdio.c index 1183ef5e203e..fd02f5cbc853 100644 --- a/drivers/net/mdio/fwnode_mdio.c +++ b/drivers/net/mdio/fwnode_mdio.c @@ -14,6 +14,7 @@ MODULE_AUTHOR("Calvin Johnson "); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("FWNODE MDIO bus (Ethernet PHY) accessors"); static struct pse_control * fwnode_find_pse_control(struct fwnode_handle *fwnode) diff --git a/drivers/net/mdio/mdio-aspeed.c b/drivers/net/mdio/mdio-aspeed.c index 70edeeb7771e..c2170650415c 100644 --- a/drivers/net/mdio/mdio-aspeed.c +++ b/drivers/net/mdio/mdio-aspeed.c @@ -205,3 +205,4 @@ module_platform_driver(aspeed_mdio_driver); MODULE_AUTHOR("Andrew Jeffery "); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ASPEED MDIO bus controller"); diff --git a/drivers/net/mdio/mdio-bitbang.c b/drivers/net/mdio/mdio-bitbang.c index 81b7748c10ce..f88639297ff2 100644 --- a/drivers/net/mdio/mdio-bitbang.c +++ b/drivers/net/mdio/mdio-bitbang.c @@ -263,3 +263,4 @@ void free_mdio_bitbang(struct mii_bus *bus) EXPORT_SYMBOL(free_mdio_bitbang); MODULE_LICENSE("GPL v2"); +MODULE_DESCRIPTION("Bitbanged MDIO buses"); diff --git a/drivers/net/mdio/of_mdio.c b/drivers/net/mdio/of_mdio.c index 7eb32ebb846d..64ebcb6d235c 100644 --- a/drivers/net/mdio/of_mdio.c +++ b/drivers/net/mdio/of_mdio.c @@ -25,6 +25,7 @@ MODULE_AUTHOR("Grant Likely "); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("OpenFirmware MDIO bus (Ethernet PHY) accessors"); /* Extract the clause 22 phy ID from the compatible string of the form * ethernet-phy-idAAAA.BBBB */ -- cgit From 05f0431bb90f2ee3657e7fc2678f11a1f9b778b7 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 30 Oct 2023 17:17:50 +0100 Subject: netlink: specs: devlink: add forgotten port function caps enum values Add two enum values that the blamed commit omitted. Fixes: f2f9dd164db0 ("netlink: specs: devlink: add the remaining command to generate complete split_ops") Signed-off-by: Jiri Pirko Link: https://lore.kernel.org/r/20231030161750.110420-1-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/devlink.yaml | 4 ++++ net/devlink/netlink_gen.c | 2 +- tools/net/ynl/generated/devlink-user.c | 2 ++ 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/Documentation/netlink/specs/devlink.yaml b/Documentation/netlink/specs/devlink.yaml index c6ba4889575a..572d83a414d0 100644 --- a/Documentation/netlink/specs/devlink.yaml +++ b/Documentation/netlink/specs/devlink.yaml @@ -71,6 +71,10 @@ definitions: name: roce-bit - name: migratable-bit + - + name: ipsec-crypto-bit + - + name: ipsec-packet-bit - type: enum name: sb-threshold-type diff --git a/net/devlink/netlink_gen.c b/net/devlink/netlink_gen.c index 9cbae0169249..788dfdc498a9 100644 --- a/net/devlink/netlink_gen.c +++ b/net/devlink/netlink_gen.c @@ -15,7 +15,7 @@ const struct nla_policy devlink_dl_port_function_nl_policy[DEVLINK_PORT_FN_ATTR_ [DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] = { .type = NLA_BINARY, }, [DEVLINK_PORT_FN_ATTR_STATE] = NLA_POLICY_MAX(NLA_U8, 1), [DEVLINK_PORT_FN_ATTR_OPSTATE] = NLA_POLICY_MAX(NLA_U8, 1), - [DEVLINK_PORT_FN_ATTR_CAPS] = NLA_POLICY_BITFIELD32(3), + [DEVLINK_PORT_FN_ATTR_CAPS] = NLA_POLICY_BITFIELD32(15), }; const struct nla_policy devlink_dl_selftest_id_nl_policy[DEVLINK_ATTR_SELFTEST_ID_FLASH + 1] = { diff --git a/tools/net/ynl/generated/devlink-user.c b/tools/net/ynl/generated/devlink-user.c index 75b744b47986..bc5065bd99b2 100644 --- a/tools/net/ynl/generated/devlink-user.c +++ b/tools/net/ynl/generated/devlink-user.c @@ -121,6 +121,8 @@ const char *devlink_port_fn_opstate_str(enum devlink_port_fn_opstate value) static const char * const devlink_port_fn_attr_cap_strmap[] = { [0] = "roce-bit", [1] = "migratable-bit", + [2] = "ipsec-crypto-bit", + [3] = "ipsec-packet-bit", }; const char *devlink_port_fn_attr_cap_str(enum devlink_port_fn_attr_cap value) -- cgit From 2b7ac0c87d985c92e519995853c52b9649ea4b07 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 27 Oct 2023 15:34:08 -0700 Subject: tools: ynl-gen: don't touch the output file if content is the same I often regenerate all YNL files in the tree to make sure they are in sync with the codegen and specs. Generator rewrites the files unconditionally, so since make looks at file modification time to decide what to rebuild - my next build takes longer. We already generate the code to a tempfile most of the time, only overwrite the target when we have to. Before: $ stat include/uapi/linux/netdev.h File: include/uapi/linux/netdev.h Size: 2307 Blocks: 8 IO Block: 4096 regular file Access: 2023-10-27 15:19:56.347071940 -0700 Modify: 2023-10-27 15:19:45.089000900 -0700 Change: 2023-10-27 15:19:45.089000900 -0700 Birth: 2023-10-27 15:19:45.088000894 -0700 $ ./tools/net/ynl/ynl-regen.sh -f [...] $ stat include/uapi/linux/netdev.h File: include/uapi/linux/netdev.h Size: 2307 Blocks: 8 IO Block: 4096 regular file Access: 2023-10-27 15:19:56.347071940 -0700 Modify: 2023-10-27 15:22:18.417968446 -0700 Change: 2023-10-27 15:22:18.417968446 -0700 Birth: 2023-10-27 15:19:45.088000894 -0700 After: $ stat include/uapi/linux/netdev.h File: include/uapi/linux/netdev.h Size: 2307 Blocks: 8 IO Block: 4096 regular file Access: 2023-10-27 15:22:41.520114221 -0700 Modify: 2023-10-27 15:22:18.417968446 -0700 Change: 2023-10-27 15:22:18.417968446 -0700 Birth: 2023-10-27 15:19:45.088000894 -0700 $ ./tools/net/ynl/ynl-regen.sh -f [...] $ stat include/uapi/linux/netdev.h File: include/uapi/linux/netdev.h Size: 2307 Blocks: 8 IO Block: 4096 regular file Access: 2023-10-27 15:22:41.520114221 -0700 Modify: 2023-10-27 15:22:18.417968446 -0700 Change: 2023-10-27 15:22:18.417968446 -0700 Birth: 2023-10-27 15:19:45.088000894 -0700 Reviewed-by: Jiri Pirko Link: https://lore.kernel.org/r/20231027223408.1865704-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/net/ynl/ynl-gen-c.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py index 13427436bfb7..c4003a83cd5d 100755 --- a/tools/net/ynl/ynl-gen-c.py +++ b/tools/net/ynl/ynl-gen-c.py @@ -3,6 +3,7 @@ import argparse import collections +import filecmp import os import re import shutil @@ -1168,7 +1169,7 @@ class CodeWriter: if out_file is None: self._out = os.sys.stdout else: - self._out = tempfile.TemporaryFile('w+') + self._out = tempfile.NamedTemporaryFile('w+') self._out_file = out_file def __del__(self): @@ -1177,6 +1178,10 @@ class CodeWriter: def close_out_file(self): if self._out == os.sys.stdout: return + # Avoid modifying the file if contents didn't change + self._out.flush() + if os.path.isfile(self._out_file) and filecmp.cmp(self._out.name, self._out_file, shallow=False): + return with open(self._out_file, 'w+') as out_file: self._out.seek(0) shutil.copyfileobj(self._out, out_file) -- cgit From d280783c3ad9ae31c01c1b3bdd2e8353a223599b Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Sat, 28 Oct 2023 22:48:35 +0200 Subject: net: xscale: Drop unused PHY number For some cargoculted reason on incomplete cleanup, we have a PHY number which refers to nothing and gives confusing messages about PHY 0 on all ports. Print the name of the actual PHY device instead. Reported-by: Howard Harte Signed-off-by: Linus Walleij Link: https://lore.kernel.org/r/20231028-ixp4xx-eth-id-v1-1-57be486d7f0f@linaro.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/xscale/ixp4xx_eth.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/xscale/ixp4xx_eth.c b/drivers/net/ethernet/xscale/ixp4xx_eth.c index 531bf919aef5..e0d26148dfd9 100644 --- a/drivers/net/ethernet/xscale/ixp4xx_eth.c +++ b/drivers/net/ethernet/xscale/ixp4xx_eth.c @@ -163,7 +163,6 @@ typedef void buffer_t; /* Information about built-in Ethernet MAC interfaces */ struct eth_plat_info { - u8 phy; /* MII PHY ID, 0 - 31 */ u8 rxq; /* configurable, currently 0 - 31 only */ u8 txreadyq; u8 hwaddr[ETH_ALEN]; @@ -1583,7 +1582,7 @@ static int ixp4xx_eth_probe(struct platform_device *pdev) if ((err = register_netdev(ndev))) goto err_phy_dis; - netdev_info(ndev, "%s: MII PHY %i on %s\n", ndev->name, plat->phy, + netdev_info(ndev, "%s: MII PHY %s on %s\n", ndev->name, phydev_name(phydev), npe_name(port->npe)); return 0; -- cgit From 7b3ba18703a63f6fd487183b9262b08e5632da1b Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 25 Oct 2023 19:42:38 -0400 Subject: llc: verify mac len before reading mac header LLC reads the mac header with eth_hdr without verifying that the skb has an Ethernet header. Syzbot was able to enter llc_rcv on a tun device. Tun can insert packets without mac len and with user configurable skb->protocol (passing a tun_pi header when not configuring IFF_NO_PI). BUG: KMSAN: uninit-value in llc_station_ac_send_test_r net/llc/llc_station.c:81 [inline] BUG: KMSAN: uninit-value in llc_station_rcv+0x6fb/0x1290 net/llc/llc_station.c:111 llc_station_ac_send_test_r net/llc/llc_station.c:81 [inline] llc_station_rcv+0x6fb/0x1290 net/llc/llc_station.c:111 llc_rcv+0xc5d/0x14a0 net/llc/llc_input.c:218 __netif_receive_skb_one_core net/core/dev.c:5523 [inline] __netif_receive_skb+0x1a6/0x5a0 net/core/dev.c:5637 netif_receive_skb_internal net/core/dev.c:5723 [inline] netif_receive_skb+0x58/0x660 net/core/dev.c:5782 tun_rx_batched+0x3ee/0x980 drivers/net/tun.c:1555 tun_get_user+0x54c5/0x69c0 drivers/net/tun.c:2002 Add a mac_len test before all three eth_hdr(skb) calls under net/llc. There are further uses in include/net/llc_pdu.h. All these are protected by a test skb->protocol == ETH_P_802_2. Which does not protect against this tun scenario. But the mac_len test added in this patch in llc_fixup_skb will indirectly protect those too. That is called from llc_rcv before any other LLC code. It is tempting to just add a blanket mac_len check in llc_rcv, but not sure whether that could break valid LLC paths that do not assume an Ethernet header. 802.2 LLC may be used on top of non-802.3 protocols in principle. The below referenced commit shows that used to, on top of Token Ring. At least one of the three eth_hdr uses goes back to before the start of git history. But the one that syzbot exercises is introduced in this commit. That commit is old enough (2008), that effectively all stable kernels should receive this. Fixes: f83f1768f833 ("[LLC]: skb allocation size for responses") Reported-by: syzbot+a8c7be6dee0de1b669cc@syzkaller.appspotmail.com Signed-off-by: Willem de Bruijn Link: https://lore.kernel.org/r/20231025234251.3796495-1-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- net/llc/llc_input.c | 10 ++++++++-- net/llc/llc_s_ac.c | 3 +++ net/llc/llc_station.c | 3 +++ 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/net/llc/llc_input.c b/net/llc/llc_input.c index 7cac441862e2..51bccfb00a9c 100644 --- a/net/llc/llc_input.c +++ b/net/llc/llc_input.c @@ -127,8 +127,14 @@ static inline int llc_fixup_skb(struct sk_buff *skb) skb->transport_header += llc_len; skb_pull(skb, llc_len); if (skb->protocol == htons(ETH_P_802_2)) { - __be16 pdulen = eth_hdr(skb)->h_proto; - s32 data_size = ntohs(pdulen) - llc_len; + __be16 pdulen; + s32 data_size; + + if (skb->mac_len < ETH_HLEN) + return 0; + + pdulen = eth_hdr(skb)->h_proto; + data_size = ntohs(pdulen) - llc_len; if (data_size < 0 || !pskb_may_pull(skb, data_size)) diff --git a/net/llc/llc_s_ac.c b/net/llc/llc_s_ac.c index 79d1cef8f15a..06fb8e6944b0 100644 --- a/net/llc/llc_s_ac.c +++ b/net/llc/llc_s_ac.c @@ -153,6 +153,9 @@ int llc_sap_action_send_test_r(struct llc_sap *sap, struct sk_buff *skb) int rc = 1; u32 data_size; + if (skb->mac_len < ETH_HLEN) + return 1; + llc_pdu_decode_sa(skb, mac_da); llc_pdu_decode_da(skb, mac_sa); llc_pdu_decode_ssap(skb, &dsap); diff --git a/net/llc/llc_station.c b/net/llc/llc_station.c index 05c6ae092053..f50654292510 100644 --- a/net/llc/llc_station.c +++ b/net/llc/llc_station.c @@ -76,6 +76,9 @@ static int llc_station_ac_send_test_r(struct sk_buff *skb) u32 data_size; struct sk_buff *nskb; + if (skb->mac_len < ETH_HLEN) + goto out; + /* The test request command is type U (llc_len = 3) */ data_size = ntohs(eth_hdr(skb)->h_proto) - 3; nskb = llc_alloc_frame(NULL, skb->dev, LLC_PDU_TYPE_U, data_size); -- cgit From 876f8ab52363f649bcc74072157dfd7adfbabc0d Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 27 Oct 2023 15:19:01 +0300 Subject: hsr: Prevent use after free in prp_create_tagged_frame() The prp_fill_rct() function can fail. In that situation, it frees the skb and returns NULL. Meanwhile on the success path, it returns the original skb. So it's straight forward to fix bug by using the returned value. Fixes: 451d8123f897 ("net: prp: add packet handling support") Signed-off-by: Dan Carpenter Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/57af1f28-7f57-4a96-bcd3-b7a0f2340845@moroto.mountain Signed-off-by: Jakub Kicinski --- net/hsr/hsr_forward.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c index b71dab630a87..80cdc6f6b34c 100644 --- a/net/hsr/hsr_forward.c +++ b/net/hsr/hsr_forward.c @@ -342,9 +342,7 @@ struct sk_buff *prp_create_tagged_frame(struct hsr_frame_info *frame, skb = skb_copy_expand(frame->skb_std, 0, skb_tailroom(frame->skb_std) + HSR_HLEN, GFP_ATOMIC); - prp_fill_rct(skb, frame, port); - - return skb; + return prp_fill_rct(skb, frame, port); } static void hsr_deliver_master(struct sk_buff *skb, struct net_device *dev, -- cgit From 19b3f72a41a8751e26bffc093bb7e1cef29ad579 Mon Sep 17 00:00:00 2001 From: Shigeru Yoshida Date: Mon, 30 Oct 2023 16:55:40 +0900 Subject: tipc: Change nla_policy for bearer-related names to NLA_NUL_STRING syzbot reported the following uninit-value access issue [1]: ===================================================== BUG: KMSAN: uninit-value in strlen lib/string.c:418 [inline] BUG: KMSAN: uninit-value in strstr+0xb8/0x2f0 lib/string.c:756 strlen lib/string.c:418 [inline] strstr+0xb8/0x2f0 lib/string.c:756 tipc_nl_node_reset_link_stats+0x3ea/0xb50 net/tipc/node.c:2595 genl_family_rcv_msg_doit net/netlink/genetlink.c:971 [inline] genl_family_rcv_msg net/netlink/genetlink.c:1051 [inline] genl_rcv_msg+0x11ec/0x1290 net/netlink/genetlink.c:1066 netlink_rcv_skb+0x371/0x650 net/netlink/af_netlink.c:2545 genl_rcv+0x40/0x60 net/netlink/genetlink.c:1075 netlink_unicast_kernel net/netlink/af_netlink.c:1342 [inline] netlink_unicast+0xf47/0x1250 net/netlink/af_netlink.c:1368 netlink_sendmsg+0x1238/0x13d0 net/netlink/af_netlink.c:1910 sock_sendmsg_nosec net/socket.c:730 [inline] sock_sendmsg net/socket.c:753 [inline] ____sys_sendmsg+0x9c2/0xd60 net/socket.c:2541 ___sys_sendmsg+0x28d/0x3c0 net/socket.c:2595 __sys_sendmsg net/socket.c:2624 [inline] __do_sys_sendmsg net/socket.c:2633 [inline] __se_sys_sendmsg net/socket.c:2631 [inline] __x64_sys_sendmsg+0x307/0x490 net/socket.c:2631 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd Uninit was created at: slab_post_alloc_hook+0x12f/0xb70 mm/slab.h:767 slab_alloc_node mm/slub.c:3478 [inline] kmem_cache_alloc_node+0x577/0xa80 mm/slub.c:3523 kmalloc_reserve+0x13d/0x4a0 net/core/skbuff.c:559 __alloc_skb+0x318/0x740 net/core/skbuff.c:650 alloc_skb include/linux/skbuff.h:1286 [inline] netlink_alloc_large_skb net/netlink/af_netlink.c:1214 [inline] netlink_sendmsg+0xb34/0x13d0 net/netlink/af_netlink.c:1885 sock_sendmsg_nosec net/socket.c:730 [inline] sock_sendmsg net/socket.c:753 [inline] ____sys_sendmsg+0x9c2/0xd60 net/socket.c:2541 ___sys_sendmsg+0x28d/0x3c0 net/socket.c:2595 __sys_sendmsg net/socket.c:2624 [inline] __do_sys_sendmsg net/socket.c:2633 [inline] __se_sys_sendmsg net/socket.c:2631 [inline] __x64_sys_sendmsg+0x307/0x490 net/socket.c:2631 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd TIPC bearer-related names including link names must be null-terminated strings. If a link name which is not null-terminated is passed through netlink, strstr() and similar functions can cause buffer overrun. This causes the above issue. This patch changes the nla_policy for bearer-related names from NLA_STRING to NLA_NUL_STRING. This resolves the issue by ensuring that only null-terminated strings are accepted as bearer-related names. syzbot reported similar uninit-value issue related to bearer names [2]. The root cause of this issue is that a non-null-terminated bearer name was passed. This patch also resolved this issue. Fixes: 7be57fc69184 ("tipc: add link get/dump to new netlink api") Fixes: 0655f6a8635b ("tipc: add bearer disable/enable to new netlink api") Reported-and-tested-by: syzbot+5138ca807af9d2b42574@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=5138ca807af9d2b42574 [1] Reported-and-tested-by: syzbot+9425c47dccbcb4c17d51@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=9425c47dccbcb4c17d51 [2] Signed-off-by: Shigeru Yoshida Reviewed-by: Jiri Pirko Link: https://lore.kernel.org/r/20231030075540.3784537-1-syoshida@redhat.com Signed-off-by: Jakub Kicinski --- net/tipc/netlink.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index e8fd257c0e68..1a9a5bdaccf4 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -88,7 +88,7 @@ const struct nla_policy tipc_nl_net_policy[TIPC_NLA_NET_MAX + 1] = { const struct nla_policy tipc_nl_link_policy[TIPC_NLA_LINK_MAX + 1] = { [TIPC_NLA_LINK_UNSPEC] = { .type = NLA_UNSPEC }, - [TIPC_NLA_LINK_NAME] = { .type = NLA_STRING, + [TIPC_NLA_LINK_NAME] = { .type = NLA_NUL_STRING, .len = TIPC_MAX_LINK_NAME }, [TIPC_NLA_LINK_MTU] = { .type = NLA_U32 }, [TIPC_NLA_LINK_BROADCAST] = { .type = NLA_FLAG }, @@ -125,7 +125,7 @@ const struct nla_policy tipc_nl_prop_policy[TIPC_NLA_PROP_MAX + 1] = { const struct nla_policy tipc_nl_bearer_policy[TIPC_NLA_BEARER_MAX + 1] = { [TIPC_NLA_BEARER_UNSPEC] = { .type = NLA_UNSPEC }, - [TIPC_NLA_BEARER_NAME] = { .type = NLA_STRING, + [TIPC_NLA_BEARER_NAME] = { .type = NLA_NUL_STRING, .len = TIPC_MAX_BEARER_NAME }, [TIPC_NLA_BEARER_PROP] = { .type = NLA_NESTED }, [TIPC_NLA_BEARER_DOMAIN] = { .type = NLA_U32 } -- cgit From 74da77921333171766031ea213b11f1e650814f9 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 31 Oct 2023 12:51:09 +0300 Subject: net/tcp_sigpool: Fix some off by one bugs The "cpool_populated" variable is the number of elements in the cpool[] array that have been populated. It is incremented in tcp_sigpool_alloc_ahash() every time we populate a new element. Unpopulated elements are NULL but if we have populated every element then this code will read one element beyond the end of the array. Fixes: 8c73b26315aa ("net/tcp: Prepare tcp_md5sig_pool for TCP-AO") Signed-off-by: Dan Carpenter Reviewed-by: Dmitry Safonov Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/ce915d61-04bc-44fb-b450-35fcc9fc8831@moroto.mountain Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_sigpool.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ipv4/tcp_sigpool.c b/net/ipv4/tcp_sigpool.c index 65a8eaae2fec..55b310a722c7 100644 --- a/net/ipv4/tcp_sigpool.c +++ b/net/ipv4/tcp_sigpool.c @@ -231,7 +231,7 @@ static void cpool_schedule_cleanup(struct kref *kref) */ void tcp_sigpool_release(unsigned int id) { - if (WARN_ON_ONCE(id > cpool_populated || !cpool[id].alg)) + if (WARN_ON_ONCE(id >= cpool_populated || !cpool[id].alg)) return; /* slow-path */ @@ -245,7 +245,7 @@ EXPORT_SYMBOL_GPL(tcp_sigpool_release); */ void tcp_sigpool_get(unsigned int id) { - if (WARN_ON_ONCE(id > cpool_populated || !cpool[id].alg)) + if (WARN_ON_ONCE(id >= cpool_populated || !cpool[id].alg)) return; kref_get(&cpool[id].kref); } @@ -256,7 +256,7 @@ int tcp_sigpool_start(unsigned int id, struct tcp_sigpool *c) __cond_acquires(RC struct crypto_ahash *hash; rcu_read_lock_bh(); - if (WARN_ON_ONCE(id > cpool_populated || !cpool[id].alg)) { + if (WARN_ON_ONCE(id >= cpool_populated || !cpool[id].alg)) { rcu_read_unlock_bh(); return -EINVAL; } @@ -301,7 +301,7 @@ EXPORT_SYMBOL_GPL(tcp_sigpool_end); */ size_t tcp_sigpool_algo(unsigned int id, char *buf, size_t buf_len) { - if (WARN_ON_ONCE(id > cpool_populated || !cpool[id].alg)) + if (WARN_ON_ONCE(id >= cpool_populated || !cpool[id].alg)) return -EINVAL; return strscpy(buf, cpool[id].alg, buf_len); -- cgit From 05670f81d1287c40ec861186e4c4e3401013e7fb Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Wed, 1 Nov 2023 19:16:01 +0100 Subject: bpf: fix compilation error without CGROUPS Our MPTCP CI complained [1] -- and KBuild too -- that it was no longer possible to build the kernel without CONFIG_CGROUPS: kernel/bpf/task_iter.c: In function 'bpf_iter_css_task_new': kernel/bpf/task_iter.c:919:14: error: 'CSS_TASK_ITER_PROCS' undeclared (first use in this function) 919 | case CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED: | ^~~~~~~~~~~~~~~~~~~ kernel/bpf/task_iter.c:919:14: note: each undeclared identifier is reported only once for each function it appears in kernel/bpf/task_iter.c:919:36: error: 'CSS_TASK_ITER_THREADED' undeclared (first use in this function) 919 | case CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED: | ^~~~~~~~~~~~~~~~~~~~~~ kernel/bpf/task_iter.c:927:60: error: invalid application of 'sizeof' to incomplete type 'struct css_task_iter' 927 | kit->css_it = bpf_mem_alloc(&bpf_global_ma, sizeof(struct css_task_iter)); | ^~~~~~ kernel/bpf/task_iter.c:930:9: error: implicit declaration of function 'css_task_iter_start'; did you mean 'task_seq_start'? [-Werror=implicit-function-declaration] 930 | css_task_iter_start(css, flags, kit->css_it); | ^~~~~~~~~~~~~~~~~~~ | task_seq_start kernel/bpf/task_iter.c: In function 'bpf_iter_css_task_next': kernel/bpf/task_iter.c:940:16: error: implicit declaration of function 'css_task_iter_next'; did you mean 'class_dev_iter_next'? [-Werror=implicit-function-declaration] 940 | return css_task_iter_next(kit->css_it); | ^~~~~~~~~~~~~~~~~~ | class_dev_iter_next kernel/bpf/task_iter.c:940:16: error: returning 'int' from a function with return type 'struct task_struct *' makes pointer from integer without a cast [-Werror=int-conversion] 940 | return css_task_iter_next(kit->css_it); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ kernel/bpf/task_iter.c: In function 'bpf_iter_css_task_destroy': kernel/bpf/task_iter.c:949:9: error: implicit declaration of function 'css_task_iter_end' [-Werror=implicit-function-declaration] 949 | css_task_iter_end(kit->css_it); | ^~~~~~~~~~~~~~~~~ This patch simply surrounds with a #ifdef the new code requiring CGroups support. It seems enough for the compiler and this is similar to bpf_iter_css_{new,next,destroy}() functions where no other #ifdef have been added in kernel/bpf/helpers.c and in the selftests. Fixes: 9c66dc94b62a ("bpf: Introduce css_task open-coded iterator kfuncs") Link: https://github.com/multipath-tcp/mptcp_net-next/actions/runs/6665206927 Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202310260528.aHWgVFqq-lkp@intel.com/ Signed-off-by: Matthieu Baerts [ added missing ifdefs for BTF_ID cgroup definitions ] Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20231101181601.1493271-1-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 8 +++++--- kernel/bpf/task_iter.c | 4 ++++ kernel/bpf/verifier.c | 8 ++++++++ 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index e46ac288a108..95449ea7cc1b 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2564,15 +2564,17 @@ BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY) BTF_ID_FLAGS(func, bpf_iter_task_vma_new, KF_ITER_NEW | KF_RCU) BTF_ID_FLAGS(func, bpf_iter_task_vma_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_task_vma_destroy, KF_ITER_DESTROY) +#ifdef CONFIG_CGROUPS BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_iter_css_task_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_css_task_destroy, KF_ITER_DESTROY) -BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED) -BTF_ID_FLAGS(func, bpf_iter_task_next, KF_ITER_NEXT | KF_RET_NULL) -BTF_ID_FLAGS(func, bpf_iter_task_destroy, KF_ITER_DESTROY) BTF_ID_FLAGS(func, bpf_iter_css_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED) BTF_ID_FLAGS(func, bpf_iter_css_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_css_destroy, KF_ITER_DESTROY) +#endif +BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED) +BTF_ID_FLAGS(func, bpf_iter_task_next, KF_ITER_NEXT | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_iter_task_destroy, KF_ITER_DESTROY) BTF_ID_FLAGS(func, bpf_dynptr_adjust) BTF_ID_FLAGS(func, bpf_dynptr_is_null) BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly) diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 654601dd6b49..8967d1ac7551 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -892,6 +892,8 @@ __bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it) __diag_pop(); +#ifdef CONFIG_CGROUPS + struct bpf_iter_css_task { __u64 __opaque[1]; } __attribute__((aligned(8))); @@ -950,6 +952,8 @@ __bpf_kfunc void bpf_iter_css_task_destroy(struct bpf_iter_css_task *it) __diag_pop(); +#endif /* CONFIG_CGROUPS */ + struct bpf_iter_task { __u64 __opaque[3]; } __attribute__((aligned(8))); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 857d76694517..8c91e0e54e27 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5388,7 +5388,9 @@ static bool in_rcu_cs(struct bpf_verifier_env *env) /* Once GCC supports btf_type_tag the following mechanism will be replaced with tag check */ BTF_SET_START(rcu_protected_types) BTF_ID(struct, prog_test_ref_kfunc) +#ifdef CONFIG_CGROUPS BTF_ID(struct, cgroup) +#endif BTF_ID(struct, bpf_cpumask) BTF_ID(struct, task_struct) BTF_SET_END(rcu_protected_types) @@ -10835,7 +10837,9 @@ BTF_ID(func, bpf_dynptr_clone) BTF_ID(func, bpf_percpu_obj_new_impl) BTF_ID(func, bpf_percpu_obj_drop_impl) BTF_ID(func, bpf_throw) +#ifdef CONFIG_CGROUPS BTF_ID(func, bpf_iter_css_task_new) +#endif BTF_SET_END(special_kfunc_set) BTF_ID_LIST(special_kfunc_list) @@ -10861,7 +10865,11 @@ BTF_ID(func, bpf_dynptr_clone) BTF_ID(func, bpf_percpu_obj_new_impl) BTF_ID(func, bpf_percpu_obj_drop_impl) BTF_ID(func, bpf_throw) +#ifdef CONFIG_CGROUPS BTF_ID(func, bpf_iter_css_task_new) +#else +BTF_ID_UNUSED +#endif static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) { -- cgit From 61e4a86600029e6e8d468d1fad6b6c749bebed19 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 27 Oct 2023 00:49:34 +0100 Subject: rxrpc: Fix two connection reaping bugs Fix two connection reaping bugs: (1) rxrpc_connection_expiry is in units of seconds, so rxrpc_disconnect_call() needs to multiply it by HZ when adding it to jiffies. (2) rxrpc_client_conn_reap_timeout() should set RXRPC_CLIENT_REAP_TIMER if local->kill_all_client_conns is clear, not if it is set (in which case we don't need the timer). Without this, old client connections don't get cleaned up until the local endpoint is cleaned up. Fixes: 5040011d073d ("rxrpc: Make the local endpoint hold a ref on a connected call") Fixes: 0d6bf319bc5a ("rxrpc: Move the client conn cache management to the I/O thread") Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://lore.kernel.org/r/783911.1698364174@warthog.procyon.org.uk Signed-off-by: Jakub Kicinski --- net/rxrpc/conn_object.c | 2 +- net/rxrpc/local_object.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index ac85d4644a3c..df8a271948a1 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -212,7 +212,7 @@ void rxrpc_disconnect_call(struct rxrpc_call *call) conn->idle_timestamp = jiffies; if (atomic_dec_and_test(&conn->active)) rxrpc_set_service_reap_timer(conn->rxnet, - jiffies + rxrpc_connection_expiry); + jiffies + rxrpc_connection_expiry * HZ); } rxrpc_put_call(call, rxrpc_call_put_io_thread); diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 7d910aee4f8c..c553a30e9c83 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -87,7 +87,7 @@ static void rxrpc_client_conn_reap_timeout(struct timer_list *timer) struct rxrpc_local *local = container_of(timer, struct rxrpc_local, client_conn_reap_timer); - if (local->kill_all_client_conns && + if (!local->kill_all_client_conns && test_and_set_bit(RXRPC_CLIENT_CONN_REAP_TIMER, &local->client_conn_flags)) rxrpc_wake_up_io_thread(local); } -- cgit From 811c363645b33e6e22658634329e95f383dfc705 Mon Sep 17 00:00:00 2001 From: Hao Sun Date: Wed, 1 Nov 2023 13:33:51 +0100 Subject: bpf: Fix check_stack_write_fixed_off() to correctly spill imm In check_stack_write_fixed_off(), imm value is cast to u32 before being spilled to the stack. Therefore, the sign information is lost, and the range information is incorrect when load from the stack again. For the following prog: 0: r2 = r10 1: *(u64*)(r2 -40) = -44 2: r0 = *(u64*)(r2 - 40) 3: if r0 s<= 0xa goto +2 4: r0 = 1 5: exit 6: r0 = 0 7: exit The verifier gives: func#0 @0 0: R1=ctx(off=0,imm=0) R10=fp0 0: (bf) r2 = r10 ; R2_w=fp0 R10=fp0 1: (7a) *(u64 *)(r2 -40) = -44 ; R2_w=fp0 fp-40_w=4294967252 2: (79) r0 = *(u64 *)(r2 -40) ; R0_w=4294967252 R2_w=fp0 fp-40_w=4294967252 3: (c5) if r0 s< 0xa goto pc+2 mark_precise: frame0: last_idx 3 first_idx 0 subseq_idx -1 mark_precise: frame0: regs=r0 stack= before 2: (79) r0 = *(u64 *)(r2 -40) 3: R0_w=4294967252 4: (b7) r0 = 1 ; R0_w=1 5: (95) exit verification time 7971 usec stack depth 40 processed 6 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0 So remove the incorrect cast, since imm field is declared as s32, and __mark_reg_known() takes u64, so imm would be correctly sign extended by compiler. Fixes: ecdf985d7615 ("bpf: track immediate values written to stack by BPF_ST instruction") Cc: stable@vger.kernel.org Signed-off-by: Hao Sun Acked-by: Shung-Hsi Yu Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20231101-fix-check-stack-write-v3-1-f05c2b1473d5@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8c91e0e54e27..014b4d1ef408 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4674,7 +4674,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, insn->imm != 0 && env->bpf_capable) { struct bpf_reg_state fake_reg = {}; - __mark_reg_known(&fake_reg, (u32)insn->imm); + __mark_reg_known(&fake_reg, insn->imm); fake_reg.type = SCALAR_VALUE; save_register_state(state, spi, &fake_reg, size); } else if (reg && is_spillable_regtype(reg->type)) { -- cgit From 85eb035e6cfd615071256592e1dbe72c1d99c24b Mon Sep 17 00:00:00 2001 From: Hao Sun Date: Wed, 1 Nov 2023 13:33:52 +0100 Subject: selftests/bpf: Add test for immediate spilled to stack Add a test to check if the verifier correctly reason about the sign of an immediate spilled to stack by BPF_ST instruction. Signed-off-by: Hao Sun Link: https://lore.kernel.org/r/20231101-fix-check-stack-write-v3-2-f05c2b1473d5@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/verifier/bpf_st_mem.c | 32 +++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/tools/testing/selftests/bpf/verifier/bpf_st_mem.c b/tools/testing/selftests/bpf/verifier/bpf_st_mem.c index 3af2501082b2..b616575c3b00 100644 --- a/tools/testing/selftests/bpf/verifier/bpf_st_mem.c +++ b/tools/testing/selftests/bpf/verifier/bpf_st_mem.c @@ -65,3 +65,35 @@ .expected_attach_type = BPF_SK_LOOKUP, .runs = -1, }, +{ + "BPF_ST_MEM stack imm sign", + /* Check if verifier correctly reasons about sign of an + * immediate spilled to stack by BPF_ST instruction. + * + * fp[-8] = -44; + * r0 = fp[-8]; + * if r0 s< 0 goto ret0; + * r0 = -1; + * exit; + * ret0: + * r0 = 0; + * exit; + */ + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, -44), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8), + BPF_JMP_IMM(BPF_JSLT, BPF_REG_0, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, -1), + BPF_EXIT_INSN(), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + /* Use prog type that requires return value in range [0, 1] */ + .prog_type = BPF_PROG_TYPE_SK_LOOKUP, + .expected_attach_type = BPF_SK_LOOKUP, + .result = VERBOSE_ACCEPT, + .runs = -1, + .errstr = "0: (7a) *(u64 *)(r10 -8) = -44 ; R10=fp0 fp-8_w=-44\ + 2: (c5) if r0 s< 0x0 goto pc+2\ + R0_w=-44", +}, -- cgit From cd60f410ddc0cd663045d15936155421b6f708fd Mon Sep 17 00:00:00 2001 From: Manu Bretelle Date: Tue, 31 Oct 2023 15:36:06 -0700 Subject: selftests/bpf: fix test_bpffs Currently this tests tries to umount /sys/kernel/debug (TDIR) but the system it is running on may have mounts below. For example, danobi/vmtest [0] VMs have mount -t tracefs tracefs /sys/kernel/debug/tracing as part of their init. This change instead creates a "random" directory under /tmp and uses this as TDIR. If the directory already exists, ignore the error and keep moving on. Test: Originally: $ vmtest -k $KERNEL_REPO/arch/x86_64/boot/bzImage "./test_progs -vv -a test_bpffs" => bzImage ===> Booting ===> Setting up VM ===> Running command [ 2.138818] bpf_testmod: loading out-of-tree module taints kernel. [ 2.140913] bpf_testmod: module verification failed: signature and/or required key missing - tainting kernel bpf_testmod.ko is already unloaded. Loading bpf_testmod.ko... Successfully loaded bpf_testmod.ko. test_test_bpffs:PASS:clone 0 nsec fn:PASS:unshare 0 nsec fn:PASS:mount / 0 nsec fn:FAIL:umount /sys/kernel/debug unexpected error: -1 (errno 16) bpf_testmod.ko is already unloaded. Loading bpf_testmod.ko... Successfully loaded bpf_testmod.ko. test_test_bpffs:PASS:clone 0 nsec test_test_bpffs:PASS:waitpid 0 nsec test_test_bpffs:FAIL:bpffs test failed 255#282 test_bpffs:FAIL Summary: 0/0 PASSED, 0 SKIPPED, 1 FAILED Successfully unloaded bpf_testmod.ko. Command failed with exit code: 1 After this change: $ vmtest -k $(make image_name) 'cd tools/testing/selftests/bpf && ./test_progs -vv -a test_bpffs' => bzImage ===> Booting ===> Setting up VM ===> Running command [ 2.295696] bpf_testmod: loading out-of-tree module taints kernel. [ 2.296468] bpf_testmod: module verification failed: signature and/or required key missing - tainting kernel bpf_testmod.ko is already unloaded. Loading bpf_testmod.ko... Successfully loaded bpf_testmod.ko. test_test_bpffs:PASS:clone 0 nsec fn:PASS:unshare 0 nsec fn:PASS:mount / 0 nsec fn:PASS:mount tmpfs 0 nsec fn:PASS:mkdir /tmp/test_bpffs_testdir/fs1 0 nsec fn:PASS:mkdir /tmp/test_bpffs_testdir/fs2 0 nsec fn:PASS:mount bpffs /tmp/test_bpffs_testdir/fs1 0 nsec fn:PASS:mount bpffs /tmp/test_bpffs_testdir/fs2 0 nsec fn:PASS:reading /tmp/test_bpffs_testdir/fs1/maps.debug 0 nsec fn:PASS:reading /tmp/test_bpffs_testdir/fs2/progs.debug 0 nsec fn:PASS:creating /tmp/test_bpffs_testdir/fs1/a 0 nsec fn:PASS:creating /tmp/test_bpffs_testdir/fs1/a/1 0 nsec fn:PASS:creating /tmp/test_bpffs_testdir/fs1/b 0 nsec fn:PASS:create_map(ARRAY) 0 nsec fn:PASS:pin map 0 nsec fn:PASS:stat(/tmp/test_bpffs_testdir/fs1/a) 0 nsec fn:PASS:renameat2(/fs1/a, /fs1/b, RENAME_EXCHANGE) 0 nsec fn:PASS:stat(/tmp/test_bpffs_testdir/fs1/b) 0 nsec fn:PASS:b should have a's inode 0 nsec fn:PASS:access(/tmp/test_bpffs_testdir/fs1/b/1) 0 nsec fn:PASS:stat(/tmp/test_bpffs_testdir/fs1/map) 0 nsec fn:PASS:renameat2(/fs1/c, /fs1/b, RENAME_EXCHANGE) 0 nsec fn:PASS:stat(/tmp/test_bpffs_testdir/fs1/b) 0 nsec fn:PASS:b should have c's inode 0 nsec fn:PASS:access(/tmp/test_bpffs_testdir/fs1/c/1) 0 nsec fn:PASS:renameat2(RENAME_NOREPLACE) 0 nsec fn:PASS:access(/tmp/test_bpffs_testdir/fs1/b) 0 nsec bpf_testmod.ko is already unloaded. Loading bpf_testmod.ko... Successfully loaded bpf_testmod.ko. test_test_bpffs:PASS:clone 0 nsec test_test_bpffs:PASS:waitpid 0 nsec test_test_bpffs:PASS:bpffs test 0 nsec #282 test_bpffs:OK Summary: 1/0 PASSED, 0 SKIPPED, 0 FAILED Successfully unloaded bpf_testmod.ko. [0] https://github.com/danobi/vmtest This is a follow-up of https://lore.kernel.org/bpf/20231024201852.1512720-1-chantr4@gmail.com/T/ v1 -> v2: - use a TDIR name that is related to test - use C-style comments Signed-off-by: Manu Bretelle Acked-by: Jiri Olsa Link: https://lore.kernel.org/r/20231031223606.2927976-1-chantr4@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/test_bpffs.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/test_bpffs.c b/tools/testing/selftests/bpf/prog_tests/test_bpffs.c index 214d9f4a94a5..ea933fd151c3 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_bpffs.c +++ b/tools/testing/selftests/bpf/prog_tests/test_bpffs.c @@ -8,7 +8,8 @@ #include #include -#define TDIR "/sys/kernel/debug" +/* TDIR must be in a location we can create a directory in. */ +#define TDIR "/tmp/test_bpffs_testdir" static int read_iter(char *file) { @@ -43,8 +44,11 @@ static int fn(void) if (!ASSERT_OK(err, "mount /")) goto out; - err = umount(TDIR); - if (!ASSERT_OK(err, "umount " TDIR)) + err = mkdir(TDIR, 0777); + /* If the directory already exists we can carry on. It may be left over + * from a previous run. + */ + if ((err && errno != EEXIST) && !ASSERT_OK(err, "mkdir " TDIR)) goto out; err = mount("none", TDIR, "tmpfs", 0, NULL); @@ -138,6 +142,7 @@ out: rmdir(TDIR "/fs1"); rmdir(TDIR "/fs2"); umount(TDIR); + rmdir(TDIR); exit(err); } -- cgit From 391145ba2accc48b596f3d438af1a6255b62a555 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Tue, 31 Oct 2023 14:56:24 -0700 Subject: bpf: Add __bpf_kfunc_{start,end}_defs macros BPF kfuncs are meant to be called from BPF programs. Accordingly, most kfuncs are not called from anywhere in the kernel, which the -Wmissing-prototypes warning is unhappy about. We've peppered __diag_ignore_all("-Wmissing-prototypes", ... everywhere kfuncs are defined in the codebase to suppress this warning. This patch adds two macros meant to bound one or many kfunc definitions. All existing kfunc definitions which use these __diag calls to suppress -Wmissing-prototypes are migrated to use the newly-introduced macros. A new __diag_ignore_all - for "-Wmissing-declarations" - is added to the __bpf_kfunc_start_defs macro based on feedback from Andrii on an earlier version of this patch [0] and another recent mailing list thread [1]. In the future we might need to ignore different warnings or do other kfunc-specific things. This change will make it easier to make such modifications for all kfunc defs. [0]: https://lore.kernel.org/bpf/CAEf4BzaE5dRWtK6RPLnjTW-MW9sx9K3Fn6uwqCTChK2Dcb1Xig@mail.gmail.com/ [1]: https://lore.kernel.org/bpf/ZT+2qCc%2FaXep0%2FLf@krava/ Signed-off-by: Dave Marchevsky Suggested-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Cc: Jiri Olsa Acked-by: Jiri Olsa Acked-by: David Vernet Acked-by: Yafang Shao Link: https://lore.kernel.org/r/20231031215625.2343848-1-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- Documentation/bpf/kfuncs.rst | 6 ++---- include/linux/btf.h | 9 +++++++++ kernel/bpf/bpf_iter.c | 6 ++---- kernel/bpf/cgroup_iter.c | 6 ++---- kernel/bpf/cpumask.c | 6 ++---- kernel/bpf/helpers.c | 6 ++---- kernel/bpf/map_iter.c | 6 ++---- kernel/bpf/task_iter.c | 18 ++++++------------ kernel/trace/bpf_trace.c | 6 ++---- net/bpf/test_run.c | 7 +++---- net/core/filter.c | 13 ++++--------- net/core/xdp.c | 6 ++---- net/ipv4/fou_bpf.c | 6 ++---- net/netfilter/nf_conntrack_bpf.c | 6 ++---- net/netfilter/nf_nat_bpf.c | 6 ++---- net/xfrm/xfrm_interface_bpf.c | 6 ++---- 16 files changed, 46 insertions(+), 73 deletions(-) diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst index 0d2647fb358d..723408e399ab 100644 --- a/Documentation/bpf/kfuncs.rst +++ b/Documentation/bpf/kfuncs.rst @@ -37,16 +37,14 @@ prototype in a header for the wrapper kfunc. An example is given below:: /* Disables missing prototype warnings */ - __diag_push(); - __diag_ignore_all("-Wmissing-prototypes", - "Global kfuncs as their definitions will be in BTF"); + __bpf_kfunc_start_defs(); __bpf_kfunc struct task_struct *bpf_find_get_task_by_vpid(pid_t nr) { return find_get_task_by_vpid(nr); } - __diag_pop(); + __bpf_kfunc_end_defs(); A wrapper kfunc is often needed when we need to annotate parameters of the kfunc. Otherwise one may directly make the kfunc visible to the BPF program by diff --git a/include/linux/btf.h b/include/linux/btf.h index c2231c64d60b..dc5ce962f600 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -84,6 +84,15 @@ */ #define __bpf_kfunc __used noinline +#define __bpf_kfunc_start_defs() \ + __diag_push(); \ + __diag_ignore_all("-Wmissing-declarations", \ + "Global kfuncs as their definitions will be in BTF");\ + __diag_ignore_all("-Wmissing-prototypes", \ + "Global kfuncs as their definitions will be in BTF") + +#define __bpf_kfunc_end_defs() __diag_pop() + /* * Return the name of the passed struct, if exists, or halt the build if for * example the structure gets renamed. In this way, developers have to revisit diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 833faa04461b..0fae79164187 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -782,9 +782,7 @@ struct bpf_iter_num_kern { int end; /* final value, exclusive */ } __aligned(8); -__diag_push(); -__diag_ignore_all("-Wmissing-prototypes", - "Global functions as their definitions will be in vmlinux BTF"); +__bpf_kfunc_start_defs(); __bpf_kfunc int bpf_iter_num_new(struct bpf_iter_num *it, int start, int end) { @@ -843,4 +841,4 @@ __bpf_kfunc void bpf_iter_num_destroy(struct bpf_iter_num *it) s->cur = s->end = 0; } -__diag_pop(); +__bpf_kfunc_end_defs(); diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c index 209e5135f9fb..d1b5c5618dd7 100644 --- a/kernel/bpf/cgroup_iter.c +++ b/kernel/bpf/cgroup_iter.c @@ -305,9 +305,7 @@ struct bpf_iter_css_kern { unsigned int flags; } __attribute__((aligned(8))); -__diag_push(); -__diag_ignore_all("-Wmissing-prototypes", - "Global functions as their definitions will be in vmlinux BTF"); +__bpf_kfunc_start_defs(); __bpf_kfunc int bpf_iter_css_new(struct bpf_iter_css *it, struct cgroup_subsys_state *start, unsigned int flags) @@ -358,4 +356,4 @@ __bpf_kfunc void bpf_iter_css_destroy(struct bpf_iter_css *it) { } -__diag_pop(); \ No newline at end of file +__bpf_kfunc_end_defs(); diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c index 6983af8e093c..e01c741e54e7 100644 --- a/kernel/bpf/cpumask.c +++ b/kernel/bpf/cpumask.c @@ -34,9 +34,7 @@ static bool cpu_valid(u32 cpu) return cpu < nr_cpu_ids; } -__diag_push(); -__diag_ignore_all("-Wmissing-prototypes", - "Global kfuncs as their definitions will be in BTF"); +__bpf_kfunc_start_defs(); /** * bpf_cpumask_create() - Create a mutable BPF cpumask. @@ -407,7 +405,7 @@ __bpf_kfunc u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1, return cpumask_any_and_distribute(src1, src2); } -__diag_pop(); +__bpf_kfunc_end_defs(); BTF_SET8_START(cpumask_kfunc_btf_ids) BTF_ID_FLAGS(func, bpf_cpumask_create, KF_ACQUIRE | KF_RET_NULL) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 95449ea7cc1b..abe82105e33e 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1886,9 +1886,7 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root, } } -__diag_push(); -__diag_ignore_all("-Wmissing-prototypes", - "Global functions as their definitions will be in vmlinux BTF"); +__bpf_kfunc_start_defs(); __bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign) { @@ -2505,7 +2503,7 @@ __bpf_kfunc void bpf_throw(u64 cookie) WARN(1, "A call to BPF exception callback should never return\n"); } -__diag_pop(); +__bpf_kfunc_end_defs(); BTF_SET8_START(generic_btf_ids) #ifdef CONFIG_KEXEC_CORE diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index 6fc9dae9edc8..6abd7c5df4b3 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -193,9 +193,7 @@ static int __init bpf_map_iter_init(void) late_initcall(bpf_map_iter_init); -__diag_push(); -__diag_ignore_all("-Wmissing-prototypes", - "Global functions as their definitions will be in vmlinux BTF"); +__bpf_kfunc_start_defs(); __bpf_kfunc s64 bpf_map_sum_elem_count(const struct bpf_map *map) { @@ -213,7 +211,7 @@ __bpf_kfunc s64 bpf_map_sum_elem_count(const struct bpf_map *map) return ret; } -__diag_pop(); +__bpf_kfunc_end_defs(); BTF_SET8_START(bpf_map_iter_kfunc_ids) BTF_ID_FLAGS(func, bpf_map_sum_elem_count, KF_TRUSTED_ARGS) diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 8967d1ac7551..4e156dca48de 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -822,9 +822,7 @@ struct bpf_iter_task_vma_kern { struct bpf_iter_task_vma_kern_data *data; } __attribute__((aligned(8))); -__diag_push(); -__diag_ignore_all("-Wmissing-prototypes", - "Global functions as their definitions will be in vmlinux BTF"); +__bpf_kfunc_start_defs(); __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it, struct task_struct *task, u64 addr) @@ -890,7 +888,7 @@ __bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it) } } -__diag_pop(); +__bpf_kfunc_end_defs(); #ifdef CONFIG_CGROUPS @@ -902,9 +900,7 @@ struct bpf_iter_css_task_kern { struct css_task_iter *css_it; } __attribute__((aligned(8))); -__diag_push(); -__diag_ignore_all("-Wmissing-prototypes", - "Global functions as their definitions will be in vmlinux BTF"); +__bpf_kfunc_start_defs(); __bpf_kfunc int bpf_iter_css_task_new(struct bpf_iter_css_task *it, struct cgroup_subsys_state *css, unsigned int flags) @@ -950,7 +946,7 @@ __bpf_kfunc void bpf_iter_css_task_destroy(struct bpf_iter_css_task *it) bpf_mem_free(&bpf_global_ma, kit->css_it); } -__diag_pop(); +__bpf_kfunc_end_defs(); #endif /* CONFIG_CGROUPS */ @@ -973,9 +969,7 @@ enum { BPF_TASK_ITER_PROC_THREADS }; -__diag_push(); -__diag_ignore_all("-Wmissing-prototypes", - "Global functions as their definitions will be in vmlinux BTF"); +__bpf_kfunc_start_defs(); __bpf_kfunc int bpf_iter_task_new(struct bpf_iter_task *it, struct task_struct *task__nullable, unsigned int flags) @@ -1045,7 +1039,7 @@ __bpf_kfunc void bpf_iter_task_destroy(struct bpf_iter_task *it) { } -__diag_pop(); +__bpf_kfunc_end_defs(); DEFINE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index df697c74d519..84e8a0f6e4e0 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1252,9 +1252,7 @@ static const struct bpf_func_proto bpf_get_func_arg_cnt_proto = { }; #ifdef CONFIG_KEYS -__diag_push(); -__diag_ignore_all("-Wmissing-prototypes", - "kfuncs which will be used in BPF programs"); +__bpf_kfunc_start_defs(); /** * bpf_lookup_user_key - lookup a key by its serial @@ -1404,7 +1402,7 @@ __bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr, } #endif /* CONFIG_SYSTEM_DATA_VERIFICATION */ -__diag_pop(); +__bpf_kfunc_end_defs(); BTF_SET8_START(key_sig_kfunc_set) BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE) diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 0841f8d82419..c9fdcc5cdce1 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -503,9 +503,8 @@ out: * architecture dependent calling conventions. 7+ can be supported in the * future. */ -__diag_push(); -__diag_ignore_all("-Wmissing-prototypes", - "Global functions as their definitions will be in vmlinux BTF"); +__bpf_kfunc_start_defs(); + __bpf_kfunc int bpf_fentry_test1(int a) { return a + 1; @@ -605,7 +604,7 @@ __bpf_kfunc void bpf_kfunc_call_memb_release(struct prog_test_member *p) { } -__diag_pop(); +__bpf_kfunc_end_defs(); BTF_SET8_START(bpf_test_modify_return_ids) BTF_ID_FLAGS(func, bpf_modify_return_test) diff --git a/net/core/filter.c b/net/core/filter.c index 21d75108c2e9..383f96b0a1c7 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -11767,9 +11767,7 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id) return func; } -__diag_push(); -__diag_ignore_all("-Wmissing-prototypes", - "Global functions as their definitions will be in vmlinux BTF"); +__bpf_kfunc_start_defs(); __bpf_kfunc int bpf_dynptr_from_skb(struct sk_buff *skb, u64 flags, struct bpf_dynptr_kern *ptr__uninit) { @@ -11816,7 +11814,7 @@ __bpf_kfunc int bpf_sock_addr_set_sun_path(struct bpf_sock_addr_kern *sa_kern, return 0; } -__diag_pop(); +__bpf_kfunc_end_defs(); int bpf_dynptr_from_skb_rdonly(struct sk_buff *skb, u64 flags, struct bpf_dynptr_kern *ptr__uninit) @@ -11879,10 +11877,7 @@ static int __init bpf_kfunc_init(void) } late_initcall(bpf_kfunc_init); -/* Disables missing prototype warnings */ -__diag_push(); -__diag_ignore_all("-Wmissing-prototypes", - "Global functions as their definitions will be in vmlinux BTF"); +__bpf_kfunc_start_defs(); /* bpf_sock_destroy: Destroy the given socket with ECONNABORTED error code. * @@ -11916,7 +11911,7 @@ __bpf_kfunc int bpf_sock_destroy(struct sock_common *sock) return sk->sk_prot->diag_destroy(sk, ECONNABORTED); } -__diag_pop() +__bpf_kfunc_end_defs(); BTF_SET8_START(bpf_sk_iter_kfunc_ids) BTF_ID_FLAGS(func, bpf_sock_destroy, KF_TRUSTED_ARGS) diff --git a/net/core/xdp.c b/net/core/xdp.c index df4789ab512d..b6f1d6dab3f2 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -696,9 +696,7 @@ struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf) return nxdpf; } -__diag_push(); -__diag_ignore_all("-Wmissing-prototypes", - "Global functions as their definitions will be in vmlinux BTF"); +__bpf_kfunc_start_defs(); /** * bpf_xdp_metadata_rx_timestamp - Read XDP frame RX timestamp. @@ -738,7 +736,7 @@ __bpf_kfunc int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, u32 *hash, return -EOPNOTSUPP; } -__diag_pop(); +__bpf_kfunc_end_defs(); BTF_SET8_START(xdp_metadata_kfunc_ids) #define XDP_METADATA_KFUNC(_, __, name, ___) BTF_ID_FLAGS(func, name, KF_TRUSTED_ARGS) diff --git a/net/ipv4/fou_bpf.c b/net/ipv4/fou_bpf.c index 3760a14b6b57..4da03bf45c9b 100644 --- a/net/ipv4/fou_bpf.c +++ b/net/ipv4/fou_bpf.c @@ -22,9 +22,7 @@ enum bpf_fou_encap_type { FOU_BPF_ENCAP_GUE, }; -__diag_push(); -__diag_ignore_all("-Wmissing-prototypes", - "Global functions as their definitions will be in BTF"); +__bpf_kfunc_start_defs(); /* bpf_skb_set_fou_encap - Set FOU encap parameters * @@ -100,7 +98,7 @@ __bpf_kfunc int bpf_skb_get_fou_encap(struct __sk_buff *skb_ctx, return 0; } -__diag_pop() +__bpf_kfunc_end_defs(); BTF_SET8_START(fou_kfunc_set) BTF_ID_FLAGS(func, bpf_skb_set_fou_encap) diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c index b21799d468d2..475358ec8212 100644 --- a/net/netfilter/nf_conntrack_bpf.c +++ b/net/netfilter/nf_conntrack_bpf.c @@ -230,9 +230,7 @@ static int _nf_conntrack_btf_struct_access(struct bpf_verifier_log *log, return 0; } -__diag_push(); -__diag_ignore_all("-Wmissing-prototypes", - "Global functions as their definitions will be in nf_conntrack BTF"); +__bpf_kfunc_start_defs(); /* bpf_xdp_ct_alloc - Allocate a new CT entry * @@ -467,7 +465,7 @@ __bpf_kfunc int bpf_ct_change_status(struct nf_conn *nfct, u32 status) return nf_ct_change_status_common(nfct, status); } -__diag_pop() +__bpf_kfunc_end_defs(); BTF_SET8_START(nf_ct_kfunc_set) BTF_ID_FLAGS(func, bpf_xdp_ct_alloc, KF_ACQUIRE | KF_RET_NULL) diff --git a/net/netfilter/nf_nat_bpf.c b/net/netfilter/nf_nat_bpf.c index 141ee7783223..6e3b2f58855f 100644 --- a/net/netfilter/nf_nat_bpf.c +++ b/net/netfilter/nf_nat_bpf.c @@ -12,9 +12,7 @@ #include #include -__diag_push(); -__diag_ignore_all("-Wmissing-prototypes", - "Global functions as their definitions will be in nf_nat BTF"); +__bpf_kfunc_start_defs(); /* bpf_ct_set_nat_info - Set source or destination nat address * @@ -54,7 +52,7 @@ __bpf_kfunc int bpf_ct_set_nat_info(struct nf_conn___init *nfct, return nf_nat_setup_info(ct, &range, manip) == NF_DROP ? -ENOMEM : 0; } -__diag_pop() +__bpf_kfunc_end_defs(); BTF_SET8_START(nf_nat_kfunc_set) BTF_ID_FLAGS(func, bpf_ct_set_nat_info, KF_TRUSTED_ARGS) diff --git a/net/xfrm/xfrm_interface_bpf.c b/net/xfrm/xfrm_interface_bpf.c index d74f3fd20f2b..7d5e920141e9 100644 --- a/net/xfrm/xfrm_interface_bpf.c +++ b/net/xfrm/xfrm_interface_bpf.c @@ -27,9 +27,7 @@ struct bpf_xfrm_info { int link; }; -__diag_push(); -__diag_ignore_all("-Wmissing-prototypes", - "Global functions as their definitions will be in xfrm_interface BTF"); +__bpf_kfunc_start_defs(); /* bpf_skb_get_xfrm_info - Get XFRM metadata * @@ -93,7 +91,7 @@ __bpf_kfunc int bpf_skb_set_xfrm_info(struct __sk_buff *skb_ctx, const struct bp return 0; } -__diag_pop() +__bpf_kfunc_end_defs(); BTF_SET8_START(xfrm_ifc_kfunc_set) BTF_ID_FLAGS(func, bpf_skb_get_xfrm_info) -- cgit From 15fb6f2b6c4c3c129adc2412ae12ec15e60a6adb Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Tue, 31 Oct 2023 14:56:25 -0700 Subject: bpf: Add __bpf_hook_{start,end} macros Not all uses of __diag_ignore_all(...) in BPF-related code in order to suppress warnings are wrapping kfunc definitions. Some "hook point" definitions - small functions meant to be used as attach points for fentry and similar BPF progs - need to suppress -Wmissing-declarations. We could use __bpf_kfunc_{start,end}_defs added in the previous patch in such cases, but this might be confusing to someone unfamiliar with BPF internals. Instead, this patch adds __bpf_hook_{start,end} macros, currently having the same effect as __bpf_kfunc_{start,end}_defs, then uses them to suppress warnings for two hook points in the kernel itself and some bpf_testmod hook points as well. Signed-off-by: Dave Marchevsky Cc: Yafang Shao Acked-by: Jiri Olsa Acked-by: Yafang Shao Link: https://lore.kernel.org/r/20231031215625.2343848-2-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 2 ++ kernel/cgroup/rstat.c | 9 +++------ net/socket.c | 8 ++------ tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c | 6 ++---- 4 files changed, 9 insertions(+), 16 deletions(-) diff --git a/include/linux/btf.h b/include/linux/btf.h index dc5ce962f600..59d404e22814 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -92,6 +92,8 @@ "Global kfuncs as their definitions will be in BTF") #define __bpf_kfunc_end_defs() __diag_pop() +#define __bpf_hook_start() __bpf_kfunc_start_defs() +#define __bpf_hook_end() __bpf_kfunc_end_defs() /* * Return the name of the passed struct, if exists, or halt the build if for diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index d80d7a608141..c0adb7254b45 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -156,19 +156,16 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, * optimize away the callsite. Therefore, __weak is needed to ensure that the * call is still emitted, by telling the compiler that we don't know what the * function might eventually be. - * - * __diag_* below are needed to dismiss the missing prototype warning. */ -__diag_push(); -__diag_ignore_all("-Wmissing-prototypes", - "kfuncs which will be used in BPF programs"); + +__bpf_hook_start(); __weak noinline void bpf_rstat_flush(struct cgroup *cgrp, struct cgroup *parent, int cpu) { } -__diag_pop(); +__bpf_hook_end(); /* see cgroup_rstat_flush() */ static void cgroup_rstat_flush_locked(struct cgroup *cgrp) diff --git a/net/socket.c b/net/socket.c index 0d1c4e78fc7f..3379c64217a4 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1685,20 +1685,16 @@ struct file *__sys_socket_file(int family, int type, int protocol) * Therefore, __weak is needed to ensure that the call is still * emitted, by telling the compiler that we don't know what the * function might eventually be. - * - * __diag_* below are needed to dismiss the missing prototype warning. */ -__diag_push(); -__diag_ignore_all("-Wmissing-prototypes", - "A fmod_ret entry point for BPF programs"); +__bpf_hook_start(); __weak noinline int update_socket_protocol(int family, int type, int protocol) { return protocol; } -__diag_pop(); +__bpf_hook_end(); int __sys_socket(int family, int type, int protocol) { diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c index a5e246f7b202..91907b321f91 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c @@ -39,9 +39,7 @@ struct bpf_testmod_struct_arg_4 { int b; }; -__diag_push(); -__diag_ignore_all("-Wmissing-prototypes", - "Global functions as their definitions will be in bpf_testmod.ko BTF"); +__bpf_hook_start(); noinline int bpf_testmod_test_struct_arg_1(struct bpf_testmod_struct_arg_2 a, int b, int c) { @@ -335,7 +333,7 @@ noinline int bpf_fentry_shadow_test(int a) } EXPORT_SYMBOL_GPL(bpf_fentry_shadow_test); -__diag_pop(); +__bpf_hook_end(); static struct bin_attribute bin_attr_bpf_testmod_file __ro_after_init = { .attr = { .name = "bpf_testmod", .mode = 0666, }, -- cgit From fd381ce60a2d79cc967506208085336d3d268ae0 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Mon, 30 Oct 2023 14:36:16 +0800 Subject: bpf: Check map->usercnt after timer->timer is assigned When there are concurrent uref release and bpf timer init operations, the following sequence diagram is possible. It will break the guarantee provided by bpf_timer: bpf_timer will still be alive after userspace application releases or unpins the map. It also will lead to kmemleak for old kernel version which doesn't release bpf_timer when map is released. bpf program X: bpf_timer_init() lock timer->lock read timer->timer as NULL read map->usercnt != 0 process Y: close(map_fd) // put last uref bpf_map_put_uref() atomic_dec_and_test(map->usercnt) array_map_free_timers() bpf_timer_cancel_and_free() // just return read timer->timer is NULL t = bpf_map_kmalloc_node() timer->timer = t unlock timer->lock Fix the problem by checking map->usercnt after timer->timer is assigned, so when there are concurrent uref release and bpf timer init, either bpf_timer_cancel_and_free() from uref release reads a no-NULL timer or the newly-added atomic64_read() returns a zero usercnt. Because atomic_dec_and_test(map->usercnt) and READ_ONCE(timer->timer) in bpf_timer_cancel_and_free() are not protected by a lock, so add a memory barrier to guarantee the order between map->usercnt and timer->timer. Also use WRITE_ONCE(timer->timer, x) to match the lockless read of timer->timer in bpf_timer_cancel_and_free(). Reported-by: Hsin-Wei Hung Closes: https://lore.kernel.org/bpf/CABcoxUaT2k9hWsS1tNgXyoU3E-=PuOgMn737qK984fbFmfYixQ@mail.gmail.com Fixes: b00628b1c7d5 ("bpf: Introduce bpf timers.") Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20231030063616.1653024-1-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index abe82105e33e..56b0c1f678ee 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1177,13 +1177,6 @@ BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map ret = -EBUSY; goto out; } - if (!atomic64_read(&map->usercnt)) { - /* maps with timers must be either held by user space - * or pinned in bpffs. - */ - ret = -EPERM; - goto out; - } /* allocate hrtimer via map_kmalloc to use memcg accounting */ t = bpf_map_kmalloc_node(map, sizeof(*t), GFP_ATOMIC, map->numa_node); if (!t) { @@ -1196,7 +1189,21 @@ BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map rcu_assign_pointer(t->callback_fn, NULL); hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT); t->timer.function = bpf_timer_cb; - timer->timer = t; + WRITE_ONCE(timer->timer, t); + /* Guarantee the order between timer->timer and map->usercnt. So + * when there are concurrent uref release and bpf timer init, either + * bpf_timer_cancel_and_free() called by uref release reads a no-NULL + * timer or atomic64_read() below returns a zero usercnt. + */ + smp_mb(); + if (!atomic64_read(&map->usercnt)) { + /* maps with timers must be either held by user space + * or pinned in bpffs. + */ + WRITE_ONCE(timer->timer, NULL); + kfree(t); + ret = -EPERM; + } out: __bpf_spin_unlock_irqrestore(&timer->lock); return ret; @@ -1374,7 +1381,7 @@ void bpf_timer_cancel_and_free(void *val) /* The subsequent bpf_timer_start/cancel() helpers won't be able to use * this timer, since it won't be initialized. */ - timer->timer = NULL; + WRITE_ONCE(timer->timer, NULL); out: __bpf_spin_unlock_irqrestore(&timer->lock); if (!t) -- cgit From 9af3775962afa8b5cd0cc30c1e454405a650c1f3 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sat, 28 Oct 2023 18:15:09 -0700 Subject: selftests/bpf: fix test_maps' use of bpf_map_create_opts Use LIBBPF_OPTS() macro to properly initialize bpf_map_create_opts in test_maps' tests. Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20231029011509.2479232-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/map_tests/map_percpu_stats.c | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/tools/testing/selftests/bpf/map_tests/map_percpu_stats.c b/tools/testing/selftests/bpf/map_tests/map_percpu_stats.c index 1a9eeefda9a8..8bf497a9843e 100644 --- a/tools/testing/selftests/bpf/map_tests/map_percpu_stats.c +++ b/tools/testing/selftests/bpf/map_tests/map_percpu_stats.c @@ -326,20 +326,14 @@ static int map_create(__u32 type, const char *name, struct bpf_map_create_opts * static int create_hash(void) { - struct bpf_map_create_opts map_opts = { - .sz = sizeof(map_opts), - .map_flags = BPF_F_NO_PREALLOC, - }; + LIBBPF_OPTS(bpf_map_create_opts, map_opts, .map_flags = BPF_F_NO_PREALLOC); return map_create(BPF_MAP_TYPE_HASH, "hash", &map_opts); } static int create_percpu_hash(void) { - struct bpf_map_create_opts map_opts = { - .sz = sizeof(map_opts), - .map_flags = BPF_F_NO_PREALLOC, - }; + LIBBPF_OPTS(bpf_map_create_opts, map_opts, .map_flags = BPF_F_NO_PREALLOC); return map_create(BPF_MAP_TYPE_PERCPU_HASH, "percpu_hash", &map_opts); } @@ -356,21 +350,17 @@ static int create_percpu_hash_prealloc(void) static int create_lru_hash(__u32 type, __u32 map_flags) { - struct bpf_map_create_opts map_opts = { - .sz = sizeof(map_opts), - .map_flags = map_flags, - }; + LIBBPF_OPTS(bpf_map_create_opts, map_opts, .map_flags = map_flags); return map_create(type, "lru_hash", &map_opts); } static int create_hash_of_maps(void) { - struct bpf_map_create_opts map_opts = { - .sz = sizeof(map_opts), + LIBBPF_OPTS(bpf_map_create_opts, map_opts, .map_flags = BPF_F_NO_PREALLOC, .inner_map_fd = create_small_hash(), - }; + ); int ret; ret = map_create_opts(BPF_MAP_TYPE_HASH_OF_MAPS, "hash_of_maps", -- cgit From 3091b667498b0a212e760e1033e5f9b8c33a948f Mon Sep 17 00:00:00 2001 From: Chuyi Zhou Date: Tue, 31 Oct 2023 13:04:36 +0800 Subject: bpf: Relax allowlist for css_task iter The newly added open-coded css_task iter would try to hold the global css_set_lock in bpf_iter_css_task_new, so the bpf side has to be careful in where it allows to use this iter. The mainly concern is dead locking on css_set_lock. check_css_task_iter_allowlist() in verifier enforced css_task can only be used in bpf_lsm hooks and sleepable bpf_iter. This patch relax the allowlist for css_task iter. Any lsm and any iter (even non-sleepable) and any sleepable are safe since they would not hold the css_set_lock before entering BPF progs context. This patch also fixes the misused BPF_TRACE_ITER in check_css_task_iter_allowlist which compared bpf_prog_type with bpf_attach_type. Fixes: 9c66dc94b62ae ("bpf: Introduce css_task open-coded iterator kfuncs") Signed-off-by: Chuyi Zhou Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20231031050438.93297-2-zhouchuyi@bytedance.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 16 ++++++++++++---- tools/testing/selftests/bpf/progs/iters_task_failure.c | 4 ++-- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 014b4d1ef408..def99b1a2b17 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11402,6 +11402,12 @@ static int process_kf_arg_ptr_to_rbtree_node(struct bpf_verifier_env *env, &meta->arg_rbtree_root.field); } +/* + * css_task iter allowlist is needed to avoid dead locking on css_set_lock. + * LSM hooks and iters (both sleepable and non-sleepable) are safe. + * Any sleepable progs are also safe since bpf_check_attach_target() enforce + * them can only be attached to some specific hook points. + */ static bool check_css_task_iter_allowlist(struct bpf_verifier_env *env) { enum bpf_prog_type prog_type = resolve_prog_type(env->prog); @@ -11409,10 +11415,12 @@ static bool check_css_task_iter_allowlist(struct bpf_verifier_env *env) switch (prog_type) { case BPF_PROG_TYPE_LSM: return true; - case BPF_TRACE_ITER: - return env->prog->aux->sleepable; + case BPF_PROG_TYPE_TRACING: + if (env->prog->expected_attach_type == BPF_TRACE_ITER) + return true; + fallthrough; default: - return false; + return env->prog->aux->sleepable; } } @@ -11671,7 +11679,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_ITER: if (meta->func_id == special_kfunc_list[KF_bpf_iter_css_task_new]) { if (!check_css_task_iter_allowlist(env)) { - verbose(env, "css_task_iter is only allowed in bpf_lsm and bpf iter-s\n"); + verbose(env, "css_task_iter is only allowed in bpf_lsm, bpf_iter and sleepable progs\n"); return -EINVAL; } } diff --git a/tools/testing/selftests/bpf/progs/iters_task_failure.c b/tools/testing/selftests/bpf/progs/iters_task_failure.c index c3bf96a67dba..6b1588d70652 100644 --- a/tools/testing/selftests/bpf/progs/iters_task_failure.c +++ b/tools/testing/selftests/bpf/progs/iters_task_failure.c @@ -84,8 +84,8 @@ int BPF_PROG(iter_css_lock_and_unlock) return 0; } -SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") -__failure __msg("css_task_iter is only allowed in bpf_lsm and bpf iter-s") +SEC("?fentry/" SYS_PREFIX "sys_getpgid") +__failure __msg("css_task_iter is only allowed in bpf_lsm, bpf_iter and sleepable progs") int BPF_PROG(iter_css_task_for_each) { u64 cg_id = bpf_get_current_cgroup_id(); -- cgit From f49843afde6771ef6ed5d021eacafacfc98a58bf Mon Sep 17 00:00:00 2001 From: Chuyi Zhou Date: Tue, 31 Oct 2023 13:04:37 +0800 Subject: selftests/bpf: Add tests for css_task iter combining with cgroup iter This patch adds a test which demonstrates how css_task iter can be combined with cgroup iter and it won't cause deadlock, though cgroup iter is not sleepable. Signed-off-by: Chuyi Zhou Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20231031050438.93297-3-zhouchuyi@bytedance.com Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/prog_tests/cgroup_iter.c | 33 ++++++++++++++++ tools/testing/selftests/bpf/progs/iters_css_task.c | 44 ++++++++++++++++++++++ 2 files changed, 77 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c b/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c index e02feb5fae97..574d9a0cdc8e 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c @@ -4,6 +4,7 @@ #include #include #include +#include "iters_css_task.skel.h" #include "cgroup_iter.skel.h" #include "cgroup_helpers.h" @@ -263,6 +264,35 @@ close_cgrp: close(cgrp_fd); } +static void test_walk_self_only_css_task(void) +{ + struct iters_css_task *skel; + int err; + + skel = iters_css_task__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + bpf_program__set_autoload(skel->progs.cgroup_id_printer, true); + + err = iters_css_task__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto cleanup; + + err = join_cgroup(cg_path[CHILD2]); + if (!ASSERT_OK(err, "join_cgroup")) + goto cleanup; + + skel->bss->target_pid = getpid(); + snprintf(expected_output, sizeof(expected_output), + PROLOGUE "%8llu\n" EPILOGUE, cg_id[CHILD2]); + read_from_cgroup_iter(skel->progs.cgroup_id_printer, cg_fd[CHILD2], + BPF_CGROUP_ITER_SELF_ONLY, "test_walk_self_only_css_task"); + ASSERT_EQ(skel->bss->css_task_cnt, 1, "css_task_cnt"); +cleanup: + iters_css_task__destroy(skel); +} + void test_cgroup_iter(void) { struct cgroup_iter *skel = NULL; @@ -293,6 +323,9 @@ void test_cgroup_iter(void) test_walk_self_only(skel); if (test__start_subtest("cgroup_iter__dead_self_only")) test_walk_dead_self_only(skel); + if (test__start_subtest("cgroup_iter__self_only_css_task")) + test_walk_self_only_css_task(); + out: cgroup_iter__destroy(skel); cleanup_cgroups(); diff --git a/tools/testing/selftests/bpf/progs/iters_css_task.c b/tools/testing/selftests/bpf/progs/iters_css_task.c index 5089ce384a1c..384ff806990f 100644 --- a/tools/testing/selftests/bpf/progs/iters_css_task.c +++ b/tools/testing/selftests/bpf/progs/iters_css_task.c @@ -10,6 +10,7 @@ char _license[] SEC("license") = "GPL"; +struct cgroup *bpf_cgroup_acquire(struct cgroup *p) __ksym; struct cgroup *bpf_cgroup_from_id(u64 cgid) __ksym; void bpf_cgroup_release(struct cgroup *p) __ksym; @@ -45,3 +46,46 @@ int BPF_PROG(iter_css_task_for_each, struct vm_area_struct *vma, return -EPERM; } + +static inline u64 cgroup_id(struct cgroup *cgrp) +{ + return cgrp->kn->id; +} + +SEC("?iter/cgroup") +int cgroup_id_printer(struct bpf_iter__cgroup *ctx) +{ + struct seq_file *seq = ctx->meta->seq; + struct cgroup *cgrp, *acquired; + struct cgroup_subsys_state *css; + struct task_struct *task; + u64 cgrp_id; + + cgrp = ctx->cgroup; + + /* epilogue */ + if (cgrp == NULL) { + BPF_SEQ_PRINTF(seq, "epilogue\n"); + return 0; + } + + /* prologue */ + if (ctx->meta->seq_num == 0) + BPF_SEQ_PRINTF(seq, "prologue\n"); + + cgrp_id = cgroup_id(cgrp); + + BPF_SEQ_PRINTF(seq, "%8llu\n", cgrp_id); + + acquired = bpf_cgroup_from_id(cgrp_id); + if (!acquired) + return 0; + css = &acquired->self; + css_task_cnt = 0; + bpf_for_each(css_task, task, css, CSS_TASK_ITER_PROCS) { + if (task->pid == target_pid) + css_task_cnt++; + } + bpf_cgroup_release(acquired); + return 0; +} -- cgit From d8234d47c4aa494d789b85562fa90e837b4575f9 Mon Sep 17 00:00:00 2001 From: Chuyi Zhou Date: Tue, 31 Oct 2023 13:04:38 +0800 Subject: selftests/bpf: Add test for using css_task iter in sleepable progs This Patch add a test to prove css_task iter can be used in normal sleepable progs. Signed-off-by: Chuyi Zhou Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20231031050438.93297-4-zhouchuyi@bytedance.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/iters.c | 1 + tools/testing/selftests/bpf/progs/iters_css_task.c | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/iters.c b/tools/testing/selftests/bpf/prog_tests/iters.c index c2425791c923..bf84d4a1d9ae 100644 --- a/tools/testing/selftests/bpf/prog_tests/iters.c +++ b/tools/testing/selftests/bpf/prog_tests/iters.c @@ -294,6 +294,7 @@ void test_iters(void) RUN_TESTS(iters_state_safety); RUN_TESTS(iters_looping); RUN_TESTS(iters); + RUN_TESTS(iters_css_task); if (env.has_testmod) RUN_TESTS(iters_testmod_seq); diff --git a/tools/testing/selftests/bpf/progs/iters_css_task.c b/tools/testing/selftests/bpf/progs/iters_css_task.c index 384ff806990f..e180aa1b1d52 100644 --- a/tools/testing/selftests/bpf/progs/iters_css_task.c +++ b/tools/testing/selftests/bpf/progs/iters_css_task.c @@ -89,3 +89,22 @@ int cgroup_id_printer(struct bpf_iter__cgroup *ctx) bpf_cgroup_release(acquired); return 0; } + +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +int BPF_PROG(iter_css_task_for_each_sleep) +{ + u64 cgrp_id = bpf_get_current_cgroup_id(); + struct cgroup *cgrp = bpf_cgroup_from_id(cgrp_id); + struct cgroup_subsys_state *css; + struct task_struct *task; + + if (cgrp == NULL) + return 0; + css = &cgrp->self; + + bpf_for_each(css_task, task, css, CSS_TASK_ITER_PROCS) { + + } + bpf_cgroup_release(cgrp); + return 0; +} -- cgit From 291d044fd51f8484066300ee42afecf8c8db7b3a Mon Sep 17 00:00:00 2001 From: Shung-Hsi Yu Date: Thu, 2 Nov 2023 13:39:03 +0800 Subject: bpf: Fix precision tracking for BPF_ALU | BPF_TO_BE | BPF_END MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BPF_END and BPF_NEG has a different specification for the source bit in the opcode compared to other ALU/ALU64 instructions, and is either reserved or use to specify the byte swap endianness. In both cases the source bit does not encode source operand location, and src_reg is a reserved field. backtrack_insn() currently does not differentiate BPF_END and BPF_NEG from other ALU/ALU64 instructions, which leads to r0 being incorrectly marked as precise when processing BPF_ALU | BPF_TO_BE | BPF_END instructions. This commit teaches backtrack_insn() to correctly mark precision for such case. While precise tracking of BPF_NEG and other BPF_END instructions are correct and does not need fixing, this commit opt to process all BPF_NEG and BPF_END instructions within the same if-clause to better align with current convention used in the verifier (e.g. check_alu_op). Fixes: b5dc0163d8fd ("bpf: precise scalar_value tracking") Cc: stable@vger.kernel.org Reported-by: Mohamed Mahmoud Closes: https://lore.kernel.org/r/87jzrrwptf.fsf@toke.dk Tested-by: Toke Høiland-Jørgensen Tested-by: Tao Lyu Acked-by: Eduard Zingerman Signed-off-by: Shung-Hsi Yu Link: https://lore.kernel.org/r/20231102053913.12004-2-shung-hsi.yu@suse.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index def99b1a2b17..bd1c42eb540f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3742,7 +3742,12 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, if (class == BPF_ALU || class == BPF_ALU64) { if (!bt_is_reg_set(bt, dreg)) return 0; - if (opcode == BPF_MOV) { + if (opcode == BPF_END || opcode == BPF_NEG) { + /* sreg is reserved and unused + * dreg still need precision before this insn + */ + return 0; + } else if (opcode == BPF_MOV) { if (BPF_SRC(insn->code) == BPF_X) { /* dreg = sreg or dreg = (s8, s16, s32)sreg * dreg needs precision after this insn -- cgit From 3c41971550f58f2e006c58aa71e8c23ad312110f Mon Sep 17 00:00:00 2001 From: Shung-Hsi Yu Date: Thu, 2 Nov 2023 13:39:05 +0800 Subject: selftests/bpf: precision tracking test for BPF_NEG and BPF_END As seen from previous commit that fix backtracking for BPF_ALU | BPF_TO_BE | BPF_END, both BPF_NEG and BPF_END require special handling. Add tests written with inline assembly to check that the verifier does not incorrecly use the src_reg field of BPF_NEG and BPF_END (including bswap added in v4). Suggested-by: Eduard Zingerman Signed-off-by: Shung-Hsi Yu Link: https://lore.kernel.org/r/20231102053913.12004-4-shung-hsi.yu@suse.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/verifier.c | 2 + .../selftests/bpf/progs/verifier_precision.c | 93 ++++++++++++++++++++++ 2 files changed, 95 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/verifier_precision.c diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index e3e68c97b40c..e5c61aa6604a 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -46,6 +46,7 @@ #include "verifier_movsx.skel.h" #include "verifier_netfilter_ctx.skel.h" #include "verifier_netfilter_retcode.skel.h" +#include "verifier_precision.skel.h" #include "verifier_prevent_map_lookup.skel.h" #include "verifier_raw_stack.skel.h" #include "verifier_raw_tp_writable.skel.h" @@ -153,6 +154,7 @@ void test_verifier_meta_access(void) { RUN(verifier_meta_access); } void test_verifier_movsx(void) { RUN(verifier_movsx); } void test_verifier_netfilter_ctx(void) { RUN(verifier_netfilter_ctx); } void test_verifier_netfilter_retcode(void) { RUN(verifier_netfilter_retcode); } +void test_verifier_precision(void) { RUN(verifier_precision); } void test_verifier_prevent_map_lookup(void) { RUN(verifier_prevent_map_lookup); } void test_verifier_raw_stack(void) { RUN(verifier_raw_stack); } void test_verifier_raw_tp_writable(void) { RUN(verifier_raw_tp_writable); } diff --git a/tools/testing/selftests/bpf/progs/verifier_precision.c b/tools/testing/selftests/bpf/progs/verifier_precision.c new file mode 100644 index 000000000000..193c0f8272d0 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_precision.c @@ -0,0 +1,93 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2023 SUSE LLC */ +#include +#include +#include "bpf_misc.h" + +SEC("?raw_tp") +__success __log_level(2) +__msg("mark_precise: frame0: regs=r2 stack= before 3: (bf) r1 = r10") +__msg("mark_precise: frame0: regs=r2 stack= before 2: (55) if r2 != 0xfffffff8 goto pc+2") +__msg("mark_precise: frame0: regs=r2 stack= before 1: (87) r2 = -r2") +__msg("mark_precise: frame0: regs=r2 stack= before 0: (b7) r2 = 8") +__naked int bpf_neg(void) +{ + asm volatile ( + "r2 = 8;" + "r2 = -r2;" + "if r2 != -8 goto 1f;" + "r1 = r10;" + "r1 += r2;" + "1:" + "r0 = 0;" + "exit;" + ::: __clobber_all); +} + +SEC("?raw_tp") +__success __log_level(2) +__msg("mark_precise: frame0: regs=r2 stack= before 3: (bf) r1 = r10") +__msg("mark_precise: frame0: regs=r2 stack= before 2: (55) if r2 != 0x0 goto pc+2") +__msg("mark_precise: frame0: regs=r2 stack= before 1: (d4) r2 = le16 r2") +__msg("mark_precise: frame0: regs=r2 stack= before 0: (b7) r2 = 0") +__naked int bpf_end_to_le(void) +{ + asm volatile ( + "r2 = 0;" + "r2 = le16 r2;" + "if r2 != 0 goto 1f;" + "r1 = r10;" + "r1 += r2;" + "1:" + "r0 = 0;" + "exit;" + ::: __clobber_all); +} + + +SEC("?raw_tp") +__success __log_level(2) +__msg("mark_precise: frame0: regs=r2 stack= before 3: (bf) r1 = r10") +__msg("mark_precise: frame0: regs=r2 stack= before 2: (55) if r2 != 0x0 goto pc+2") +__msg("mark_precise: frame0: regs=r2 stack= before 1: (dc) r2 = be16 r2") +__msg("mark_precise: frame0: regs=r2 stack= before 0: (b7) r2 = 0") +__naked int bpf_end_to_be(void) +{ + asm volatile ( + "r2 = 0;" + "r2 = be16 r2;" + "if r2 != 0 goto 1f;" + "r1 = r10;" + "r1 += r2;" + "1:" + "r0 = 0;" + "exit;" + ::: __clobber_all); +} + +#if (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86) || \ + (defined(__TARGET_ARCH_riscv) && __riscv_xlen == 64) || \ + defined(__TARGET_ARCH_arm) || defined(__TARGET_ARCH_s390)) && \ + __clang_major__ >= 18 + +SEC("?raw_tp") +__success __log_level(2) +__msg("mark_precise: frame0: regs=r2 stack= before 3: (bf) r1 = r10") +__msg("mark_precise: frame0: regs=r2 stack= before 2: (55) if r2 != 0x0 goto pc+2") +__msg("mark_precise: frame0: regs=r2 stack= before 1: (d7) r2 = bswap16 r2") +__msg("mark_precise: frame0: regs=r2 stack= before 0: (b7) r2 = 0") +__naked int bpf_end_bswap(void) +{ + asm volatile ( + "r2 = 0;" + "r2 = bswap16 r2;" + "if r2 != 0 goto 1f;" + "r1 = r10;" + "r1 += r2;" + "1:" + "r0 = 0;" + "exit;" + ::: __clobber_all); +} + +#endif /* v4 instruction */ -- cgit From 1726483b79a72e0150734d5367e4a0238bf8fcff Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 25 Oct 2023 14:10:37 +0000 Subject: inet: shrink struct flowi_common I am looking at syzbot reports triggering kernel stack overflows involving a cascade of ipvlan devices. We can save 8 bytes in struct flowi_common. This patch alone will not fix the issue, but is a start. Fixes: 24ba14406c5c ("route: Add multipath_hash in flowi_common to make user-define hash") Signed-off-by: Eric Dumazet Cc: wenxu Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20231025141037.3448203-1-edumazet@google.com Signed-off-by: Paolo Abeni --- include/net/flow.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/flow.h b/include/net/flow.h index 7f0adda3bf2f..335bbc52171c 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -40,8 +40,8 @@ struct flowi_common { #define FLOWI_FLAG_KNOWN_NH 0x02 __u32 flowic_secid; kuid_t flowic_uid; - struct flowi_tunnel flowic_tun_key; __u32 flowic_multipath_hash; + struct flowi_tunnel flowic_tun_key; }; union flowi_uli { -- cgit From 96b9a68d1a6e4f889d453874c9e359aa720b520f Mon Sep 17 00:00:00 2001 From: Ratheesh Kannoth Date: Fri, 27 Oct 2023 07:49:52 +0530 Subject: octeontx2-pf: Fix error codes Some of error codes were wrong. Fix the same. Fixes: 51afe9026d0c ("octeontx2-pf: NIX TX overwrites SQ_CTX_HW_S[SQ_INT]") Signed-off-by: Ratheesh Kannoth Reviewed-by: Wojciech Drewek Link: https://lore.kernel.org/r/20231027021953.1819959-1-rkannoth@marvell.com Signed-off-by: Paolo Abeni --- .../ethernet/marvell/octeontx2/nic/otx2_struct.h | 34 +++++++++++----------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_struct.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_struct.h index fa37b9f312ca..4e5899d8fa2e 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_struct.h +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_struct.h @@ -318,23 +318,23 @@ enum nix_snd_status_e { NIX_SND_STATUS_EXT_ERR = 0x6, NIX_SND_STATUS_JUMP_FAULT = 0x7, NIX_SND_STATUS_JUMP_POISON = 0x8, - NIX_SND_STATUS_CRC_ERR = 0x9, - NIX_SND_STATUS_IMM_ERR = 0x10, - NIX_SND_STATUS_SG_ERR = 0x11, - NIX_SND_STATUS_MEM_ERR = 0x12, - NIX_SND_STATUS_INVALID_SUBDC = 0x13, - NIX_SND_STATUS_SUBDC_ORDER_ERR = 0x14, - NIX_SND_STATUS_DATA_FAULT = 0x15, - NIX_SND_STATUS_DATA_POISON = 0x16, - NIX_SND_STATUS_NPC_DROP_ACTION = 0x17, - NIX_SND_STATUS_LOCK_VIOL = 0x18, - NIX_SND_STATUS_NPC_UCAST_CHAN_ERR = 0x19, - NIX_SND_STATUS_NPC_MCAST_CHAN_ERR = 0x20, - NIX_SND_STATUS_NPC_MCAST_ABORT = 0x21, - NIX_SND_STATUS_NPC_VTAG_PTR_ERR = 0x22, - NIX_SND_STATUS_NPC_VTAG_SIZE_ERR = 0x23, - NIX_SND_STATUS_SEND_MEM_FAULT = 0x24, - NIX_SND_STATUS_SEND_STATS_ERR = 0x25, + NIX_SND_STATUS_CRC_ERR = 0x10, + NIX_SND_STATUS_IMM_ERR = 0x11, + NIX_SND_STATUS_SG_ERR = 0x12, + NIX_SND_STATUS_MEM_ERR = 0x13, + NIX_SND_STATUS_INVALID_SUBDC = 0x14, + NIX_SND_STATUS_SUBDC_ORDER_ERR = 0x15, + NIX_SND_STATUS_DATA_FAULT = 0x16, + NIX_SND_STATUS_DATA_POISON = 0x17, + NIX_SND_STATUS_NPC_DROP_ACTION = 0x20, + NIX_SND_STATUS_LOCK_VIOL = 0x21, + NIX_SND_STATUS_NPC_UCAST_CHAN_ERR = 0x22, + NIX_SND_STATUS_NPC_MCAST_CHAN_ERR = 0x23, + NIX_SND_STATUS_NPC_MCAST_ABORT = 0x24, + NIX_SND_STATUS_NPC_VTAG_PTR_ERR = 0x25, + NIX_SND_STATUS_NPC_VTAG_SIZE_ERR = 0x26, + NIX_SND_STATUS_SEND_MEM_FAULT = 0x27, + NIX_SND_STATUS_SEND_STATS_ERR = 0x28, NIX_SND_STATUS_MAX, }; -- cgit From 7aeeb2cb7a2570bb69a87ad14018b03e06ce5be5 Mon Sep 17 00:00:00 2001 From: Ratheesh Kannoth Date: Fri, 27 Oct 2023 07:49:53 +0530 Subject: octeontx2-pf: Fix holes in error code Error code strings are not getting printed properly due to holes. Print error code as well. Fixes: 51afe9026d0c ("octeontx2-pf: NIX TX overwrites SQ_CTX_HW_S[SQ_INT]") Signed-off-by: Ratheesh Kannoth Reviewed-by: Wojciech Drewek Link: https://lore.kernel.org/r/20231027021953.1819959-2-rkannoth@marvell.com Signed-off-by: Paolo Abeni --- .../net/ethernet/marvell/octeontx2/nic/otx2_pf.c | 80 +++++++++++++--------- 1 file changed, 46 insertions(+), 34 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c index 6daf4d58c25d..125fe231702a 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c @@ -1193,31 +1193,32 @@ static char *nix_mnqerr_e_str[NIX_MNQERR_MAX] = { }; static char *nix_snd_status_e_str[NIX_SND_STATUS_MAX] = { - "NIX_SND_STATUS_GOOD", - "NIX_SND_STATUS_SQ_CTX_FAULT", - "NIX_SND_STATUS_SQ_CTX_POISON", - "NIX_SND_STATUS_SQB_FAULT", - "NIX_SND_STATUS_SQB_POISON", - "NIX_SND_STATUS_HDR_ERR", - "NIX_SND_STATUS_EXT_ERR", - "NIX_SND_STATUS_JUMP_FAULT", - "NIX_SND_STATUS_JUMP_POISON", - "NIX_SND_STATUS_CRC_ERR", - "NIX_SND_STATUS_IMM_ERR", - "NIX_SND_STATUS_SG_ERR", - "NIX_SND_STATUS_MEM_ERR", - "NIX_SND_STATUS_INVALID_SUBDC", - "NIX_SND_STATUS_SUBDC_ORDER_ERR", - "NIX_SND_STATUS_DATA_FAULT", - "NIX_SND_STATUS_DATA_POISON", - "NIX_SND_STATUS_NPC_DROP_ACTION", - "NIX_SND_STATUS_LOCK_VIOL", - "NIX_SND_STATUS_NPC_UCAST_CHAN_ERR", - "NIX_SND_STATUS_NPC_MCAST_CHAN_ERR", - "NIX_SND_STATUS_NPC_MCAST_ABORT", - "NIX_SND_STATUS_NPC_VTAG_PTR_ERR", - "NIX_SND_STATUS_NPC_VTAG_SIZE_ERR", - "NIX_SND_STATUS_SEND_STATS_ERR", + [NIX_SND_STATUS_GOOD] = "NIX_SND_STATUS_GOOD", + [NIX_SND_STATUS_SQ_CTX_FAULT] = "NIX_SND_STATUS_SQ_CTX_FAULT", + [NIX_SND_STATUS_SQ_CTX_POISON] = "NIX_SND_STATUS_SQ_CTX_POISON", + [NIX_SND_STATUS_SQB_FAULT] = "NIX_SND_STATUS_SQB_FAULT", + [NIX_SND_STATUS_SQB_POISON] = "NIX_SND_STATUS_SQB_POISON", + [NIX_SND_STATUS_HDR_ERR] = "NIX_SND_STATUS_HDR_ERR", + [NIX_SND_STATUS_EXT_ERR] = "NIX_SND_STATUS_EXT_ERR", + [NIX_SND_STATUS_JUMP_FAULT] = "NIX_SND_STATUS_JUMP_FAULT", + [NIX_SND_STATUS_JUMP_POISON] = "NIX_SND_STATUS_JUMP_POISON", + [NIX_SND_STATUS_CRC_ERR] = "NIX_SND_STATUS_CRC_ERR", + [NIX_SND_STATUS_IMM_ERR] = "NIX_SND_STATUS_IMM_ERR", + [NIX_SND_STATUS_SG_ERR] = "NIX_SND_STATUS_SG_ERR", + [NIX_SND_STATUS_MEM_ERR] = "NIX_SND_STATUS_MEM_ERR", + [NIX_SND_STATUS_INVALID_SUBDC] = "NIX_SND_STATUS_INVALID_SUBDC", + [NIX_SND_STATUS_SUBDC_ORDER_ERR] = "NIX_SND_STATUS_SUBDC_ORDER_ERR", + [NIX_SND_STATUS_DATA_FAULT] = "NIX_SND_STATUS_DATA_FAULT", + [NIX_SND_STATUS_DATA_POISON] = "NIX_SND_STATUS_DATA_POISON", + [NIX_SND_STATUS_NPC_DROP_ACTION] = "NIX_SND_STATUS_NPC_DROP_ACTION", + [NIX_SND_STATUS_LOCK_VIOL] = "NIX_SND_STATUS_LOCK_VIOL", + [NIX_SND_STATUS_NPC_UCAST_CHAN_ERR] = "NIX_SND_STAT_NPC_UCAST_CHAN_ERR", + [NIX_SND_STATUS_NPC_MCAST_CHAN_ERR] = "NIX_SND_STAT_NPC_MCAST_CHAN_ERR", + [NIX_SND_STATUS_NPC_MCAST_ABORT] = "NIX_SND_STATUS_NPC_MCAST_ABORT", + [NIX_SND_STATUS_NPC_VTAG_PTR_ERR] = "NIX_SND_STATUS_NPC_VTAG_PTR_ERR", + [NIX_SND_STATUS_NPC_VTAG_SIZE_ERR] = "NIX_SND_STATUS_NPC_VTAG_SIZE_ERR", + [NIX_SND_STATUS_SEND_MEM_FAULT] = "NIX_SND_STATUS_SEND_MEM_FAULT", + [NIX_SND_STATUS_SEND_STATS_ERR] = "NIX_SND_STATUS_SEND_STATS_ERR", }; static irqreturn_t otx2_q_intr_handler(int irq, void *data) @@ -1238,14 +1239,16 @@ static irqreturn_t otx2_q_intr_handler(int irq, void *data) continue; if (val & BIT_ULL(42)) { - netdev_err(pf->netdev, "CQ%lld: error reading NIX_LF_CQ_OP_INT, NIX_LF_ERR_INT 0x%llx\n", + netdev_err(pf->netdev, + "CQ%lld: error reading NIX_LF_CQ_OP_INT, NIX_LF_ERR_INT 0x%llx\n", qidx, otx2_read64(pf, NIX_LF_ERR_INT)); } else { if (val & BIT_ULL(NIX_CQERRINT_DOOR_ERR)) netdev_err(pf->netdev, "CQ%lld: Doorbell error", qidx); if (val & BIT_ULL(NIX_CQERRINT_CQE_FAULT)) - netdev_err(pf->netdev, "CQ%lld: Memory fault on CQE write to LLC/DRAM", + netdev_err(pf->netdev, + "CQ%lld: Memory fault on CQE write to LLC/DRAM", qidx); } @@ -1272,7 +1275,8 @@ static irqreturn_t otx2_q_intr_handler(int irq, void *data) (val & NIX_SQINT_BITS)); if (val & BIT_ULL(42)) { - netdev_err(pf->netdev, "SQ%lld: error reading NIX_LF_SQ_OP_INT, NIX_LF_ERR_INT 0x%llx\n", + netdev_err(pf->netdev, + "SQ%lld: error reading NIX_LF_SQ_OP_INT, NIX_LF_ERR_INT 0x%llx\n", qidx, otx2_read64(pf, NIX_LF_ERR_INT)); goto done; } @@ -1282,8 +1286,11 @@ static irqreturn_t otx2_q_intr_handler(int irq, void *data) goto chk_mnq_err_dbg; sq_op_err_code = FIELD_GET(GENMASK(7, 0), sq_op_err_dbg); - netdev_err(pf->netdev, "SQ%lld: NIX_LF_SQ_OP_ERR_DBG(%llx) err=%s\n", - qidx, sq_op_err_dbg, nix_sqoperr_e_str[sq_op_err_code]); + netdev_err(pf->netdev, + "SQ%lld: NIX_LF_SQ_OP_ERR_DBG(0x%llx) err=%s(%#x)\n", + qidx, sq_op_err_dbg, + nix_sqoperr_e_str[sq_op_err_code], + sq_op_err_code); otx2_write64(pf, NIX_LF_SQ_OP_ERR_DBG, BIT_ULL(44)); @@ -1300,16 +1307,21 @@ chk_mnq_err_dbg: goto chk_snd_err_dbg; mnq_err_code = FIELD_GET(GENMASK(7, 0), mnq_err_dbg); - netdev_err(pf->netdev, "SQ%lld: NIX_LF_MNQ_ERR_DBG(%llx) err=%s\n", - qidx, mnq_err_dbg, nix_mnqerr_e_str[mnq_err_code]); + netdev_err(pf->netdev, + "SQ%lld: NIX_LF_MNQ_ERR_DBG(0x%llx) err=%s(%#x)\n", + qidx, mnq_err_dbg, nix_mnqerr_e_str[mnq_err_code], + mnq_err_code); otx2_write64(pf, NIX_LF_MNQ_ERR_DBG, BIT_ULL(44)); chk_snd_err_dbg: snd_err_dbg = otx2_read64(pf, NIX_LF_SEND_ERR_DBG); if (snd_err_dbg & BIT(44)) { snd_err_code = FIELD_GET(GENMASK(7, 0), snd_err_dbg); - netdev_err(pf->netdev, "SQ%lld: NIX_LF_SND_ERR_DBG:0x%llx err=%s\n", - qidx, snd_err_dbg, nix_snd_status_e_str[snd_err_code]); + netdev_err(pf->netdev, + "SQ%lld: NIX_LF_SND_ERR_DBG:0x%llx err=%s(%#x)\n", + qidx, snd_err_dbg, + nix_snd_status_e_str[snd_err_code], + snd_err_code); otx2_write64(pf, NIX_LF_SEND_ERR_DBG, BIT_ULL(44)); } -- cgit From 5a22fbcc10f3f7d94c5d88afbbffa240a3677057 Mon Sep 17 00:00:00 2001 From: Alexander Sverdlin Date: Fri, 27 Oct 2023 08:57:38 +0200 Subject: net: dsa: lan9303: consequently nested-lock physical MDIO When LAN9303 is MDIO-connected two callchains exist into mdio->bus->write(): 1. switch ports 1&2 ("physical" PHYs): virtual (switch-internal) MDIO bus (lan9303_switch_ops->phy_{read|write})-> lan9303_mdio_phy_{read|write} -> mdiobus_{read|write}_nested 2. LAN9303 virtual PHY: virtual MDIO bus (lan9303_phy_{read|write}) -> lan9303_virt_phy_reg_{read|write} -> regmap -> lan9303_mdio_{read|write} If the latter functions just take mutex_lock(&sw_dev->device->bus->mdio_lock) it triggers a LOCKDEP false-positive splat. It's false-positive because the first mdio_lock in the second callchain above belongs to virtual MDIO bus, the second mdio_lock belongs to physical MDIO bus. Consequent annotation in lan9303_mdio_{read|write} as nested lock (similar to lan9303_mdio_phy_{read|write}, it's the same physical MDIO bus) prevents the following splat: WARNING: possible circular locking dependency detected 5.15.71 #1 Not tainted ------------------------------------------------------ kworker/u4:3/609 is trying to acquire lock: ffff000011531c68 (lan9303_mdio:131:(&lan9303_mdio_regmap_config)->lock){+.+.}-{3:3}, at: regmap_lock_mutex but task is already holding lock: ffff0000114c44d8 (&bus->mdio_lock){+.+.}-{3:3}, at: mdiobus_read which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&bus->mdio_lock){+.+.}-{3:3}: lock_acquire __mutex_lock mutex_lock_nested lan9303_mdio_read _regmap_read regmap_read lan9303_probe lan9303_mdio_probe mdio_probe really_probe __driver_probe_device driver_probe_device __device_attach_driver bus_for_each_drv __device_attach device_initial_probe bus_probe_device deferred_probe_work_func process_one_work worker_thread kthread ret_from_fork -> #0 (lan9303_mdio:131:(&lan9303_mdio_regmap_config)->lock){+.+.}-{3:3}: __lock_acquire lock_acquire.part.0 lock_acquire __mutex_lock mutex_lock_nested regmap_lock_mutex regmap_read lan9303_phy_read dsa_slave_phy_read __mdiobus_read mdiobus_read get_phy_device mdiobus_scan __mdiobus_register dsa_register_switch lan9303_probe lan9303_mdio_probe mdio_probe really_probe __driver_probe_device driver_probe_device __device_attach_driver bus_for_each_drv __device_attach device_initial_probe bus_probe_device deferred_probe_work_func process_one_work worker_thread kthread ret_from_fork other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&bus->mdio_lock); lock(lan9303_mdio:131:(&lan9303_mdio_regmap_config)->lock); lock(&bus->mdio_lock); lock(lan9303_mdio:131:(&lan9303_mdio_regmap_config)->lock); *** DEADLOCK *** 5 locks held by kworker/u4:3/609: #0: ffff000002842938 ((wq_completion)events_unbound){+.+.}-{0:0}, at: process_one_work #1: ffff80000bacbd60 (deferred_probe_work){+.+.}-{0:0}, at: process_one_work #2: ffff000007645178 (&dev->mutex){....}-{3:3}, at: __device_attach #3: ffff8000096e6e78 (dsa2_mutex){+.+.}-{3:3}, at: dsa_register_switch #4: ffff0000114c44d8 (&bus->mdio_lock){+.+.}-{3:3}, at: mdiobus_read stack backtrace: CPU: 1 PID: 609 Comm: kworker/u4:3 Not tainted 5.15.71 #1 Workqueue: events_unbound deferred_probe_work_func Call trace: dump_backtrace show_stack dump_stack_lvl dump_stack print_circular_bug check_noncircular __lock_acquire lock_acquire.part.0 lock_acquire __mutex_lock mutex_lock_nested regmap_lock_mutex regmap_read lan9303_phy_read dsa_slave_phy_read __mdiobus_read mdiobus_read get_phy_device mdiobus_scan __mdiobus_register dsa_register_switch lan9303_probe lan9303_mdio_probe ... Cc: stable@vger.kernel.org Fixes: dc7005831523 ("net: dsa: LAN9303: add MDIO managed mode support") Signed-off-by: Alexander Sverdlin Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20231027065741.534971-1-alexander.sverdlin@siemens.com Signed-off-by: Paolo Abeni --- drivers/net/dsa/lan9303_mdio.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/dsa/lan9303_mdio.c b/drivers/net/dsa/lan9303_mdio.c index d8ab2b77d201..167a86f39f27 100644 --- a/drivers/net/dsa/lan9303_mdio.c +++ b/drivers/net/dsa/lan9303_mdio.c @@ -32,7 +32,7 @@ static int lan9303_mdio_write(void *ctx, uint32_t reg, uint32_t val) struct lan9303_mdio *sw_dev = (struct lan9303_mdio *)ctx; reg <<= 2; /* reg num to offset */ - mutex_lock(&sw_dev->device->bus->mdio_lock); + mutex_lock_nested(&sw_dev->device->bus->mdio_lock, MDIO_MUTEX_NESTED); lan9303_mdio_real_write(sw_dev->device, reg, val & 0xffff); lan9303_mdio_real_write(sw_dev->device, reg + 2, (val >> 16) & 0xffff); mutex_unlock(&sw_dev->device->bus->mdio_lock); @@ -50,7 +50,7 @@ static int lan9303_mdio_read(void *ctx, uint32_t reg, uint32_t *val) struct lan9303_mdio *sw_dev = (struct lan9303_mdio *)ctx; reg <<= 2; /* reg num to offset */ - mutex_lock(&sw_dev->device->bus->mdio_lock); + mutex_lock_nested(&sw_dev->device->bus->mdio_lock, MDIO_MUTEX_NESTED); *val = lan9303_mdio_real_read(sw_dev->device, reg); *val |= (lan9303_mdio_real_read(sw_dev->device, reg + 2) << 16); mutex_unlock(&sw_dev->device->bus->mdio_lock); -- cgit From f55d8e60f10909dbc5524e261041e1d28d7d20d8 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sat, 28 Oct 2023 21:25:11 +0200 Subject: net: ethtool: Fix documentation of ethtool_sprintf() This function takes a pointer to a pointer, unlike sprintf() which is passed a plain pointer. Fix up the documentation to make this clear. Fixes: 7888fe53b706 ("ethtool: Add common function for filling out strings") Cc: Alexander Duyck Cc: Justin Stitt Cc: stable@vger.kernel.org Signed-off-by: Andrew Lunn Reviewed-by: Justin Stitt Link: https://lore.kernel.org/r/20231028192511.100001-1-andrew@lunn.ch Signed-off-by: Paolo Abeni --- include/linux/ethtool.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 226a36ed5aa1..689028257fcc 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -1045,10 +1045,10 @@ static inline int ethtool_mm_frag_size_min_to_add(u32 val_min, u32 *val_add, /** * ethtool_sprintf - Write formatted string to ethtool string data - * @data: Pointer to start of string to update + * @data: Pointer to a pointer to the start of string to update * @fmt: Format of string to write * - * Write formatted string to data. Update data to point at start of + * Write formatted string to *data. Update *data to point at start of * next string. */ extern __printf(2, 3) void ethtool_sprintf(u8 **data, const char *fmt, ...); -- cgit From 8ffbd1669ed1d58939d6e878dffaa2f60bf961a4 Mon Sep 17 00:00:00 2001 From: Jian Shen Date: Mon, 30 Oct 2023 17:12:56 +0800 Subject: net: page_pool: add missing free_percpu when page_pool_init fail When ptr_ring_init() returns failure in page_pool_init(), free_percpu() is not called to free pool->recycle_stats, which may cause memory leak. Fixes: ad6fa1e1ab1b ("page_pool: Add recycle stats") Signed-off-by: Jian Shen Signed-off-by: Jijie Shao Reviewed-by: Yunsheng Lin Reviewed-by: Jiri Pirko Reviewed-by: Somnath Kotur Reviewed-by: Ilias Apalodimas Link: https://lore.kernel.org/r/20231030091256.2915394-1-shaojijie@huawei.com Signed-off-by: Paolo Abeni --- net/core/page_pool.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 5e409b98aba0..dec544337236 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -217,8 +217,12 @@ static int page_pool_init(struct page_pool *pool, return -ENOMEM; #endif - if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) + if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) { +#ifdef CONFIG_PAGE_POOL_STATS + free_percpu(pool->recycle_stats); +#endif return -ENOMEM; + } atomic_set(&pool->pages_state_release_cnt, 0); -- cgit From a1602d749097386ec9e8e411a16a9c37ff6cd5fc Mon Sep 17 00:00:00 2001 From: Gerd Bayer Date: Mon, 30 Oct 2023 18:03:43 +0100 Subject: net/smc: fix documentation of buffer sizes Since commit 833bac7ec392 ("net/smc: Fix setsockopt and sysctl to specify same buffer size again") the SMC protocol uses its own default values for the smc.rmem and smc.wmem sysctl variables which are no longer derived from the TCP IPv4 buffer sizes. Fixup the kernel documentation to reflect this change, too. Fixes: 833bac7ec392 ("net/smc: Fix setsockopt and sysctl to specify same buffer size again") Signed-off-by: Gerd Bayer Reviewed-by: Wenjia Zhang Reviewed-by: Dust Li Link: https://lore.kernel.org/r/20231030170343.748097-1-gbayer@linux.ibm.com Signed-off-by: Paolo Abeni --- Documentation/networking/smc-sysctl.rst | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/Documentation/networking/smc-sysctl.rst b/Documentation/networking/smc-sysctl.rst index 6d8acdbe9be1..769149d98773 100644 --- a/Documentation/networking/smc-sysctl.rst +++ b/Documentation/networking/smc-sysctl.rst @@ -44,18 +44,16 @@ smcr_testlink_time - INTEGER wmem - INTEGER Initial size of send buffer used by SMC sockets. - The default value inherits from net.ipv4.tcp_wmem[1]. The minimum value is 16KiB and there is no hard limit for max value, but only allowed 512KiB for SMC-R and 1MiB for SMC-D. - Default: 16K + Default: 64KiB rmem - INTEGER Initial size of receive buffer (RMB) used by SMC sockets. - The default value inherits from net.ipv4.tcp_rmem[1]. The minimum value is 16KiB and there is no hard limit for max value, but only allowed 512KiB for SMC-R and 1MiB for SMC-D. - Default: 128K + Default: 64KiB -- cgit From fa2df45af13091f76b89adb84a28f13818d5d631 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 30 Oct 2023 13:10:41 -0700 Subject: dccp: Call security_inet_conn_request() after setting IPv4 addresses. Initially, commit 4237c75c0a35 ("[MLSXFRM]: Auto-labeling of child sockets") introduced security_inet_conn_request() in some functions where reqsk is allocated. The hook is added just after the allocation, so reqsk's IPv4 remote address was not initialised then. However, SELinux/Smack started to read it in netlbl_req_setattr() after the cited commits. This bug was partially fixed by commit 284904aa7946 ("lsm: Relocate the IPv4 security_inet_conn_request() hooks"). This patch fixes the last bug in DCCPv4. Fixes: 389fb800ac8b ("netlabel: Label incoming TCP connections correctly in SELinux") Fixes: 07feee8f812f ("netlabel: Cleanup the Smack/NetLabel code to fix incoming TCP connections") Signed-off-by: Kuniyuki Iwashima Acked-by: Paul Moore Signed-off-by: Paolo Abeni --- net/dccp/ipv4.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 1b8cbfda6e5d..44b033fe1ef6 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -629,9 +629,6 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) if (dccp_parse_options(sk, dreq, skb)) goto drop_and_free; - if (security_inet_conn_request(sk, skb, req)) - goto drop_and_free; - ireq = inet_rsk(req); sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); @@ -639,6 +636,9 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) ireq->ireq_family = AF_INET; ireq->ir_iif = READ_ONCE(sk->sk_bound_dev_if); + if (security_inet_conn_request(sk, skb, req)) + goto drop_and_free; + /* * Step 3: Process LISTEN state * -- cgit From 23be1e0e2a83a8543214d2599a31d9a2185a796b Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 30 Oct 2023 13:10:42 -0700 Subject: dccp/tcp: Call security_inet_conn_request() after setting IPv6 addresses. Initially, commit 4237c75c0a35 ("[MLSXFRM]: Auto-labeling of child sockets") introduced security_inet_conn_request() in some functions where reqsk is allocated. The hook is added just after the allocation, so reqsk's IPv6 remote address was not initialised then. However, SELinux/Smack started to read it in netlbl_req_setattr() after commit e1adea927080 ("calipso: Allow request sockets to be relabelled by the lsm."). Commit 284904aa7946 ("lsm: Relocate the IPv4 security_inet_conn_request() hooks") fixed that kind of issue only in TCPv4 because IPv6 labeling was not supported at that time. Finally, the same issue was introduced again in IPv6. Let's apply the same fix on DCCPv6 and TCPv6. Fixes: e1adea927080 ("calipso: Allow request sockets to be relabelled by the lsm.") Signed-off-by: Kuniyuki Iwashima Acked-by: Paul Moore Signed-off-by: Paolo Abeni --- net/dccp/ipv6.c | 6 +++--- net/ipv6/syncookies.c | 7 ++++--- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 8d344b219f84..4550b680665a 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -360,15 +360,15 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) if (dccp_parse_options(sk, dreq, skb)) goto drop_and_free; - if (security_inet_conn_request(sk, skb, req)) - goto drop_and_free; - ireq = inet_rsk(req); ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; ireq->ireq_family = AF_INET6; ireq->ir_mark = inet_request_mark(sk, skb); + if (security_inet_conn_request(sk, skb, req)) + goto drop_and_free; + if (ipv6_opt_accepted(sk, skb, IP6CB(skb)) || np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) { diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 500f6ed3b8cf..12eedc6ca2cc 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -181,14 +181,15 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) treq = tcp_rsk(req); treq->tfo_listener = false; - if (security_inet_conn_request(sk, skb, req)) - goto out_free; - req->mss = mss; ireq->ir_rmt_port = th->source; ireq->ir_num = ntohs(th->dest); ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; + + if (security_inet_conn_request(sk, skb, req)) + goto out_free; + if (ipv6_opt_accepted(sk, skb, &TCP_SKB_CB(skb)->header.h6) || np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) { -- cgit From efa5f1311c4998e9e6317c52bc5ee93b3a0f36df Mon Sep 17 00:00:00 2001 From: Patrick Thompson Date: Mon, 30 Oct 2023 16:50:14 -0400 Subject: net: r8169: Disable multicast filter for RTL8168H and RTL8107E RTL8168H and RTL8107E ethernet adapters erroneously filter unicast eapol packets unless allmulti is enabled. These devices correspond to RTL_GIGA_MAC_VER_46 and VER_48. Add an exception for VER_46 and VER_48 in the same way that VER_35 has an exception. Fixes: 6e1d0b898818 ("r8169:add support for RTL8168H and RTL8107E") Signed-off-by: Patrick Thompson Reviewed-by: Jacob Keller Reviewed-by: Heiner Kallweit Link: https://lore.kernel.org/r/20231030205031.177855-1-ptf@google.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/realtek/r8169_main.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index a987defb575c..4b8251cdb436 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -2584,7 +2584,9 @@ static void rtl_set_rx_mode(struct net_device *dev) rx_mode |= AcceptAllPhys; } else if (netdev_mc_count(dev) > MC_FILTER_LIMIT || dev->flags & IFF_ALLMULTI || - tp->mac_version == RTL_GIGA_MAC_VER_35) { + tp->mac_version == RTL_GIGA_MAC_VER_35 || + tp->mac_version == RTL_GIGA_MAC_VER_46 || + tp->mac_version == RTL_GIGA_MAC_VER_48) { /* accept all multicasts */ } else if (netdev_mc_empty(dev)) { rx_mode &= ~AcceptMulticast; -- cgit From d84b139f53e8fa8048f16814c6b2a53d7bc15c3d Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Thu, 2 Nov 2023 11:35:37 +0100 Subject: selftests/bpf: Fix broken build where char is unsigned MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are architectures where char is not signed. If so, the following error is triggered: | xdp_hw_metadata.c:435:42: error: result of comparison of constant -1 \ | with expression of type 'char' is always true \ | [-Werror,-Wtautological-constant-out-of-range-compare] | 435 | while ((opt = getopt(argc, argv, "mh")) != -1) { | | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ^ ~~ | 1 error generated. Correct by changing the char to int. Fixes: bb6a88885fde ("selftests/bpf: Add options and frags to xdp_hw_metadata") Signed-off-by: Björn Töpel Acked-by: Larysa Zaremba Tested-by: Anders Roxell Link: https://lore.kernel.org/r/20231102103537.247336-1-bjorn@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/xdp_hw_metadata.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/xdp_hw_metadata.c b/tools/testing/selftests/bpf/xdp_hw_metadata.c index 17c0f92ff160..c3ba40d0b9de 100644 --- a/tools/testing/selftests/bpf/xdp_hw_metadata.c +++ b/tools/testing/selftests/bpf/xdp_hw_metadata.c @@ -430,7 +430,7 @@ static void print_usage(void) static void read_args(int argc, char *argv[]) { - char opt; + int opt; while ((opt = getopt(argc, argv, "mh")) != -1) { switch (opt) { -- cgit From e8ae8ad479e2d037daa33756e5e72850a7bd37a9 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 24 Oct 2023 09:53:33 +1100 Subject: Fix termination state for idr_for_each_entry_ul() The comment for idr_for_each_entry_ul() states after normal termination @entry is left with the value NULL This is not correct in the case where UINT_MAX has an entry in the idr. In that case @entry will be non-NULL after termination. No current code depends on the documentation being correct, but to save future code we should fix it. Also fix idr_for_each_entry_continue_ul(). While this is not documented as leaving @entry as NULL, the mellanox driver appears to depend on it doing so. So make that explicit in the documentation as well as in the code. Fixes: e33d2b74d805 ("idr: fix overflow case for idr_for_each_entry_ul()") Cc: Matthew Wilcox Cc: Chris Mi Cc: Cong Wang Signed-off-by: NeilBrown Signed-off-by: David S. Miller --- include/linux/idr.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/linux/idr.h b/include/linux/idr.h index a0dce14090a9..da5f5fa4a3a6 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -200,7 +200,7 @@ static inline void idr_preload_end(void) */ #define idr_for_each_entry_ul(idr, entry, tmp, id) \ for (tmp = 0, id = 0; \ - tmp <= id && ((entry) = idr_get_next_ul(idr, &(id))) != NULL; \ + ((entry) = tmp <= id ? idr_get_next_ul(idr, &(id)) : NULL) != NULL; \ tmp = id, ++id) /** @@ -224,10 +224,12 @@ static inline void idr_preload_end(void) * @id: Entry ID. * * Continue to iterate over entries, continuing after the current position. + * After normal termination @entry is left with the value NULL. This + * is convenient for a "not found" value. */ #define idr_for_each_entry_continue_ul(idr, entry, tmp, id) \ for (tmp = id; \ - tmp <= id && ((entry) = idr_get_next_ul(idr, &(id))) != NULL; \ + ((entry) = tmp <= id ? idr_get_next_ul(idr, &(id)) : NULL) != NULL; \ tmp = id, ++id) /* -- cgit From db456d90a4c1b43b6251fa4348c8adc59b583274 Mon Sep 17 00:00:00 2001 From: Furong Xu <0x1207@gmail.com> Date: Tue, 31 Oct 2023 10:27:29 +0800 Subject: net: stmmac: xgmac: Enable support for multiple Flexible PPS outputs From XGMAC Core 3.20 and later, each Flexible PPS has individual PPSEN bit to select Fixed mode or Flexible mode. The PPSEN must be set, or it stays in Fixed PPS mode by default. XGMAC Core prior 3.20, only PPSEN0(bit 4) is writable. PPSEN{1,2,3} are read-only reserved, and they are already in Flexible mode by default, our new code always set PPSEN{1,2,3} do not make things worse ;-) Fixes: 95eaf3cd0a90 ("net: stmmac: dwxgmac: Add Flexible PPS support") Reviewed-by: Serge Semin Reviewed-by: Jacob Keller Signed-off-by: Furong Xu <0x1207@gmail.com> Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h | 2 +- drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c | 14 +++++++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h index 7a8f47e7b728..a4e8b498dea9 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h +++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h @@ -259,7 +259,7 @@ ((val) << XGMAC_PPS_MINIDX(x)) #define XGMAC_PPSCMD_START 0x2 #define XGMAC_PPSCMD_STOP 0x5 -#define XGMAC_PPSEN0 BIT(4) +#define XGMAC_PPSENx(x) BIT(4 + (x) * 8) #define XGMAC_PPSx_TARGET_TIME_SEC(x) (0x00000d80 + (x) * 0x10) #define XGMAC_PPSx_TARGET_TIME_NSEC(x) (0x00000d84 + (x) * 0x10) #define XGMAC_TRGTBUSY0 BIT(31) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c index f352be269deb..453e88b75be0 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c @@ -1178,7 +1178,19 @@ static int dwxgmac2_flex_pps_config(void __iomem *ioaddr, int index, val |= XGMAC_PPSCMDx(index, XGMAC_PPSCMD_START); val |= XGMAC_TRGTMODSELx(index, XGMAC_PPSCMD_START); - val |= XGMAC_PPSEN0; + + /* XGMAC Core has 4 PPS outputs at most. + * + * Prior XGMAC Core 3.20, Fixed mode or Flexible mode are selectable for + * PPS0 only via PPSEN0. PPS{1,2,3} are in Flexible mode by default, + * and can not be switched to Fixed mode, since PPSEN{1,2,3} are + * read-only reserved to 0. + * But we always set PPSEN{1,2,3} do not make things worse ;-) + * + * From XGMAC Core 3.20 and later, PPSEN{0,1,2,3} are writable and must + * be set, or the PPS outputs stay in Fixed PPS mode by default. + */ + val |= XGMAC_PPSENx(index); writel(cfg->start.tv_sec, ioaddr + XGMAC_PPSx_TARGET_TIME_SEC(index)); -- cgit From 63e201916b27260218e528a2f8758be47f99bbf4 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 31 Oct 2023 11:47:32 +0800 Subject: selftests: pmtu.sh: fix result checking In the PMTU test, when all previous tests are skipped and the new test passes, the exit code is set to 0. However, the current check mistakenly treats this as an assignment, causing the check to pass every time. Consequently, regardless of how many tests have failed, if the latest test passes, the PMTU test will report a pass. Fixes: 2a9d3716b810 ("selftests: pmtu.sh: improve the test result processing") Signed-off-by: Hangbin Liu Acked-by: Po-Hsu Lin Signed-off-by: David S. Miller --- tools/testing/selftests/net/pmtu.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh index f838dd370f6a..b3b2dc5a630c 100755 --- a/tools/testing/selftests/net/pmtu.sh +++ b/tools/testing/selftests/net/pmtu.sh @@ -2048,7 +2048,7 @@ run_test() { case $ret in 0) all_skipped=false - [ $exitcode=$ksft_skip ] && exitcode=0 + [ $exitcode -eq $ksft_skip ] && exitcode=0 ;; $ksft_skip) [ $all_skipped = true ] && exitcode=$ksft_skip -- cgit From cdbab6236605dc11780779d9af689aea7d58cab1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 31 Oct 2023 06:19:45 +0000 Subject: tcp: fix fastopen code vs usec TS After blamed commit, TFO client-ack-dropped-then-recovery-ms-timestamps packetdrill test failed. David Morley and Neal Cardwell started investigating and Neal pointed that we had : tcp_conn_request() tcp_try_fastopen() -> tcp_fastopen_create_child -> child = inet_csk(sk)->icsk_af_ops->syn_recv_sock() -> tcp_create_openreq_child() -> copy req_usec_ts from req: newtp->tcp_usec_ts = treq->req_usec_ts; // now the new TFO server socket always does usec TS, no matter // what the route options are... send_synack() -> tcp_make_synack() // disable tcp_rsk(req)->req_usec_ts if route option is not present: if (tcp_rsk(req)->req_usec_ts < 0) tcp_rsk(req)->req_usec_ts = dst_tcp_usec_ts(dst); tcp_conn_request() has the initial dst, we can initialize tcp_rsk(req)->req_usec_ts there instead of later in send_synack(); This means tcp_rsk(req)->req_usec_ts can be a boolean. Many thanks to David an Neal for their help. Fixes: 614e8316aa4c ("tcp: add support for usec resolution in TCP TS values") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202310302216.f79d78bc-oliver.sang@intel.com Suggested-by: Neal Cardwell Signed-off-by: Eric Dumazet Cc: David Morley Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- include/linux/tcp.h | 2 +- net/ipv4/syncookies.c | 2 +- net/ipv4/tcp_input.c | 7 ++++--- net/ipv4/tcp_output.c | 2 -- 4 files changed, 6 insertions(+), 7 deletions(-) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index ec4e9367f5b0..68f3d315d2e1 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -152,7 +152,7 @@ struct tcp_request_sock { u64 snt_synack; /* first SYNACK sent time */ bool tfo_listener; bool is_mptcp; - s8 req_usec_ts; + bool req_usec_ts; #if IS_ENABLED(CONFIG_MPTCP) bool drop_req; #endif diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 98b25e5d147b..d37282c06e3d 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -306,7 +306,7 @@ struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, treq->af_specific = af_ops; treq->syn_tos = TCP_SKB_CB(skb)->ip_dsfield; - treq->req_usec_ts = -1; + treq->req_usec_ts = false; #if IS_ENABLED(CONFIG_MPTCP) treq->is_mptcp = sk_is_mptcp(sk); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 50aaa1527150..bcb55d98004c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -7115,7 +7115,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, req->syncookie = want_cookie; tcp_rsk(req)->af_specific = af_ops; tcp_rsk(req)->ts_off = 0; - tcp_rsk(req)->req_usec_ts = -1; + tcp_rsk(req)->req_usec_ts = false; #if IS_ENABLED(CONFIG_MPTCP) tcp_rsk(req)->is_mptcp = 0; #endif @@ -7143,9 +7143,10 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, if (!dst) goto drop_and_free; - if (tmp_opt.tstamp_ok) + if (tmp_opt.tstamp_ok) { + tcp_rsk(req)->req_usec_ts = dst_tcp_usec_ts(dst); tcp_rsk(req)->ts_off = af_ops->init_ts_off(net, skb); - + } if (!want_cookie && !isn) { int max_syn_backlog = READ_ONCE(net->ipv4.sysctl_max_syn_backlog); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f558c054cf6e..0d8dd5b7e2e5 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3693,8 +3693,6 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, mss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); memset(&opts, 0, sizeof(opts)); - if (tcp_rsk(req)->req_usec_ts < 0) - tcp_rsk(req)->req_usec_ts = dst_tcp_usec_ts(dst); now = tcp_clock_ns(); #ifdef CONFIG_SYN_COOKIES if (unlikely(synack_type == TCP_SYNACK_COOKIE && ireq->tstamp_ok)) -- cgit From 153a58c6d8976f289b52e6652932c1cb28a2eacd Mon Sep 17 00:00:00 2001 From: Ronald Wahl Date: Tue, 31 Oct 2023 13:20:05 +0100 Subject: net: ethernet: ti: am65-cpsw: rx_pause/tx_pause controls wrong direction The rx_pause flag says that whether we support receiving Pause frames. When a Pause frame is received TX is delayed for some time. This is TX flow control. In the same manner tx_pause is actually RX flow control. Signed-off-by: Ronald Wahl Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index 24120605502f..ece9f8df98ae 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -1588,10 +1588,10 @@ static void am65_cpsw_nuss_mac_link_up(struct phylink_config *config, struct phy /* rx_pause/tx_pause */ if (rx_pause) - mac_control |= CPSW_SL_CTL_RX_FLOW_EN; + mac_control |= CPSW_SL_CTL_TX_FLOW_EN; if (tx_pause) - mac_control |= CPSW_SL_CTL_TX_FLOW_EN; + mac_control |= CPSW_SL_CTL_RX_FLOW_EN; cpsw_sl_ctl_set(port->slave.mac_sl, mac_control); -- cgit From 02f0717e9835dbdeee26084e42086fbd32e64eb8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 1 Nov 2023 04:52:33 +0000 Subject: net/tcp: fix possible out-of-bounds reads in tcp_hash_fail() syzbot managed to trigger a fault by sending TCP packets with all flags being set. v2: - While fixing this bug, add PSH flag handling and represent flags the way tcpdump does : [S], [S.], [P.] - Print 4-tuples more consistently between families. BUG: KASAN: stack-out-of-bounds in string_nocheck lib/vsprintf.c:645 [inline] BUG: KASAN: stack-out-of-bounds in string+0x394/0x3d0 lib/vsprintf.c:727 Read of size 1 at addr ffffc9000397f3f5 by task syz-executor299/5039 CPU: 1 PID: 5039 Comm: syz-executor299 Not tainted 6.6.0-rc7-syzkaller-02075-g55c900477f5b #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/09/2023 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xd9/0x1b0 lib/dump_stack.c:106 print_address_description mm/kasan/report.c:364 [inline] print_report+0xc4/0x620 mm/kasan/report.c:475 kasan_report+0xda/0x110 mm/kasan/report.c:588 string_nocheck lib/vsprintf.c:645 [inline] string+0x394/0x3d0 lib/vsprintf.c:727 vsnprintf+0xc5f/0x1870 lib/vsprintf.c:2818 vprintk_store+0x3a0/0xb80 kernel/printk/printk.c:2191 vprintk_emit+0x14c/0x5f0 kernel/printk/printk.c:2288 vprintk+0x7b/0x90 kernel/printk/printk_safe.c:45 _printk+0xc8/0x100 kernel/printk/printk.c:2332 tcp_inbound_hash.constprop.0+0xdb2/0x10d0 include/net/tcp.h:2760 tcp_v6_rcv+0x2b31/0x34d0 net/ipv6/tcp_ipv6.c:1882 ip6_protocol_deliver_rcu+0x33b/0x13d0 net/ipv6/ip6_input.c:438 ip6_input_finish+0x14f/0x2f0 net/ipv6/ip6_input.c:483 NF_HOOK include/linux/netfilter.h:314 [inline] NF_HOOK include/linux/netfilter.h:308 [inline] ip6_input+0xce/0x440 net/ipv6/ip6_input.c:492 dst_input include/net/dst.h:461 [inline] ip6_rcv_finish net/ipv6/ip6_input.c:79 [inline] NF_HOOK include/linux/netfilter.h:314 [inline] NF_HOOK include/linux/netfilter.h:308 [inline] ipv6_rcv+0x563/0x720 net/ipv6/ip6_input.c:310 __netif_receive_skb_one_core+0x115/0x180 net/core/dev.c:5527 __netif_receive_skb+0x1f/0x1b0 net/core/dev.c:5641 netif_receive_skb_internal net/core/dev.c:5727 [inline] netif_receive_skb+0x133/0x700 net/core/dev.c:5786 tun_rx_batched+0x429/0x780 drivers/net/tun.c:1579 tun_get_user+0x29e7/0x3bc0 drivers/net/tun.c:2002 tun_chr_write_iter+0xe8/0x210 drivers/net/tun.c:2048 call_write_iter include/linux/fs.h:1956 [inline] new_sync_write fs/read_write.c:491 [inline] vfs_write+0x650/0xe40 fs/read_write.c:584 ksys_write+0x12f/0x250 fs/read_write.c:637 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x38/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd Fixes: 2717b5adea9e ("net/tcp: Add tcp_hash_fail() ratelimited logs") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Dmitry Safonov Cc: Francesco Ruggeri Cc: David Ahern Reviewed-by: Dmitry Safonov Signed-off-by: David S. Miller --- include/net/tcp_ao.h | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/include/net/tcp_ao.h b/include/net/tcp_ao.h index a375a171ef3c..b56be10838f0 100644 --- a/include/net/tcp_ao.h +++ b/include/net/tcp_ao.h @@ -124,7 +124,7 @@ struct tcp_ao_info { #define tcp_hash_fail(msg, family, skb, fmt, ...) \ do { \ const struct tcphdr *th = tcp_hdr(skb); \ - char hdr_flags[5] = {}; \ + char hdr_flags[6]; \ char *f = hdr_flags; \ \ if (th->fin) \ @@ -133,17 +133,18 @@ do { \ *f++ = 'S'; \ if (th->rst) \ *f++ = 'R'; \ + if (th->psh) \ + *f++ = 'P'; \ if (th->ack) \ - *f++ = 'A'; \ - if (f != hdr_flags) \ - *f = ' '; \ + *f++ = '.'; \ + *f = 0; \ if ((family) == AF_INET) { \ - net_info_ratelimited("%s for (%pI4, %d)->(%pI4, %d) %s" fmt "\n", \ + net_info_ratelimited("%s for %pI4.%d->%pI4.%d [%s] " fmt "\n", \ msg, &ip_hdr(skb)->saddr, ntohs(th->source), \ &ip_hdr(skb)->daddr, ntohs(th->dest), \ hdr_flags, ##__VA_ARGS__); \ } else { \ - net_info_ratelimited("%s for [%pI6c]:%u->[%pI6c]:%u %s" fmt "\n", \ + net_info_ratelimited("%s for [%pI6c].%d->[%pI6c].%d [%s]" fmt "\n", \ msg, &ipv6_hdr(skb)->saddr, ntohs(th->source), \ &ipv6_hdr(skb)->daddr, ntohs(th->dest), \ hdr_flags, ##__VA_ARGS__); \ -- cgit From 016b9332a3346e97a6cacffea0f9dc10e1235a75 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 1 Nov 2023 21:57:24 -0700 Subject: netlink: fill in missing MODULE_DESCRIPTION() W=1 builds now warn if a module is built without a MODULE_DESCRIPTION(). Fill it in for sock_diag. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/netlink/diag.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/netlink/diag.c b/net/netlink/diag.c index 9c4f231be275..1eeff9422856 100644 --- a/net/netlink/diag.c +++ b/net/netlink/diag.c @@ -257,5 +257,6 @@ static void __exit netlink_diag_exit(void) module_init(netlink_diag_init); module_exit(netlink_diag_exit); +MODULE_DESCRIPTION("Netlink-based socket monitoring/diagnostic interface (sock_diag)"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 16 /* AF_NETLINK */); -- cgit From 40cb2fdfed342e7e578d551a073687789f698d89 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sat, 28 Oct 2023 13:16:10 -0400 Subject: net, sched: Fix SKB_NOT_DROPPED_YET splat under debug config Getting the following splat [1] with CONFIG_DEBUG_NET=y and this reproducer [2]. Problem seems to be that classifiers clear 'struct tcf_result::drop_reason', thereby triggering the warning in __kfree_skb_reason() due to reason being 'SKB_NOT_DROPPED_YET' (0). Fixed by disambiguating a legit error from a verdict with a bogus drop_reason [1] WARNING: CPU: 0 PID: 181 at net/core/skbuff.c:1082 kfree_skb_reason+0x38/0x130 Modules linked in: CPU: 0 PID: 181 Comm: mausezahn Not tainted 6.6.0-rc6-custom-ge43e6d9582e0 #682 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-1.fc37 04/01/2014 RIP: 0010:kfree_skb_reason+0x38/0x130 [...] Call Trace: __netif_receive_skb_core.constprop.0+0x837/0xdb0 __netif_receive_skb_one_core+0x3c/0x70 process_backlog+0x95/0x130 __napi_poll+0x25/0x1b0 net_rx_action+0x29b/0x310 __do_softirq+0xc0/0x29b do_softirq+0x43/0x60 [2] ip link add name veth0 type veth peer name veth1 ip link set dev veth0 up ip link set dev veth1 up tc qdisc add dev veth1 clsact tc filter add dev veth1 ingress pref 1 proto all flower dst_mac 00:11:22:33:44:55 action drop mausezahn veth0 -a own -b 00:11:22:33:44:55 -q -c 1 Ido reported: [...] getting the following splat [1] with CONFIG_DEBUG_NET=y and this reproducer [2]. Problem seems to be that classifiers clear 'struct tcf_result::drop_reason', thereby triggering the warning in __kfree_skb_reason() due to reason being 'SKB_NOT_DROPPED_YET' (0). [...] [1] WARNING: CPU: 0 PID: 181 at net/core/skbuff.c:1082 kfree_skb_reason+0x38/0x130 Modules linked in: CPU: 0 PID: 181 Comm: mausezahn Not tainted 6.6.0-rc6-custom-ge43e6d9582e0 #682 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-1.fc37 04/01/2014 RIP: 0010:kfree_skb_reason+0x38/0x130 [...] Call Trace: __netif_receive_skb_core.constprop.0+0x837/0xdb0 __netif_receive_skb_one_core+0x3c/0x70 process_backlog+0x95/0x130 __napi_poll+0x25/0x1b0 net_rx_action+0x29b/0x310 __do_softirq+0xc0/0x29b do_softirq+0x43/0x60 [2] #!/bin/bash ip link add name veth0 type veth peer name veth1 ip link set dev veth0 up ip link set dev veth1 up tc qdisc add dev veth1 clsact tc filter add dev veth1 ingress pref 1 proto all flower dst_mac 00:11:22:33:44:55 action drop mausezahn veth0 -a own -b 00:11:22:33:44:55 -q -c 1 What happens is that inside most classifiers the tcf_result is copied over from a filter template e.g. *res = f->res which then implicitly overrides the prior SKB_DROP_REASON_TC_{INGRESS,EGRESS} default drop code which was set via sch_handle_{ingress,egress}() for kfree_skb_reason(). Commit text above copied verbatim from Daniel. The general idea of the patch is not very different from what Ido originally posted but instead done at the cls_api codepath. Fixes: 54a59aed395c ("net, sched: Make tc-related drop reason more flexible") Reported-by: Ido Schimmel Signed-off-by: Jamal Hadi Salim Link: https://lore.kernel.org/netdev/ZTjY959R+AFXf3Xy@shredder Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/sched/act_api.c | 2 +- net/sched/cls_api.c | 9 ++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 9d3f26bf0440..c39252d61ebb 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -1098,7 +1098,7 @@ repeat: } } else if (TC_ACT_EXT_CMP(ret, TC_ACT_GOTO_CHAIN)) { if (unlikely(!rcu_access_pointer(a->goto_chain))) { - net_warn_ratelimited("can't go to NULL chain!\n"); + tcf_set_drop_reason(res, SKB_DROP_REASON_TC_ERROR); return TC_ACT_SHOT; } tcf_action_goto_chain_exec(a, res); diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 1daeb2182b70..1976bd163986 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1658,6 +1658,7 @@ static inline int __tcf_classify(struct sk_buff *skb, int act_index, u32 *last_executed_chain) { + u32 orig_reason = res->drop_reason; #ifdef CONFIG_NET_CLS_ACT const int max_reclassify_loop = 16; const struct tcf_proto *first_tp; @@ -1712,8 +1713,14 @@ reclassify: goto reset; } #endif - if (err >= 0) + if (err >= 0) { + /* Policy drop or drop reason is over-written by + * classifiers with a bogus value(0) */ + if (err == TC_ACT_SHOT && + res->drop_reason == SKB_NOT_DROPPED_YET) + tcf_set_drop_reason(res, orig_reason); return err; + } } if (unlikely(n)) { -- cgit From 3423ca23e08bf285a324237abe88e7e7d9becfe6 Mon Sep 17 00:00:00 2001 From: Geetha sowjanya Date: Tue, 31 Oct 2023 16:53:45 +0530 Subject: octeontx2-pf: Free pending and dropped SQEs On interface down, the pending SQEs in the NIX get dropped or drained out during SMQ flush. But skb's pointed by these SQEs never get free or updated to the stack as respective CQE never get added. This patch fixes the issue by freeing all valid skb's in SQ SG list. Fixes: b1bc8457e9d0 ("octeontx2-pf: Cleanup all receive buffers in SG descriptor") Signed-off-by: Geetha sowjanya Signed-off-by: David S. Miller --- .../ethernet/marvell/octeontx2/nic/otx2_common.c | 15 +++----- .../ethernet/marvell/octeontx2/nic/otx2_common.h | 1 + .../net/ethernet/marvell/octeontx2/nic/otx2_pf.c | 1 + .../net/ethernet/marvell/octeontx2/nic/otx2_txrx.c | 42 ++++++++++++++++++++++ 4 files changed, 49 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c index 1a42bfded872..7ca6941ea0b9 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c @@ -818,7 +818,6 @@ void otx2_sqb_flush(struct otx2_nic *pfvf) int qidx, sqe_tail, sqe_head; struct otx2_snd_queue *sq; u64 incr, *ptr, val; - int timeout = 1000; ptr = (u64 *)otx2_get_regaddr(pfvf, NIX_LF_SQ_OP_STATUS); for (qidx = 0; qidx < otx2_get_total_tx_queues(pfvf); qidx++) { @@ -827,15 +826,11 @@ void otx2_sqb_flush(struct otx2_nic *pfvf) continue; incr = (u64)qidx << 32; - while (timeout) { - val = otx2_atomic64_add(incr, ptr); - sqe_head = (val >> 20) & 0x3F; - sqe_tail = (val >> 28) & 0x3F; - if (sqe_head == sqe_tail) - break; - usleep_range(1, 3); - timeout--; - } + val = otx2_atomic64_add(incr, ptr); + sqe_head = (val >> 20) & 0x3F; + sqe_tail = (val >> 28) & 0x3F; + if (sqe_head != sqe_tail) + usleep_range(50, 60); } } diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h index c04a8ee53a82..e7c69b57147e 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h @@ -977,6 +977,7 @@ int otx2_txschq_config(struct otx2_nic *pfvf, int lvl, int prio, bool pfc_en); int otx2_txsch_alloc(struct otx2_nic *pfvf); void otx2_txschq_stop(struct otx2_nic *pfvf); void otx2_txschq_free_one(struct otx2_nic *pfvf, u16 lvl, u16 schq); +void otx2_free_pending_sqe(struct otx2_nic *pfvf); void otx2_sqb_flush(struct otx2_nic *pfvf); int otx2_alloc_rbuf(struct otx2_nic *pfvf, struct otx2_pool *pool, dma_addr_t *dma); diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c index 125fe231702a..91b99fd70361 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c @@ -1601,6 +1601,7 @@ static void otx2_free_hw_resources(struct otx2_nic *pf) else otx2_cleanup_tx_cqes(pf, cq); } + otx2_free_pending_sqe(pf); otx2_free_sq_res(pf); diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c index 53b2a4ef5298..6ee15f3c25ed 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c @@ -1247,9 +1247,11 @@ void otx2_cleanup_rx_cqes(struct otx2_nic *pfvf, struct otx2_cq_queue *cq, int q void otx2_cleanup_tx_cqes(struct otx2_nic *pfvf, struct otx2_cq_queue *cq) { + int tx_pkts = 0, tx_bytes = 0; struct sk_buff *skb = NULL; struct otx2_snd_queue *sq; struct nix_cqe_tx_s *cqe; + struct netdev_queue *txq; int processed_cqe = 0; struct sg_list *sg; int qidx; @@ -1270,12 +1272,20 @@ void otx2_cleanup_tx_cqes(struct otx2_nic *pfvf, struct otx2_cq_queue *cq) sg = &sq->sg[cqe->comp.sqe_id]; skb = (struct sk_buff *)sg->skb; if (skb) { + tx_bytes += skb->len; + tx_pkts++; otx2_dma_unmap_skb_frags(pfvf, sg); dev_kfree_skb_any(skb); sg->skb = (u64)NULL; } } + if (likely(tx_pkts)) { + if (qidx >= pfvf->hw.tx_queues) + qidx -= pfvf->hw.xdp_queues; + txq = netdev_get_tx_queue(pfvf->netdev, qidx); + netdev_tx_completed_queue(txq, tx_pkts, tx_bytes); + } /* Free CQEs to HW */ otx2_write64(pfvf, NIX_LF_CQ_OP_DOOR, ((u64)cq->cq_idx << 32) | processed_cqe); @@ -1302,6 +1312,38 @@ int otx2_rxtx_enable(struct otx2_nic *pfvf, bool enable) return err; } +void otx2_free_pending_sqe(struct otx2_nic *pfvf) +{ + int tx_pkts = 0, tx_bytes = 0; + struct sk_buff *skb = NULL; + struct otx2_snd_queue *sq; + struct netdev_queue *txq; + struct sg_list *sg; + int sq_idx, sqe; + + for (sq_idx = 0; sq_idx < pfvf->hw.tx_queues; sq_idx++) { + sq = &pfvf->qset.sq[sq_idx]; + for (sqe = 0; sqe < sq->sqe_cnt; sqe++) { + sg = &sq->sg[sqe]; + skb = (struct sk_buff *)sg->skb; + if (skb) { + tx_bytes += skb->len; + tx_pkts++; + otx2_dma_unmap_skb_frags(pfvf, sg); + dev_kfree_skb_any(skb); + sg->skb = (u64)NULL; + } + } + + if (!tx_pkts) + continue; + txq = netdev_get_tx_queue(pfvf->netdev, sq_idx); + netdev_tx_completed_queue(txq, tx_pkts, tx_bytes); + tx_pkts = 0; + tx_bytes = 0; + } +} + static void otx2_xdp_sqe_add_sg(struct otx2_snd_queue *sq, u64 dma_addr, int len, int *offset) { -- cgit From 0a8e987dcc13244b5a5bc90cb1b184f813104d87 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 2 Nov 2023 14:05:48 -0700 Subject: tcp: Fix SYN option room calculation for TCP-AO. When building SYN packet in tcp_syn_options(), MSS, TS, WS, and SACKPERM are used without checking the remaining bytes in the options area. To keep that logic as is, we limit the TCP-AO MAC length in tcp_ao_parse_crypto(). Currently, the limit is calculated as below. MAX_TCP_OPTION_SPACE - TCPOLEN_TSTAMP_ALIGNED - TCPOLEN_WSCALE_ALIGNED - TCPOLEN_SACKPERM_ALIGNED This looks confusing as (1) we pack SACKPERM into the leading 2-bytes of the aligned 12-bytes of TS and (2) TCPOLEN_MSS_ALIGNED is not used. Fortunately, the calculated limit is not wrong as TCPOLEN_SACKPERM_ALIGNED and TCPOLEN_MSS_ALIGNED are the same value. However, we should use the proper constant in the formula. MAX_TCP_OPTION_SPACE - TCPOLEN_MSS_ALIGNED - TCPOLEN_TSTAMP_ALIGNED - TCPOLEN_WSCALE_ALIGNED Fixes: 4954f17ddefc ("net/tcp: Introduce TCP_AO setsockopt()s") Signed-off-by: Kuniyuki Iwashima Reviewed-by: Dmitry Safonov Signed-off-by: David S. Miller --- net/ipv4/tcp_ao.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c index ef5472ed6158..7696417d0640 100644 --- a/net/ipv4/tcp_ao.c +++ b/net/ipv4/tcp_ao.c @@ -1315,7 +1315,8 @@ static int tcp_ao_parse_crypto(struct tcp_ao_add *cmd, struct tcp_ao_key *key) key->maclen = cmd->maclen ?: 12; /* 12 is the default in RFC5925 */ /* Check: maclen + tcp-ao header <= (MAX_TCP_OPTION_SPACE - mss - * - tstamp - wscale - sackperm), + * - tstamp (including sackperm) + * - wscale), * see tcp_syn_options(), tcp_synack_options(), commit 33ad798c924b. * * In order to allow D-SACK with TCP-AO, the header size should be: @@ -1342,9 +1343,9 @@ static int tcp_ao_parse_crypto(struct tcp_ao_add *cmd, struct tcp_ao_key *key) * large to leave sufficient option space. */ syn_tcp_option_space = MAX_TCP_OPTION_SPACE; + syn_tcp_option_space -= TCPOLEN_MSS_ALIGNED; syn_tcp_option_space -= TCPOLEN_TSTAMP_ALIGNED; syn_tcp_option_space -= TCPOLEN_WSCALE_ALIGNED; - syn_tcp_option_space -= TCPOLEN_SACKPERM_ALIGNED; if (tcp_ao_len(key) > syn_tcp_option_space) { err = -EMSGSIZE; goto err_kfree; -- cgit From d93f9528573e1d419b69ca5ff4130201d05f6b90 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 2 Nov 2023 11:52:27 -0700 Subject: nfsd: regenerate user space parsers after ynl-gen changes Commit 8cea95b0bd79 ("tools: ynl-gen: handle do ops with no input attrs") added support for some of the previously-skipped ops in nfsd. Regenerate the user space parsers to fill them in. Signed-off-by: Jakub Kicinski Acked-by: Chuck Lever Signed-off-by: David S. Miller --- include/uapi/linux/nfsd_netlink.h | 6 +- tools/net/ynl/generated/nfsd-user.c | 120 ++++++++++++++++++++++++++++++++++-- tools/net/ynl/generated/nfsd-user.h | 44 +++++++++++-- 3 files changed, 156 insertions(+), 14 deletions(-) diff --git a/include/uapi/linux/nfsd_netlink.h b/include/uapi/linux/nfsd_netlink.h index c8ae72466ee6..3cd044edee5d 100644 --- a/include/uapi/linux/nfsd_netlink.h +++ b/include/uapi/linux/nfsd_netlink.h @@ -3,8 +3,8 @@ /* Documentation/netlink/specs/nfsd.yaml */ /* YNL-GEN uapi header */ -#ifndef _UAPI_LINUX_NFSD_H -#define _UAPI_LINUX_NFSD_H +#ifndef _UAPI_LINUX_NFSD_NETLINK_H +#define _UAPI_LINUX_NFSD_NETLINK_H #define NFSD_FAMILY_NAME "nfsd" #define NFSD_FAMILY_VERSION 1 @@ -36,4 +36,4 @@ enum { NFSD_CMD_MAX = (__NFSD_CMD_MAX - 1) }; -#endif /* _UAPI_LINUX_NFSD_H */ +#endif /* _UAPI_LINUX_NFSD_NETLINK_H */ diff --git a/tools/net/ynl/generated/nfsd-user.c b/tools/net/ynl/generated/nfsd-user.c index fec6828680ce..360b6448c6e9 100644 --- a/tools/net/ynl/generated/nfsd-user.c +++ b/tools/net/ynl/generated/nfsd-user.c @@ -50,9 +50,116 @@ struct ynl_policy_nest nfsd_rpc_status_nest = { /* Common nested types */ /* ============== NFSD_CMD_RPC_STATUS_GET ============== */ /* NFSD_CMD_RPC_STATUS_GET - dump */ -void nfsd_rpc_status_get_list_free(struct nfsd_rpc_status_get_list *rsp) +int nfsd_rpc_status_get_rsp_dump_parse(const struct nlmsghdr *nlh, void *data) { - struct nfsd_rpc_status_get_list *next = rsp; + struct nfsd_rpc_status_get_rsp_dump *dst; + struct ynl_parse_arg *yarg = data; + unsigned int n_compound_ops = 0; + const struct nlattr *attr; + int i; + + dst = yarg->data; + + if (dst->compound_ops) + return ynl_error_parse(yarg, "attribute already present (rpc-status.compound-ops)"); + + mnl_attr_for_each(attr, nlh, sizeof(struct genlmsghdr)) { + unsigned int type = mnl_attr_get_type(attr); + + if (type == NFSD_A_RPC_STATUS_XID) { + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + dst->_present.xid = 1; + dst->xid = mnl_attr_get_u32(attr); + } else if (type == NFSD_A_RPC_STATUS_FLAGS) { + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + dst->_present.flags = 1; + dst->flags = mnl_attr_get_u32(attr); + } else if (type == NFSD_A_RPC_STATUS_PROG) { + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + dst->_present.prog = 1; + dst->prog = mnl_attr_get_u32(attr); + } else if (type == NFSD_A_RPC_STATUS_VERSION) { + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + dst->_present.version = 1; + dst->version = mnl_attr_get_u8(attr); + } else if (type == NFSD_A_RPC_STATUS_PROC) { + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + dst->_present.proc = 1; + dst->proc = mnl_attr_get_u32(attr); + } else if (type == NFSD_A_RPC_STATUS_SERVICE_TIME) { + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + dst->_present.service_time = 1; + dst->service_time = mnl_attr_get_u64(attr); + } else if (type == NFSD_A_RPC_STATUS_SADDR4) { + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + dst->_present.saddr4 = 1; + dst->saddr4 = mnl_attr_get_u32(attr); + } else if (type == NFSD_A_RPC_STATUS_DADDR4) { + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + dst->_present.daddr4 = 1; + dst->daddr4 = mnl_attr_get_u32(attr); + } else if (type == NFSD_A_RPC_STATUS_SADDR6) { + unsigned int len; + + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + + len = mnl_attr_get_payload_len(attr); + dst->_present.saddr6_len = len; + dst->saddr6 = malloc(len); + memcpy(dst->saddr6, mnl_attr_get_payload(attr), len); + } else if (type == NFSD_A_RPC_STATUS_DADDR6) { + unsigned int len; + + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + + len = mnl_attr_get_payload_len(attr); + dst->_present.daddr6_len = len; + dst->daddr6 = malloc(len); + memcpy(dst->daddr6, mnl_attr_get_payload(attr), len); + } else if (type == NFSD_A_RPC_STATUS_SPORT) { + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + dst->_present.sport = 1; + dst->sport = mnl_attr_get_u16(attr); + } else if (type == NFSD_A_RPC_STATUS_DPORT) { + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + dst->_present.dport = 1; + dst->dport = mnl_attr_get_u16(attr); + } else if (type == NFSD_A_RPC_STATUS_COMPOUND_OPS) { + n_compound_ops++; + } + } + + if (n_compound_ops) { + dst->compound_ops = calloc(n_compound_ops, sizeof(*dst->compound_ops)); + dst->n_compound_ops = n_compound_ops; + i = 0; + mnl_attr_for_each(attr, nlh, sizeof(struct genlmsghdr)) { + if (mnl_attr_get_type(attr) == NFSD_A_RPC_STATUS_COMPOUND_OPS) { + dst->compound_ops[i] = mnl_attr_get_u32(attr); + i++; + } + } + } + + return MNL_CB_OK; +} + +void +nfsd_rpc_status_get_rsp_list_free(struct nfsd_rpc_status_get_rsp_list *rsp) +{ + struct nfsd_rpc_status_get_rsp_list *next = rsp; while ((void *)next != YNL_LIST_END) { rsp = next; @@ -65,15 +172,16 @@ void nfsd_rpc_status_get_list_free(struct nfsd_rpc_status_get_list *rsp) } } -struct nfsd_rpc_status_get_list *nfsd_rpc_status_get_dump(struct ynl_sock *ys) +struct nfsd_rpc_status_get_rsp_list * +nfsd_rpc_status_get_dump(struct ynl_sock *ys) { struct ynl_dump_state yds = {}; struct nlmsghdr *nlh; int err; yds.ys = ys; - yds.alloc_sz = sizeof(struct nfsd_rpc_status_get_list); - yds.cb = nfsd_rpc_status_get_rsp_parse; + yds.alloc_sz = sizeof(struct nfsd_rpc_status_get_rsp_list); + yds.cb = nfsd_rpc_status_get_rsp_dump_parse; yds.rsp_cmd = NFSD_CMD_RPC_STATUS_GET; yds.rsp_policy = &nfsd_rpc_status_nest; @@ -86,7 +194,7 @@ struct nfsd_rpc_status_get_list *nfsd_rpc_status_get_dump(struct ynl_sock *ys) return yds.first; free_list: - nfsd_rpc_status_get_list_free(yds.first); + nfsd_rpc_status_get_rsp_list_free(yds.first); return NULL; } diff --git a/tools/net/ynl/generated/nfsd-user.h b/tools/net/ynl/generated/nfsd-user.h index b6b69501031a..989c6e209ced 100644 --- a/tools/net/ynl/generated/nfsd-user.h +++ b/tools/net/ynl/generated/nfsd-user.h @@ -21,13 +21,47 @@ const char *nfsd_op_str(int op); /* Common nested types */ /* ============== NFSD_CMD_RPC_STATUS_GET ============== */ /* NFSD_CMD_RPC_STATUS_GET - dump */ -struct nfsd_rpc_status_get_list { - struct nfsd_rpc_status_get_list *next; - struct nfsd_rpc_status_get_rsp obj __attribute__ ((aligned (8))); +struct nfsd_rpc_status_get_rsp_dump { + struct { + __u32 xid:1; + __u32 flags:1; + __u32 prog:1; + __u32 version:1; + __u32 proc:1; + __u32 service_time:1; + __u32 saddr4:1; + __u32 daddr4:1; + __u32 saddr6_len; + __u32 daddr6_len; + __u32 sport:1; + __u32 dport:1; + } _present; + + __u32 xid /* big-endian */; + __u32 flags; + __u32 prog; + __u8 version; + __u32 proc; + __s64 service_time; + __u32 saddr4 /* big-endian */; + __u32 daddr4 /* big-endian */; + void *saddr6; + void *daddr6; + __u16 sport /* big-endian */; + __u16 dport /* big-endian */; + unsigned int n_compound_ops; + __u32 *compound_ops; +}; + +struct nfsd_rpc_status_get_rsp_list { + struct nfsd_rpc_status_get_rsp_list *next; + struct nfsd_rpc_status_get_rsp_dump obj __attribute__((aligned(8))); }; -void nfsd_rpc_status_get_list_free(struct nfsd_rpc_status_get_list *rsp); +void +nfsd_rpc_status_get_rsp_list_free(struct nfsd_rpc_status_get_rsp_list *rsp); -struct nfsd_rpc_status_get_list *nfsd_rpc_status_get_dump(struct ynl_sock *ys); +struct nfsd_rpc_status_get_rsp_list * +nfsd_rpc_status_get_dump(struct ynl_sock *ys); #endif /* _LINUX_NFSD_GEN_H */ -- cgit From 5211c9729484c923f8d2e06bd29f9322cc42bb8f Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Fri, 3 Nov 2023 14:07:38 +0800 Subject: net/smc: fix dangling sock under state SMC_APPFINCLOSEWAIT Considering scenario: smc_cdc_rx_handler __smc_release sock_set_flag smc_close_active() sock_set_flag __set_bit(DEAD) __set_bit(DONE) Dues to __set_bit is not atomic, the DEAD or DONE might be lost. if the DEAD flag lost, the state SMC_CLOSED will be never be reached in smc_close_passive_work: if (sock_flag(sk, SOCK_DEAD) && smc_close_sent_any_close(conn)) { sk->sk_state = SMC_CLOSED; } else { /* just shutdown, but not yet closed locally */ sk->sk_state = SMC_APPFINCLOSEWAIT; } Replace sock_set_flags or __set_bit to set_bit will fix this problem. Since set_bit is atomic. Fixes: b38d732477e4 ("smc: socket closing and linkgroup cleanup") Signed-off-by: D. Wythe Reviewed-by: Dust Li Signed-off-by: David S. Miller --- net/smc/af_smc.c | 4 ++-- net/smc/smc.h | 5 +++++ net/smc/smc_cdc.c | 2 +- net/smc/smc_close.c | 2 +- 4 files changed, 9 insertions(+), 4 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index abd2667734d4..da97f946b79b 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -275,7 +275,7 @@ static int __smc_release(struct smc_sock *smc) if (!smc->use_fallback) { rc = smc_close_active(smc); - sock_set_flag(sk, SOCK_DEAD); + smc_sock_set_flag(sk, SOCK_DEAD); sk->sk_shutdown |= SHUTDOWN_MASK; } else { if (sk->sk_state != SMC_CLOSED) { @@ -1743,7 +1743,7 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) if (new_clcsock) sock_release(new_clcsock); new_sk->sk_state = SMC_CLOSED; - sock_set_flag(new_sk, SOCK_DEAD); + smc_sock_set_flag(new_sk, SOCK_DEAD); sock_put(new_sk); /* final */ *new_smc = NULL; goto out; diff --git a/net/smc/smc.h b/net/smc/smc.h index 24745fde4ac2..e377980b8414 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -377,4 +377,9 @@ int smc_nl_dump_hs_limitation(struct sk_buff *skb, struct netlink_callback *cb); int smc_nl_enable_hs_limitation(struct sk_buff *skb, struct genl_info *info); int smc_nl_disable_hs_limitation(struct sk_buff *skb, struct genl_info *info); +static inline void smc_sock_set_flag(struct sock *sk, enum sock_flags flag) +{ + set_bit(flag, &sk->sk_flags); +} + #endif /* __SMC_H */ diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 89105e95b452..01bdb7909a14 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -385,7 +385,7 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc, smc->sk.sk_shutdown |= RCV_SHUTDOWN; if (smc->clcsock && smc->clcsock->sk) smc->clcsock->sk->sk_shutdown |= RCV_SHUTDOWN; - sock_set_flag(&smc->sk, SOCK_DONE); + smc_sock_set_flag(&smc->sk, SOCK_DONE); sock_hold(&smc->sk); /* sock_put in close_work */ if (!queue_work(smc_close_wq, &conn->close_work)) sock_put(&smc->sk); diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index dbdf03e8aa5b..449ef454b53b 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -173,7 +173,7 @@ void smc_close_active_abort(struct smc_sock *smc) break; } - sock_set_flag(sk, SOCK_DEAD); + smc_sock_set_flag(sk, SOCK_DEAD); sk->sk_state_change(sk); if (release_clcsock) { -- cgit From c5bf605ba4f9d6fbbb120595ab95002f4716edcb Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Fri, 3 Nov 2023 14:07:39 +0800 Subject: net/smc: allow cdc msg send rather than drop it with NULL sndbuf_desc This patch re-fix the issues mentioned by commit 22a825c541d7 ("net/smc: fix NULL sndbuf_desc in smc_cdc_tx_handler()"). Blocking sending message do solve the issues though, but it also prevents the peer to receive the final message. Besides, in logic, whether the sndbuf_desc is NULL or not have no impact on the processing of cdc message sending. Hence that, this patch allows the cdc message sending but to check the sndbuf_desc with care in smc_cdc_tx_handler(). Fixes: 22a825c541d7 ("net/smc: fix NULL sndbuf_desc in smc_cdc_tx_handler()") Signed-off-by: D. Wythe Reviewed-by: Dust Li Signed-off-by: David S. Miller --- net/smc/smc_cdc.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 01bdb7909a14..3c06625ceb20 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -28,13 +28,15 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd, { struct smc_cdc_tx_pend *cdcpend = (struct smc_cdc_tx_pend *)pnd_snd; struct smc_connection *conn = cdcpend->conn; + struct smc_buf_desc *sndbuf_desc; struct smc_sock *smc; int diff; + sndbuf_desc = conn->sndbuf_desc; smc = container_of(conn, struct smc_sock, conn); bh_lock_sock(&smc->sk); - if (!wc_status) { - diff = smc_curs_diff(cdcpend->conn->sndbuf_desc->len, + if (!wc_status && sndbuf_desc) { + diff = smc_curs_diff(sndbuf_desc->len, &cdcpend->conn->tx_curs_fin, &cdcpend->cursor); /* sndbuf_space is decreased in smc_sendmsg */ @@ -114,9 +116,6 @@ int smc_cdc_msg_send(struct smc_connection *conn, union smc_host_cursor cfed; int rc; - if (unlikely(!READ_ONCE(conn->sndbuf_desc))) - return -ENOBUFS; - smc_cdc_add_pending_send(conn, pend); conn->tx_cdc_seq++; -- cgit From aa96fbd6d78d9770323b21e2c92bd38821be8852 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Fri, 3 Nov 2023 14:07:40 +0800 Subject: net/smc: put sk reference if close work was canceled Note that we always hold a reference to sock when attempting to submit close_work. Therefore, if we have successfully canceled close_work from pending, we MUST release that reference to avoid potential leaks. Fixes: 42bfba9eaa33 ("net/smc: immediate termination for SMCD link groups") Signed-off-by: D. Wythe Reviewed-by: Dust Li Signed-off-by: David S. Miller --- net/smc/smc_close.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index 449ef454b53b..10219f55aad1 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -116,7 +116,8 @@ static void smc_close_cancel_work(struct smc_sock *smc) struct sock *sk = &smc->sk; release_sock(sk); - cancel_work_sync(&smc->conn.close_work); + if (cancel_work_sync(&smc->conn.close_work)) + sock_put(sk); cancel_delayed_work_sync(&smc->conn.tx_work); lock_sock(sk); } -- cgit From e96fe283c6f45dd888536ccb7b0464569533f791 Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Tue, 24 Oct 2023 14:51:08 +0200 Subject: i40e: Do not call devlink_port_type_clear() Do not call devlink_port_type_clear() prior devlink port unregister and let devlink core to take care about it. Reproducer: [root@host ~]# rmmod i40e [ 4539.964699] i40e 0000:02:00.0: devlink port type for port 0 cleared without a software interface reference, device type not supported by the kernel? [ 4540.319811] i40e 0000:02:00.1: devlink port type for port 1 cleared without a software interface reference, device type not supported by the kernel? Fixes: 9e479d64dc58 ("i40e: Add initial devlink support") Signed-off-by: Ivan Vecera Reviewed-by: Jiri Pirko Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_devlink.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_devlink.c b/drivers/net/ethernet/intel/i40e/i40e_devlink.c index 74bc111b4849..cc4e9e2addb7 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_devlink.c +++ b/drivers/net/ethernet/intel/i40e/i40e_devlink.c @@ -231,6 +231,5 @@ int i40e_devlink_create_port(struct i40e_pf *pf) **/ void i40e_devlink_destroy_port(struct i40e_pf *pf) { - devlink_port_type_clear(&pf->devlink_port); devlink_port_unregister(&pf->devlink_port); } -- cgit From aa54d846f3613fa9651786308c6f438e8705aff1 Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Tue, 24 Oct 2023 14:51:09 +0200 Subject: i40e: Fix devlink port unregistering Ensure that devlink port is unregistered after unregistering of net device. Reproducer: [root@host ~]# rmmod i40e [ 4742.939386] i40e 0000:02:00.1: i40e_ptp_stop: removed PHC on enp2s0f1np1 [ 4743.059269] ------------[ cut here ]------------ [ 4743.063900] WARNING: CPU: 21 PID: 10766 at net/devlink/port.c:1078 devl_port_unregister+0x69/0x80 ... Fixes: 9e479d64dc58 ("i40e: Add initial devlink support") Signed-off-by: Ivan Vecera Reviewed-by: Jiri Pirko Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_main.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 3157d14d9b12..f7a332e51524 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -14213,8 +14213,7 @@ int i40e_vsi_release(struct i40e_vsi *vsi) } set_bit(__I40E_VSI_RELEASING, vsi->state); uplink_seid = vsi->uplink_seid; - if (vsi->type == I40E_VSI_MAIN) - i40e_devlink_destroy_port(pf); + if (vsi->type != I40E_VSI_SRIOV) { if (vsi->netdev_registered) { vsi->netdev_registered = false; @@ -14228,6 +14227,9 @@ int i40e_vsi_release(struct i40e_vsi *vsi) i40e_vsi_disable_irq(vsi); } + if (vsi->type == I40E_VSI_MAIN) + i40e_devlink_destroy_port(pf); + spin_lock_bh(&vsi->mac_filter_hash_lock); /* clear the sync flag on all filters */ @@ -14402,14 +14404,14 @@ static struct i40e_vsi *i40e_vsi_reinit_setup(struct i40e_vsi *vsi) err_rings: i40e_vsi_free_q_vectors(vsi); - if (vsi->type == I40E_VSI_MAIN) - i40e_devlink_destroy_port(pf); if (vsi->netdev_registered) { vsi->netdev_registered = false; unregister_netdev(vsi->netdev); free_netdev(vsi->netdev); vsi->netdev = NULL; } + if (vsi->type == I40E_VSI_MAIN) + i40e_devlink_destroy_port(pf); i40e_aq_delete_element(&pf->hw, vsi->seid, NULL); err_vsi: i40e_vsi_clear(vsi); -- cgit From 3e39da4fa16c9c09207d98b8a86a6f6436b531c9 Mon Sep 17 00:00:00 2001 From: Dave Ertman Date: Tue, 10 Oct 2023 10:32:15 -0700 Subject: ice: Fix SRIOV LAG disable on non-compliant aggregate If an attribute of an aggregate interface disqualifies it from supporting SRIOV, the driver will unwind the SRIOV support. Currently the driver is clearing the feature bit for all interfaces in the aggregate, but this is not allowing the other interfaces to unwind successfully on driver unload. Only clear the feature bit for the interface that is currently unwinding. Fixes: bf65da2eb279 ("ice: enforce interface eligibility and add messaging for SRIOV LAG") Signed-off-by: Dave Ertman Reviewed-by: Wojciech Drewek Reviewed-by: Simon Horman Tested-by: Sujai Buvaneswaran Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_lag.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_lag.c b/drivers/net/ethernet/intel/ice/ice_lag.c index b980f89dc892..95e46bde54fe 100644 --- a/drivers/net/ethernet/intel/ice/ice_lag.c +++ b/drivers/net/ethernet/intel/ice/ice_lag.c @@ -1555,18 +1555,12 @@ static void ice_lag_chk_disabled_bond(struct ice_lag *lag, void *ptr) */ static void ice_lag_disable_sriov_bond(struct ice_lag *lag) { - struct ice_lag_netdev_list *entry; struct ice_netdev_priv *np; - struct net_device *netdev; struct ice_pf *pf; - list_for_each_entry(entry, lag->netdev_head, node) { - netdev = entry->netdev; - np = netdev_priv(netdev); - pf = np->vsi->back; - - ice_clear_feature_support(pf, ICE_F_SRIOV_LAG); - } + np = netdev_priv(lag->netdev); + pf = np->vsi->back; + ice_clear_feature_support(pf, ICE_F_SRIOV_LAG); } /** -- cgit From e1db8c2a01d7e12bd566106fbeefa3c5cccd2003 Mon Sep 17 00:00:00 2001 From: Michal Schmidt Date: Mon, 23 Oct 2023 12:59:53 +0200 Subject: ice: lag: in RCU, use atomic allocation Sleeping is not allowed in RCU read-side critical sections. Use atomic allocations under rcu_read_lock. Fixes: 1e0f9881ef79 ("ice: Flesh out implementation of support for SRIOV on bonded interface") Fixes: 41ccedf5ca8f ("ice: implement lag netdev event handler") Fixes: 3579aa86fb40 ("ice: update reset path for SRIOV LAG support") Signed-off-by: Michal Schmidt Reviewed-by: Wojciech Drewek Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Reviewed-by: Simon Horman Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_lag.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_lag.c b/drivers/net/ethernet/intel/ice/ice_lag.c index 95e46bde54fe..cd065ec48c87 100644 --- a/drivers/net/ethernet/intel/ice/ice_lag.c +++ b/drivers/net/ethernet/intel/ice/ice_lag.c @@ -628,7 +628,7 @@ void ice_lag_move_new_vf_nodes(struct ice_vf *vf) INIT_LIST_HEAD(&ndlist.node); rcu_read_lock(); for_each_netdev_in_bond_rcu(lag->upper_netdev, tmp_nd) { - nl = kzalloc(sizeof(*nl), GFP_KERNEL); + nl = kzalloc(sizeof(*nl), GFP_ATOMIC); if (!nl) break; @@ -1692,7 +1692,7 @@ ice_lag_event_handler(struct notifier_block *notif_blk, unsigned long event, rcu_read_lock(); for_each_netdev_in_bond_rcu(upper_netdev, tmp_nd) { - nd_list = kzalloc(sizeof(*nd_list), GFP_KERNEL); + nd_list = kzalloc(sizeof(*nd_list), GFP_ATOMIC); if (!nd_list) break; @@ -2069,7 +2069,7 @@ void ice_lag_rebuild(struct ice_pf *pf) INIT_LIST_HEAD(&ndlist.node); rcu_read_lock(); for_each_netdev_in_bond_rcu(lag->upper_netdev, tmp_nd) { - nl = kzalloc(sizeof(*nl), GFP_KERNEL); + nl = kzalloc(sizeof(*nl), GFP_ATOMIC); if (!nl) break; -- cgit From 8b3c8c55ccbc02920b0ae6601c66df24f0d833bd Mon Sep 17 00:00:00 2001 From: Aniruddha Paul Date: Fri, 13 Oct 2023 19:13:42 +0530 Subject: ice: Fix VF-VF filter rules in switchdev mode Any packet leaving VSI i.e VF's VSI is considered as egress traffic by HW, thus failing to match the added rule. Mark the direction for redirect rules as below: 1. VF-VF - Egress 2. Uplink-VF - Ingress 3. VF-Uplink - Egress 4. Link_Partner-Uplink - Ingress 5. Link_Partner-VF - Ingress Fixes: 0960a27bd479 ("ice: Add direction metadata") Reviewed-by: Przemek Kitszel Reviewed-by: Wojciech Drewek Signed-off-by: Aniruddha Paul Tested-by: Sujai Buvaneswaran Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_tc_lib.c | 90 ++++++++++++++++++++--------- 1 file changed, 62 insertions(+), 28 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_tc_lib.c b/drivers/net/ethernet/intel/ice/ice_tc_lib.c index 37b54db91df2..0e75fc6b3c06 100644 --- a/drivers/net/ethernet/intel/ice/ice_tc_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_tc_lib.c @@ -630,32 +630,61 @@ bool ice_is_tunnel_supported(struct net_device *dev) return ice_tc_tun_get_type(dev) != TNL_LAST; } -static int -ice_eswitch_tc_parse_action(struct ice_tc_flower_fltr *fltr, - struct flow_action_entry *act) +static bool ice_tc_is_dev_uplink(struct net_device *dev) +{ + return netif_is_ice(dev) || ice_is_tunnel_supported(dev); +} + +static int ice_tc_setup_redirect_action(struct net_device *filter_dev, + struct ice_tc_flower_fltr *fltr, + struct net_device *target_dev) { struct ice_repr *repr; + fltr->action.fltr_act = ICE_FWD_TO_VSI; + + if (ice_is_port_repr_netdev(filter_dev) && + ice_is_port_repr_netdev(target_dev)) { + repr = ice_netdev_to_repr(target_dev); + + fltr->dest_vsi = repr->src_vsi; + fltr->direction = ICE_ESWITCH_FLTR_EGRESS; + } else if (ice_is_port_repr_netdev(filter_dev) && + ice_tc_is_dev_uplink(target_dev)) { + repr = ice_netdev_to_repr(filter_dev); + + fltr->dest_vsi = repr->src_vsi->back->switchdev.uplink_vsi; + fltr->direction = ICE_ESWITCH_FLTR_EGRESS; + } else if (ice_tc_is_dev_uplink(filter_dev) && + ice_is_port_repr_netdev(target_dev)) { + repr = ice_netdev_to_repr(target_dev); + + fltr->dest_vsi = repr->src_vsi; + fltr->direction = ICE_ESWITCH_FLTR_INGRESS; + } else { + NL_SET_ERR_MSG_MOD(fltr->extack, + "Unsupported netdevice in switchdev mode"); + return -EINVAL; + } + + return 0; +} + +static int ice_eswitch_tc_parse_action(struct net_device *filter_dev, + struct ice_tc_flower_fltr *fltr, + struct flow_action_entry *act) +{ + int err; + switch (act->id) { case FLOW_ACTION_DROP: fltr->action.fltr_act = ICE_DROP_PACKET; break; case FLOW_ACTION_REDIRECT: - fltr->action.fltr_act = ICE_FWD_TO_VSI; - - if (ice_is_port_repr_netdev(act->dev)) { - repr = ice_netdev_to_repr(act->dev); - - fltr->dest_vsi = repr->src_vsi; - fltr->direction = ICE_ESWITCH_FLTR_INGRESS; - } else if (netif_is_ice(act->dev) || - ice_is_tunnel_supported(act->dev)) { - fltr->direction = ICE_ESWITCH_FLTR_EGRESS; - } else { - NL_SET_ERR_MSG_MOD(fltr->extack, "Unsupported netdevice in switchdev mode"); - return -EINVAL; - } + err = ice_tc_setup_redirect_action(filter_dev, fltr, act->dev); + if (err) + return err; break; @@ -696,10 +725,6 @@ ice_eswitch_add_tc_fltr(struct ice_vsi *vsi, struct ice_tc_flower_fltr *fltr) goto exit; } - /* egress traffic is always redirect to uplink */ - if (fltr->direction == ICE_ESWITCH_FLTR_EGRESS) - fltr->dest_vsi = vsi->back->switchdev.uplink_vsi; - rule_info.sw_act.fltr_act = fltr->action.fltr_act; if (fltr->action.fltr_act != ICE_DROP_PACKET) rule_info.sw_act.vsi_handle = fltr->dest_vsi->idx; @@ -713,13 +738,21 @@ ice_eswitch_add_tc_fltr(struct ice_vsi *vsi, struct ice_tc_flower_fltr *fltr) rule_info.flags_info.act_valid = true; if (fltr->direction == ICE_ESWITCH_FLTR_INGRESS) { + /* Uplink to VF */ rule_info.sw_act.flag |= ICE_FLTR_RX; rule_info.sw_act.src = hw->pf_id; rule_info.flags_info.act = ICE_SINGLE_ACT_LB_ENABLE; - } else { + } else if (fltr->direction == ICE_ESWITCH_FLTR_EGRESS && + fltr->dest_vsi == vsi->back->switchdev.uplink_vsi) { + /* VF to Uplink */ rule_info.sw_act.flag |= ICE_FLTR_TX; rule_info.sw_act.src = vsi->idx; rule_info.flags_info.act = ICE_SINGLE_ACT_LAN_ENABLE; + } else { + /* VF to VF */ + rule_info.sw_act.flag |= ICE_FLTR_TX; + rule_info.sw_act.src = vsi->idx; + rule_info.flags_info.act = ICE_SINGLE_ACT_LB_ENABLE; } /* specify the cookie as filter_rule_id */ @@ -1745,16 +1778,17 @@ ice_tc_parse_action(struct ice_vsi *vsi, struct ice_tc_flower_fltr *fltr, /** * ice_parse_tc_flower_actions - Parse the actions for a TC filter + * @filter_dev: Pointer to device on which filter is being added * @vsi: Pointer to VSI * @cls_flower: Pointer to TC flower offload structure * @fltr: Pointer to TC flower filter structure * * Parse the actions for a TC filter */ -static int -ice_parse_tc_flower_actions(struct ice_vsi *vsi, - struct flow_cls_offload *cls_flower, - struct ice_tc_flower_fltr *fltr) +static int ice_parse_tc_flower_actions(struct net_device *filter_dev, + struct ice_vsi *vsi, + struct flow_cls_offload *cls_flower, + struct ice_tc_flower_fltr *fltr) { struct flow_rule *rule = flow_cls_offload_flow_rule(cls_flower); struct flow_action *flow_action = &rule->action; @@ -1769,7 +1803,7 @@ ice_parse_tc_flower_actions(struct ice_vsi *vsi, flow_action_for_each(i, act, flow_action) { if (ice_is_eswitch_mode_switchdev(vsi->back)) - err = ice_eswitch_tc_parse_action(fltr, act); + err = ice_eswitch_tc_parse_action(filter_dev, fltr, act); else err = ice_tc_parse_action(vsi, fltr, act); if (err) @@ -1856,7 +1890,7 @@ ice_add_tc_fltr(struct net_device *netdev, struct ice_vsi *vsi, if (err < 0) goto err; - err = ice_parse_tc_flower_actions(vsi, f, fltr); + err = ice_parse_tc_flower_actions(netdev, vsi, f, fltr); if (err < 0) goto err; -- cgit From 68c51db3a16d258e730dd1c04a1de2f7ab038ddf Mon Sep 17 00:00:00 2001 From: Marcin Szycik Date: Wed, 25 Oct 2023 16:47:24 +0200 Subject: ice: Fix VF-VF direction matching in drop rule in switchdev When adding a drop rule on a VF, rule direction is not being set, which results in it always being set to ingress (ICE_ESWITCH_FLTR_INGRESS equals 0). Because of this, drop rules added on port representors don't match any packets. To fix it, set rule direction in drop action to egress when netdev is a port representor, otherwise set it to ingress. Fixes: 0960a27bd479 ("ice: Add direction metadata") Reviewed-by: Michal Swiatkowski Signed-off-by: Marcin Szycik Tested-by: Sujai Buvaneswaran Reviewed-by: Simon Horman Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_tc_lib.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_tc_lib.c b/drivers/net/ethernet/intel/ice/ice_tc_lib.c index 0e75fc6b3c06..dd03cb69ad26 100644 --- a/drivers/net/ethernet/intel/ice/ice_tc_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_tc_lib.c @@ -670,6 +670,25 @@ static int ice_tc_setup_redirect_action(struct net_device *filter_dev, return 0; } +static int +ice_tc_setup_drop_action(struct net_device *filter_dev, + struct ice_tc_flower_fltr *fltr) +{ + fltr->action.fltr_act = ICE_DROP_PACKET; + + if (ice_is_port_repr_netdev(filter_dev)) { + fltr->direction = ICE_ESWITCH_FLTR_EGRESS; + } else if (ice_tc_is_dev_uplink(filter_dev)) { + fltr->direction = ICE_ESWITCH_FLTR_INGRESS; + } else { + NL_SET_ERR_MSG_MOD(fltr->extack, + "Unsupported netdevice in switchdev mode"); + return -EINVAL; + } + + return 0; +} + static int ice_eswitch_tc_parse_action(struct net_device *filter_dev, struct ice_tc_flower_fltr *fltr, struct flow_action_entry *act) @@ -678,7 +697,10 @@ static int ice_eswitch_tc_parse_action(struct net_device *filter_dev, switch (act->id) { case FLOW_ACTION_DROP: - fltr->action.fltr_act = ICE_DROP_PACKET; + err = ice_tc_setup_drop_action(filter_dev, fltr); + if (err) + return err; + break; case FLOW_ACTION_REDIRECT: -- cgit From 9fc3bc7643341dc5be7d269f3d3dbe441d8d7ac3 Mon Sep 17 00:00:00 2001 From: George Shuklin Date: Fri, 3 Nov 2023 13:50:29 +0200 Subject: tg3: power down device only on SYSTEM_POWER_OFF Dell R650xs servers hangs on reboot if tg3 driver calls tg3_power_down. This happens only if network adapters (BCM5720 for R650xs) were initialized using SNP (e.g. by booting ipxe.efi). The actual problem is on Dell side, but this fix allows servers to come back alive after reboot. Signed-off-by: George Shuklin Fixes: 2ca1c94ce0b6 ("tg3: Disable tg3 device on system reboot to avoid triggering AER") Reviewed-by: Pavan Chebbi Reviewed-by: Michael Chan Link: https://lore.kernel.org/r/20231103115029.83273-1-george.shuklin@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/tg3.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index 5665d0c3668f..9a8d87fe25ff 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -18127,7 +18127,8 @@ static void tg3_shutdown(struct pci_dev *pdev) if (netif_running(dev)) dev_close(dev); - tg3_power_down(tp); + if (system_state == SYSTEM_POWER_OFF) + tg3_power_down(tp); rtnl_unlock(); -- cgit From 115c0f4d58574524ed7fbbcce1b0a86aa5249db1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 3 Nov 2023 20:04:51 +0000 Subject: idpf: fix potential use-after-free in idpf_tso() skb_cow_head() can change skb->head (and thus skb_shinfo(skb)) We must not cache skb_shinfo(skb) before skb_cow_head(). Fixes: 6818c4d5b3c2 ("idpf: add splitq start_xmit") Signed-off-by: Eric Dumazet Cc: Joshua Hay Cc: Alan Brady Cc: Madhu Chittim Cc: Phani Burra Cc: Sridhar Samudrala Cc: Willem de Bruijn Cc: Pavan Kumar Linga Cc: Tony Nguyen Cc: Bailey Forrest Reviewed-by: Jacob Keller Link: https://lore.kernel.org/r/20231103200451.514047-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/idpf/idpf_txrx.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index 5e1ef70d54fe..1f728a9004d9 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -2365,7 +2365,7 @@ static void idpf_tx_splitq_map(struct idpf_queue *tx_q, */ int idpf_tso(struct sk_buff *skb, struct idpf_tx_offload_params *off) { - const struct skb_shared_info *shinfo = skb_shinfo(skb); + const struct skb_shared_info *shinfo; union { struct iphdr *v4; struct ipv6hdr *v6; @@ -2379,13 +2379,15 @@ int idpf_tso(struct sk_buff *skb, struct idpf_tx_offload_params *off) u32 paylen, l4_start; int err; - if (!shinfo->gso_size) + if (!skb_is_gso(skb)) return 0; err = skb_cow_head(skb, 0); if (err < 0) return err; + shinfo = skb_shinfo(skb); + ip.hdr = skb_network_header(skb); l4.hdr = skb_transport_header(skb); -- cgit From dbc9e341e3655f03a1eeeda8f0fa7931f54fac58 Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Tue, 7 Nov 2023 06:20:59 +0800 Subject: s390/qeth: Fix typo 'weed' in comment Replace 'weed' with 'we' in the comment. Signed-off-by: Kuan-Wei Chiu Reviewed-by: Alexandra Winter Signed-off-by: David S. Miller --- drivers/s390/net/qeth_core_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index 6af2511e070c..cf8506d0f185 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -3675,7 +3675,7 @@ static void qeth_flush_queue(struct qeth_qdio_out_q *queue) static void qeth_check_outbound_queue(struct qeth_qdio_out_q *queue) { /* - * check if weed have to switch to non-packing mode or if + * check if we have to switch to non-packing mode or if * we have to get a pci flag out on the queue */ if ((atomic_read(&queue->used_buffers) <= QETH_LOW_WATERMARK_PACK) || -- cgit From c542b39b607dc0619e3b1b36e289600a555c4774 Mon Sep 17 00:00:00 2001 From: Alex Pakhunov Date: Sun, 5 Nov 2023 10:58:28 -0800 Subject: tg3: Fix the TX ring stall The TX ring maintained by the tg3 driver can end up in the state, when it has packets queued for sending but the NIC hardware is not informed, so no progress is made. This leads to a multi-second interruption in network traffic followed by dev_watchdog() firing and resetting the queue. The specific sequence of steps is: 1. tg3_start_xmit() is called at least once and queues packet(s) without updating tnapi->prodmbox (netdev_xmit_more() returns true) 2. tg3_start_xmit() is called with an SKB which causes tg3_tso_bug() to be called. 3. tg3_tso_bug() determines that the SKB is too large, ... if (unlikely(tg3_tx_avail(tnapi) <= frag_cnt_est)) { ... stops the queue, and returns NETDEV_TX_BUSY: netif_tx_stop_queue(txq); ... if (tg3_tx_avail(tnapi) <= frag_cnt_est) return NETDEV_TX_BUSY; 4. Since all tg3_tso_bug() call sites directly return, the code updating tnapi->prodmbox is skipped. 5. The queue is stuck now. tg3_start_xmit() is not called while the queue is stopped. The NIC is not processing new packets because tnapi->prodmbox wasn't updated. tg3_tx() is not called by tg3_poll_work() because the all TX descriptions that could be freed has been freed: /* run TX completion thread */ if (tnapi->hw_status->idx[0].tx_consumer != tnapi->tx_cons) { tg3_tx(tnapi); 6. Eventually, dev_watchdog() fires triggering a reset of the queue. This fix makes sure that the tnapi->prodmbox update happens regardless of the reason tg3_start_xmit() returned. Signed-off-by: Alex Pakhunov Signed-off-by: Vincent Wong Reviewed-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/tg3.c | 53 +++++++++++++++++++++++++++++-------- 1 file changed, 42 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index 9a8d87fe25ff..1dee27349367 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -6647,9 +6647,9 @@ static void tg3_tx(struct tg3_napi *tnapi) tnapi->tx_cons = sw_idx; - /* Need to make the tx_cons update visible to tg3_start_xmit() + /* Need to make the tx_cons update visible to __tg3_start_xmit() * before checking for netif_queue_stopped(). Without the - * memory barrier, there is a small possibility that tg3_start_xmit() + * memory barrier, there is a small possibility that __tg3_start_xmit() * will miss it and cause the queue to be stopped forever. */ smp_mb(); @@ -7889,7 +7889,7 @@ static bool tg3_tso_bug_gso_check(struct tg3_napi *tnapi, struct sk_buff *skb) return skb_shinfo(skb)->gso_segs < tnapi->tx_pending / 3; } -static netdev_tx_t tg3_start_xmit(struct sk_buff *, struct net_device *); +static netdev_tx_t __tg3_start_xmit(struct sk_buff *, struct net_device *); /* Use GSO to workaround all TSO packets that meet HW bug conditions * indicated in tg3_tx_frag_set() @@ -7923,7 +7923,7 @@ static int tg3_tso_bug(struct tg3 *tp, struct tg3_napi *tnapi, skb_list_walk_safe(segs, seg, next) { skb_mark_not_on_list(seg); - tg3_start_xmit(seg, tp->dev); + __tg3_start_xmit(seg, tp->dev); } tg3_tso_bug_end: @@ -7933,7 +7933,7 @@ tg3_tso_bug_end: } /* hard_start_xmit for all devices */ -static netdev_tx_t tg3_start_xmit(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t __tg3_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct tg3 *tp = netdev_priv(dev); u32 len, entry, base_flags, mss, vlan = 0; @@ -8182,11 +8182,6 @@ static netdev_tx_t tg3_start_xmit(struct sk_buff *skb, struct net_device *dev) netif_tx_wake_queue(txq); } - if (!netdev_xmit_more() || netif_xmit_stopped(txq)) { - /* Packets are ready, update Tx producer idx on card. */ - tw32_tx_mbox(tnapi->prodmbox, entry); - } - return NETDEV_TX_OK; dma_error: @@ -8199,6 +8194,42 @@ drop_nofree: return NETDEV_TX_OK; } +static netdev_tx_t tg3_start_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct netdev_queue *txq; + u16 skb_queue_mapping; + netdev_tx_t ret; + + skb_queue_mapping = skb_get_queue_mapping(skb); + txq = netdev_get_tx_queue(dev, skb_queue_mapping); + + ret = __tg3_start_xmit(skb, dev); + + /* Notify the hardware that packets are ready by updating the TX ring + * tail pointer. We respect netdev_xmit_more() thus avoiding poking + * the hardware for every packet. To guarantee forward progress the TX + * ring must be drained when it is full as indicated by + * netif_xmit_stopped(). This needs to happen even when the current + * skb was dropped or rejected with NETDEV_TX_BUSY. Otherwise packets + * queued by previous __tg3_start_xmit() calls might get stuck in + * the queue forever. + */ + if (!netdev_xmit_more() || netif_xmit_stopped(txq)) { + struct tg3_napi *tnapi; + struct tg3 *tp; + + tp = netdev_priv(dev); + tnapi = &tp->napi[skb_queue_mapping]; + + if (tg3_flag(tp, ENABLE_TSS)) + tnapi++; + + tw32_tx_mbox(tnapi->prodmbox, tnapi->tx_prod); + } + + return ret; +} + static void tg3_mac_loopback(struct tg3 *tp, bool enable) { if (enable) { @@ -17729,7 +17760,7 @@ static int tg3_init_one(struct pci_dev *pdev, * device behind the EPB cannot support DMA addresses > 40-bit. * On 64-bit systems with IOMMU, use 40-bit dma_mask. * On 64-bit systems without IOMMU, use 64-bit dma_mask and - * do DMA address check in tg3_start_xmit(). + * do DMA address check in __tg3_start_xmit(). */ if (tg3_flag(tp, IS_5788)) persist_dma_mask = dma_mask = DMA_BIT_MASK(32); -- cgit From 7425627b2b2cd671d5bf6541ce50f7cba8a76ad6 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Mon, 6 Nov 2023 14:14:16 -0700 Subject: tcp: Fix -Wc23-extensions in tcp_options_write() Clang warns (or errors with CONFIG_WERROR=y) when CONFIG_TCP_AO is set: net/ipv4/tcp_output.c:663:2: error: label at end of compound statement is a C23 extension [-Werror,-Wc23-extensions] 663 | } | ^ 1 error generated. On earlier releases (such as clang-11, the current minimum supported version for building the kernel) that do not support C23, this was a hard error unconditionally: net/ipv4/tcp_output.c:663:2: error: expected statement } ^ 1 error generated. While adding a semicolon after the label would resolve this, it is more in line with the kernel as a whole to refactor this block into a standalone function, which means the goto a label construct can just be replaced with a return statement. Do so to resolve the warning. Closes: https://github.com/ClangBuiltLinux/linux/issues/1953 Fixes: 1e03d32bea8e ("net/tcp: Add TCP-AO sign to outgoing packets") Signed-off-by: Nathan Chancellor Reviewed-by: Dmitry Safonov Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 70 ++++++++++++++++++++++++++++----------------------- 1 file changed, 39 insertions(+), 31 deletions(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 0d8dd5b7e2e5..eb13a55d660c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -601,6 +601,44 @@ static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb, } #endif +static __be32 *process_tcp_ao_options(struct tcp_sock *tp, + const struct tcp_request_sock *tcprsk, + struct tcp_out_options *opts, + struct tcp_key *key, __be32 *ptr) +{ +#ifdef CONFIG_TCP_AO + u8 maclen = tcp_ao_maclen(key->ao_key); + + if (tcprsk) { + u8 aolen = maclen + sizeof(struct tcp_ao_hdr); + + *ptr++ = htonl((TCPOPT_AO << 24) | (aolen << 16) | + (tcprsk->ao_keyid << 8) | + (tcprsk->ao_rcv_next)); + } else { + struct tcp_ao_key *rnext_key; + struct tcp_ao_info *ao_info; + + ao_info = rcu_dereference_check(tp->ao_info, + lockdep_sock_is_held(&tp->inet_conn.icsk_inet.sk)); + rnext_key = READ_ONCE(ao_info->rnext_key); + if (WARN_ON_ONCE(!rnext_key)) + return ptr; + *ptr++ = htonl((TCPOPT_AO << 24) | + (tcp_ao_len(key->ao_key) << 16) | + (key->ao_key->sndid << 8) | + (rnext_key->rcvid)); + } + opts->hash_location = (__u8 *)ptr; + ptr += maclen / sizeof(*ptr); + if (unlikely(maclen % sizeof(*ptr))) { + memset(ptr, TCPOPT_NOP, sizeof(*ptr)); + ptr++; + } +#endif + return ptr; +} + /* Write previously computed TCP options to the packet. * * Beware: Something in the Internet is very sensitive to the ordering of @@ -629,37 +667,7 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, opts->hash_location = (__u8 *)ptr; ptr += 4; } else if (tcp_key_is_ao(key)) { -#ifdef CONFIG_TCP_AO - u8 maclen = tcp_ao_maclen(key->ao_key); - - if (tcprsk) { - u8 aolen = maclen + sizeof(struct tcp_ao_hdr); - - *ptr++ = htonl((TCPOPT_AO << 24) | (aolen << 16) | - (tcprsk->ao_keyid << 8) | - (tcprsk->ao_rcv_next)); - } else { - struct tcp_ao_key *rnext_key; - struct tcp_ao_info *ao_info; - - ao_info = rcu_dereference_check(tp->ao_info, - lockdep_sock_is_held(&tp->inet_conn.icsk_inet.sk)); - rnext_key = READ_ONCE(ao_info->rnext_key); - if (WARN_ON_ONCE(!rnext_key)) - goto out_ao; - *ptr++ = htonl((TCPOPT_AO << 24) | - (tcp_ao_len(key->ao_key) << 16) | - (key->ao_key->sndid << 8) | - (rnext_key->rcvid)); - } - opts->hash_location = (__u8 *)ptr; - ptr += maclen / sizeof(*ptr); - if (unlikely(maclen % sizeof(*ptr))) { - memset(ptr, TCPOPT_NOP, sizeof(*ptr)); - ptr++; - } -out_ao: -#endif + ptr = process_tcp_ao_options(tp, tcprsk, opts, key, ptr); } if (unlikely(opts->mss)) { *ptr++ = htonl((TCPOPT_MSS << 24) | -- cgit From 3a5cc90a4d1756072619fe511d07621bdef7f120 Mon Sep 17 00:00:00 2001 From: Filippo Storniolo Date: Fri, 3 Nov 2023 18:55:48 +0100 Subject: vsock/virtio: remove socket from connected/bound list on shutdown If the same remote peer, using the same port, tries to connect to a server on a listening port more than once, the server will reject the connection, causing a "connection reset by peer" error on the remote peer. This is due to the presence of a dangling socket from a previous connection in both the connected and bound socket lists. The inconsistency of the above lists only occurs when the remote peer disconnects and the server remains active. This bug does not occur when the server socket is closed: virtio_transport_release() will eventually schedule a call to virtio_transport_do_close() and the latter will remove the socket from the bound and connected socket lists and clear the sk_buff. However, virtio_transport_do_close() will only perform the above actions if it has been scheduled, and this will not happen if the server is processing the shutdown message from a remote peer. To fix this, introduce a call to vsock_remove_sock() when the server is handling a client disconnect. This is to remove the socket from the bound and connected socket lists without clearing the sk_buff. Fixes: 06a8fc78367d ("VSOCK: Introduce virtio_vsock_common.ko") Reported-by: Daan De Meyer Tested-by: Daan De Meyer Co-developed-by: Luigi Leonardi Signed-off-by: Luigi Leonardi Signed-off-by: Filippo Storniolo Reviewed-by: Stefano Garzarella Signed-off-by: David S. Miller --- net/vmw_vsock/virtio_transport_common.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index e22c81435ef7..4c595dd1fd64 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -1369,11 +1369,17 @@ virtio_transport_recv_connected(struct sock *sk, vsk->peer_shutdown |= RCV_SHUTDOWN; if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SHUTDOWN_SEND) vsk->peer_shutdown |= SEND_SHUTDOWN; - if (vsk->peer_shutdown == SHUTDOWN_MASK && - vsock_stream_has_data(vsk) <= 0 && - !sock_flag(sk, SOCK_DONE)) { - (void)virtio_transport_reset(vsk, NULL); - virtio_transport_do_close(vsk, true); + if (vsk->peer_shutdown == SHUTDOWN_MASK) { + if (vsock_stream_has_data(vsk) <= 0 && !sock_flag(sk, SOCK_DONE)) { + (void)virtio_transport_reset(vsk, NULL); + virtio_transport_do_close(vsk, true); + } + /* Remove this socket anyway because the remote peer sent + * the shutdown. This way a new connection will succeed + * if the remote peer uses the same source port, + * even if the old socket is still unreleased, but now disconnected. + */ + vsock_remove_sock(vsk); } if (le32_to_cpu(virtio_vsock_hdr(skb)->flags)) sk->sk_state_change(sk); -- cgit From bfada5a7672fea5465d81bba3d05fca6024a244e Mon Sep 17 00:00:00 2001 From: Filippo Storniolo Date: Fri, 3 Nov 2023 18:55:49 +0100 Subject: test/vsock fix: add missing check on socket creation Add check on socket() return value in vsock_listen() and vsock_connect() Co-developed-by: Luigi Leonardi Signed-off-by: Luigi Leonardi Signed-off-by: Filippo Storniolo Reviewed-by: Stefano Garzarella Signed-off-by: David S. Miller --- tools/testing/vsock/util.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tools/testing/vsock/util.c b/tools/testing/vsock/util.c index 92336721321a..698b0b44a2ee 100644 --- a/tools/testing/vsock/util.c +++ b/tools/testing/vsock/util.c @@ -104,6 +104,10 @@ static int vsock_connect(unsigned int cid, unsigned int port, int type) control_expectln("LISTENING"); fd = socket(AF_VSOCK, type, 0); + if (fd < 0) { + perror("socket"); + exit(EXIT_FAILURE); + } timeout_begin(TIMEOUT); do { @@ -158,6 +162,10 @@ static int vsock_accept(unsigned int cid, unsigned int port, int old_errno; fd = socket(AF_VSOCK, type, 0); + if (fd < 0) { + perror("socket"); + exit(EXIT_FAILURE); + } if (bind(fd, &addr.sa, sizeof(addr.svm)) < 0) { perror("bind"); -- cgit From 84d5fb9741316ca53f0f7c23b82f30e0bb33c38e Mon Sep 17 00:00:00 2001 From: Filippo Storniolo Date: Fri, 3 Nov 2023 18:55:50 +0100 Subject: test/vsock: refactor vsock_accept This is a preliminary patch to introduce SOCK_STREAM bind connect test. vsock_accept() is split into vsock_listen() and vsock_accept(). Co-developed-by: Luigi Leonardi Signed-off-by: Luigi Leonardi Signed-off-by: Filippo Storniolo Reviewed-by: Stefano Garzarella Signed-off-by: David S. Miller --- tools/testing/vsock/util.c | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/tools/testing/vsock/util.c b/tools/testing/vsock/util.c index 698b0b44a2ee..2fc96f29bdf2 100644 --- a/tools/testing/vsock/util.c +++ b/tools/testing/vsock/util.c @@ -136,11 +136,8 @@ int vsock_seqpacket_connect(unsigned int cid, unsigned int port) return vsock_connect(cid, port, SOCK_SEQPACKET); } -/* Listen on and return the first incoming connection. The remote - * address is stored to clientaddrp. clientaddrp may be NULL. - */ -static int vsock_accept(unsigned int cid, unsigned int port, - struct sockaddr_vm *clientaddrp, int type) +/* Listen on and return the file descriptor. */ +static int vsock_listen(unsigned int cid, unsigned int port, int type) { union { struct sockaddr sa; @@ -152,14 +149,7 @@ static int vsock_accept(unsigned int cid, unsigned int port, .svm_cid = cid, }, }; - union { - struct sockaddr sa; - struct sockaddr_vm svm; - } clientaddr; - socklen_t clientaddr_len = sizeof(clientaddr.svm); int fd; - int client_fd; - int old_errno; fd = socket(AF_VSOCK, type, 0); if (fd < 0) { @@ -177,6 +167,24 @@ static int vsock_accept(unsigned int cid, unsigned int port, exit(EXIT_FAILURE); } + return fd; +} + +/* Listen on and return the first incoming connection. The remote + * address is stored to clientaddrp. clientaddrp may be NULL. + */ +static int vsock_accept(unsigned int cid, unsigned int port, + struct sockaddr_vm *clientaddrp, int type) +{ + union { + struct sockaddr sa; + struct sockaddr_vm svm; + } clientaddr; + socklen_t clientaddr_len = sizeof(clientaddr.svm); + int fd, client_fd, old_errno; + + fd = vsock_listen(cid, port, type); + control_writeln("LISTENING"); timeout_begin(TIMEOUT); -- cgit From d80f63f690257b04b4fe3731b90dbf34a9ebb93f Mon Sep 17 00:00:00 2001 From: Filippo Storniolo Date: Fri, 3 Nov 2023 18:55:51 +0100 Subject: test/vsock: add dobule bind connect test This add bind connect test which creates a listening server socket and tries to connect a client with a bound local port to it twice. Co-developed-by: Luigi Leonardi Signed-off-by: Luigi Leonardi Signed-off-by: Filippo Storniolo Reviewed-by: Stefano Garzarella Signed-off-by: David S. Miller --- tools/testing/vsock/util.c | 47 +++++++++++++++++++++++++++++++++++++ tools/testing/vsock/util.h | 3 +++ tools/testing/vsock/vsock_test.c | 50 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 100 insertions(+) diff --git a/tools/testing/vsock/util.c b/tools/testing/vsock/util.c index 2fc96f29bdf2..ae2b33c21c45 100644 --- a/tools/testing/vsock/util.c +++ b/tools/testing/vsock/util.c @@ -85,6 +85,48 @@ void vsock_wait_remote_close(int fd) close(epollfd); } +/* Bind to , connect to and return the file descriptor. */ +int vsock_bind_connect(unsigned int cid, unsigned int port, unsigned int bind_port, int type) +{ + struct sockaddr_vm sa_client = { + .svm_family = AF_VSOCK, + .svm_cid = VMADDR_CID_ANY, + .svm_port = bind_port, + }; + struct sockaddr_vm sa_server = { + .svm_family = AF_VSOCK, + .svm_cid = cid, + .svm_port = port, + }; + + int client_fd, ret; + + client_fd = socket(AF_VSOCK, type, 0); + if (client_fd < 0) { + perror("socket"); + exit(EXIT_FAILURE); + } + + if (bind(client_fd, (struct sockaddr *)&sa_client, sizeof(sa_client))) { + perror("bind"); + exit(EXIT_FAILURE); + } + + timeout_begin(TIMEOUT); + do { + ret = connect(client_fd, (struct sockaddr *)&sa_server, sizeof(sa_server)); + timeout_check("connect"); + } while (ret < 0 && errno == EINTR); + timeout_end(); + + if (ret < 0) { + perror("connect"); + exit(EXIT_FAILURE); + } + + return client_fd; +} + /* Connect to and return the file descriptor. */ static int vsock_connect(unsigned int cid, unsigned int port, int type) { @@ -223,6 +265,11 @@ int vsock_stream_accept(unsigned int cid, unsigned int port, return vsock_accept(cid, port, clientaddrp, SOCK_STREAM); } +int vsock_stream_listen(unsigned int cid, unsigned int port) +{ + return vsock_listen(cid, port, SOCK_STREAM); +} + int vsock_seqpacket_accept(unsigned int cid, unsigned int port, struct sockaddr_vm *clientaddrp) { diff --git a/tools/testing/vsock/util.h b/tools/testing/vsock/util.h index a77175d25864..03c88d0cb861 100644 --- a/tools/testing/vsock/util.h +++ b/tools/testing/vsock/util.h @@ -36,9 +36,12 @@ struct test_case { void init_signals(void); unsigned int parse_cid(const char *str); int vsock_stream_connect(unsigned int cid, unsigned int port); +int vsock_bind_connect(unsigned int cid, unsigned int port, + unsigned int bind_port, int type); int vsock_seqpacket_connect(unsigned int cid, unsigned int port); int vsock_stream_accept(unsigned int cid, unsigned int port, struct sockaddr_vm *clientaddrp); +int vsock_stream_listen(unsigned int cid, unsigned int port); int vsock_seqpacket_accept(unsigned int cid, unsigned int port, struct sockaddr_vm *clientaddrp); void vsock_wait_remote_close(int fd); diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c index c1f7bc9abd22..5b0e93f9996c 100644 --- a/tools/testing/vsock/vsock_test.c +++ b/tools/testing/vsock/vsock_test.c @@ -1180,6 +1180,51 @@ static void test_stream_shutrd_server(const struct test_opts *opts) close(fd); } +static void test_double_bind_connect_server(const struct test_opts *opts) +{ + int listen_fd, client_fd, i; + struct sockaddr_vm sa_client; + socklen_t socklen_client = sizeof(sa_client); + + listen_fd = vsock_stream_listen(VMADDR_CID_ANY, 1234); + + for (i = 0; i < 2; i++) { + control_writeln("LISTENING"); + + timeout_begin(TIMEOUT); + do { + client_fd = accept(listen_fd, (struct sockaddr *)&sa_client, + &socklen_client); + timeout_check("accept"); + } while (client_fd < 0 && errno == EINTR); + timeout_end(); + + if (client_fd < 0) { + perror("accept"); + exit(EXIT_FAILURE); + } + + /* Waiting for remote peer to close connection */ + vsock_wait_remote_close(client_fd); + } + + close(listen_fd); +} + +static void test_double_bind_connect_client(const struct test_opts *opts) +{ + int i, client_fd; + + for (i = 0; i < 2; i++) { + /* Wait until server is ready to accept a new connection */ + control_expectln("LISTENING"); + + client_fd = vsock_bind_connect(opts->peer_cid, 1234, 4321, SOCK_STREAM); + + close(client_fd); + } +} + static struct test_case test_cases[] = { { .name = "SOCK_STREAM connection reset", @@ -1285,6 +1330,11 @@ static struct test_case test_cases[] = { .run_client = test_stream_msgzcopy_empty_errq_client, .run_server = test_stream_msgzcopy_empty_errq_server, }, + { + .name = "SOCK_STREAM double bind connect", + .run_client = test_double_bind_connect_client, + .run_server = test_double_bind_connect_server, + }, {}, }; -- cgit From 02d5fdbf4f2b8c406f7a4c98fa52aa181a11d733 Mon Sep 17 00:00:00 2001 From: Klaus Kudielka Date: Tue, 7 Nov 2023 18:44:02 +0100 Subject: net: phylink: initialize carrier state at creation Background: Turris Omnia (Armada 385); eth2 (mvneta) connected to SFP bus; SFP module is present, but no fiber connected, so definitely no carrier. After booting, eth2 is down, but netdev LED trigger surprisingly reports link active. Then, after "ip link set eth2 up", the link indicator goes away - as I would have expected it from the beginning. It turns out, that the default carrier state after netdev creation is "carrier ok". Some ethernet drivers explicitly call netif_carrier_off during probing, others (like mvneta) don't - which explains the current behaviour: only when the device is brought up, phylink_start calls netif_carrier_off. Fix this for all drivers using phylink, by calling netif_carrier_off in phylink_create. Fixes: 089381b27abe ("leds: initial support for Turris Omnia LEDs") Cc: stable@vger.kernel.org Suggested-by: Andrew Lunn Signed-off-by: Klaus Kudielka Reviewed-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/phylink.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index 6fdc6ba294d5..25c19496a336 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -1616,6 +1616,7 @@ struct phylink *phylink_create(struct phylink_config *config, pl->config = config; if (config->type == PHYLINK_NETDEV) { pl->netdev = to_net_dev(config->dev); + netif_carrier_off(pl->netdev); } else if (config->type == PHYLINK_DEV) { pl->dev = config->dev; } else { -- cgit From 0de4f50de25af79c2a46db55d70cdbd8f985c6d1 Mon Sep 17 00:00:00 2001 From: Chuyi Zhou Date: Tue, 7 Nov 2023 21:22:03 +0800 Subject: bpf: Let verifier consider {task,cgroup} is trusted in bpf_iter_reg BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task) in verifier.c wanted to teach BPF verifier that bpf_iter__task -> task is a trusted ptr. But it doesn't work well. The reason is, bpf_iter__task -> task would go through btf_ctx_access() which enforces the reg_type of 'task' is ctx_arg_info->reg_type, and in task_iter.c, we actually explicitly declare that the ctx_arg_info->reg_type is PTR_TO_BTF_ID_OR_NULL. Actually we have a previous case like this[1] where PTR_TRUSTED is added to the arg flag for map_iter. This patch sets ctx_arg_info->reg_type is PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED in task_reg_info. Similarly, bpf_cgroup_reg_info -> cgroup is also PTR_TRUSTED since we are under the protection of cgroup_mutex and we would check cgroup_is_dead() in __cgroup_iter_seq_show(). This patch is to improve the user experience of the newly introduced bpf_iter_css_task kfunc before hitting the mainline. The Fixes tag is pointing to the commit introduced the bpf_iter_css_task kfunc. Link[1]:https://lore.kernel.org/all/20230706133932.45883-3-aspsk@isovalent.com/ Fixes: 9c66dc94b62a ("bpf: Introduce css_task open-coded iterator kfuncs") Signed-off-by: Chuyi Zhou Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20231107132204.912120-2-zhouchuyi@bytedance.com Signed-off-by: Martin KaFai Lau --- kernel/bpf/cgroup_iter.c | 2 +- kernel/bpf/task_iter.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c index d1b5c5618dd7..f04a468cf6a7 100644 --- a/kernel/bpf/cgroup_iter.c +++ b/kernel/bpf/cgroup_iter.c @@ -282,7 +282,7 @@ static struct bpf_iter_reg bpf_cgroup_reg_info = { .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__cgroup, cgroup), - PTR_TO_BTF_ID_OR_NULL }, + PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, }, .seq_info = &cgroup_iter_seq_info, }; diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 4e156dca48de..26082b97894d 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -704,7 +704,7 @@ static struct bpf_iter_reg task_reg_info = { .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__task, task), - PTR_TO_BTF_ID_OR_NULL }, + PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, }, .seq_info = &task_seq_info, .fill_link_info = bpf_iter_fill_link_info, -- cgit From 3c5864ba9cf912ff9809f315d28f296f21563cce Mon Sep 17 00:00:00 2001 From: Chuyi Zhou Date: Tue, 7 Nov 2023 21:22:04 +0800 Subject: selftests/bpf: get trusted cgrp from bpf_iter__cgroup directly Commit f49843afde (selftests/bpf: Add tests for css_task iter combining with cgroup iter) added a test which demonstrates how css_task iter can be combined with cgroup iter. That test used bpf_cgroup_from_id() to convert bpf_iter__cgroup->cgroup to a trusted ptr which is pointless now, since with the previous fix, we can get a trusted cgroup directly from bpf_iter__cgroup. Signed-off-by: Chuyi Zhou Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20231107132204.912120-3-zhouchuyi@bytedance.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/progs/iters_css_task.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/iters_css_task.c b/tools/testing/selftests/bpf/progs/iters_css_task.c index e180aa1b1d52..9ac758649cb8 100644 --- a/tools/testing/selftests/bpf/progs/iters_css_task.c +++ b/tools/testing/selftests/bpf/progs/iters_css_task.c @@ -56,12 +56,9 @@ SEC("?iter/cgroup") int cgroup_id_printer(struct bpf_iter__cgroup *ctx) { struct seq_file *seq = ctx->meta->seq; - struct cgroup *cgrp, *acquired; + struct cgroup *cgrp = ctx->cgroup; struct cgroup_subsys_state *css; struct task_struct *task; - u64 cgrp_id; - - cgrp = ctx->cgroup; /* epilogue */ if (cgrp == NULL) { @@ -73,20 +70,15 @@ int cgroup_id_printer(struct bpf_iter__cgroup *ctx) if (ctx->meta->seq_num == 0) BPF_SEQ_PRINTF(seq, "prologue\n"); - cgrp_id = cgroup_id(cgrp); - - BPF_SEQ_PRINTF(seq, "%8llu\n", cgrp_id); + BPF_SEQ_PRINTF(seq, "%8llu\n", cgroup_id(cgrp)); - acquired = bpf_cgroup_from_id(cgrp_id); - if (!acquired) - return 0; - css = &acquired->self; + css = &cgrp->self; css_task_cnt = 0; bpf_for_each(css_task, task, css, CSS_TASK_ITER_PROCS) { if (task->pid == target_pid) css_task_cnt++; } - bpf_cgroup_release(acquired); + return 0; } -- cgit From 8999ce4cfc87e61b4143ec2e7b93d8e92e11fa7f Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sun, 5 Nov 2023 23:43:36 +0100 Subject: r8169: respect userspace disabling IFF_MULTICAST So far we ignore the setting of IFF_MULTICAST. Fix this and clear bit AcceptMulticast if IFF_MULTICAST isn't set. Note: Based on the implementations I've seen it doesn't seem to be 100% clear what a driver is supposed to do if IFF_ALLMULTI is set but IFF_MULTICAST is not. This patch is based on the understanding that IFF_MULTICAST has precedence. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Heiner Kallweit Link: https://lore.kernel.org/r/4a57ba02-d52d-4369-9f14-3565e6c1f7dc@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/r8169_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 4b8251cdb436..0c76c162b8a9 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -2582,6 +2582,8 @@ static void rtl_set_rx_mode(struct net_device *dev) if (dev->flags & IFF_PROMISC) { rx_mode |= AcceptAllPhys; + } else if (!(dev->flags & IFF_MULTICAST)) { + rx_mode &= ~AcceptMulticast; } else if (netdev_mc_count(dev) > MC_FILTER_LIMIT || dev->flags & IFF_ALLMULTI || tp->mac_version == RTL_GIGA_MAC_VER_35 || -- cgit From 34c4effacfc329aeca5635a69fd9e0f6c90b4101 Mon Sep 17 00:00:00 2001 From: Shigeru Yoshida Date: Sun, 5 Nov 2023 00:05:31 +0900 Subject: virtio/vsock: Fix uninit-value in virtio_transport_recv_pkt() KMSAN reported the following uninit-value access issue: ===================================================== BUG: KMSAN: uninit-value in virtio_transport_recv_pkt+0x1dfb/0x26a0 net/vmw_vsock/virtio_transport_common.c:1421 virtio_transport_recv_pkt+0x1dfb/0x26a0 net/vmw_vsock/virtio_transport_common.c:1421 vsock_loopback_work+0x3bb/0x5a0 net/vmw_vsock/vsock_loopback.c:120 process_one_work kernel/workqueue.c:2630 [inline] process_scheduled_works+0xff6/0x1e60 kernel/workqueue.c:2703 worker_thread+0xeca/0x14d0 kernel/workqueue.c:2784 kthread+0x3cc/0x520 kernel/kthread.c:388 ret_from_fork+0x66/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:304 Uninit was stored to memory at: virtio_transport_space_update net/vmw_vsock/virtio_transport_common.c:1274 [inline] virtio_transport_recv_pkt+0x1ee8/0x26a0 net/vmw_vsock/virtio_transport_common.c:1415 vsock_loopback_work+0x3bb/0x5a0 net/vmw_vsock/vsock_loopback.c:120 process_one_work kernel/workqueue.c:2630 [inline] process_scheduled_works+0xff6/0x1e60 kernel/workqueue.c:2703 worker_thread+0xeca/0x14d0 kernel/workqueue.c:2784 kthread+0x3cc/0x520 kernel/kthread.c:388 ret_from_fork+0x66/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:304 Uninit was created at: slab_post_alloc_hook+0x105/0xad0 mm/slab.h:767 slab_alloc_node mm/slub.c:3478 [inline] kmem_cache_alloc_node+0x5a2/0xaf0 mm/slub.c:3523 kmalloc_reserve+0x13c/0x4a0 net/core/skbuff.c:559 __alloc_skb+0x2fd/0x770 net/core/skbuff.c:650 alloc_skb include/linux/skbuff.h:1286 [inline] virtio_vsock_alloc_skb include/linux/virtio_vsock.h:66 [inline] virtio_transport_alloc_skb+0x90/0x11e0 net/vmw_vsock/virtio_transport_common.c:58 virtio_transport_reset_no_sock net/vmw_vsock/virtio_transport_common.c:957 [inline] virtio_transport_recv_pkt+0x1279/0x26a0 net/vmw_vsock/virtio_transport_common.c:1387 vsock_loopback_work+0x3bb/0x5a0 net/vmw_vsock/vsock_loopback.c:120 process_one_work kernel/workqueue.c:2630 [inline] process_scheduled_works+0xff6/0x1e60 kernel/workqueue.c:2703 worker_thread+0xeca/0x14d0 kernel/workqueue.c:2784 kthread+0x3cc/0x520 kernel/kthread.c:388 ret_from_fork+0x66/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:304 CPU: 1 PID: 10664 Comm: kworker/1:5 Not tainted 6.6.0-rc3-00146-g9f3ebbef746f #3 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-1.fc38 04/01/2014 Workqueue: vsock-loopback vsock_loopback_work ===================================================== The following simple reproducer can cause the issue described above: int main(void) { int sock; struct sockaddr_vm addr = { .svm_family = AF_VSOCK, .svm_cid = VMADDR_CID_ANY, .svm_port = 1234, }; sock = socket(AF_VSOCK, SOCK_STREAM, 0); connect(sock, (struct sockaddr *)&addr, sizeof(addr)); return 0; } This issue occurs because the `buf_alloc` and `fwd_cnt` fields of the `struct virtio_vsock_hdr` are not initialized when a new skb is allocated in `virtio_transport_init_hdr()`. This patch resolves the issue by initializing these fields during allocation. Fixes: 71dc9ec9ac7d ("virtio/vsock: replace virtio_vsock_pkt with sk_buff") Reported-and-tested-by: syzbot+0c8ce1da0ac31abbadcd@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=0c8ce1da0ac31abbadcd Signed-off-by: Shigeru Yoshida Reviewed-by: Stefano Garzarella Link: https://lore.kernel.org/r/20231104150531.257952-1-syoshida@redhat.com Signed-off-by: Jakub Kicinski --- net/vmw_vsock/virtio_transport_common.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 4c595dd1fd64..f6dc896bf44c 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -130,6 +130,8 @@ static void virtio_transport_init_hdr(struct sk_buff *skb, hdr->dst_port = cpu_to_le32(dst_port); hdr->flags = cpu_to_le32(info->flags); hdr->len = cpu_to_le32(payload_len); + hdr->buf_alloc = cpu_to_le32(0); + hdr->fwd_cnt = cpu_to_le32(0); } static void virtio_transport_copy_nonlinear_skb(const struct sk_buff *skb, -- cgit From f968c56417f00be4cb62eadeed042a1e3c80dc53 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 6 Nov 2023 18:03:11 +0200 Subject: net: enetc: shorten enetc_setup_xdp_prog() error message to fit NETLINK_MAX_FMTMSG_LEN NETLINK_MAX_FMTMSG_LEN is currently hardcoded to 80, and we provide an error printf-formatted string having 96 characters including the terminating \0. Assuming each %d (representing a queue) gets replaced by a number having at most 2 digits (a reasonable assumption), the final string is also 96 characters wide, which is too much. Reduce the verbiage a bit by removing some (partially) redundant words, which makes the new printf-formatted string be 73 characters wide with the trailing newline. Fixes: 800db2d125c2 ("net: enetc: ensure we always have a minimum number of TXQs for stack") Reported-by: kernel test robot Closes: https://lore.kernel.org/lkml/202311061336.4dsWMT1h-lkp@intel.com/ Signed-off-by: Vladimir Oltean Link: https://lore.kernel.org/r/20231106160311.616118-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/enetc/enetc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index 30bec47bc665..cffbf27c4656 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -2769,7 +2769,7 @@ static int enetc_setup_xdp_prog(struct net_device *ndev, struct bpf_prog *prog, if (priv->min_num_stack_tx_queues + num_xdp_tx_queues > priv->num_tx_rings) { NL_SET_ERR_MSG_FMT_MOD(extack, - "Reserving %d XDP TXQs does not leave a minimum of %d TXQs for network stack (total %d available)", + "Reserving %d XDP TXQs does not leave a minimum of %d for stack (total %d)", num_xdp_tx_queues, priv->min_num_stack_tx_queues, priv->num_tx_rings); -- cgit From caf3100810f4150677f4e1057aa0a29f8a2c3743 Mon Sep 17 00:00:00 2001 From: Philipp Stanner Date: Mon, 6 Nov 2023 10:16:00 +0100 Subject: drivers/net/ppp: use standard array-copy-function In ppp_generic.c, memdup_user() is utilized to copy a userspace array. This is done without an overflow-check, which is, however, not critical because the multiplicands are an unsigned short and struct sock_filter, which is currently of size 8. Regardless, string.h now provides memdup_array_user(), a wrapper for copying userspace arrays in a standardized manner, which has the advantage of making it more obvious to the reader that an array is being copied. The wrapper additionally performs an obligatory overflow check, saving the reader the effort of analyzing the potential for overflow, and making the code a bit more robust in case of future changes to the multiplicands len * size. Replace memdup_user() with memdup_array_user(). Suggested-by: Dave Airlie Signed-off-by: Philipp Stanner Signed-off-by: David S. Miller --- drivers/net/ppp/ppp_generic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index a9beacd552cf..0193af2d31c9 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -570,8 +570,8 @@ static struct bpf_prog *get_filter(struct sock_fprog *uprog) /* uprog->len is unsigned short, so no overflow here */ fprog.len = uprog->len; - fprog.filter = memdup_user(uprog->filter, - uprog->len * sizeof(struct sock_filter)); + fprog.filter = memdup_array_user(uprog->filter, + uprog->len, sizeof(struct sock_filter)); if (IS_ERR(fprog.filter)) return ERR_CAST(fprog.filter); -- cgit From 94090b23f3f71c150359a2e0716855a4037ad45a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 4 Nov 2023 11:14:05 +0100 Subject: netfilter: add missing module descriptions W=1 builds warn on missing MODULE_DESCRIPTION, add them. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/ebtable_broute.c | 1 + net/bridge/netfilter/ebtable_filter.c | 1 + net/bridge/netfilter/ebtable_nat.c | 1 + net/bridge/netfilter/ebtables.c | 1 + net/bridge/netfilter/nf_conntrack_bridge.c | 1 + net/ipv4/netfilter/iptable_nat.c | 1 + net/ipv4/netfilter/iptable_raw.c | 1 + net/ipv4/netfilter/nf_defrag_ipv4.c | 1 + net/ipv4/netfilter/nf_reject_ipv4.c | 1 + net/ipv6/netfilter/ip6table_nat.c | 1 + net/ipv6/netfilter/ip6table_raw.c | 1 + net/ipv6/netfilter/nf_defrag_ipv6_hooks.c | 1 + net/ipv6/netfilter/nf_reject_ipv6.c | 1 + net/netfilter/nf_conntrack_broadcast.c | 1 + net/netfilter/nf_conntrack_netlink.c | 1 + net/netfilter/nf_conntrack_proto.c | 1 + net/netfilter/nf_nat_core.c | 1 + net/netfilter/nf_tables_api.c | 1 + net/netfilter/nfnetlink_osf.c | 1 + net/netfilter/nft_chain_nat.c | 1 + net/netfilter/nft_fib.c | 1 + net/netfilter/nft_fwd_netdev.c | 1 + 22 files changed, 22 insertions(+) diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c index 8f19253024b0..741360219552 100644 --- a/net/bridge/netfilter/ebtable_broute.c +++ b/net/bridge/netfilter/ebtable_broute.c @@ -135,3 +135,4 @@ static void __exit ebtable_broute_fini(void) module_init(ebtable_broute_init); module_exit(ebtable_broute_fini); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Force packets to be routed instead of bridged"); diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c index 278f324e6752..dacd81b12e62 100644 --- a/net/bridge/netfilter/ebtable_filter.c +++ b/net/bridge/netfilter/ebtable_filter.c @@ -116,3 +116,4 @@ static void __exit ebtable_filter_fini(void) module_init(ebtable_filter_init); module_exit(ebtable_filter_fini); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ebtables legacy filter table"); diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c index 9066f7f376d5..0f2a8c6118d4 100644 --- a/net/bridge/netfilter/ebtable_nat.c +++ b/net/bridge/netfilter/ebtable_nat.c @@ -116,3 +116,4 @@ static void __exit ebtable_nat_fini(void) module_init(ebtable_nat_init); module_exit(ebtable_nat_fini); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ebtables legacy stateless nat table"); diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index aa23479b20b2..99d82676f780 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -2595,3 +2595,4 @@ EXPORT_SYMBOL(ebt_do_table); module_init(ebtables_init); module_exit(ebtables_fini); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ebtables legacy core"); diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c index 71056ee84773..b5c406a6e765 100644 --- a/net/bridge/netfilter/nf_conntrack_bridge.c +++ b/net/bridge/netfilter/nf_conntrack_bridge.c @@ -416,3 +416,4 @@ module_exit(nf_conntrack_l3proto_bridge_fini); MODULE_ALIAS("nf_conntrack-" __stringify(AF_BRIDGE)); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Bridge IPv4 and IPv6 connection tracking"); diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index 56f6ecc43451..4d42d0756fd7 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -170,3 +170,4 @@ module_init(iptable_nat_init); module_exit(iptable_nat_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("iptables legacy nat table"); diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index ca5e5b21587c..0e7f53964d0a 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -108,3 +108,4 @@ static void __exit iptable_raw_fini(void) module_init(iptable_raw_init); module_exit(iptable_raw_fini); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("iptables legacy raw table"); diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index 265b39bc435b..482e733c3375 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -186,3 +186,4 @@ module_init(nf_defrag_init); module_exit(nf_defrag_fini); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("IPv4 defragmentation support"); diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index f33aeab9424f..f01b038fc1cd 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -336,3 +336,4 @@ void nf_send_unreach(struct sk_buff *skb_in, int code, int hook) EXPORT_SYMBOL_GPL(nf_send_unreach); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("IPv4 packet rejection core"); diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c index bf3cb3a13600..52cf104e3478 100644 --- a/net/ipv6/netfilter/ip6table_nat.c +++ b/net/ipv6/netfilter/ip6table_nat.c @@ -170,3 +170,4 @@ module_init(ip6table_nat_init); module_exit(ip6table_nat_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Ip6tables legacy nat table"); diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c index 08861d5d1f4d..fc9f6754028f 100644 --- a/net/ipv6/netfilter/ip6table_raw.c +++ b/net/ipv6/netfilter/ip6table_raw.c @@ -106,3 +106,4 @@ static void __exit ip6table_raw_fini(void) module_init(ip6table_raw_init); module_exit(ip6table_raw_fini); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Ip6tables legacy raw table"); diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c index d59b296b4f51..be7817fbc024 100644 --- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c +++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c @@ -182,3 +182,4 @@ module_init(nf_defrag_init); module_exit(nf_defrag_fini); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("IPv6 defragmentation support"); diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c index 58ccdb08c0fd..d45bc54b7ea5 100644 --- a/net/ipv6/netfilter/nf_reject_ipv6.c +++ b/net/ipv6/netfilter/nf_reject_ipv6.c @@ -413,3 +413,4 @@ void nf_send_unreach6(struct net *net, struct sk_buff *skb_in, EXPORT_SYMBOL_GPL(nf_send_unreach6); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("IPv6 packet rejection core"); diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c index 9fb9b8031298..cfa0fe0356de 100644 --- a/net/netfilter/nf_conntrack_broadcast.c +++ b/net/netfilter/nf_conntrack_broadcast.c @@ -82,3 +82,4 @@ out: EXPORT_SYMBOL_GPL(nf_conntrack_broadcast_help); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Broadcast connection tracking helper"); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 334db22199c1..fb0ae15e96df 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -57,6 +57,7 @@ #include "nf_internals.h" MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("List and change connection tracking table"); struct ctnetlink_list_dump_ctx { struct nf_conn *last; diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index c928ff63b10e..f36727ed91e1 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -699,3 +699,4 @@ MODULE_ALIAS("ip_conntrack"); MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET)); MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET6)); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("IPv4 and IPv6 connection tracking"); diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index c4e0516a8dfa..c3d7ecbc777c 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -1263,6 +1263,7 @@ static void __exit nf_nat_cleanup(void) } MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Network address translation core"); module_init(nf_nat_init); module_exit(nf_nat_cleanup); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 3c1fd8283bf4..146b7447a969 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -11386,4 +11386,5 @@ module_exit(nf_tables_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy "); +MODULE_DESCRIPTION("Framework for packet filtering and classification"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_NFTABLES); diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c index 50723ba08289..c0fc431991e8 100644 --- a/net/netfilter/nfnetlink_osf.c +++ b/net/netfilter/nfnetlink_osf.c @@ -447,4 +447,5 @@ module_init(nfnl_osf_init); module_exit(nfnl_osf_fini); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Passive OS fingerprint matching"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_OSF); diff --git a/net/netfilter/nft_chain_nat.c b/net/netfilter/nft_chain_nat.c index 98e4946100c5..40e230d8b712 100644 --- a/net/netfilter/nft_chain_nat.c +++ b/net/netfilter/nft_chain_nat.c @@ -137,6 +137,7 @@ module_init(nft_chain_nat_init); module_exit(nft_chain_nat_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("nftables network address translation support"); #ifdef CONFIG_NF_TABLES_IPV4 MODULE_ALIAS_NFT_CHAIN(AF_INET, "nat"); #endif diff --git a/net/netfilter/nft_fib.c b/net/netfilter/nft_fib.c index 04b51f285332..1bfe258018da 100644 --- a/net/netfilter/nft_fib.c +++ b/net/netfilter/nft_fib.c @@ -204,4 +204,5 @@ bool nft_fib_reduce(struct nft_regs_track *track, EXPORT_SYMBOL_GPL(nft_fib_reduce); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Query routing table from nftables"); MODULE_AUTHOR("Florian Westphal "); diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c index a5268e6dd32f..358e742afad7 100644 --- a/net/netfilter/nft_fwd_netdev.c +++ b/net/netfilter/nft_fwd_netdev.c @@ -270,4 +270,5 @@ module_exit(nft_fwd_netdev_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso "); +MODULE_DESCRIPTION("nftables netdev packet forwarding support"); MODULE_ALIAS_NFT_AF_EXPR(5, "fwd"); -- cgit From 93995bf4af2c5a99e2a87f0cd5ce547d31eb7630 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 6 Nov 2023 10:53:09 +0100 Subject: netfilter: nf_tables: remove catchall element in GC sync path The expired catchall element is not deactivated and removed from GC sync path. This path holds mutex so just call nft_setelem_data_deactivate() and nft_setelem_catchall_remove() before queueing the GC work. Fixes: 4a9e12ea7e70 ("netfilter: nft_set_pipapo: call nft_trans_gc_queue_sync() in catchall GC") Reported-by: lonial con Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 146b7447a969..a761ee6796f6 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6520,6 +6520,12 @@ static int nft_setelem_deactivate(const struct net *net, return ret; } +static void nft_setelem_catchall_destroy(struct nft_set_elem_catchall *catchall) +{ + list_del_rcu(&catchall->list); + kfree_rcu(catchall, rcu); +} + static void nft_setelem_catchall_remove(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv) @@ -6528,8 +6534,7 @@ static void nft_setelem_catchall_remove(const struct net *net, list_for_each_entry_safe(catchall, next, &set->catchall_list, list) { if (catchall->elem == elem_priv) { - list_del_rcu(&catchall->list); - kfree_rcu(catchall, rcu); + nft_setelem_catchall_destroy(catchall); break; } } @@ -9678,11 +9683,12 @@ static struct nft_trans_gc *nft_trans_gc_catchall(struct nft_trans_gc *gc, unsigned int gc_seq, bool sync) { - struct nft_set_elem_catchall *catchall; + struct nft_set_elem_catchall *catchall, *next; const struct nft_set *set = gc->set; + struct nft_elem_priv *elem_priv; struct nft_set_ext *ext; - list_for_each_entry_rcu(catchall, &set->catchall_list, list) { + list_for_each_entry_safe(catchall, next, &set->catchall_list, list) { ext = nft_set_elem_ext(set, catchall->elem); if (!nft_set_elem_expired(ext)) @@ -9700,7 +9706,13 @@ dead_elem: if (!gc) return NULL; - nft_trans_gc_elem_add(gc, catchall->elem); + elem_priv = catchall->elem; + if (sync) { + nft_setelem_data_deactivate(gc->net, gc->set, elem_priv); + nft_setelem_catchall_destroy(catchall); + } + + nft_trans_gc_elem_add(gc, elem_priv); } return gc; -- cgit From 17cd01e4d1e37e2c8051bbc0ca1ecca4cb001198 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 7 Nov 2023 10:48:04 +0100 Subject: ipvs: add missing module descriptions W=1 builds warn on missing MODULE_DESCRIPTION, add them. Signed-off-by: Florian Westphal Acked-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_core.c | 1 + net/netfilter/ipvs/ip_vs_dh.c | 1 + net/netfilter/ipvs/ip_vs_fo.c | 1 + net/netfilter/ipvs/ip_vs_ftp.c | 1 + net/netfilter/ipvs/ip_vs_lblc.c | 1 + net/netfilter/ipvs/ip_vs_lblcr.c | 1 + net/netfilter/ipvs/ip_vs_lc.c | 1 + net/netfilter/ipvs/ip_vs_nq.c | 1 + net/netfilter/ipvs/ip_vs_ovf.c | 1 + net/netfilter/ipvs/ip_vs_pe_sip.c | 1 + net/netfilter/ipvs/ip_vs_rr.c | 1 + net/netfilter/ipvs/ip_vs_sed.c | 1 + net/netfilter/ipvs/ip_vs_sh.c | 1 + net/netfilter/ipvs/ip_vs_twos.c | 1 + net/netfilter/ipvs/ip_vs_wlc.c | 1 + net/netfilter/ipvs/ip_vs_wrr.c | 1 + 16 files changed, 16 insertions(+) diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 3230506ae3ff..a2c16b501087 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -2450,3 +2450,4 @@ static void __exit ip_vs_cleanup(void) module_init(ip_vs_init); module_exit(ip_vs_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("IP Virtual Server"); diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c index 5e6ec32aff2b..75f4c231f4a0 100644 --- a/net/netfilter/ipvs/ip_vs_dh.c +++ b/net/netfilter/ipvs/ip_vs_dh.c @@ -270,3 +270,4 @@ static void __exit ip_vs_dh_cleanup(void) module_init(ip_vs_dh_init); module_exit(ip_vs_dh_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs destination hashing scheduler"); diff --git a/net/netfilter/ipvs/ip_vs_fo.c b/net/netfilter/ipvs/ip_vs_fo.c index b846cc385279..ab117e5bc34e 100644 --- a/net/netfilter/ipvs/ip_vs_fo.c +++ b/net/netfilter/ipvs/ip_vs_fo.c @@ -72,3 +72,4 @@ static void __exit ip_vs_fo_cleanup(void) module_init(ip_vs_fo_init); module_exit(ip_vs_fo_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs weighted failover scheduler"); diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index ef1f45e43b63..f53899d12416 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -635,3 +635,4 @@ static void __exit ip_vs_ftp_exit(void) module_init(ip_vs_ftp_init); module_exit(ip_vs_ftp_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs ftp helper"); diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c index cf78ba4ce5ff..8ceec7a2fa8f 100644 --- a/net/netfilter/ipvs/ip_vs_lblc.c +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -632,3 +632,4 @@ static void __exit ip_vs_lblc_cleanup(void) module_init(ip_vs_lblc_init); module_exit(ip_vs_lblc_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs locality-based least-connection scheduler"); diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index 9eddf118b40e..0fb64707213f 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -817,3 +817,4 @@ static void __exit ip_vs_lblcr_cleanup(void) module_init(ip_vs_lblcr_init); module_exit(ip_vs_lblcr_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs locality-based least-connection with replication scheduler"); diff --git a/net/netfilter/ipvs/ip_vs_lc.c b/net/netfilter/ipvs/ip_vs_lc.c index 9d34d81fc6f1..c2764505e380 100644 --- a/net/netfilter/ipvs/ip_vs_lc.c +++ b/net/netfilter/ipvs/ip_vs_lc.c @@ -86,3 +86,4 @@ static void __exit ip_vs_lc_cleanup(void) module_init(ip_vs_lc_init); module_exit(ip_vs_lc_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs least connection scheduler"); diff --git a/net/netfilter/ipvs/ip_vs_nq.c b/net/netfilter/ipvs/ip_vs_nq.c index f56862a87518..ed7f5c889b41 100644 --- a/net/netfilter/ipvs/ip_vs_nq.c +++ b/net/netfilter/ipvs/ip_vs_nq.c @@ -136,3 +136,4 @@ static void __exit ip_vs_nq_cleanup(void) module_init(ip_vs_nq_init); module_exit(ip_vs_nq_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs never queue scheduler"); diff --git a/net/netfilter/ipvs/ip_vs_ovf.c b/net/netfilter/ipvs/ip_vs_ovf.c index c03066fdd5ca..c7708b809700 100644 --- a/net/netfilter/ipvs/ip_vs_ovf.c +++ b/net/netfilter/ipvs/ip_vs_ovf.c @@ -79,3 +79,4 @@ static void __exit ip_vs_ovf_cleanup(void) module_init(ip_vs_ovf_init); module_exit(ip_vs_ovf_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs overflow connection scheduler"); diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c index 0ac6705a61d3..e4ce1d9a63f9 100644 --- a/net/netfilter/ipvs/ip_vs_pe_sip.c +++ b/net/netfilter/ipvs/ip_vs_pe_sip.c @@ -185,3 +185,4 @@ static void __exit ip_vs_sip_cleanup(void) module_init(ip_vs_sip_init); module_exit(ip_vs_sip_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs sip helper"); diff --git a/net/netfilter/ipvs/ip_vs_rr.c b/net/netfilter/ipvs/ip_vs_rr.c index 38495c6f6c7c..6baa34dff9f0 100644 --- a/net/netfilter/ipvs/ip_vs_rr.c +++ b/net/netfilter/ipvs/ip_vs_rr.c @@ -122,4 +122,5 @@ static void __exit ip_vs_rr_cleanup(void) module_init(ip_vs_rr_init); module_exit(ip_vs_rr_cleanup); +MODULE_DESCRIPTION("ipvs round-robin scheduler"); MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_sed.c b/net/netfilter/ipvs/ip_vs_sed.c index 7663288e5358..a46f99a56618 100644 --- a/net/netfilter/ipvs/ip_vs_sed.c +++ b/net/netfilter/ipvs/ip_vs_sed.c @@ -137,3 +137,4 @@ static void __exit ip_vs_sed_cleanup(void) module_init(ip_vs_sed_init); module_exit(ip_vs_sed_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs shortest expected delay scheduler"); diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c index c2028e412092..92e77d7a6b50 100644 --- a/net/netfilter/ipvs/ip_vs_sh.c +++ b/net/netfilter/ipvs/ip_vs_sh.c @@ -376,3 +376,4 @@ static void __exit ip_vs_sh_cleanup(void) module_init(ip_vs_sh_init); module_exit(ip_vs_sh_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs source hashing scheduler"); diff --git a/net/netfilter/ipvs/ip_vs_twos.c b/net/netfilter/ipvs/ip_vs_twos.c index 3308e4cc740a..8d5419edde50 100644 --- a/net/netfilter/ipvs/ip_vs_twos.c +++ b/net/netfilter/ipvs/ip_vs_twos.c @@ -137,3 +137,4 @@ static void __exit ip_vs_twos_cleanup(void) module_init(ip_vs_twos_init); module_exit(ip_vs_twos_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs power of twos choice scheduler"); diff --git a/net/netfilter/ipvs/ip_vs_wlc.c b/net/netfilter/ipvs/ip_vs_wlc.c index 09f584b564a0..9fa500927c0a 100644 --- a/net/netfilter/ipvs/ip_vs_wlc.c +++ b/net/netfilter/ipvs/ip_vs_wlc.c @@ -109,3 +109,4 @@ static void __exit ip_vs_wlc_cleanup(void) module_init(ip_vs_wlc_init); module_exit(ip_vs_wlc_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs weighted least connection scheduler"); diff --git a/net/netfilter/ipvs/ip_vs_wrr.c b/net/netfilter/ipvs/ip_vs_wrr.c index 1bc7a0789d85..85ce0d04afac 100644 --- a/net/netfilter/ipvs/ip_vs_wrr.c +++ b/net/netfilter/ipvs/ip_vs_wrr.c @@ -263,3 +263,4 @@ static void __exit ip_vs_wrr_cleanup(void) module_init(ip_vs_wrr_init); module_exit(ip_vs_wrr_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs weighted round-robin scheduler"); -- cgit From 7b308feb4fd2d1c06919445c65c8fbf8e9fd1781 Mon Sep 17 00:00:00 2001 From: Maciej Żenczykowski Date: Sun, 5 Nov 2023 11:56:00 -0800 Subject: netfilter: xt_recent: fix (increase) ipv6 literal buffer length MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit in6_pton() supports 'low-32-bit dot-decimal representation' (this is useful with DNS64/NAT64 networks for example): # echo +aaaa:bbbb:cccc:dddd:eeee:ffff:1.2.3.4 > /proc/self/net/xt_recent/DEFAULT # cat /proc/self/net/xt_recent/DEFAULT src=aaaa:bbbb:cccc:dddd:eeee:ffff:0102:0304 ttl: 0 last_seen: 9733848829 oldest_pkt: 1 9733848829 but the provided buffer is too short: # echo +aaaa:bbbb:cccc:dddd:eeee:ffff:255.255.255.255 > /proc/self/net/xt_recent/DEFAULT -bash: echo: write error: Invalid argument Fixes: 079aa88fe717 ("netfilter: xt_recent: IPv6 support") Signed-off-by: Maciej Żenczykowski Reviewed-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_recent.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index 7ddb9a78e3fc..ef93e0d3bee0 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -561,7 +561,7 @@ recent_mt_proc_write(struct file *file, const char __user *input, { struct recent_table *t = pde_data(file_inode(file)); struct recent_entry *e; - char buf[sizeof("+b335:1d35:1e55:dead:c0de:1715:5afe:c0de")]; + char buf[sizeof("+b335:1d35:1e55:dead:c0de:1715:255.255.255.255")]; const char *c = buf; union nf_inet_addr addr = {}; u_int16_t family; -- cgit From 80abbe8a8263106fe45a4f293b92b5c74cc9cc8a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 8 Nov 2023 13:18:53 +0100 Subject: netfilter: nat: fix ipv6 nat redirect with mapped and scoped addresses The ipv6 redirect target was derived from the ipv4 one, i.e. its identical to a 'dnat' with the first (primary) address assigned to the network interface. The code has been moved around to make it usable from nf_tables too, but its still the same as it was back when this was added in 2012. IPv6, however, has different types of addresses, if the 'wrong' address comes first the redirection does not work. In Daniels case, the addresses are: inet6 ::ffff:192 ... inet6 2a01: ... ... so the function attempts to redirect to the mapped address. Add more checks before the address is deemed correct: 1. If the packets' daddr is scoped, search for a scoped address too 2. skip tentative addresses 3. skip mapped addresses Use the first address that appears to match our needs. Reported-by: Daniel Huhardeaux Closes: https://lore.kernel.org/netfilter/71be06b8-6aa0-4cf9-9e0b-e2839b01b22f@tootai.net/ Fixes: 115e23ac78f8 ("netfilter: ip6tables: add REDIRECT target") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_nat_redirect.c | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_nat_redirect.c b/net/netfilter/nf_nat_redirect.c index 6616ba5d0b04..5b37487d9d11 100644 --- a/net/netfilter/nf_nat_redirect.c +++ b/net/netfilter/nf_nat_redirect.c @@ -80,6 +80,26 @@ EXPORT_SYMBOL_GPL(nf_nat_redirect_ipv4); static const struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT; +static bool nf_nat_redirect_ipv6_usable(const struct inet6_ifaddr *ifa, unsigned int scope) +{ + unsigned int ifa_addr_type = ipv6_addr_type(&ifa->addr); + + if (ifa_addr_type & IPV6_ADDR_MAPPED) + return false; + + if ((ifa->flags & IFA_F_TENTATIVE) && (!(ifa->flags & IFA_F_OPTIMISTIC))) + return false; + + if (scope) { + unsigned int ifa_scope = ifa_addr_type & IPV6_ADDR_SCOPE_MASK; + + if (!(scope & ifa_scope)) + return false; + } + + return true; +} + unsigned int nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, unsigned int hooknum) @@ -89,14 +109,19 @@ nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, if (hooknum == NF_INET_LOCAL_OUT) { newdst.in6 = loopback_addr; } else { + unsigned int scope = ipv6_addr_scope(&ipv6_hdr(skb)->daddr); struct inet6_dev *idev; - struct inet6_ifaddr *ifa; bool addr = false; idev = __in6_dev_get(skb->dev); if (idev != NULL) { + const struct inet6_ifaddr *ifa; + read_lock_bh(&idev->lock); list_for_each_entry(ifa, &idev->addr_list, if_list) { + if (!nf_nat_redirect_ipv6_usable(ifa, scope)) + continue; + newdst.in6 = ifa->addr; addr = true; break; -- cgit From 9bc64bd0cd765f696fcd40fc98909b1f7c73b2ba Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Fri, 3 Nov 2023 16:14:10 +0100 Subject: net/sched: act_ct: Always fill offloading tuple iifidx Referenced commit doesn't always set iifidx when offloading the flow to hardware. Fix the following cases: - nf_conn_act_ct_ext_fill() is called before extension is created with nf_conn_act_ct_ext_add() in tcf_ct_act(). This can cause rule offload with unspecified iifidx when connection is offloaded after only single original-direction packet has been processed by tc data path. Always fill the new nf_conn_act_ct_ext instance after creating it in nf_conn_act_ct_ext_add(). - Offloading of unidirectional UDP NEW connections is now supported, but ct flow iifidx field is not updated when connection is promoted to bidirectional which can result reply-direction iifidx to be zero when refreshing the connection. Fill in the extension and update flow iifidx before calling flow_offload_refresh(). Fixes: 9795ded7f924 ("net/sched: act_ct: Fill offloading tuple iifidx") Reviewed-by: Paul Blakey Signed-off-by: Vlad Buslov Reviewed-by: Simon Horman Fixes: 6a9bad0069cf ("net/sched: act_ct: offload UDP NEW connections") Link: https://lore.kernel.org/r/20231103151410.764271-1-vladbu@nvidia.com Signed-off-by: Jakub Kicinski --- include/net/netfilter/nf_conntrack_act_ct.h | 30 ++++++++++++++++------------- net/openvswitch/conntrack.c | 2 +- net/sched/act_ct.c | 15 ++++++++++++++- 3 files changed, 32 insertions(+), 15 deletions(-) diff --git a/include/net/netfilter/nf_conntrack_act_ct.h b/include/net/netfilter/nf_conntrack_act_ct.h index 078d3c52c03f..e5f2f0b73a9a 100644 --- a/include/net/netfilter/nf_conntrack_act_ct.h +++ b/include/net/netfilter/nf_conntrack_act_ct.h @@ -20,7 +20,22 @@ static inline struct nf_conn_act_ct_ext *nf_conn_act_ct_ext_find(const struct nf #endif } -static inline struct nf_conn_act_ct_ext *nf_conn_act_ct_ext_add(struct nf_conn *ct) +static inline void nf_conn_act_ct_ext_fill(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo) +{ +#if IS_ENABLED(CONFIG_NET_ACT_CT) + struct nf_conn_act_ct_ext *act_ct_ext; + + act_ct_ext = nf_conn_act_ct_ext_find(ct); + if (dev_net(skb->dev) == &init_net && act_ct_ext) + act_ct_ext->ifindex[CTINFO2DIR(ctinfo)] = skb->dev->ifindex; +#endif +} + +static inline struct +nf_conn_act_ct_ext *nf_conn_act_ct_ext_add(struct sk_buff *skb, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo) { #if IS_ENABLED(CONFIG_NET_ACT_CT) struct nf_conn_act_ct_ext *act_ct = nf_ct_ext_find(ct, NF_CT_EXT_ACT_CT); @@ -29,22 +44,11 @@ static inline struct nf_conn_act_ct_ext *nf_conn_act_ct_ext_add(struct nf_conn * return act_ct; act_ct = nf_ct_ext_add(ct, NF_CT_EXT_ACT_CT, GFP_ATOMIC); + nf_conn_act_ct_ext_fill(skb, ct, ctinfo); return act_ct; #else return NULL; #endif } -static inline void nf_conn_act_ct_ext_fill(struct sk_buff *skb, struct nf_conn *ct, - enum ip_conntrack_info ctinfo) -{ -#if IS_ENABLED(CONFIG_NET_ACT_CT) - struct nf_conn_act_ct_ext *act_ct_ext; - - act_ct_ext = nf_conn_act_ct_ext_find(ct); - if (dev_net(skb->dev) == &init_net && act_ct_ext) - act_ct_ext->ifindex[CTINFO2DIR(ctinfo)] = skb->dev->ifindex; -#endif -} - #endif /* _NF_CONNTRACK_ACT_CT_H */ diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 0b9a785dea45..3019a4406ca4 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -985,7 +985,7 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key, if (err) return err; - nf_conn_act_ct_ext_add(ct); + nf_conn_act_ct_ext_add(skb, ct, ctinfo); } else if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && labels_nonzero(&info->labels.mask)) { err = ovs_ct_set_labels(ct, key, &info->labels.value, diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 9583645e86c2..0db0ecf1d110 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -376,6 +376,17 @@ static void tcf_ct_flow_tc_ifidx(struct flow_offload *entry, entry->tuplehash[dir].tuple.tc.iifidx = act_ct_ext->ifindex[dir]; } +static void tcf_ct_flow_ct_ext_ifidx_update(struct flow_offload *entry) +{ + struct nf_conn_act_ct_ext *act_ct_ext; + + act_ct_ext = nf_conn_act_ct_ext_find(entry->ct); + if (act_ct_ext) { + tcf_ct_flow_tc_ifidx(entry, act_ct_ext, FLOW_OFFLOAD_DIR_ORIGINAL); + tcf_ct_flow_tc_ifidx(entry, act_ct_ext, FLOW_OFFLOAD_DIR_REPLY); + } +} + static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft, struct nf_conn *ct, bool tcp, bool bidirectional) @@ -671,6 +682,8 @@ static bool tcf_ct_flow_table_lookup(struct tcf_ct_params *p, else ctinfo = IP_CT_ESTABLISHED_REPLY; + nf_conn_act_ct_ext_fill(skb, ct, ctinfo); + tcf_ct_flow_ct_ext_ifidx_update(flow); flow_offload_refresh(nf_ft, flow, force_refresh); if (!test_bit(IPS_ASSURED_BIT, &ct->status)) { /* Process this flow in SW to allow promoting to ASSURED */ @@ -1034,7 +1047,7 @@ do_nat: tcf_ct_act_set_labels(ct, p->labels, p->labels_mask); if (!nf_ct_is_confirmed(ct)) - nf_conn_act_ct_ext_add(ct); + nf_conn_act_ct_ext_add(skb, ct, ctinfo); /* This will take care of sending queued events * even if the connection is already confirmed. -- cgit From 31356547e3316829f15a98ecf9a2096cf3e228d2 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 7 Nov 2023 18:03:05 -0800 Subject: net: kcm: fill in MODULE_DESCRIPTION() W=1 builds now warn if module is built without a MODULE_DESCRIPTION(). Link: https://lore.kernel.org/r/20231108020305.537293-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/kcm/kcmsock.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index dd1d8ffd5f59..65d1f6755f98 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -1946,4 +1946,5 @@ module_init(kcm_init); module_exit(kcm_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("KCM (Kernel Connection Multiplexor) sockets"); MODULE_ALIAS_NETPROTO(PF_KCM); -- cgit From f1a3b283f852c613fae004f87bbbacc8cef5a061 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 7 Nov 2023 16:04:40 +0000 Subject: net_sched: sch_fq: better validate TCA_FQ_WEIGHTS and TCA_FQ_PRIOMAP syzbot was able to trigger the following report while providing too small TCA_FQ_WEIGHTS attribute [1] Fix is to use NLA_POLICY_EXACT_LEN() to ensure user space provided correct sizes. Apply the same fix to TCA_FQ_PRIOMAP. [1] BUG: KMSAN: uninit-value in fq_load_weights net/sched/sch_fq.c:960 [inline] BUG: KMSAN: uninit-value in fq_change+0x1348/0x2fe0 net/sched/sch_fq.c:1071 fq_load_weights net/sched/sch_fq.c:960 [inline] fq_change+0x1348/0x2fe0 net/sched/sch_fq.c:1071 fq_init+0x68e/0x780 net/sched/sch_fq.c:1159 qdisc_create+0x12f3/0x1be0 net/sched/sch_api.c:1326 tc_modify_qdisc+0x11ef/0x2c20 rtnetlink_rcv_msg+0x16a6/0x1840 net/core/rtnetlink.c:6558 netlink_rcv_skb+0x371/0x650 net/netlink/af_netlink.c:2545 rtnetlink_rcv+0x34/0x40 net/core/rtnetlink.c:6576 netlink_unicast_kernel net/netlink/af_netlink.c:1342 [inline] netlink_unicast+0xf47/0x1250 net/netlink/af_netlink.c:1368 netlink_sendmsg+0x1238/0x13d0 net/netlink/af_netlink.c:1910 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg net/socket.c:745 [inline] ____sys_sendmsg+0x9c2/0xd60 net/socket.c:2588 ___sys_sendmsg+0x28d/0x3c0 net/socket.c:2642 __sys_sendmsg net/socket.c:2671 [inline] __do_sys_sendmsg net/socket.c:2680 [inline] __se_sys_sendmsg net/socket.c:2678 [inline] __x64_sys_sendmsg+0x307/0x490 net/socket.c:2678 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x44/0x110 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x63/0x6b Uninit was created at: slab_post_alloc_hook+0x129/0xa70 mm/slab.h:768 slab_alloc_node mm/slub.c:3478 [inline] kmem_cache_alloc_node+0x5e9/0xb10 mm/slub.c:3523 kmalloc_reserve+0x13d/0x4a0 net/core/skbuff.c:560 __alloc_skb+0x318/0x740 net/core/skbuff.c:651 alloc_skb include/linux/skbuff.h:1286 [inline] netlink_alloc_large_skb net/netlink/af_netlink.c:1214 [inline] netlink_sendmsg+0xb34/0x13d0 net/netlink/af_netlink.c:1885 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg net/socket.c:745 [inline] ____sys_sendmsg+0x9c2/0xd60 net/socket.c:2588 ___sys_sendmsg+0x28d/0x3c0 net/socket.c:2642 __sys_sendmsg net/socket.c:2671 [inline] __do_sys_sendmsg net/socket.c:2680 [inline] __se_sys_sendmsg net/socket.c:2678 [inline] __x64_sys_sendmsg+0x307/0x490 net/socket.c:2678 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x44/0x110 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x63/0x6b CPU: 1 PID: 5001 Comm: syz-executor300 Not tainted 6.6.0-syzkaller-12401-g8f6f76a6a29f #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/09/2023 Fixes: 29f834aa326e ("net_sched: sch_fq: add 3 bands and WRR scheduling") Fixes: 49e7265fd098 ("net_sched: sch_fq: add TCA_FQ_WEIGHTS attribute") Reported-by: syzbot Signed-off-by: Eric Dumazet Acked-by: Jamal Hadi Salim Link: https://lore.kernel.org/r/20231107160440.1992526-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/sched/sch_fq.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 0fd18c344ab5..3a31c47fea9b 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -919,14 +919,8 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { [TCA_FQ_TIMER_SLACK] = { .type = NLA_U32 }, [TCA_FQ_HORIZON] = { .type = NLA_U32 }, [TCA_FQ_HORIZON_DROP] = { .type = NLA_U8 }, - [TCA_FQ_PRIOMAP] = { - .type = NLA_BINARY, - .len = sizeof(struct tc_prio_qopt), - }, - [TCA_FQ_WEIGHTS] = { - .type = NLA_BINARY, - .len = FQ_BANDS * sizeof(s32), - }, + [TCA_FQ_PRIOMAP] = NLA_POLICY_EXACT_LEN(sizeof(struct tc_prio_qopt)), + [TCA_FQ_WEIGHTS] = NLA_POLICY_EXACT_LEN(FQ_BANDS * sizeof(s32)), }; /* compress a u8 array with all elems <= 3 to an array of 2-bit fields */ -- cgit From b714ca2ccf6a90733f6ceb14abb6ce914f8832c3 Mon Sep 17 00:00:00 2001 From: Edward Adam Davis Date: Tue, 7 Nov 2023 16:00:40 +0800 Subject: ptp: ptp_read should not release queue Firstly, queue is not the memory allocated in ptp_read; Secondly, other processes may block at ptp_read and wait for conditions to be met to perform read operations. Acked-by: Richard Cochran Reported-and-tested-by: syzbot+df3f3ef31f60781fa911@syzkaller.appspotmail.com Fixes: 8f5de6fb2453 ("ptp: support multiple timestamp event readers") Signed-off-by: Edward Adam Davis Link: https://lore.kernel.org/r/tencent_18747D76F1675A3C633772960237544AAA09@qq.com Signed-off-by: Jakub Kicinski --- drivers/ptp/ptp_chardev.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c index 282cd7d24077..27c1ef493617 100644 --- a/drivers/ptp/ptp_chardev.c +++ b/drivers/ptp/ptp_chardev.c @@ -585,7 +585,5 @@ ssize_t ptp_read(struct posix_clock_context *pccontext, uint rdflags, free_event: kfree(event); exit: - if (result < 0) - ptp_release(pccontext); return result; } -- cgit From 1bea2c3e6df8caf45d18384abfb707f47e9ff993 Mon Sep 17 00:00:00 2001 From: Edward Adam Davis Date: Tue, 7 Nov 2023 16:00:41 +0800 Subject: ptp: fix corrupted list in ptp_open There is no lock protection when writing ptp->tsevqs in ptp_open() and ptp_release(), which can cause data corruption, use spin lock to avoid this issue. Moreover, ptp_release() should not be used to release the queue in ptp_read(), and it should be deleted altogether. Acked-by: Richard Cochran Reported-and-tested-by: syzbot+df3f3ef31f60781fa911@syzkaller.appspotmail.com Fixes: 8f5de6fb2453 ("ptp: support multiple timestamp event readers") Signed-off-by: Edward Adam Davis Link: https://lore.kernel.org/r/tencent_CD19564FFE8DA8A5918DFE92325D92DD8107@qq.com Signed-off-by: Jakub Kicinski --- drivers/ptp/ptp_chardev.c | 21 ++++++++++++--------- drivers/ptp/ptp_clock.c | 8 ++++++-- drivers/ptp/ptp_private.h | 1 + 3 files changed, 19 insertions(+), 11 deletions(-) diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c index 27c1ef493617..3f7a74788802 100644 --- a/drivers/ptp/ptp_chardev.c +++ b/drivers/ptp/ptp_chardev.c @@ -108,6 +108,7 @@ int ptp_open(struct posix_clock_context *pccontext, fmode_t fmode) container_of(pccontext->clk, struct ptp_clock, clock); struct timestamp_event_queue *queue; char debugfsname[32]; + unsigned long flags; queue = kzalloc(sizeof(*queue), GFP_KERNEL); if (!queue) @@ -119,7 +120,9 @@ int ptp_open(struct posix_clock_context *pccontext, fmode_t fmode) } bitmap_set(queue->mask, 0, PTP_MAX_CHANNELS); spin_lock_init(&queue->lock); + spin_lock_irqsave(&ptp->tsevqs_lock, flags); list_add_tail(&queue->qlist, &ptp->tsevqs); + spin_unlock_irqrestore(&ptp->tsevqs_lock, flags); pccontext->private_clkdata = queue; /* Debugfs contents */ @@ -139,16 +142,16 @@ int ptp_release(struct posix_clock_context *pccontext) { struct timestamp_event_queue *queue = pccontext->private_clkdata; unsigned long flags; + struct ptp_clock *ptp = + container_of(pccontext->clk, struct ptp_clock, clock); - if (queue) { - debugfs_remove(queue->debugfs_instance); - pccontext->private_clkdata = NULL; - spin_lock_irqsave(&queue->lock, flags); - list_del(&queue->qlist); - spin_unlock_irqrestore(&queue->lock, flags); - bitmap_free(queue->mask); - kfree(queue); - } + debugfs_remove(queue->debugfs_instance); + pccontext->private_clkdata = NULL; + spin_lock_irqsave(&ptp->tsevqs_lock, flags); + list_del(&queue->qlist); + spin_unlock_irqrestore(&ptp->tsevqs_lock, flags); + bitmap_free(queue->mask); + kfree(queue); return 0; } diff --git a/drivers/ptp/ptp_clock.c b/drivers/ptp/ptp_clock.c index 3d1b0a97301c..3134568af622 100644 --- a/drivers/ptp/ptp_clock.c +++ b/drivers/ptp/ptp_clock.c @@ -179,11 +179,11 @@ static void ptp_clock_release(struct device *dev) mutex_destroy(&ptp->pincfg_mux); mutex_destroy(&ptp->n_vclocks_mux); /* Delete first entry */ + spin_lock_irqsave(&ptp->tsevqs_lock, flags); tsevq = list_first_entry(&ptp->tsevqs, struct timestamp_event_queue, qlist); - spin_lock_irqsave(&tsevq->lock, flags); list_del(&tsevq->qlist); - spin_unlock_irqrestore(&tsevq->lock, flags); + spin_unlock_irqrestore(&ptp->tsevqs_lock, flags); bitmap_free(tsevq->mask); kfree(tsevq); debugfs_remove(ptp->debugfs_root); @@ -247,6 +247,7 @@ struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info, if (!queue) goto no_memory_queue; list_add_tail(&queue->qlist, &ptp->tsevqs); + spin_lock_init(&ptp->tsevqs_lock); queue->mask = bitmap_alloc(PTP_MAX_CHANNELS, GFP_KERNEL); if (!queue->mask) goto no_memory_bitmap; @@ -407,6 +408,7 @@ void ptp_clock_event(struct ptp_clock *ptp, struct ptp_clock_event *event) { struct timestamp_event_queue *tsevq; struct pps_event_time evt; + unsigned long flags; switch (event->type) { @@ -415,10 +417,12 @@ void ptp_clock_event(struct ptp_clock *ptp, struct ptp_clock_event *event) case PTP_CLOCK_EXTTS: /* Enqueue timestamp on selected queues */ + spin_lock_irqsave(&ptp->tsevqs_lock, flags); list_for_each_entry(tsevq, &ptp->tsevqs, qlist) { if (test_bit((unsigned int)event->index, tsevq->mask)) enqueue_external_timestamp(tsevq, event); } + spin_unlock_irqrestore(&ptp->tsevqs_lock, flags); wake_up_interruptible(&ptp->tsev_wq); break; diff --git a/drivers/ptp/ptp_private.h b/drivers/ptp/ptp_private.h index 52f87e394aa6..35fde0a05746 100644 --- a/drivers/ptp/ptp_private.h +++ b/drivers/ptp/ptp_private.h @@ -44,6 +44,7 @@ struct ptp_clock { struct pps_device *pps_source; long dialed_frequency; /* remembers the frequency adjustment */ struct list_head tsevqs; /* timestamp fifo list */ + spinlock_t tsevqs_lock; /* protects tsevqs from concurrent access */ struct mutex pincfg_mux; /* protect concurrent info->pin_config access */ wait_queue_head_t tsev_wq; int defunct; /* tells readers to go away when clock is being removed */ -- cgit From 83b9dda8afa4e968d9cce253f390b01c0612a2a5 Mon Sep 17 00:00:00 2001 From: Diogo Ivo Date: Tue, 7 Nov 2023 12:00:36 +0000 Subject: net: ti: icss-iep: fix setting counter value Currently icss_iep_set_counter() writes the upper 32-bits of the counter value to both the lower and upper counter registers, so fix this by writing the appropriate value to the lower register. Fixes: c1e0230eeaab ("net: ti: icss-iep: Add IEP driver") Signed-off-by: Diogo Ivo Link: https://lore.kernel.org/r/20231107120037.1513546-1-diogo.ivo@siemens.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/ti/icssg/icss_iep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ti/icssg/icss_iep.c b/drivers/net/ethernet/ti/icssg/icss_iep.c index 4cf2a52e4378..3025e9c18970 100644 --- a/drivers/net/ethernet/ti/icssg/icss_iep.c +++ b/drivers/net/ethernet/ti/icssg/icss_iep.c @@ -177,7 +177,7 @@ static void icss_iep_set_counter(struct icss_iep *iep, u64 ns) if (iep->plat_data->flags & ICSS_IEP_64BIT_COUNTER_SUPPORT) writel(upper_32_bits(ns), iep->base + iep->plat_data->reg_offs[ICSS_IEP_COUNT_REG1]); - writel(upper_32_bits(ns), iep->base + iep->plat_data->reg_offs[ICSS_IEP_COUNT_REG0]); + writel(lower_32_bits(ns), iep->base + iep->plat_data->reg_offs[ICSS_IEP_COUNT_REG0]); } static void icss_iep_update_to_next_boundary(struct icss_iep *iep, u64 start_ns); -- cgit