From fc5a40694ba684fb3b7009819965ec38e829118f Mon Sep 17 00:00:00 2001 From: Stanislav Jakubek Date: Thu, 23 Dec 2021 18:33:39 +0100 Subject: Revert "dt-bindings: arm: qcom: Document SDX65 platform and boards" This reverts commit 3b338c9a6a2afd6db46d5d8e39ae4f5eef420bf8. This was a duplicate of 61339f368d59d25e22401731f89de44e3215508b, causing the sdx65 compatible and its board to be documented twice. Signed-off-by: Stanislav Jakubek Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/20211223173339.GA3925@standask-GA-A55M-S2HP --- Documentation/devicetree/bindings/arm/qcom.yaml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/Documentation/devicetree/bindings/arm/qcom.yaml b/Documentation/devicetree/bindings/arm/qcom.yaml index 2d613282816a..c8808e0f9e64 100644 --- a/Documentation/devicetree/bindings/arm/qcom.yaml +++ b/Documentation/devicetree/bindings/arm/qcom.yaml @@ -48,7 +48,6 @@ description: | sdx65 sm7225 sm8150 - sdx65 sm8250 sm8350 @@ -225,11 +224,6 @@ properties: - qcom,sdx65-mtp - const: qcom,sdx65 - - items: - - enum: - - qcom,sdx65-mtp - - const: qcom,sdx65 - - items: - enum: - qcom,ipq6018-cp01 -- cgit From 2ddd96aadbd0412040ef49eda94549c32de6c92c Mon Sep 17 00:00:00 2001 From: Frank Wunderlich Date: Sun, 23 Jan 2022 14:36:15 +0100 Subject: arm64: dts: rockchip: fix dma-controller node names on rk356x DMA-Cotrollers defined in rk356x.dtsi do not match the pattern in bindings. arch/arm64/boot/dts/rockchip/rk3568-evb1-v10.dt.yaml: dmac@fe530000: $nodename:0: 'dmac@fe530000' does not match '^dma-controller(@.*)?$' From schema: Documentation/devicetree/bindings/dma/arm,pl330.yaml arch/arm64/boot/dts/rockchip/rk3568-evb1-v10.dt.yaml: dmac@fe550000: $nodename:0: 'dmac@fe550000' does not match '^dma-controller(@.*)?$' From schema: Documentation/devicetree/bindings/dma/arm,pl330.yaml This Patch fixes it. Signed-off-by: Frank Wunderlich Link: https://lore.kernel.org/r/20220123133615.135789-1-linux@fw-web.de Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk356x.dtsi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/rockchip/rk356x.dtsi b/arch/arm64/boot/dts/rockchip/rk356x.dtsi index a68033a23975..8ccce54ee8e7 100644 --- a/arch/arm64/boot/dts/rockchip/rk356x.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk356x.dtsi @@ -651,7 +651,7 @@ status = "disabled"; }; - dmac0: dmac@fe530000 { + dmac0: dma-controller@fe530000 { compatible = "arm,pl330", "arm,primecell"; reg = <0x0 0xfe530000 0x0 0x4000>; interrupts = , @@ -662,7 +662,7 @@ #dma-cells = <1>; }; - dmac1: dmac@fe550000 { + dmac1: dma-controller@fe550000 { compatible = "arm,pl330", "arm,primecell"; reg = <0x0 0xfe550000 0x0 0x4000>; interrupts = , -- cgit From 85a8bccfa945680dc561f06b65ea01341d2033fc Mon Sep 17 00:00:00 2001 From: Frank Wunderlich Date: Sun, 23 Jan 2022 14:35:10 +0100 Subject: arm64: dts: rockchip: drop pclk_xpcs from gmac0 on rk3568 pclk_xpcs is not supported by mainline driver and breaks dtbs_check following warnings occour, and many more rk3568-evb1-v10.dt.yaml: ethernet@fe2a0000: clocks: [[15, 386], [15, 389], [15, 389], [15, 184], [15, 180], [15, 181], [15, 389], [15, 185], [15, 172]] is too long From schema: Documentation/devicetree/bindings/net/snps,dwmac.yaml rk3568-evb1-v10.dt.yaml: ethernet@fe2a0000: clock-names: ['stmmaceth', 'mac_clk_rx', 'mac_clk_tx', 'clk_mac_refout', 'aclk_mac', 'pclk_mac', 'clk_mac_speed', 'ptp_ref', 'pclk_xpcs'] is too long From schema: Documentation/devicetree/bindings/net/snps,dwmac.yaml after removing it, the clock and other warnings are gone. pclk_xpcs on gmac is used to support QSGMII, but this requires a driver supporting it. Once xpcs support is introduced, the clock can be added to the documentation and both controllers. Fixes: b8d41e5053cd ("arm64: dts: rockchip: add gmac0 node to rk3568") Co-developed-by: Peter Geis Signed-off-by: Peter Geis Signed-off-by: Frank Wunderlich Acked-by: Michael Riesch Link: https://lore.kernel.org/r/20220123133510.135651-1-linux@fw-web.de Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3568.dtsi | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3568.dtsi b/arch/arm64/boot/dts/rockchip/rk3568.dtsi index 2fd313a295f8..d91df1cde736 100644 --- a/arch/arm64/boot/dts/rockchip/rk3568.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3568.dtsi @@ -32,13 +32,11 @@ clocks = <&cru SCLK_GMAC0>, <&cru SCLK_GMAC0_RX_TX>, <&cru SCLK_GMAC0_RX_TX>, <&cru CLK_MAC0_REFOUT>, <&cru ACLK_GMAC0>, <&cru PCLK_GMAC0>, - <&cru SCLK_GMAC0_RX_TX>, <&cru CLK_GMAC0_PTP_REF>, - <&cru PCLK_XPCS>; + <&cru SCLK_GMAC0_RX_TX>, <&cru CLK_GMAC0_PTP_REF>; clock-names = "stmmaceth", "mac_clk_rx", "mac_clk_tx", "clk_mac_refout", "aclk_mac", "pclk_mac", - "clk_mac_speed", "ptp_ref", - "pclk_xpcs"; + "clk_mac_speed", "ptp_ref"; resets = <&cru SRST_A_GMAC0>; reset-names = "stmmaceth"; rockchip,grf = <&grf>; -- cgit From ed2c66a95c0c5669880aa93d0d34c6e9694b4cbd Mon Sep 17 00:00:00 2001 From: Quentin Schulz Date: Thu, 20 Jan 2022 13:51:56 +0100 Subject: arm64: dts: rockchip: fix rk3399-puma-haikou USB OTG mode The micro USB3.0 port available on the Haikou evaluation kit for Puma RK3399-Q7 SoM supports dual-role model (aka drd or OTG) but its support was broken until now because of missing logic around the ID pin. This adds proper support for USB OTG on Puma Haikou by "connecting" the GPIO used for USB ID to the USB3 controller device. Cc: Quentin Schulz Signed-off-by: Quentin Schulz Link: https://lore.kernel.org/r/20220120125156.16217-1-quentin.schulz@theobroma-systems.com Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3399-puma-haikou.dts | 1 + arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi | 14 ++++++++++++++ 2 files changed, 15 insertions(+) diff --git a/arch/arm64/boot/dts/rockchip/rk3399-puma-haikou.dts b/arch/arm64/boot/dts/rockchip/rk3399-puma-haikou.dts index 292bb7e80cf3..3ae5d727e367 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399-puma-haikou.dts +++ b/arch/arm64/boot/dts/rockchip/rk3399-puma-haikou.dts @@ -232,6 +232,7 @@ &usbdrd_dwc3_0 { dr_mode = "otg"; + extcon = <&extcon_usb3>; status = "okay"; }; diff --git a/arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi b/arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi index fb67db4619ea..002ece51c3ba 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi @@ -25,6 +25,13 @@ }; }; + extcon_usb3: extcon-usb3 { + compatible = "linux,extcon-usb-gpio"; + id-gpio = <&gpio1 RK_PC2 GPIO_ACTIVE_HIGH>; + pinctrl-names = "default"; + pinctrl-0 = <&usb3_id>; + }; + clkin_gmac: external-gmac-clock { compatible = "fixed-clock"; clock-frequency = <125000000>; @@ -422,6 +429,13 @@ <4 RK_PA3 RK_FUNC_GPIO &pcfg_pull_none>; }; }; + + usb3 { + usb3_id: usb3-id { + rockchip,pins = + <1 RK_PC2 RK_FUNC_GPIO &pcfg_pull_none>; + }; + }; }; &sdhci { -- cgit From b5fbaf7d779f5f02b7f75b080e7707222573be2a Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Fri, 14 Jan 2022 15:02:07 -0800 Subject: arm64: dts: rockchip: Switch RK3399-Gru DP to SPDIF output Commit b18c6c3c7768 ("ASoC: rockchip: cdn-dp sound output use spdif") switched the platform to SPDIF, but we didn't fix up the device tree. Drop the pinctrl settings, because the 'spdif_bus' pins are either: * unused (on kevin, bob), so the settings is ~harmless * used by a different function (on scarlet), which causes probe failures (!!) Fixes: b18c6c3c7768 ("ASoC: rockchip: cdn-dp sound output use spdif") Signed-off-by: Brian Norris Reviewed-by: Chen-Yu Tsai Link: https://lore.kernel.org/r/20220114150129.v2.1.I46f64b00508d9dff34abe1c3e8d2defdab4ea1e5@changeid Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3399-gru.dtsi | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3399-gru.dtsi b/arch/arm64/boot/dts/rockchip/rk3399-gru.dtsi index 45a5ae5d2027..162f08bca0d4 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399-gru.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3399-gru.dtsi @@ -286,7 +286,7 @@ sound: sound { compatible = "rockchip,rk3399-gru-sound"; - rockchip,cpu = <&i2s0 &i2s2>; + rockchip,cpu = <&i2s0 &spdif>; }; }; @@ -437,10 +437,6 @@ ap_i2c_audio: &i2c8 { status = "okay"; }; -&i2s2 { - status = "okay"; -}; - &io_domains { status = "okay"; @@ -537,6 +533,17 @@ ap_i2c_audio: &i2c8 { vqmmc-supply = <&ppvar_sd_card_io>; }; +&spdif { + status = "okay"; + + /* + * SPDIF is routed internally to DP; we either don't use these pins, or + * mux them to something else. + */ + /delete-property/ pinctrl-0; + /delete-property/ pinctrl-names; +}; + &spi1 { status = "okay"; -- cgit From 305325688ff924c52a6e12f92235a89536e022cf Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 20 Jan 2022 17:02:47 -0600 Subject: NTB/msi: Use struct_size() helper in devm_kzalloc() Make use of the struct_size() helper instead of an open-coded version, in order to avoid any potential type mistakes or integer overflows that, in the worst scenario, could lead to heap overflows. Also, address the following sparse warnings: drivers/ntb/msi.c:46:23: warning: using sizeof on a flexible structure Link: https://github.com/KSPP/linux/issues/174 Signed-off-by: Gustavo A. R. Silva Reviewed-by: Kees Cook Reviewed-by: Logan Gunthorpe Signed-off-by: Jon Mason --- drivers/ntb/msi.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/ntb/msi.c b/drivers/ntb/msi.c index dd683cb58d09..6295e55ef85e 100644 --- a/drivers/ntb/msi.c +++ b/drivers/ntb/msi.c @@ -33,7 +33,6 @@ int ntb_msi_init(struct ntb_dev *ntb, { phys_addr_t mw_phys_addr; resource_size_t mw_size; - size_t struct_size; int peer_widx; int peers; int ret; @@ -43,9 +42,8 @@ int ntb_msi_init(struct ntb_dev *ntb, if (peers <= 0) return -EINVAL; - struct_size = sizeof(*ntb->msi) + sizeof(*ntb->msi->peer_mws) * peers; - - ntb->msi = devm_kzalloc(&ntb->dev, struct_size, GFP_KERNEL); + ntb->msi = devm_kzalloc(&ntb->dev, struct_size(ntb->msi, peer_mws, peers), + GFP_KERNEL); if (!ntb->msi) return -ENOMEM; -- cgit From 6596a0229541270fb8d38d989f91b78838e5e9da Mon Sep 17 00:00:00 2001 From: Jiri Bohac Date: Wed, 19 Jan 2022 10:22:53 +0100 Subject: xfrm: fix MTU regression Commit 749439bfac6e1a2932c582e2699f91d329658196 ("ipv6: fix udpv6 sendmsg crash caused by too small MTU") breaks PMTU for xfrm. A Packet Too Big ICMPv6 message received in response to an ESP packet will prevent all further communication through the tunnel if the reported MTU minus the ESP overhead is smaller than 1280. E.g. in a case of a tunnel-mode ESP with sha256/aes the overhead is 92 bytes. Receiving a PTB with MTU of 1371 or less will result in all further packets in the tunnel dropped. A ping through the tunnel fails with "ping: sendmsg: Invalid argument". Apparently the MTU on the xfrm route is smaller than 1280 and fails the check inside ip6_setup_cork() added by 749439bf. We found this by debugging USGv6/ipv6ready failures. Failing tests are: "Phase-2 Interoperability Test Scenario IPsec" / 5.3.11 and 5.4.11 (Tunnel Mode: Fragmentation). Commit b515d2637276a3810d6595e10ab02c13bfd0b63a ("xfrm: xfrm_state_mtu should return at least 1280 for ipv6") attempted to fix this but caused another regression in TCP MSS calculations and had to be reverted. The patch below fixes the situation by dropping the MTU check and instead checking for the underflows described in the 749439bf commit message. Signed-off-by: Jiri Bohac Fixes: 749439bfac6e ("ipv6: fix udpv6 sendmsg crash caused by too small MTU") Signed-off-by: Steffen Klassert --- net/ipv6/ip6_output.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 2995f8d89e7e..4ff110214dea 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1408,8 +1408,6 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, if (np->frag_size) mtu = np->frag_size; } - if (mtu < IPV6_MIN_MTU) - return -EINVAL; cork->base.fragsize = mtu; cork->base.gso_size = ipc6->gso_size; cork->base.tx_flags = 0; @@ -1471,8 +1469,6 @@ static int __ip6_append_data(struct sock *sk, fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + (opt ? opt->opt_nflen : 0); - maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - - sizeof(struct frag_hdr); headersize = sizeof(struct ipv6hdr) + (opt ? opt->opt_flen + opt->opt_nflen : 0) + @@ -1480,6 +1476,13 @@ static int __ip6_append_data(struct sock *sk, sizeof(struct frag_hdr) : 0) + rt->rt6i_nfheader_len; + if (mtu < fragheaderlen || + ((mtu - fragheaderlen) & ~7) + fragheaderlen < sizeof(struct frag_hdr)) + goto emsgsize; + + maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - + sizeof(struct frag_hdr); + /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit * the first fragment */ -- cgit From c1aca3080e382886e2e58e809787441984a2f89b Mon Sep 17 00:00:00 2001 From: Yan Yan Date: Tue, 18 Jan 2022 16:00:13 -0800 Subject: xfrm: Check if_id in xfrm_migrate This patch enables distinguishing SAs and SPs based on if_id during the xfrm_migrate flow. This ensures support for xfrm interfaces throughout the SA/SP lifecycle. When there are multiple existing SPs with the same direction, the same xfrm_selector and different endpoint addresses, xfrm_migrate might fail with ENODATA. Specifically, the code path for performing xfrm_migrate is: Stage 1: find policy to migrate with xfrm_migrate_policy_find(sel, dir, type, net) Stage 2: find and update state(s) with xfrm_migrate_state_find(mp, net) Stage 3: update endpoint address(es) of template(s) with xfrm_policy_migrate(pol, m, num_migrate) Currently "Stage 1" always returns the first xfrm_policy that matches, and "Stage 3" looks for the xfrm_tmpl that matches the old endpoint address. Thus if there are multiple xfrm_policy with same selector, direction, type and net, "Stage 1" might rertun a wrong xfrm_policy and "Stage 3" will fail with ENODATA because it cannot find a xfrm_tmpl with the matching endpoint address. The fix is to allow userspace to pass an if_id and add if_id to the matching rule in Stage 1 and Stage 2 since if_id is a unique ID for xfrm_policy and xfrm_state. For compatibility, if_id will only be checked if the attribute is set. Tested with additions to Android's kernel unit test suite: https://android-review.googlesource.com/c/kernel/tests/+/1668886 Signed-off-by: Yan Yan Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 5 +++-- net/key/af_key.c | 2 +- net/xfrm/xfrm_policy.c | 14 ++++++++------ net/xfrm/xfrm_state.c | 7 ++++++- net/xfrm/xfrm_user.c | 6 +++++- 5 files changed, 23 insertions(+), 11 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index fdb41e8bb626..743dd1da506e 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1681,14 +1681,15 @@ int km_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, const struct xfrm_migrate *m, int num_bundles, const struct xfrm_kmaddress *k, const struct xfrm_encap_tmpl *encap); -struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *net); +struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *net, + u32 if_id); struct xfrm_state *xfrm_state_migrate(struct xfrm_state *x, struct xfrm_migrate *m, struct xfrm_encap_tmpl *encap); int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, struct xfrm_migrate *m, int num_bundles, struct xfrm_kmaddress *k, struct net *net, - struct xfrm_encap_tmpl *encap); + struct xfrm_encap_tmpl *encap, u32 if_id); #endif int km_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, __be16 sport); diff --git a/net/key/af_key.c b/net/key/af_key.c index de24a7d474df..9bf52a09b5ff 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -2623,7 +2623,7 @@ static int pfkey_migrate(struct sock *sk, struct sk_buff *skb, } return xfrm_migrate(&sel, dir, XFRM_POLICY_TYPE_MAIN, m, i, - kma ? &k : NULL, net, NULL); + kma ? &k : NULL, net, NULL, 0); out: return err; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 04d1ce9b510f..882526159d3a 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -4256,7 +4256,7 @@ static bool xfrm_migrate_selector_match(const struct xfrm_selector *sel_cmp, } static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector *sel, - u8 dir, u8 type, struct net *net) + u8 dir, u8 type, struct net *net, u32 if_id) { struct xfrm_policy *pol, *ret = NULL; struct hlist_head *chain; @@ -4265,7 +4265,8 @@ static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector * spin_lock_bh(&net->xfrm.xfrm_policy_lock); chain = policy_hash_direct(net, &sel->daddr, &sel->saddr, sel->family, dir); hlist_for_each_entry(pol, chain, bydst) { - if (xfrm_migrate_selector_match(sel, &pol->selector) && + if ((if_id == 0 || pol->if_id == if_id) && + xfrm_migrate_selector_match(sel, &pol->selector) && pol->type == type) { ret = pol; priority = ret->priority; @@ -4277,7 +4278,8 @@ static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector * if ((pol->priority >= priority) && ret) break; - if (xfrm_migrate_selector_match(sel, &pol->selector) && + if ((if_id == 0 || pol->if_id == if_id) && + xfrm_migrate_selector_match(sel, &pol->selector) && pol->type == type) { ret = pol; break; @@ -4393,7 +4395,7 @@ static int xfrm_migrate_check(const struct xfrm_migrate *m, int num_migrate) int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, struct xfrm_migrate *m, int num_migrate, struct xfrm_kmaddress *k, struct net *net, - struct xfrm_encap_tmpl *encap) + struct xfrm_encap_tmpl *encap, u32 if_id) { int i, err, nx_cur = 0, nx_new = 0; struct xfrm_policy *pol = NULL; @@ -4412,14 +4414,14 @@ int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, } /* Stage 1 - find policy */ - if ((pol = xfrm_migrate_policy_find(sel, dir, type, net)) == NULL) { + if ((pol = xfrm_migrate_policy_find(sel, dir, type, net, if_id)) == NULL) { err = -ENOENT; goto out; } /* Stage 2 - find and update state(s) */ for (i = 0, mp = m; i < num_migrate; i++, mp++) { - if ((x = xfrm_migrate_state_find(mp, net))) { + if ((x = xfrm_migrate_state_find(mp, net, if_id))) { x_cur[nx_cur] = x; nx_cur++; xc = xfrm_state_migrate(x, mp, encap); diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index ca6bee18346d..b0eeb0aef493 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1606,7 +1606,8 @@ out: return NULL; } -struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *net) +struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *net, + u32 if_id) { unsigned int h; struct xfrm_state *x = NULL; @@ -1622,6 +1623,8 @@ struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *n continue; if (m->reqid && x->props.reqid != m->reqid) continue; + if (if_id != 0 && x->if_id != if_id) + continue; if (!xfrm_addr_equal(&x->id.daddr, &m->old_daddr, m->old_family) || !xfrm_addr_equal(&x->props.saddr, &m->old_saddr, @@ -1637,6 +1640,8 @@ struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *n if (x->props.mode != m->mode || x->id.proto != m->proto) continue; + if (if_id != 0 && x->if_id != if_id) + continue; if (!xfrm_addr_equal(&x->id.daddr, &m->old_daddr, m->old_family) || !xfrm_addr_equal(&x->props.saddr, &m->old_saddr, diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 8cd6c8129004..a4fb596e87af 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -2608,6 +2608,7 @@ static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh, int n = 0; struct net *net = sock_net(skb->sk); struct xfrm_encap_tmpl *encap = NULL; + u32 if_id = 0; if (attrs[XFRMA_MIGRATE] == NULL) return -EINVAL; @@ -2632,7 +2633,10 @@ static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh, return -ENOMEM; } - err = xfrm_migrate(&pi->sel, pi->dir, type, m, n, kmp, net, encap); + if (attrs[XFRMA_IF_ID]) + if_id = nla_get_u32(attrs[XFRMA_IF_ID]); + + err = xfrm_migrate(&pi->sel, pi->dir, type, m, n, kmp, net, encap, if_id); kfree(encap); -- cgit From e03c3bba351f99ad932e8f06baa9da1afc418e02 Mon Sep 17 00:00:00 2001 From: Yan Yan Date: Tue, 18 Jan 2022 16:00:14 -0800 Subject: xfrm: Fix xfrm migrate issues when address family changes xfrm_migrate cannot handle address family change of an xfrm_state. The symptons are the xfrm_state will be migrated to a wrong address, and sending as well as receiving packets wil be broken. This commit fixes it by breaking the original xfrm_state_clone method into two steps so as to update the props.family before running xfrm_init_state. As the result, xfrm_state's inner mode, outer mode, type and IP header length in xfrm_state_migrate can be updated with the new address family. Tested with additions to Android's kernel unit test suite: https://android-review.googlesource.com/c/kernel/tests/+/1885354 Signed-off-by: Yan Yan Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index b0eeb0aef493..1ba6fbfe8cdb 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1579,9 +1579,6 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig, memcpy(&x->mark, &orig->mark, sizeof(x->mark)); memcpy(&x->props.smark, &orig->props.smark, sizeof(x->props.smark)); - if (xfrm_init_state(x) < 0) - goto error; - x->props.flags = orig->props.flags; x->props.extra_flags = orig->props.extra_flags; @@ -1668,6 +1665,11 @@ struct xfrm_state *xfrm_state_migrate(struct xfrm_state *x, if (!xc) return NULL; + xc->props.family = m->new_family; + + if (xfrm_init_state(xc) < 0) + goto error; + memcpy(&xc->id.daddr, &m->new_daddr, sizeof(xc->id.daddr)); memcpy(&xc->props.saddr, &m->new_saddr, sizeof(xc->props.saddr)); -- cgit From 31eeb6b09f4053f32a30ce9fbcdfca31f713028d Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 24 Jan 2022 17:57:01 +0000 Subject: arm64: dts: juno: Remove GICv2m dma-range Although it is painstakingly honest to describe all 3 PCI windows in "dma-ranges", it misses the the subtle distinction that the window for the GICv2m range is normally programmed for Device memory attributes rather than Normal Cacheable like the DRAM windows. Since MMU-401 only offers stage 2 translation, this means that when the PCI SMMU is enabled, accesses through that IPA range unexpectedly lose coherency if mapped as cacheable at the SMMU, due to the attribute combining rules. Since an extra 256KB is neither here nor there when we still have 10GB worth of usable address space, rather than attempting to describe and cope with this detail let's just remove the offending range. If the SMMU is not used then it makes no difference anyway. Link: https://lore.kernel.org/r/856c3f7192c6c3ce545ba67462f2ce9c86ed6b0c.1643046936.git.robin.murphy@arm.com Fixes: 4ac4d146cb63 ("arm64: dts: juno: Describe PCI dma-ranges") Reported-by: Anders Roxell Acked-by: Liviu Dudau Signed-off-by: Robin Murphy Signed-off-by: Sudeep Holla --- arch/arm64/boot/dts/arm/juno-base.dtsi | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/arm/juno-base.dtsi b/arch/arm64/boot/dts/arm/juno-base.dtsi index 6288e104a089..a2635b14da30 100644 --- a/arch/arm64/boot/dts/arm/juno-base.dtsi +++ b/arch/arm64/boot/dts/arm/juno-base.dtsi @@ -543,8 +543,7 @@ <0x02000000 0x00 0x50000000 0x00 0x50000000 0x0 0x08000000>, <0x42000000 0x40 0x00000000 0x40 0x00000000 0x1 0x00000000>; /* Standard AXI Translation entries as programmed by EDK2 */ - dma-ranges = <0x02000000 0x0 0x2c1c0000 0x0 0x2c1c0000 0x0 0x00040000>, - <0x02000000 0x0 0x80000000 0x0 0x80000000 0x0 0x80000000>, + dma-ranges = <0x02000000 0x0 0x80000000 0x0 0x80000000 0x0 0x80000000>, <0x43000000 0x8 0x00000000 0x8 0x00000000 0x2 0x00000000>; #interrupt-cells = <1>; interrupt-map-mask = <0 0 0 7>; -- cgit From a6d95c5a628a09be129f25d5663a7e9db8261f51 Mon Sep 17 00:00:00 2001 From: Jiri Bohac Date: Wed, 26 Jan 2022 16:00:18 +0100 Subject: Revert "xfrm: xfrm_state_mtu should return at least 1280 for ipv6" This reverts commit b515d2637276a3810d6595e10ab02c13bfd0b63a. Commit b515d2637276a3810d6595e10ab02c13bfd0b63a ("xfrm: xfrm_state_mtu should return at least 1280 for ipv6") in v5.14 breaks the TCP MSS calculation in ipsec transport mode, resulting complete stalls of TCP connections. This happens when the (P)MTU is 1280 or slighly larger. The desired formula for the MSS is: MSS = (MTU - ESP_overhead) - IP header - TCP header However, the above commit clamps the (MTU - ESP_overhead) to a minimum of 1280, turning the formula into MSS = max(MTU - ESP overhead, 1280) - IP header - TCP header With the (P)MTU near 1280, the calculated MSS is too large and the resulting TCP packets never make it to the destination because they are over the actual PMTU. The above commit also causes suboptimal double fragmentation in xfrm tunnel mode, as described in https://lore.kernel.org/netdev/20210429202529.codhwpc7w6kbudug@dwarf.suse.cz/ The original problem the above commit was trying to fix is now fixed by commit 6596a0229541270fb8d38d989f91b78838e5e9da ("xfrm: fix MTU regression"). Signed-off-by: Jiri Bohac Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 1 - net/ipv4/esp4.c | 2 +- net/ipv6/esp6.c | 2 +- net/xfrm/xfrm_state.c | 14 ++------------ 4 files changed, 4 insertions(+), 15 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 743dd1da506e..76aa6f11a540 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1568,7 +1568,6 @@ void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si); void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si); u32 xfrm_replay_seqhi(struct xfrm_state *x, __be32 net_seq); int xfrm_init_replay(struct xfrm_state *x); -u32 __xfrm_state_mtu(struct xfrm_state *x, int mtu); u32 xfrm_state_mtu(struct xfrm_state *x, int mtu); int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload); int xfrm_init_state(struct xfrm_state *x); diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 851f542928a3..e1b1d080e908 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -671,7 +671,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb); u32 padto; - padto = min(x->tfcpad, __xfrm_state_mtu(x, dst->child_mtu_cached)); + padto = min(x->tfcpad, xfrm_state_mtu(x, dst->child_mtu_cached)); if (skb->len < padto) esp.tfclen = padto - skb->len; } diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 8bb2c407b46b..7591160edce1 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -707,7 +707,7 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb); u32 padto; - padto = min(x->tfcpad, __xfrm_state_mtu(x, dst->child_mtu_cached)); + padto = min(x->tfcpad, xfrm_state_mtu(x, dst->child_mtu_cached)); if (skb->len < padto) esp.tfclen = padto - skb->len; } diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 1ba6fbfe8cdb..b749935152ba 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -2579,7 +2579,7 @@ void xfrm_state_delete_tunnel(struct xfrm_state *x) } EXPORT_SYMBOL(xfrm_state_delete_tunnel); -u32 __xfrm_state_mtu(struct xfrm_state *x, int mtu) +u32 xfrm_state_mtu(struct xfrm_state *x, int mtu) { const struct xfrm_type *type = READ_ONCE(x->type); struct crypto_aead *aead; @@ -2610,17 +2610,7 @@ u32 __xfrm_state_mtu(struct xfrm_state *x, int mtu) return ((mtu - x->props.header_len - crypto_aead_authsize(aead) - net_adj) & ~(blksize - 1)) + net_adj - 2; } -EXPORT_SYMBOL_GPL(__xfrm_state_mtu); - -u32 xfrm_state_mtu(struct xfrm_state *x, int mtu) -{ - mtu = __xfrm_state_mtu(x, mtu); - - if (x->props.family == AF_INET6 && mtu < IPV6_MIN_MTU) - return IPV6_MIN_MTU; - - return mtu; -} +EXPORT_SYMBOL_GPL(xfrm_state_mtu); int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload) { -- cgit From ebea268ea583ba4970df425dfef8c8e21d0a4e12 Mon Sep 17 00:00:00 2001 From: Jon Hunter Date: Thu, 13 Jan 2022 14:31:52 +0000 Subject: arm64: tegra: Disable ISO SMMU for Tegra194 Commit e762232f9466 ("arm64: tegra: Add ISO SMMU controller for Tegra194") added the ISO SMMU for display devices on Tegra194. The SMMU is enabled by default but not hooked up to the display controllers yet because we do not have a way to pass frame-buffer memory from the bootloader to the kernel. However, even though the SMMU is not hooked up to the display controllers' SMMU faults are being seen if a display is connected. Therefore, keep the ISO SMMU disabled by default for now. Fixes: e762232f9466 ("arm64: tegra: Add ISO SMMU controller for Tegra194") Signed-off-by: Jon Hunter Signed-off-by: Thierry Reding --- arch/arm64/boot/dts/nvidia/tegra194.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/nvidia/tegra194.dtsi b/arch/arm64/boot/dts/nvidia/tegra194.dtsi index 3c4acfca459d..6005e422c968 100644 --- a/arch/arm64/boot/dts/nvidia/tegra194.dtsi +++ b/arch/arm64/boot/dts/nvidia/tegra194.dtsi @@ -1585,7 +1585,7 @@ #iommu-cells = <1>; nvidia,memory-controller = <&mc>; - status = "okay"; + status = "disabled"; }; smmu: iommu@12000000 { -- cgit From d5081bf5dcfb1cb83fb538708b0ac07a10a79cc4 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Thu, 27 Jan 2022 13:31:12 -0700 Subject: ntb: intel: fix port config status offset for SPR The field offset for port configuration status on SPR has been changed to bit 14 from ICX where it resides at bit 12. By chance link status detection continued to work on SPR. This is due to bit 12 being a configuration bit which is in sync with the status bit. Fix this by checking for a SPR device and checking correct status bit. Fixes: 26bfe3d0b227 ("ntb: intel: Add Icelake (gen4) support for Intel NTB") Tested-by: Jerry Dai Signed-off-by: Dave Jiang Signed-off-by: Jon Mason --- drivers/ntb/hw/intel/ntb_hw_gen4.c | 17 ++++++++++++++++- drivers/ntb/hw/intel/ntb_hw_gen4.h | 16 ++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/drivers/ntb/hw/intel/ntb_hw_gen4.c b/drivers/ntb/hw/intel/ntb_hw_gen4.c index fede05151f69..4081fc538ff4 100644 --- a/drivers/ntb/hw/intel/ntb_hw_gen4.c +++ b/drivers/ntb/hw/intel/ntb_hw_gen4.c @@ -168,6 +168,18 @@ static enum ntb_topo gen4_ppd_topo(struct intel_ntb_dev *ndev, u32 ppd) return NTB_TOPO_NONE; } +static enum ntb_topo spr_ppd_topo(struct intel_ntb_dev *ndev, u32 ppd) +{ + switch (ppd & SPR_PPD_TOPO_MASK) { + case SPR_PPD_TOPO_B2B_USD: + return NTB_TOPO_B2B_USD; + case SPR_PPD_TOPO_B2B_DSD: + return NTB_TOPO_B2B_DSD; + } + + return NTB_TOPO_NONE; +} + int gen4_init_dev(struct intel_ntb_dev *ndev) { struct pci_dev *pdev = ndev->ntb.pdev; @@ -183,7 +195,10 @@ int gen4_init_dev(struct intel_ntb_dev *ndev) } ppd1 = ioread32(ndev->self_mmio + GEN4_PPD1_OFFSET); - ndev->ntb.topo = gen4_ppd_topo(ndev, ppd1); + if (pdev_is_ICX(pdev)) + ndev->ntb.topo = gen4_ppd_topo(ndev, ppd1); + else if (pdev_is_SPR(pdev)) + ndev->ntb.topo = spr_ppd_topo(ndev, ppd1); dev_dbg(&pdev->dev, "ppd %#x topo %s\n", ppd1, ntb_topo_string(ndev->ntb.topo)); if (ndev->ntb.topo == NTB_TOPO_NONE) diff --git a/drivers/ntb/hw/intel/ntb_hw_gen4.h b/drivers/ntb/hw/intel/ntb_hw_gen4.h index 3fcd3fdce9ed..f91323eaf5ce 100644 --- a/drivers/ntb/hw/intel/ntb_hw_gen4.h +++ b/drivers/ntb/hw/intel/ntb_hw_gen4.h @@ -49,10 +49,14 @@ #define GEN4_PPD_CLEAR_TRN 0x0001 #define GEN4_PPD_LINKTRN 0x0008 #define GEN4_PPD_CONN_MASK 0x0300 +#define SPR_PPD_CONN_MASK 0x0700 #define GEN4_PPD_CONN_B2B 0x0200 #define GEN4_PPD_DEV_MASK 0x1000 #define GEN4_PPD_DEV_DSD 0x1000 #define GEN4_PPD_DEV_USD 0x0000 +#define SPR_PPD_DEV_MASK 0x4000 +#define SPR_PPD_DEV_DSD 0x4000 +#define SPR_PPD_DEV_USD 0x0000 #define GEN4_LINK_CTRL_LINK_DISABLE 0x0010 #define GEN4_SLOTSTS 0xb05a @@ -62,6 +66,10 @@ #define GEN4_PPD_TOPO_B2B_USD (GEN4_PPD_CONN_B2B | GEN4_PPD_DEV_USD) #define GEN4_PPD_TOPO_B2B_DSD (GEN4_PPD_CONN_B2B | GEN4_PPD_DEV_DSD) +#define SPR_PPD_TOPO_MASK (SPR_PPD_CONN_MASK | SPR_PPD_DEV_MASK) +#define SPR_PPD_TOPO_B2B_USD (GEN4_PPD_CONN_B2B | SPR_PPD_DEV_USD) +#define SPR_PPD_TOPO_B2B_DSD (GEN4_PPD_CONN_B2B | SPR_PPD_DEV_DSD) + #define GEN4_DB_COUNT 32 #define GEN4_DB_LINK 32 #define GEN4_DB_LINK_BIT BIT_ULL(GEN4_DB_LINK) @@ -112,4 +120,12 @@ static inline int pdev_is_ICX(struct pci_dev *pdev) return 0; } +static inline int pdev_is_SPR(struct pci_dev *pdev) +{ + if (pdev_is_gen4(pdev) && + pdev->revision > PCI_DEVICE_REVISION_ICX_MAX) + return 1; + return 0; +} + #endif -- cgit From ad02776cf8d083e28b1ca4d93d8b1949668c27cc Mon Sep 17 00:00:00 2001 From: Peter Geis Date: Thu, 27 Jan 2022 19:38:05 -0500 Subject: arm64: dts: rockchip: fix Quartz64-A ddr regulator voltage The Quartz64 Model A uses a voltage divider to ensure ddr voltage is within specification from the default regulator configuration. Adjusting this voltage is detrimental, and currently causes the ddr voltage to be about 0.8v. Remove the min and max voltage setpoints for the ddr regulator. Fixes: b33a22a1e7c4 ("arm64: dts: rockchip: add basic dts for Pine64 Quartz64-A") Signed-off-by: Peter Geis Link: https://lore.kernel.org/r/20220128003809.3291407-2-pgwipeout@gmail.com Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3566-quartz64-a.dts | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3566-quartz64-a.dts b/arch/arm64/boot/dts/rockchip/rk3566-quartz64-a.dts index 166399b7f13f..d9eb92d59099 100644 --- a/arch/arm64/boot/dts/rockchip/rk3566-quartz64-a.dts +++ b/arch/arm64/boot/dts/rockchip/rk3566-quartz64-a.dts @@ -285,8 +285,6 @@ vcc_ddr: DCDC_REG3 { regulator-always-on; regulator-boot-on; - regulator-min-microvolt = <1100000>; - regulator-max-microvolt = <1100000>; regulator-initial-mode = <0x2>; regulator-name = "vcc_ddr"; regulator-state-mem { -- cgit From 62966cbdda8a92f82d966a45aa671e788b2006f7 Mon Sep 17 00:00:00 2001 From: Jakob Unterwurzacher Date: Wed, 19 Jan 2022 14:49:48 +0100 Subject: arm64: dts: rockchip: fix rk3399-puma eMMC HS400 signal integrity There are signal integrity issues running the eMMC at 200MHz on Puma RK3399-Q7. Similar to the work-around found for RK3399 Gru boards, lowering the frequency to 100MHz made the eMMC much more stable, so let's lower the frequency to 100MHz. It might be possible to run at 150MHz as on RK3399 Gru boards but only 100MHz was extensively tested. Cc: Quentin Schulz Signed-off-by: Jakob Unterwurzacher Signed-off-by: Quentin Schulz Link: https://lore.kernel.org/r/20220119134948.1444965-1-quentin.schulz@theobroma-systems.com Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi b/arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi index 002ece51c3ba..08fa00364b42 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi @@ -439,6 +439,12 @@ }; &sdhci { + /* + * Signal integrity isn't great at 200MHz but 100MHz has proven stable + * enough. + */ + max-frequency = <100000000>; + bus-width = <8>; mmc-hs400-1_8v; mmc-hs400-enhanced-strobe; -- cgit From 8fd9415042826c7609c588e5ef45f3e84237785f Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sat, 29 Jan 2022 18:54:29 +0100 Subject: arm64: dts: rockchip: align pl330 node name with dtschema Fixes dtbs_check warnings like: dmac@ff240000: $nodename:0: 'dmac@ff240000' does not match '^dma-controller(@.*)?$' Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20220129175429.298836-1-krzysztof.kozlowski@canonical.com Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/px30.dtsi | 2 +- arch/arm64/boot/dts/rockchip/rk3328.dtsi | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/rockchip/px30.dtsi b/arch/arm64/boot/dts/rockchip/px30.dtsi index f972704dfe7a..56dfbb2e2fa6 100644 --- a/arch/arm64/boot/dts/rockchip/px30.dtsi +++ b/arch/arm64/boot/dts/rockchip/px30.dtsi @@ -711,7 +711,7 @@ clock-names = "pclk", "timer"; }; - dmac: dmac@ff240000 { + dmac: dma-controller@ff240000 { compatible = "arm,pl330", "arm,primecell"; reg = <0x0 0xff240000 0x0 0x4000>; interrupts = , diff --git a/arch/arm64/boot/dts/rockchip/rk3328.dtsi b/arch/arm64/boot/dts/rockchip/rk3328.dtsi index 39db0b85b4da..b822533dc7f1 100644 --- a/arch/arm64/boot/dts/rockchip/rk3328.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3328.dtsi @@ -489,7 +489,7 @@ status = "disabled"; }; - dmac: dmac@ff1f0000 { + dmac: dma-controller@ff1f0000 { compatible = "arm,pl330", "arm,primecell"; reg = <0x0 0xff1f0000 0x0 0x4000>; interrupts = , -- cgit From 0fd4dcb607ce29110d6c0b481a98c4ff3d300551 Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Wed, 22 Dec 2021 08:20:58 -0800 Subject: arm64: dts: qcom: sm8350: Correct UFS symbol clocks The introduction of '9a61f813fcc8 ("clk: qcom: regmap-mux: fix parent clock lookup")' broke UFS support on SM8350. The cause for this is that the symbol clocks have a specified rate in the "freq-table-hz" table in the UFS node, which causes the UFS code to request a rate change, for which the "bi_tcxo" happens to provide the closest rate. Prior to the change in regmap-mux it was determined (incorrectly) that no change was needed and everything worked. The rates of 75 and 300MHz matches the documentation for the symbol clocks, but we don't represent the parent clocks today. So let's mimic the configuration found in other platforms, by omitting the rate for the symbol clocks as well to avoid the rate change. While at it also fill in the dummy symbol clocks that was dropped from the GCC driver as it was upstreamed. Fixes: 59c7cf814783 ("arm64: dts: qcom: sm8350: Add UFS nodes") Signed-off-by: Bjorn Andersson Reviewed-by: Vinod Koul Reviewed-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20211222162058.3418902-1-bjorn.andersson@linaro.org --- arch/arm64/boot/dts/qcom/sm8350.dtsi | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/arch/arm64/boot/dts/qcom/sm8350.dtsi b/arch/arm64/boot/dts/qcom/sm8350.dtsi index 53b39e718fb6..4b19744bcfb3 100644 --- a/arch/arm64/boot/dts/qcom/sm8350.dtsi +++ b/arch/arm64/boot/dts/qcom/sm8350.dtsi @@ -35,6 +35,24 @@ clock-frequency = <32000>; #clock-cells = <0>; }; + + ufs_phy_rx_symbol_0_clk: ufs-phy-rx-symbol-0 { + compatible = "fixed-clock"; + clock-frequency = <1000>; + #clock-cells = <0>; + }; + + ufs_phy_rx_symbol_1_clk: ufs-phy-rx-symbol-1 { + compatible = "fixed-clock"; + clock-frequency = <1000>; + #clock-cells = <0>; + }; + + ufs_phy_tx_symbol_0_clk: ufs-phy-tx-symbol-0 { + compatible = "fixed-clock"; + clock-frequency = <1000>; + #clock-cells = <0>; + }; }; cpus { @@ -603,9 +621,9 @@ <0>, <0>, <0>, - <0>, - <0>, - <0>, + <&ufs_phy_rx_symbol_0_clk>, + <&ufs_phy_rx_symbol_1_clk>, + <&ufs_phy_tx_symbol_0_clk>, <0>, <0>; }; @@ -1923,8 +1941,8 @@ <75000000 300000000>, <0 0>, <0 0>, - <75000000 300000000>, - <75000000 300000000>; + <0 0>, + <0 0>; status = "disabled"; }; -- cgit From 197769fede5824d218f1f13f7620243015801d81 Mon Sep 17 00:00:00 2001 From: Jonathan Marek Date: Sat, 22 Jan 2022 11:29:31 -0500 Subject: arm64: dts: qcom: sm8450: enable GCC_USB3_0_CLKREF_EN for usb USB doesn't work at all without this clock enabled. This fixes USB when not using clk_ignore_unused. Fixes: 19fd04fb9247 ("arm64: dts: qcom: sm8450: Add usb nodes") Signed-off-by: Jonathan Marek Reviewed-by: Vinod Koul Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220122162932.7686-1-jonathan@marek.ca --- arch/arm64/boot/dts/qcom/sm8450.dtsi | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/qcom/sm8450.dtsi b/arch/arm64/boot/dts/qcom/sm8450.dtsi index 10c25ad2d0c7..f68475fb7a32 100644 --- a/arch/arm64/boot/dts/qcom/sm8450.dtsi +++ b/arch/arm64/boot/dts/qcom/sm8450.dtsi @@ -1072,9 +1072,10 @@ <&gcc GCC_USB30_PRIM_MASTER_CLK>, <&gcc GCC_AGGRE_USB3_PRIM_AXI_CLK>, <&gcc GCC_USB30_PRIM_MOCK_UTMI_CLK>, - <&gcc GCC_USB30_PRIM_SLEEP_CLK>; + <&gcc GCC_USB30_PRIM_SLEEP_CLK>, + <&gcc GCC_USB3_0_CLKREF_EN>; clock-names = "cfg_noc", "core", "iface", "mock_utmi", - "sleep"; + "sleep", "xo"; assigned-clocks = <&gcc GCC_USB30_PRIM_MOCK_UTMI_CLK>, <&gcc GCC_USB30_PRIM_MASTER_CLK>; -- cgit From 7baa00bef336254e2cea5d4b064afe6430a05309 Mon Sep 17 00:00:00 2001 From: Jonathan Marek Date: Sat, 22 Jan 2022 11:29:32 -0500 Subject: arm64: dts: qcom: sm8450: fix apps_smmu interrupts Update interrupts in apps_smmu to match downstream. This is fixes apps_smmu failing to probe when running at EL2 (expects 96 context interrupts) Fixes: 892d5395396d ("arm64: dts: qcom: sm8450: add smmu nodes") Signed-off-by: Jonathan Marek Reviewed-by: Vinod Koul Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220122162932.7686-2-jonathan@marek.ca --- arch/arm64/boot/dts/qcom/sm8450.dtsi | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/qcom/sm8450.dtsi b/arch/arm64/boot/dts/qcom/sm8450.dtsi index f68475fb7a32..02b97e838c47 100644 --- a/arch/arm64/boot/dts/qcom/sm8450.dtsi +++ b/arch/arm64/boot/dts/qcom/sm8450.dtsi @@ -726,7 +726,7 @@ compatible = "qcom,sm8450-smmu-500", "arm,mmu-500"; reg = <0 0x15000000 0 0x100000>; #iommu-cells = <2>; - #global-interrupts = <2>; + #global-interrupts = <1>; interrupts = , , , @@ -813,6 +813,7 @@ , , , + , , , , -- cgit From 9b818634f8e7e0bca3386a50b1fada7a49036408 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Wed, 2 Feb 2022 14:19:25 -0700 Subject: MAINTAINERS: update mailing list address for NTB subsystem NTB mailing list is moving from linux-ntb@googlegroups.com to ntb@lists.linux.dev in order to get better archive and lore support. Update all entries in MAINTAINERS. Signed-off-by: Dave Jiang Signed-off-by: Jon Mason --- MAINTAINERS | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index ea3e6c914384..0a7edab46ed1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13674,7 +13674,7 @@ F: scripts/nsdeps NTB AMD DRIVER M: Sanjay R Mehta M: Shyam Sundar S K -L: linux-ntb@googlegroups.com +L: ntb@lists.linux.dev S: Supported F: drivers/ntb/hw/amd/ @@ -13682,7 +13682,7 @@ NTB DRIVER CORE M: Jon Mason M: Dave Jiang M: Allen Hubbe -L: linux-ntb@googlegroups.com +L: ntb@lists.linux.dev S: Supported W: https://github.com/jonmason/ntb/wiki T: git git://github.com/jonmason/ntb.git @@ -13694,13 +13694,13 @@ F: tools/testing/selftests/ntb/ NTB IDT DRIVER M: Serge Semin -L: linux-ntb@googlegroups.com +L: ntb@lists.linux.dev S: Supported F: drivers/ntb/hw/idt/ NTB INTEL DRIVER M: Dave Jiang -L: linux-ntb@googlegroups.com +L: ntb@lists.linux.dev S: Supported W: https://github.com/davejiang/linux/wiki T: git https://github.com/davejiang/linux.git -- cgit From 6d0d95a1c2b07270870e7be16575c513c29af3f1 Mon Sep 17 00:00:00 2001 From: Antony Antony Date: Tue, 1 Feb 2022 07:51:57 +0100 Subject: xfrm: fix the if_id check in changelink if_id will be always 0, because it was not yet initialized. Fixes: 8dce43919566 ("xfrm: interface with if_id 0 should return error") Reported-by: Pavel Machek Signed-off-by: Antony Antony Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_interface.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index 57448fc519fc..4e3c62d1ad9e 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -673,12 +673,12 @@ static int xfrmi_changelink(struct net_device *dev, struct nlattr *tb[], struct net *net = xi->net; struct xfrm_if_parms p = {}; + xfrmi_netlink_parms(data, &p); if (!p.if_id) { NL_SET_ERR_MSG(extack, "if_id must be non zero"); return -EINVAL; } - xfrmi_netlink_parms(data, &p); xi = xfrmi_locate(net, &p); if (!xi) { xi = netdev_priv(dev); -- cgit From 2e8a8b5955a000cc655f7e368670518cbb77fe58 Mon Sep 17 00:00:00 2001 From: Sascha Hauer Date: Wed, 26 Jan 2022 15:55:40 +0100 Subject: arm64: dts: rockchip: reorder rk3399 hdmi clocks The binding specifies the clock order to "cec", "grf", "vpll". Reorder the clocks accordingly. Signed-off-by: Sascha Hauer Link: https://lore.kernel.org/r/20220126145549.617165-19-s.hauer@pengutronix.de Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3399.dtsi | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3399.dtsi b/arch/arm64/boot/dts/rockchip/rk3399.dtsi index d3cdf6f42a30..080457a68e3c 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3399.dtsi @@ -1881,10 +1881,10 @@ interrupts = ; clocks = <&cru PCLK_HDMI_CTRL>, <&cru SCLK_HDMI_SFR>, - <&cru PLL_VPLL>, + <&cru SCLK_HDMI_CEC>, <&cru PCLK_VIO_GRF>, - <&cru SCLK_HDMI_CEC>; - clock-names = "iahb", "isfr", "vpll", "grf", "cec"; + <&cru PLL_VPLL>; + clock-names = "iahb", "isfr", "cec", "grf", "vpll"; power-domains = <&power RK3399_PD_HDCP>; reg-io-width = <4>; rockchip,grf = <&grf>; -- cgit From 7c76ecd9c99b6e9a771d813ab1aa7fa428b3ade1 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 8 Feb 2022 16:14:32 +0200 Subject: xfrm: enforce validity of offload input flags struct xfrm_user_offload has flags variable that received user input, but kernel didn't check if valid bits were provided. It caused a situation where not sanitized input was forwarded directly to the drivers. For example, XFRM_OFFLOAD_IPV6 define that was exposed, was used by strongswan, but not implemented in the kernel at all. As a solution, check and sanitize input flags to forward XFRM_OFFLOAD_INBOUND to the drivers. Fixes: d77e38e612a0 ("xfrm: Add an IPsec hardware offloading API") Signed-off-by: Leon Romanovsky Signed-off-by: Steffen Klassert --- include/uapi/linux/xfrm.h | 6 ++++++ net/xfrm/xfrm_device.c | 6 +++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h index 4e29d7851890..65e13a099b1a 100644 --- a/include/uapi/linux/xfrm.h +++ b/include/uapi/linux/xfrm.h @@ -511,6 +511,12 @@ struct xfrm_user_offload { int ifindex; __u8 flags; }; +/* This flag was exposed without any kernel code that supporting it. + * Unfortunately, strongswan has the code that uses sets this flag, + * which makes impossible to reuse this bit. + * + * So leave it here to make sure that it won't be reused by mistake. + */ #define XFRM_OFFLOAD_IPV6 1 #define XFRM_OFFLOAD_INBOUND 2 diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 3fa066419d37..39bce5d764de 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -223,6 +223,9 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, if (x->encap || x->tfcpad) return -EINVAL; + if (xuo->flags & ~(XFRM_OFFLOAD_IPV6 | XFRM_OFFLOAD_INBOUND)) + return -EINVAL; + dev = dev_get_by_index(net, xuo->ifindex); if (!dev) { if (!(xuo->flags & XFRM_OFFLOAD_INBOUND)) { @@ -262,7 +265,8 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, netdev_tracker_alloc(dev, &xso->dev_tracker, GFP_ATOMIC); xso->real_dev = dev; xso->num_exthdrs = 1; - xso->flags = xuo->flags; + /* Don't forward bit that is not implemented */ + xso->flags = xuo->flags & ~XFRM_OFFLOAD_IPV6; err = dev->xfrmdev_ops->xdo_dev_state_add(x); if (err) { -- cgit From 6620e311ae76c502b685247b8f7232e81a321a5b Mon Sep 17 00:00:00 2001 From: Nicolas Ferre Date: Sun, 30 Jan 2022 15:39:35 +0100 Subject: MAINTAINERS: replace a Microchip AT91 maintainer As Ludovic is more focusing on other aspects of the Microchip Linux-based development, replace him with Claudiu. Entry is added to the CREDITS file. Thanks Ludovic for these great contributions in the kernel space! Signed-off-by: Nicolas Ferre Acked-by: Claudiu Beznea Acked-by: Ludovic Desroches Link: https://lore.kernel.org/r/23819d8baa635815d0893955197561fe4f044d5e.1643553501.git.nicolas.ferre@microchip.com --- CREDITS | 6 ++++++ MAINTAINERS | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CREDITS b/CREDITS index b97256d5bc24..7e85a53b6a88 100644 --- a/CREDITS +++ b/CREDITS @@ -895,6 +895,12 @@ S: 3000 FORE Drive S: Warrendale, Pennsylvania 15086 S: USA +N: Ludovic Desroches +E: ludovic.desroches@microchip.com +D: Maintainer for ARM/Microchip (AT91) SoC support +D: Author of ADC, pinctrl, XDMA and SDHCI drivers for this platform +S: France + N: Martin Devera E: devik@cdi.cz W: http://luxik.cdi.cz/~devik/qos/ diff --git a/MAINTAINERS b/MAINTAINERS index ea3e6c914384..674d13708324 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2250,7 +2250,7 @@ F: drivers/phy/mediatek/ ARM/Microchip (AT91) SoC support M: Nicolas Ferre M: Alexandre Belloni -M: Ludovic Desroches +M: Claudiu Beznea L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Supported W: http://www.linux4sam.org -- cgit From 26077968f8389a68fdb38af3f2c2289ddc95e8ca Mon Sep 17 00:00:00 2001 From: Nicolas Ferre Date: Sun, 30 Jan 2022 15:39:36 +0100 Subject: dt-bindings: ARM: at91: update maintainers entry Align the binding documentation with the newly updated MAINTAINERS entry. Signed-off-by: Nicolas Ferre Acked-by: Claudiu Beznea Acked-by: Ludovic Desroches Link: https://lore.kernel.org/r/5bf9873eeee3cd49c52a8952a7cd4cb60b61d50a.1643553501.git.nicolas.ferre@microchip.com --- Documentation/devicetree/bindings/arm/atmel-at91.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/arm/atmel-at91.yaml b/Documentation/devicetree/bindings/arm/atmel-at91.yaml index c612e1f48dba..ff91df04f9f4 100644 --- a/Documentation/devicetree/bindings/arm/atmel-at91.yaml +++ b/Documentation/devicetree/bindings/arm/atmel-at91.yaml @@ -8,7 +8,8 @@ title: Atmel AT91 device tree bindings. maintainers: - Alexandre Belloni - - Ludovic Desroches + - Claudiu Beznea + - Nicolas Ferre description: | Boards with a SoC of the Atmel AT91 or SMART family shall have the following -- cgit From 728390fce4fc4d033a898fb6f5088697d03254b8 Mon Sep 17 00:00:00 2001 From: Dinh Nguyen Date: Thu, 6 Jan 2022 17:37:03 -0600 Subject: dt-bindings: usb: dwc2: add compatible "intel,socfpga-agilex-hsotg" Add the compatible "intel,socfpga-agilex-hsotg" to the DWC2 implementation, because the Agilex DWC2 implementation does not support clock gating. Acked-by: Rob Herring Signed-off-by: Dinh Nguyen --- Documentation/devicetree/bindings/usb/dwc2.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/usb/dwc2.yaml b/Documentation/devicetree/bindings/usb/dwc2.yaml index f00867ebc147..481aaa09f3f2 100644 --- a/Documentation/devicetree/bindings/usb/dwc2.yaml +++ b/Documentation/devicetree/bindings/usb/dwc2.yaml @@ -53,6 +53,7 @@ properties: - const: st,stm32mp15-hsotg - const: snps,dwc2 - const: samsung,s3c6400-hsotg + - const: intel,socfpga-agilex-hsotg reg: maxItems: 1 -- cgit From 268a491aebc25e6dc7c618903b09ac3a2e8af530 Mon Sep 17 00:00:00 2001 From: Dinh Nguyen Date: Thu, 6 Jan 2022 17:53:31 -0600 Subject: arm64: dts: agilex: use the compatible "intel,socfpga-agilex-hsotg" The DWC2 USB controller on the Agilex platform does not support clock gating, so use the chip specific "intel,socfpga-agilex-hsotg" compatible. Signed-off-by: Dinh Nguyen --- arch/arm64/boot/dts/intel/socfpga_agilex.dtsi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/intel/socfpga_agilex.dtsi b/arch/arm64/boot/dts/intel/socfpga_agilex.dtsi index 0dd2d2ee765a..f4270cf18996 100644 --- a/arch/arm64/boot/dts/intel/socfpga_agilex.dtsi +++ b/arch/arm64/boot/dts/intel/socfpga_agilex.dtsi @@ -502,7 +502,7 @@ }; usb0: usb@ffb00000 { - compatible = "snps,dwc2"; + compatible = "intel,socfpga-agilex-hsotg", "snps,dwc2"; reg = <0xffb00000 0x40000>; interrupts = ; phys = <&usbphy0>; @@ -515,7 +515,7 @@ }; usb1: usb@ffb40000 { - compatible = "snps,dwc2"; + compatible = "intel,socfpga-agilex-hsotg", "snps,dwc2"; reg = <0xffb40000 0x40000>; interrupts = ; phys = <&usbphy0>; -- cgit From 1ba603f56568c3b4c2542dfba07afa25f21dcff3 Mon Sep 17 00:00:00 2001 From: Alyssa Ross Date: Fri, 11 Feb 2022 10:27:04 +0000 Subject: firmware: arm_scmi: Remove space in MODULE_ALIAS name modprobe can't handle spaces in aliases. Get rid of it to fix the issue. Link: https://lore.kernel.org/r/20220211102704.128354-1-sudeep.holla@arm.com Fixes: aa4f886f3893 ("firmware: arm_scmi: add basic driver infrastructure for SCMI") Reviewed-by: Cristian Marussi Signed-off-by: Alyssa Ross Signed-off-by: Sudeep Holla --- drivers/firmware/arm_scmi/driver.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/firmware/arm_scmi/driver.c b/drivers/firmware/arm_scmi/driver.c index b406b3f78f46..d76bab3aaac4 100644 --- a/drivers/firmware/arm_scmi/driver.c +++ b/drivers/firmware/arm_scmi/driver.c @@ -2112,7 +2112,7 @@ static void __exit scmi_driver_exit(void) } module_exit(scmi_driver_exit); -MODULE_ALIAS("platform: arm-scmi"); +MODULE_ALIAS("platform:arm-scmi"); MODULE_AUTHOR("Sudeep Holla "); MODULE_DESCRIPTION("ARM SCMI protocol driver"); MODULE_LICENSE("GPL v2"); -- cgit From be4e65bdffab5f588044325117df77dad7e9c45a Mon Sep 17 00:00:00 2001 From: Sascha Hauer Date: Thu, 10 Feb 2022 15:23:53 +0100 Subject: ARM: dts: rockchip: reorder rk322x hmdi clocks The binding specifies the clock order to "iahb", "isfr", "cec". Reorder the clocks accordingly. Signed-off-by: Sascha Hauer Link: https://lore.kernel.org/r/20220210142353.3420859-1-s.hauer@pengutronix.de Signed-off-by: Heiko Stuebner --- arch/arm/boot/dts/rk322x.dtsi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/boot/dts/rk322x.dtsi b/arch/arm/boot/dts/rk322x.dtsi index 8eed9e3a92e9..5868eb512f69 100644 --- a/arch/arm/boot/dts/rk322x.dtsi +++ b/arch/arm/boot/dts/rk322x.dtsi @@ -718,8 +718,8 @@ interrupts = ; assigned-clocks = <&cru SCLK_HDMI_PHY>; assigned-clock-parents = <&hdmi_phy>; - clocks = <&cru SCLK_HDMI_HDCP>, <&cru PCLK_HDMI_CTRL>, <&cru SCLK_HDMI_CEC>; - clock-names = "isfr", "iahb", "cec"; + clocks = <&cru PCLK_HDMI_CTRL>, <&cru SCLK_HDMI_HDCP>, <&cru SCLK_HDMI_CEC>; + clock-names = "iahb", "isfr", "cec"; pinctrl-names = "default"; pinctrl-0 = <&hdmii2c_xfer &hdmi_hpd &hdmi_cec>; resets = <&cru SRST_HDMI_P>; -- cgit From 3916c3619599a3970d3e6f98fb430b7c46266ada Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Wed, 9 Feb 2022 12:03:55 +0000 Subject: ARM: dts: rockchip: fix a typo on rk3288 crypto-controller crypto-controller had a typo, fix it. In the same time, rename it to just crypto Signed-off-by: Corentin Labbe Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20220209120355.1985707-1-clabbe@baylibre.com Signed-off-by: Heiko Stuebner --- arch/arm/boot/dts/rk3288.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/rk3288.dtsi b/arch/arm/boot/dts/rk3288.dtsi index aaaa61875701..45a9d9b908d2 100644 --- a/arch/arm/boot/dts/rk3288.dtsi +++ b/arch/arm/boot/dts/rk3288.dtsi @@ -971,7 +971,7 @@ status = "disabled"; }; - crypto: cypto-controller@ff8a0000 { + crypto: crypto@ff8a0000 { compatible = "rockchip,rk3288-crypto"; reg = <0x0 0xff8a0000 0x0 0x4000>; interrupts = ; -- cgit From ef3075d6638d3d5353a97fcc7bb0338fc85675f5 Mon Sep 17 00:00:00 2001 From: Adam Ford Date: Tue, 25 Jan 2022 11:11:25 -0600 Subject: arm64: dts: imx8mm: Fix VPU Hanging The vpumix power domain has a reset assigned to it, however when used, it causes a system hang. Testing has shown that it does not appear to be needed anywhere. Fixes: d39d4bb15310 ("arm64: dts: imx8mm: add GPC node") Signed-off-by: Adam Ford Reviewed-by: Lucas Stach Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8mm.dtsi | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm64/boot/dts/freescale/imx8mm.dtsi b/arch/arm64/boot/dts/freescale/imx8mm.dtsi index f77f90ed416f..0c7a72c51a31 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mm.dtsi @@ -707,7 +707,6 @@ clocks = <&clk IMX8MM_CLK_VPU_DEC_ROOT>; assigned-clocks = <&clk IMX8MM_CLK_VPU_BUS>; assigned-clock-parents = <&clk IMX8MM_SYS_PLL1_800M>; - resets = <&src IMX8MQ_RESET_VPU_RESET>; }; pgc_vpu_g1: power-domain@7 { -- cgit From 45d941f67b000b6d79159522a0bbfc37cfd584d6 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Fri, 11 Feb 2022 11:02:04 +0000 Subject: arm64: dts: imx8ulp: Set #thermal-sensor-cells to 1 as required The SCMI binding clearly states the value of #thermal-sensor-cells must be 1. However arch/arm64/boot/dts/freescale/imx8ulp.dtsi sets it 0 which results in the following warning with dtbs_check: | arch/arm64/boot/dts/freescale/imx8ulp-evk.dt.yaml: scmi: | protocol@15:#thermal-sensor-cells:0:0: 1 was expected | From schema: Documentation/devicetree/bindings/firmware/arm,scmi.yaml Fix it by setting it to 1 as required. Cc:Shawn Guo Cc: Sascha Hauer Signed-off-by: Sudeep Holla Reviewed-by: Fabio Estevam Acked-by: Peng Fan Fixes: a38771d7a49b ("arm64: dts: imx8ulp: add scmi firmware node") Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8ulp.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/freescale/imx8ulp.dtsi b/arch/arm64/boot/dts/freescale/imx8ulp.dtsi index a987ff7156bd..09f7364dd1d0 100644 --- a/arch/arm64/boot/dts/freescale/imx8ulp.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8ulp.dtsi @@ -132,7 +132,7 @@ scmi_sensor: protocol@15 { reg = <0x15>; - #thermal-sensor-cells = <0>; + #thermal-sensor-cells = <1>; }; }; }; -- cgit From 5ce97f4ec5e0f8726a5dda1710727b1ee9badcac Mon Sep 17 00:00:00 2001 From: Lennert Buytenhek Date: Mon, 4 Oct 2021 13:07:24 +0300 Subject: iommu/amd: Recover from event log overflow The AMD IOMMU logs I/O page faults and such to a ring buffer in system memory, and this ring buffer can overflow. The AMD IOMMU spec has the following to say about the interrupt status bit that signals this overflow condition: EventOverflow: Event log overflow. RW1C. Reset 0b. 1 = IOMMU event log overflow has occurred. This bit is set when a new event is to be written to the event log and there is no usable entry in the event log, causing the new event information to be discarded. An interrupt is generated when EventOverflow = 1b and MMIO Offset 0018h[EventIntEn] = 1b. No new event log entries are written while this bit is set. Software Note: To resume logging, clear EventOverflow (W1C), and write a 1 to MMIO Offset 0018h[EventLogEn]. The AMD IOMMU driver doesn't currently implement this recovery sequence, meaning that if a ring buffer overflow occurs, logging of EVT/PPR/GA events will cease entirely. This patch implements the spec-mandated reset sequence, with the minor tweak that the hardware seems to want to have a 0 written to MMIO Offset 0018h[EventLogEn] first, before writing an 1 into this field, or the IOMMU won't actually resume logging events. Signed-off-by: Lennert Buytenhek Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/YVrSXEdW2rzEfOvk@wantstofly.org Signed-off-by: Joerg Roedel --- drivers/iommu/amd/amd_iommu.h | 1 + drivers/iommu/amd/amd_iommu_types.h | 1 + drivers/iommu/amd/init.c | 10 ++++++++++ drivers/iommu/amd/iommu.c | 10 ++++++++-- 4 files changed, 20 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h index 416815a525d6..bb95edf74415 100644 --- a/drivers/iommu/amd/amd_iommu.h +++ b/drivers/iommu/amd/amd_iommu.h @@ -14,6 +14,7 @@ extern irqreturn_t amd_iommu_int_thread(int irq, void *data); extern irqreturn_t amd_iommu_int_handler(int irq, void *data); extern void amd_iommu_apply_erratum_63(u16 devid); +extern void amd_iommu_restart_event_logging(struct amd_iommu *iommu); extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu); extern int amd_iommu_init_devices(void); extern void amd_iommu_uninit_devices(void); diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index ffc89c4fb120..47108ed44fbb 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -110,6 +110,7 @@ #define PASID_MASK 0x0000ffff /* MMIO status bits */ +#define MMIO_STATUS_EVT_OVERFLOW_INT_MASK (1 << 0) #define MMIO_STATUS_EVT_INT_MASK (1 << 1) #define MMIO_STATUS_COM_WAIT_INT_MASK (1 << 2) #define MMIO_STATUS_PPR_INT_MASK (1 << 6) diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index b10fb52ea442..7bfe37e52e21 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -657,6 +657,16 @@ static int __init alloc_command_buffer(struct amd_iommu *iommu) return iommu->cmd_buf ? 0 : -ENOMEM; } +/* + * This function restarts event logging in case the IOMMU experienced + * an event log buffer overflow. + */ +void amd_iommu_restart_event_logging(struct amd_iommu *iommu) +{ + iommu_feature_disable(iommu, CONTROL_EVT_LOG_EN); + iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN); +} + /* * This function resets the command buffer if the IOMMU stopped fetching * commands from it. diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 461f1844ed1f..a18b549951bb 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -764,7 +764,8 @@ amd_iommu_set_pci_msi_domain(struct device *dev, struct amd_iommu *iommu) { } #endif /* !CONFIG_IRQ_REMAP */ #define AMD_IOMMU_INT_MASK \ - (MMIO_STATUS_EVT_INT_MASK | \ + (MMIO_STATUS_EVT_OVERFLOW_INT_MASK | \ + MMIO_STATUS_EVT_INT_MASK | \ MMIO_STATUS_PPR_INT_MASK | \ MMIO_STATUS_GALOG_INT_MASK) @@ -774,7 +775,7 @@ irqreturn_t amd_iommu_int_thread(int irq, void *data) u32 status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET); while (status & AMD_IOMMU_INT_MASK) { - /* Enable EVT and PPR and GA interrupts again */ + /* Enable interrupt sources again */ writel(AMD_IOMMU_INT_MASK, iommu->mmio_base + MMIO_STATUS_OFFSET); @@ -795,6 +796,11 @@ irqreturn_t amd_iommu_int_thread(int irq, void *data) } #endif + if (status & MMIO_STATUS_EVT_OVERFLOW_INT_MASK) { + pr_info_ratelimited("IOMMU event log overflow\n"); + amd_iommu_restart_event_logging(iommu); + } + /* * Hardware bug: ERBT1312 * When re-enabling interrupt (by writing 1 -- cgit From 40eb0dcf4114cbfff4d207890fa5a19e82da9fdc Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Thu, 10 Feb 2022 17:10:53 +0800 Subject: tee: optee: fix error return code in probe function If teedev_open() fails, probe function need return error code. Fixes: aceeafefff73 ("optee: use driver internal tee_context for some rpc") Reported-by: Hulk Robot Signed-off-by: Yang Yingliang Signed-off-by: Jens Wiklander --- drivers/tee/optee/ffa_abi.c | 4 +++- drivers/tee/optee/smc_abi.c | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/tee/optee/ffa_abi.c b/drivers/tee/optee/ffa_abi.c index 545f61af1248..0c90355896a0 100644 --- a/drivers/tee/optee/ffa_abi.c +++ b/drivers/tee/optee/ffa_abi.c @@ -860,8 +860,10 @@ static int optee_ffa_probe(struct ffa_device *ffa_dev) optee_supp_init(&optee->supp); ffa_dev_set_drvdata(ffa_dev, optee); ctx = teedev_open(optee->teedev); - if (IS_ERR(ctx)) + if (IS_ERR(ctx)) { + rc = PTR_ERR(ctx); goto err_rhashtable_free; + } optee->ctx = ctx; rc = optee_notif_init(optee, OPTEE_DEFAULT_MAX_NOTIF_VALUE); if (rc) diff --git a/drivers/tee/optee/smc_abi.c b/drivers/tee/optee/smc_abi.c index bacd1a1d79ee..4157f4b41bdd 100644 --- a/drivers/tee/optee/smc_abi.c +++ b/drivers/tee/optee/smc_abi.c @@ -1427,8 +1427,10 @@ static int optee_probe(struct platform_device *pdev) platform_set_drvdata(pdev, optee); ctx = teedev_open(optee->teedev); - if (IS_ERR(ctx)) + if (IS_ERR(ctx)) { + rc = PTR_ERR(ctx); goto err_supp_uninit; + } optee->ctx = ctx; rc = optee_notif_init(optee, max_notif_value); if (rc) -- cgit From 6b0b2d9a6a308bcd9300c2d83000a82812c56cea Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 10 Feb 2022 09:47:45 -0600 Subject: iommu/amd: Fix I/O page table memory leak The current logic updates the I/O page table mode for the domain before calling the logic to free memory used for the page table. This results in IOMMU page table memory leak, and can be observed when launching VM w/ pass-through devices. Fix by freeing the memory used for page table before updating the mode. Cc: Joerg Roedel Reported-by: Daniel Jordan Tested-by: Daniel Jordan Signed-off-by: Suravee Suthikulpanit Fixes: e42ba0633064 ("iommu/amd: Restructure code for freeing page table") Link: https://lore.kernel.org/all/20220118194720.urjgi73b7c3tq2o6@oracle.com/ Link: https://lore.kernel.org/r/20220210154745.11524-1-suravee.suthikulpanit@amd.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/io_pgtable.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c index b1bf4125b0f7..6608d1717574 100644 --- a/drivers/iommu/amd/io_pgtable.c +++ b/drivers/iommu/amd/io_pgtable.c @@ -492,18 +492,18 @@ static void v1_free_pgtable(struct io_pgtable *iop) dom = container_of(pgtable, struct protection_domain, iop); - /* Update data structure */ - amd_iommu_domain_clr_pt_root(dom); - - /* Make changes visible to IOMMUs */ - amd_iommu_domain_update(dom); - /* Page-table is not visible to IOMMU anymore, so free it */ BUG_ON(pgtable->mode < PAGE_MODE_NONE || pgtable->mode > PAGE_MODE_6_LEVEL); free_sub_pt(pgtable->root, pgtable->mode, &freelist); + /* Update data structure */ + amd_iommu_domain_clr_pt_root(dom); + + /* Make changes visible to IOMMUs */ + amd_iommu_domain_update(dom); + put_pages_list(&freelist); } -- cgit From 382e3e0eb6a83f1cf73d4dfa3448ade1ed721f22 Mon Sep 17 00:00:00 2001 From: Steev Klimaszewski Date: Thu, 4 Nov 2021 22:52:32 -0500 Subject: arm64: dts: qcom: c630: disable crypto due to serror Disable the crypto block due to it causing an SError in qce_start() on the C630, which happens upon every boot when cryptomanager tests are enabled. Signed-off-by: Steev Klimaszewski [bjorn: Reworked commit message] Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/20211105035235.2392-1-steev@kali.org --- arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts b/arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts index 58845a14805f..e2b9ec134cb1 100644 --- a/arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts +++ b/arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts @@ -807,3 +807,8 @@ qcom,snoc-host-cap-8bit-quirk; }; + +&crypto { + /* FIXME: qce_start triggers an SError */ + status= "disable"; +}; -- cgit From 4330e2c5c04c27bebf89d34e0bc14e6943413067 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 17 Nov 2021 15:15:26 +0000 Subject: arm64: entry.S: Add ventry overflow sanity checks Subsequent patches add even more code to the ventry slots. Ensure kernels that overflow a ventry slot don't get built. Reviewed-by: Russell King (Oracle) Reviewed-by: Catalin Marinas Signed-off-by: James Morse --- arch/arm64/kernel/entry.S | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 772ec2ecf488..bd940b2254da 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -37,6 +37,7 @@ .macro kernel_ventry, el:req, ht:req, regsize:req, label:req .align 7 +.Lventry_start\@: #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 .if \el == 0 alternative_if ARM64_UNMAP_KERNEL_AT_EL0 @@ -95,6 +96,7 @@ alternative_else_nop_endif mrs x0, tpidrro_el0 #endif b el\el\ht\()_\regsize\()_\label +.org .Lventry_start\@ + 128 // Did we overflow the ventry slot? .endm .macro tramp_alias, dst, sym @@ -662,6 +664,7 @@ alternative_else_nop_endif add x30, x30, #(1b - tramp_vectors) isb ret +.org 1b + 128 // Did we overflow the ventry slot? .endm .macro tramp_exit, regsize = 64 -- cgit From 1b33d4860deaecf1d8eec3061b7e7ed7ab0bae8d Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 16 Nov 2021 15:00:51 +0000 Subject: arm64: spectre: Rename spectre_v4_patch_fw_mitigation_conduit The spectre-v4 sequence includes an SMC from the assembly entry code. spectre_v4_patch_fw_mitigation_conduit is the patching callback that generates an HVC or SMC depending on the SMCCC conduit type. As this isn't specific to spectre-v4, rename it smccc_patch_fw_mitigation_conduit so it can be re-used. Reviewed-by: Russell King (Oracle) Reviewed-by: Catalin Marinas Signed-off-by: James Morse --- arch/arm64/kernel/entry.S | 2 +- arch/arm64/kernel/proton-pack.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index bd940b2254da..d97eb024adec 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -118,7 +118,7 @@ alternative_cb_end tbnz \tmp2, #TIF_SSBD, .L__asm_ssbd_skip\@ mov w0, #ARM_SMCCC_ARCH_WORKAROUND_2 mov w1, #\state -alternative_cb spectre_v4_patch_fw_mitigation_conduit +alternative_cb smccc_patch_fw_mitigation_conduit nop // Patched to SMC/HVC #0 alternative_cb_end .L__asm_ssbd_skip\@: diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index 902e4084c477..9394f21d7566 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -554,9 +554,9 @@ void __init spectre_v4_patch_fw_mitigation_enable(struct alt_instr *alt, * Patch a NOP in the Spectre-v4 mitigation code with an SMC/HVC instruction * to call into firmware to adjust the mitigation state. */ -void __init spectre_v4_patch_fw_mitigation_conduit(struct alt_instr *alt, - __le32 *origptr, - __le32 *updptr, int nr_inst) +void __init smccc_patch_fw_mitigation_conduit(struct alt_instr *alt, + __le32 *origptr, + __le32 *updptr, int nr_inst) { u32 insn; -- cgit From 5bdf3437603d4af87f9c7f424b0c8aeed2420745 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 16 Nov 2021 15:06:19 +0000 Subject: KVM: arm64: Allow indirect vectors to be used without SPECTRE_V3A CPUs vulnerable to Spectre-BHB either need to make an SMC-CC firmware call from the vectors, or run a sequence of branches. This gets added to the hyp vectors. If there is no support for arch-workaround-1 in firmware, the indirect vector will be used. kvm_init_vector_slots() only initialises the two indirect slots if the platform is vulnerable to Spectre-v3a. pKVM's hyp_map_vectors() only initialises __hyp_bp_vect_base if the platform is vulnerable to Spectre-v3a. As there are about to more users of the indirect vectors, ensure their entries in hyp_spectre_vector_selector[] are always initialised, and __hyp_bp_vect_base defaults to the regular VA mapping. The Spectre-v3a check is moved to a helper kvm_system_needs_idmapped_vectors(), and merged with the code that creates the hyp mappings. Reviewed-by: Catalin Marinas Signed-off-by: James Morse --- arch/arm64/include/asm/kvm_host.h | 5 +++++ arch/arm64/kvm/arm.c | 5 +---- arch/arm64/kvm/hyp/nvhe/mm.c | 4 +++- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 5bc01e62c08a..031e3a2537fc 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -714,6 +714,11 @@ static inline void kvm_init_host_cpu_context(struct kvm_cpu_context *cpu_ctxt) ctxt_sys_reg(cpu_ctxt, MPIDR_EL1) = read_cpuid_mpidr(); } +static inline bool kvm_system_needs_idmapped_vectors(void) +{ + return cpus_have_const_cap(ARM64_SPECTRE_V3A); +} + void kvm_arm_vcpu_ptrauth_trap(struct kvm_vcpu *vcpu); static inline void kvm_arch_hardware_unsetup(void) {} diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index ecc5958e27fe..4dca6ffd03d4 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1491,10 +1491,7 @@ static int kvm_init_vector_slots(void) base = kern_hyp_va(kvm_ksym_ref(__bp_harden_hyp_vecs)); kvm_init_vector_slot(base, HYP_VECTOR_SPECTRE_DIRECT); - if (!cpus_have_const_cap(ARM64_SPECTRE_V3A)) - return 0; - - if (!has_vhe()) { + if (kvm_system_needs_idmapped_vectors() && !has_vhe()) { err = create_hyp_exec_mappings(__pa_symbol(__bp_harden_hyp_vecs), __BP_HARDEN_HYP_VECS_SZ, &base); if (err) diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c index 526a7d6fa86f..cdbe8e246418 100644 --- a/arch/arm64/kvm/hyp/nvhe/mm.c +++ b/arch/arm64/kvm/hyp/nvhe/mm.c @@ -148,8 +148,10 @@ int hyp_map_vectors(void) phys_addr_t phys; void *bp_base; - if (!cpus_have_const_cap(ARM64_SPECTRE_V3A)) + if (!kvm_system_needs_idmapped_vectors()) { + __hyp_bp_vect_base = __bp_harden_hyp_vecs; return 0; + } phys = __hyp_pa(__bp_harden_hyp_vecs); bp_base = (void *)__pkvm_create_private_mapping(phys, -- cgit From d739da1694a0eaef0358a42b76904b611539b77b Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 24 Nov 2021 15:36:12 +0000 Subject: arm64: entry: Make the trampoline cleanup optional Subsequent patches will add additional sets of vectors that use the same tricks as the kpti vectors to reach the full-fat vectors. The full-fat vectors contain some cleanup for kpti that is patched in by alternatives when kpti is in use. Once there are additional vectors, the cleanup will be needed in more cases. But on big/little systems, the cleanup would be harmful if no trampoline vector were in use. Instead of forcing CPUs that don't need a trampoline vector to use one, make the trampoline cleanup optional. Entry at the top of the vectors will skip the cleanup. The trampoline vectors can then skip the first instruction, triggering the cleanup to run. Reviewed-by: Russell King (Oracle) Reviewed-by: Catalin Marinas Signed-off-by: James Morse --- arch/arm64/kernel/entry.S | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index d97eb024adec..bdbdb92b5f77 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -40,14 +40,18 @@ .Lventry_start\@: #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 .if \el == 0 -alternative_if ARM64_UNMAP_KERNEL_AT_EL0 + /* + * This must be the first instruction of the EL0 vector entries. It is + * skipped by the trampoline vectors, to trigger the cleanup. + */ + b .Lskip_tramp_vectors_cleanup\@ .if \regsize == 64 mrs x30, tpidrro_el0 msr tpidrro_el0, xzr .else mov x30, xzr .endif -alternative_else_nop_endif +.Lskip_tramp_vectors_cleanup\@: .endif #endif @@ -661,7 +665,7 @@ alternative_if_not ARM64_WORKAROUND_CAVIUM_TX2_219_PRFM prfm plil1strm, [x30, #(1b - tramp_vectors)] alternative_else_nop_endif msr vbar_el1, x30 - add x30, x30, #(1b - tramp_vectors) + add x30, x30, #(1b - tramp_vectors + 4) isb ret .org 1b + 128 // Did we overflow the ventry slot? -- cgit From 03aff3a77a58b5b52a77e00537a42090ad57b80b Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 23 Nov 2021 18:41:43 +0000 Subject: arm64: entry: Free up another register on kpti's tramp_exit path Kpti stashes x30 in far_el1 while it uses x30 for all its work. Making the vectors a per-cpu data structure will require a second register. Allow tramp_exit two registers before it unmaps the kernel, by leaving x30 on the stack, and stashing x29 in far_el1. Reviewed-by: Russell King (Oracle) Reviewed-by: Catalin Marinas Signed-off-by: James Morse --- arch/arm64/kernel/entry.S | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index bdbdb92b5f77..45e89135dc11 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -419,14 +419,16 @@ alternative_else_nop_endif ldp x24, x25, [sp, #16 * 12] ldp x26, x27, [sp, #16 * 13] ldp x28, x29, [sp, #16 * 14] - ldr lr, [sp, #S_LR] - add sp, sp, #PT_REGS_SIZE // restore sp .if \el == 0 -alternative_insn eret, nop, ARM64_UNMAP_KERNEL_AT_EL0 +alternative_if_not ARM64_UNMAP_KERNEL_AT_EL0 + ldr lr, [sp, #S_LR] + add sp, sp, #PT_REGS_SIZE // restore sp + eret +alternative_else_nop_endif #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 bne 4f - msr far_el1, x30 + msr far_el1, x29 tramp_alias x30, tramp_exit_native br x30 4: @@ -434,6 +436,9 @@ alternative_insn eret, nop, ARM64_UNMAP_KERNEL_AT_EL0 br x30 #endif .else + ldr lr, [sp, #S_LR] + add sp, sp, #PT_REGS_SIZE // restore sp + /* Ensure any device/NC reads complete */ alternative_insn nop, "dmb sy", ARM64_WORKAROUND_1508412 @@ -674,10 +679,12 @@ alternative_else_nop_endif .macro tramp_exit, regsize = 64 adr x30, tramp_vectors msr vbar_el1, x30 - tramp_unmap_kernel x30 + ldr lr, [sp, #S_LR] + tramp_unmap_kernel x29 .if \regsize == 64 - mrs x30, far_el1 + mrs x29, far_el1 .endif + add sp, sp, #PT_REGS_SIZE // restore sp eret sb .endm -- cgit From c091fb6ae059cda563b2a4d93fdbc548ef34e1d6 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 23 Nov 2021 15:43:31 +0000 Subject: arm64: entry: Move the trampoline data page before the text page The trampoline code has a data page that holds the address of the vectors, which is unmapped when running in user-space. This ensures that with CONFIG_RANDOMIZE_BASE, the randomised address of the kernel can't be discovered until after the kernel has been mapped. If the trampoline text page is extended to include multiple sets of vectors, it will be larger than a single page, making it tricky to find the data page without knowing the size of the trampoline text pages, which will vary with PAGE_SIZE. Move the data page to appear before the text page. This allows the data page to be found without knowing the size of the trampoline text pages. 'tramp_vectors' is used to refer to the beginning of the .entry.tramp.text section, do that explicitly. Reviewed-by: Russell King (Oracle) Reviewed-by: Catalin Marinas Signed-off-by: James Morse --- arch/arm64/include/asm/fixmap.h | 2 +- arch/arm64/kernel/entry.S | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/fixmap.h b/arch/arm64/include/asm/fixmap.h index 4335800201c9..0aabc0253b18 100644 --- a/arch/arm64/include/asm/fixmap.h +++ b/arch/arm64/include/asm/fixmap.h @@ -62,8 +62,8 @@ enum fixed_addresses { #endif /* CONFIG_ACPI_APEI_GHES */ #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 - FIX_ENTRY_TRAMP_DATA, FIX_ENTRY_TRAMP_TEXT, + FIX_ENTRY_TRAMP_DATA, #define TRAMP_VALIAS (__fix_to_virt(FIX_ENTRY_TRAMP_TEXT)) #endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ __end_of_permanent_fixed_addresses, diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 45e89135dc11..d8a76869e873 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -644,6 +644,11 @@ alternative_else_nop_endif */ .endm + .macro tramp_data_page dst + adr \dst, .entry.tramp.text + sub \dst, \dst, PAGE_SIZE + .endm + .macro tramp_ventry, regsize = 64 .align 7 1: @@ -660,7 +665,7 @@ alternative_else_nop_endif 2: tramp_map_kernel x30 #ifdef CONFIG_RANDOMIZE_BASE - adr x30, tramp_vectors + PAGE_SIZE + tramp_data_page x30 alternative_insn isb, nop, ARM64_WORKAROUND_QCOM_FALKOR_E1003 ldr x30, [x30] #else @@ -851,7 +856,7 @@ SYM_CODE_START(__sdei_asm_entry_trampoline) 1: str x4, [x1, #(SDEI_EVENT_INTREGS + S_SDEI_TTBR1)] #ifdef CONFIG_RANDOMIZE_BASE - adr x4, tramp_vectors + PAGE_SIZE + tramp_data_page x4 add x4, x4, #:lo12:__sdei_asm_trampoline_next_handler ldr x4, [x4] #else -- cgit From 6c5bf79b69f911560fbf82214c0971af6e58e682 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 24 Nov 2021 11:40:18 +0000 Subject: arm64: entry: Allow tramp_alias to access symbols after the 4K boundary Systems using kpti enter and exit the kernel through a trampoline mapping that is always mapped, even when the kernel is not. tramp_valias is a macro to find the address of a symbol in the trampoline mapping. Adding extra sets of vectors will expand the size of the entry.tramp.text section to beyond 4K. tramp_valias will be unable to generate addresses for symbols beyond 4K as it uses the 12 bit immediate of the add instruction. As there are now two registers available when tramp_alias is called, use the extra register to avoid the 4K limit of the 12 bit immediate. Reviewed-by: Russell King (Oracle) Reviewed-by: Catalin Marinas Signed-off-by: James Morse --- arch/arm64/kernel/entry.S | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index d8a76869e873..192cf77bd374 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -103,9 +103,12 @@ .org .Lventry_start\@ + 128 // Did we overflow the ventry slot? .endm - .macro tramp_alias, dst, sym + .macro tramp_alias, dst, sym, tmp mov_q \dst, TRAMP_VALIAS - add \dst, \dst, #(\sym - .entry.tramp.text) + adr_l \tmp, \sym + add \dst, \dst, \tmp + adr_l \tmp, .entry.tramp.text + sub \dst, \dst, \tmp .endm /* @@ -429,10 +432,10 @@ alternative_else_nop_endif #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 bne 4f msr far_el1, x29 - tramp_alias x30, tramp_exit_native + tramp_alias x30, tramp_exit_native, x29 br x30 4: - tramp_alias x30, tramp_exit_compat + tramp_alias x30, tramp_exit_compat, x29 br x30 #endif .else @@ -1000,7 +1003,7 @@ alternative_if_not ARM64_UNMAP_KERNEL_AT_EL0 alternative_else_nop_endif #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 - tramp_alias dst=x5, sym=__sdei_asm_exit_trampoline + tramp_alias dst=x5, sym=__sdei_asm_exit_trampoline, tmp=x3 br x5 #endif SYM_CODE_END(__sdei_asm_handler) -- cgit From ed50da7764535f1e24432ded289974f2bf2b0c5a Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 24 Nov 2021 13:40:09 +0000 Subject: arm64: entry: Don't assume tramp_vectors is the start of the vectors The tramp_ventry macro uses tramp_vectors as the address of the vectors when calculating which ventry in the 'full fat' vectors to branch to. While there is one set of tramp_vectors, this will be true. Adding multiple sets of vectors will break this assumption. Move the generation of the vectors to a macro, and pass the start of the vectors as an argument to tramp_ventry. Reviewed-by: Russell King (Oracle) Reviewed-by: Catalin Marinas Signed-off-by: James Morse --- arch/arm64/kernel/entry.S | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 192cf77bd374..4e6b8d97d941 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -652,7 +652,7 @@ alternative_else_nop_endif sub \dst, \dst, PAGE_SIZE .endm - .macro tramp_ventry, regsize = 64 + .macro tramp_ventry, vector_start, regsize .align 7 1: .if \regsize == 64 @@ -675,10 +675,10 @@ alternative_insn isb, nop, ARM64_WORKAROUND_QCOM_FALKOR_E1003 ldr x30, =vectors #endif alternative_if_not ARM64_WORKAROUND_CAVIUM_TX2_219_PRFM - prfm plil1strm, [x30, #(1b - tramp_vectors)] + prfm plil1strm, [x30, #(1b - \vector_start)] alternative_else_nop_endif msr vbar_el1, x30 - add x30, x30, #(1b - tramp_vectors + 4) + add x30, x30, #(1b - \vector_start + 4) isb ret .org 1b + 128 // Did we overflow the ventry slot? @@ -697,19 +697,21 @@ alternative_else_nop_endif sb .endm - .align 11 -SYM_CODE_START_NOALIGN(tramp_vectors) + .macro generate_tramp_vector +.Lvector_start\@: .space 0x400 - tramp_ventry - tramp_ventry - tramp_ventry - tramp_ventry + .rept 4 + tramp_ventry .Lvector_start\@, 64 + .endr + .rept 4 + tramp_ventry .Lvector_start\@, 32 + .endr + .endm - tramp_ventry 32 - tramp_ventry 32 - tramp_ventry 32 - tramp_ventry 32 + .align 11 +SYM_CODE_START_NOALIGN(tramp_vectors) + generate_tramp_vector SYM_CODE_END(tramp_vectors) SYM_CODE_START(tramp_exit_native) -- cgit From 13d7a08352a83ef2252aeb464a5e08dfc06b5dfd Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 18 Nov 2021 14:02:30 +0000 Subject: arm64: entry: Move trampoline macros out of ifdef'd section The macros for building the kpti trampoline are all behind CONFIG_UNMAP_KERNEL_AT_EL0, and in a region that outputs to the .entry.tramp.text section. Move the macros out so they can be used to generate other kinds of trampoline. Only the symbols need to be guarded by CONFIG_UNMAP_KERNEL_AT_EL0 and appear in the .entry.tramp.text section. Reviewed-by: Russell King (Oracle) Reviewed-by: Catalin Marinas Signed-off-by: James Morse --- arch/arm64/kernel/entry.S | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 4e6b8d97d941..2735c8941c2d 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -608,12 +608,6 @@ SYM_CODE_END(ret_to_user) .popsection // .entry.text -#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 -/* - * Exception vectors trampoline. - */ - .pushsection ".entry.tramp.text", "ax" - // Move from tramp_pg_dir to swapper_pg_dir .macro tramp_map_kernel, tmp mrs \tmp, ttbr1_el1 @@ -709,6 +703,11 @@ alternative_else_nop_endif .endr .endm +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +/* + * Exception vectors trampoline. + */ + .pushsection ".entry.tramp.text", "ax" .align 11 SYM_CODE_START_NOALIGN(tramp_vectors) generate_tramp_vector -- cgit From c47e4d04ba0f1ea17353d85d45f611277507e07a Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 18 Nov 2021 13:16:23 +0000 Subject: arm64: entry: Make the kpti trampoline's kpti sequence optional Spectre-BHB needs to add sequences to the vectors. Having one global set of vectors is a problem for big/little systems where the sequence is costly on cpus that are not vulnerable. Making the vectors per-cpu in the style of KVM's bh_harden_hyp_vecs requires the vectors to be generated by macros. Make the kpti re-mapping of the kernel optional, so the macros can be used without kpti. Reviewed-by: Russell King (Oracle) Reviewed-by: Catalin Marinas Signed-off-by: James Morse --- arch/arm64/kernel/entry.S | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 2735c8941c2d..23e85c94ea36 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -646,9 +646,10 @@ alternative_else_nop_endif sub \dst, \dst, PAGE_SIZE .endm - .macro tramp_ventry, vector_start, regsize + .macro tramp_ventry, vector_start, regsize, kpti .align 7 1: + .if \kpti == 1 .if \regsize == 64 msr tpidrro_el0, x30 // Restored in kernel_ventry .endif @@ -671,9 +672,14 @@ alternative_insn isb, nop, ARM64_WORKAROUND_QCOM_FALKOR_E1003 alternative_if_not ARM64_WORKAROUND_CAVIUM_TX2_219_PRFM prfm plil1strm, [x30, #(1b - \vector_start)] alternative_else_nop_endif + msr vbar_el1, x30 - add x30, x30, #(1b - \vector_start + 4) isb + .else + ldr x30, =vectors + .endif // \kpti == 1 + + add x30, x30, #(1b - \vector_start + 4) ret .org 1b + 128 // Did we overflow the ventry slot? .endm @@ -691,15 +697,15 @@ alternative_else_nop_endif sb .endm - .macro generate_tramp_vector + .macro generate_tramp_vector, kpti .Lvector_start\@: .space 0x400 .rept 4 - tramp_ventry .Lvector_start\@, 64 + tramp_ventry .Lvector_start\@, 64, \kpti .endr .rept 4 - tramp_ventry .Lvector_start\@, 32 + tramp_ventry .Lvector_start\@, 32, \kpti .endr .endm @@ -710,7 +716,7 @@ alternative_else_nop_endif .pushsection ".entry.tramp.text", "ax" .align 11 SYM_CODE_START_NOALIGN(tramp_vectors) - generate_tramp_vector + generate_tramp_vector kpti=1 SYM_CODE_END(tramp_vectors) SYM_CODE_START(tramp_exit_native) -- cgit From a9c406e6462ff14956d690de7bbe5131a5677dc9 Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 18 Nov 2021 15:04:32 +0000 Subject: arm64: entry: Allow the trampoline text to occupy multiple pages Adding a second set of vectors to .entry.tramp.text will make it larger than a single 4K page. Allow the trampoline text to occupy up to three pages by adding two more fixmap slots. Previous changes to tramp_valias allowed it to reach beyond a single page. Reviewed-by: Catalin Marinas Signed-off-by: James Morse --- arch/arm64/include/asm/fixmap.h | 6 ++++-- arch/arm64/include/asm/sections.h | 5 +++++ arch/arm64/kernel/entry.S | 2 +- arch/arm64/kernel/vmlinux.lds.S | 2 +- arch/arm64/mm/mmu.c | 12 +++++++++--- 5 files changed, 20 insertions(+), 7 deletions(-) diff --git a/arch/arm64/include/asm/fixmap.h b/arch/arm64/include/asm/fixmap.h index 0aabc0253b18..daff882883f9 100644 --- a/arch/arm64/include/asm/fixmap.h +++ b/arch/arm64/include/asm/fixmap.h @@ -62,9 +62,11 @@ enum fixed_addresses { #endif /* CONFIG_ACPI_APEI_GHES */ #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 - FIX_ENTRY_TRAMP_TEXT, + FIX_ENTRY_TRAMP_TEXT3, + FIX_ENTRY_TRAMP_TEXT2, + FIX_ENTRY_TRAMP_TEXT1, FIX_ENTRY_TRAMP_DATA, -#define TRAMP_VALIAS (__fix_to_virt(FIX_ENTRY_TRAMP_TEXT)) +#define TRAMP_VALIAS (__fix_to_virt(FIX_ENTRY_TRAMP_TEXT1)) #endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ __end_of_permanent_fixed_addresses, diff --git a/arch/arm64/include/asm/sections.h b/arch/arm64/include/asm/sections.h index 152cb35bf9df..40971ac1303f 100644 --- a/arch/arm64/include/asm/sections.h +++ b/arch/arm64/include/asm/sections.h @@ -23,4 +23,9 @@ extern char __mmuoff_data_start[], __mmuoff_data_end[]; extern char __entry_tramp_text_start[], __entry_tramp_text_end[]; extern char __relocate_new_kernel_start[], __relocate_new_kernel_end[]; +static inline size_t entry_tramp_text_size(void) +{ + return __entry_tramp_text_end - __entry_tramp_text_start; +} + #endif /* __ASM_SECTIONS_H */ diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 23e85c94ea36..e0a0c1da5db8 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -642,7 +642,7 @@ alternative_else_nop_endif .endm .macro tramp_data_page dst - adr \dst, .entry.tramp.text + adr_l \dst, .entry.tramp.text sub \dst, \dst, PAGE_SIZE .endm diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 50bab186c49b..edaf0faf766f 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -341,7 +341,7 @@ ASSERT(__hibernate_exit_text_end - (__hibernate_exit_text_start & ~(SZ_4K - 1)) <= SZ_4K, "Hibernate exit text too big or misaligned") #endif #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 -ASSERT((__entry_tramp_text_end - __entry_tramp_text_start) == PAGE_SIZE, +ASSERT((__entry_tramp_text_end - __entry_tramp_text_start) <= 3*PAGE_SIZE, "Entry trampoline text too big") #endif #ifdef CONFIG_KVM diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index acfae9b41cc8..49abbf43bf35 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -617,6 +617,8 @@ early_param("rodata", parse_rodata); #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 static int __init map_entry_trampoline(void) { + int i; + pgprot_t prot = rodata_enabled ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC; phys_addr_t pa_start = __pa_symbol(__entry_tramp_text_start); @@ -625,11 +627,15 @@ static int __init map_entry_trampoline(void) /* Map only the text into the trampoline page table */ memset(tramp_pg_dir, 0, PGD_SIZE); - __create_pgd_mapping(tramp_pg_dir, pa_start, TRAMP_VALIAS, PAGE_SIZE, - prot, __pgd_pgtable_alloc, 0); + __create_pgd_mapping(tramp_pg_dir, pa_start, TRAMP_VALIAS, + entry_tramp_text_size(), prot, + __pgd_pgtable_alloc, NO_BLOCK_MAPPINGS); /* Map both the text and data into the kernel page table */ - __set_fixmap(FIX_ENTRY_TRAMP_TEXT, pa_start, prot); + for (i = 0; i < DIV_ROUND_UP(entry_tramp_text_size(), PAGE_SIZE); i++) + __set_fixmap(FIX_ENTRY_TRAMP_TEXT1 - i, + pa_start + i * PAGE_SIZE, prot); + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { extern char __entry_tramp_data_start[]; -- cgit From aff65393fa1401e034656e349abd655cfe272de0 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 24 Nov 2021 15:03:15 +0000 Subject: arm64: entry: Add non-kpti __bp_harden_el1_vectors for mitigations kpti is an optional feature, for systems not using kpti a set of vectors for the spectre-bhb mitigations is needed. Add another set of vectors, __bp_harden_el1_vectors, that will be used if a mitigation is needed and kpti is not in use. The EL1 ventries are repeated verbatim as there is no additional work needed for entry from EL1. Reviewed-by: Russell King (Oracle) Reviewed-by: Catalin Marinas Signed-off-by: James Morse --- arch/arm64/kernel/entry.S | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index e0a0c1da5db8..9c4ff75f983e 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -649,10 +649,11 @@ alternative_else_nop_endif .macro tramp_ventry, vector_start, regsize, kpti .align 7 1: - .if \kpti == 1 .if \regsize == 64 msr tpidrro_el0, x30 // Restored in kernel_ventry .endif + + .if \kpti == 1 /* * Defend against branch aliasing attacks by pushing a dummy * entry onto the return stack and using a RET instruction to @@ -739,6 +740,38 @@ SYM_DATA_END(__entry_tramp_data_start) #endif /* CONFIG_RANDOMIZE_BASE */ #endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ +/* + * Exception vectors for spectre mitigations on entry from EL1 when + * kpti is not in use. + */ + .macro generate_el1_vector +.Lvector_start\@: + kernel_ventry 1, t, 64, sync // Synchronous EL1t + kernel_ventry 1, t, 64, irq // IRQ EL1t + kernel_ventry 1, t, 64, fiq // FIQ EL1h + kernel_ventry 1, t, 64, error // Error EL1t + + kernel_ventry 1, h, 64, sync // Synchronous EL1h + kernel_ventry 1, h, 64, irq // IRQ EL1h + kernel_ventry 1, h, 64, fiq // FIQ EL1h + kernel_ventry 1, h, 64, error // Error EL1h + + .rept 4 + tramp_ventry .Lvector_start\@, 64, kpti=0 + .endr + .rept 4 + tramp_ventry .Lvector_start\@, 32, kpti=0 + .endr + .endm + + .pushsection ".entry.text", "ax" + .align 11 +SYM_CODE_START(__bp_harden_el1_vectors) + generate_el1_vector +SYM_CODE_END(__bp_harden_el1_vectors) + .popsection + + /* * Register switch for AArch64. The callee-saved registers need to be saved * and restored. On entry: -- cgit From ba2689234be92024e5635d30fe744f4853ad97db Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 18 Nov 2021 13:59:46 +0000 Subject: arm64: entry: Add vectors that have the bhb mitigation sequences Some CPUs affected by Spectre-BHB need a sequence of branches, or a firmware call to be run before any indirect branch. This needs to go in the vectors. No CPU needs both. While this can be patched in, it would run on all CPUs as there is a single set of vectors. If only one part of a big/little combination is affected, the unaffected CPUs have to run the mitigation too. Create extra vectors that include the sequence. Subsequent patches will allow affected CPUs to select this set of vectors. Later patches will modify the loop count to match what the CPU requires. Reviewed-by: Catalin Marinas Signed-off-by: James Morse --- arch/arm64/include/asm/assembler.h | 24 +++++++++++++++++ arch/arm64/include/asm/vectors.h | 34 ++++++++++++++++++++++++ arch/arm64/kernel/entry.S | 53 +++++++++++++++++++++++++++++++------- arch/arm64/kernel/proton-pack.c | 16 ++++++++++++ include/linux/arm-smccc.h | 5 ++++ 5 files changed, 123 insertions(+), 9 deletions(-) create mode 100644 arch/arm64/include/asm/vectors.h diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index e8bd0af0141c..046c38ee2841 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -850,4 +850,28 @@ alternative_endif #endif /* GNU_PROPERTY_AARCH64_FEATURE_1_DEFAULT */ + .macro __mitigate_spectre_bhb_loop tmp +#ifdef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY + mov \tmp, #32 +.Lspectre_bhb_loop\@: + b . + 4 + subs \tmp, \tmp, #1 + b.ne .Lspectre_bhb_loop\@ + sb +#endif /* CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ + .endm + + /* Save/restores x0-x3 to the stack */ + .macro __mitigate_spectre_bhb_fw +#ifdef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY + stp x0, x1, [sp, #-16]! + stp x2, x3, [sp, #-16]! + mov w0, #ARM_SMCCC_ARCH_WORKAROUND_3 +alternative_cb smccc_patch_fw_mitigation_conduit + nop // Patched to SMC/HVC #0 +alternative_cb_end + ldp x2, x3, [sp], #16 + ldp x0, x1, [sp], #16 +#endif /* CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ + .endm #endif /* __ASM_ASSEMBLER_H */ diff --git a/arch/arm64/include/asm/vectors.h b/arch/arm64/include/asm/vectors.h new file mode 100644 index 000000000000..bac53fad037d --- /dev/null +++ b/arch/arm64/include/asm/vectors.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2022 ARM Ltd. + */ +#ifndef __ASM_VECTORS_H +#define __ASM_VECTORS_H + +/* + * Note: the order of this enum corresponds to two arrays in entry.S: + * tramp_vecs and __bp_harden_el1_vectors. By default the canonical + * 'full fat' vectors are used directly. + */ +enum arm64_bp_harden_el1_vectors { +#ifdef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY + /* + * Perform the BHB loop mitigation, before branching to the canonical + * vectors. + */ + EL1_VECTOR_BHB_LOOP, + + /* + * Make the SMC call for firmware mitigation, before branching to the + * canonical vectors. + */ + EL1_VECTOR_BHB_FW, +#endif /* CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ + + /* + * Remap the kernel before branching to the canonical vectors. + */ + EL1_VECTOR_KPTI, ++}; + +#endif /* __ASM_VECTORS_H */ diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 9c4ff75f983e..2ceb0c3647b4 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -646,13 +646,26 @@ alternative_else_nop_endif sub \dst, \dst, PAGE_SIZE .endm - .macro tramp_ventry, vector_start, regsize, kpti + +#define BHB_MITIGATION_NONE 0 +#define BHB_MITIGATION_LOOP 1 +#define BHB_MITIGATION_FW 2 + + .macro tramp_ventry, vector_start, regsize, kpti, bhb .align 7 1: .if \regsize == 64 msr tpidrro_el0, x30 // Restored in kernel_ventry .endif + .if \bhb == BHB_MITIGATION_LOOP + /* + * This sequence must appear before the first indirect branch. i.e. the + * ret out of tramp_ventry. It appears here because x30 is free. + */ + __mitigate_spectre_bhb_loop x30 + .endif // \bhb == BHB_MITIGATION_LOOP + .if \kpti == 1 /* * Defend against branch aliasing attacks by pushing a dummy @@ -680,6 +693,15 @@ alternative_else_nop_endif ldr x30, =vectors .endif // \kpti == 1 + .if \bhb == BHB_MITIGATION_FW + /* + * The firmware sequence must appear before the first indirect branch. + * i.e. the ret out of tramp_ventry. But it also needs the stack to be + * mapped to save/restore the registers the SMC clobbers. + */ + __mitigate_spectre_bhb_fw + .endif // \bhb == BHB_MITIGATION_FW + add x30, x30, #(1b - \vector_start + 4) ret .org 1b + 128 // Did we overflow the ventry slot? @@ -687,6 +709,9 @@ alternative_else_nop_endif .macro tramp_exit, regsize = 64 adr x30, tramp_vectors +#ifdef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY + add x30, x30, SZ_4K +#endif msr vbar_el1, x30 ldr lr, [sp, #S_LR] tramp_unmap_kernel x29 @@ -698,26 +723,32 @@ alternative_else_nop_endif sb .endm - .macro generate_tramp_vector, kpti + .macro generate_tramp_vector, kpti, bhb .Lvector_start\@: .space 0x400 .rept 4 - tramp_ventry .Lvector_start\@, 64, \kpti + tramp_ventry .Lvector_start\@, 64, \kpti, \bhb .endr .rept 4 - tramp_ventry .Lvector_start\@, 32, \kpti + tramp_ventry .Lvector_start\@, 32, \kpti, \bhb .endr .endm #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 /* * Exception vectors trampoline. + * The order must match __bp_harden_el1_vectors and the + * arm64_bp_harden_el1_vectors enum. */ .pushsection ".entry.tramp.text", "ax" .align 11 SYM_CODE_START_NOALIGN(tramp_vectors) - generate_tramp_vector kpti=1 +#ifdef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY + generate_tramp_vector kpti=1, bhb=BHB_MITIGATION_LOOP + generate_tramp_vector kpti=1, bhb=BHB_MITIGATION_FW +#endif /* CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ + generate_tramp_vector kpti=1, bhb=BHB_MITIGATION_NONE SYM_CODE_END(tramp_vectors) SYM_CODE_START(tramp_exit_native) @@ -744,7 +775,7 @@ SYM_DATA_END(__entry_tramp_data_start) * Exception vectors for spectre mitigations on entry from EL1 when * kpti is not in use. */ - .macro generate_el1_vector + .macro generate_el1_vector, bhb .Lvector_start\@: kernel_ventry 1, t, 64, sync // Synchronous EL1t kernel_ventry 1, t, 64, irq // IRQ EL1t @@ -757,17 +788,21 @@ SYM_DATA_END(__entry_tramp_data_start) kernel_ventry 1, h, 64, error // Error EL1h .rept 4 - tramp_ventry .Lvector_start\@, 64, kpti=0 + tramp_ventry .Lvector_start\@, 64, 0, \bhb .endr .rept 4 - tramp_ventry .Lvector_start\@, 32, kpti=0 + tramp_ventry .Lvector_start\@, 32, 0, \bhb .endr .endm +/* The order must match tramp_vecs and the arm64_bp_harden_el1_vectors enum. */ .pushsection ".entry.text", "ax" .align 11 SYM_CODE_START(__bp_harden_el1_vectors) - generate_el1_vector +#ifdef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY + generate_el1_vector bhb=BHB_MITIGATION_LOOP + generate_el1_vector bhb=BHB_MITIGATION_FW +#endif /* CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ SYM_CODE_END(__bp_harden_el1_vectors) .popsection diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index 9394f21d7566..6a5eeb8beea3 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -770,3 +770,19 @@ int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which) return -ENODEV; } } + +/* Patched to NOP when enabled */ +void noinstr spectre_bhb_patch_loop_mitigation_enable(struct alt_instr *alt, + __le32 *origptr, + __le32 *updptr, int nr_inst) +{ + BUG_ON(nr_inst != 1); +} + +/* Patched to NOP when enabled */ +void noinstr spectre_bhb_patch_fw_mitigation_enabled(struct alt_instr *alt, + __le32 *origptr, + __le32 *updptr, int nr_inst) +{ + BUG_ON(nr_inst != 1); +} diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index 63ccb5252190..220c8c60e021 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -92,6 +92,11 @@ ARM_SMCCC_SMC_32, \ 0, 0x7fff) +#define ARM_SMCCC_ARCH_WORKAROUND_3 \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_32, \ + 0, 0x3fff) + #define ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID \ ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ ARM_SMCCC_SMC_32, \ -- cgit From b28a8eebe81c186fdb1a0078263b30576c8e1f42 Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 25 Nov 2021 14:25:34 +0000 Subject: arm64: entry: Add macro for reading symbol addresses from the trampoline The trampoline code needs to use the address of symbols in the wider kernel, e.g. vectors. PC-relative addressing wouldn't work as the trampoline code doesn't run at the address the linker expected. tramp_ventry uses a literal pool, unless CONFIG_RANDOMIZE_BASE is set, in which case it uses the data page as a literal pool because the data page can be unmapped when running in user-space, which is required for CPUs vulnerable to meltdown. Pull this logic out as a macro, instead of adding a third copy of it. Reviewed-by: Catalin Marinas Signed-off-by: James Morse --- arch/arm64/kernel/entry.S | 37 ++++++++++++++++--------------------- 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 2ceb0c3647b4..8da732fefd8f 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -646,6 +646,15 @@ alternative_else_nop_endif sub \dst, \dst, PAGE_SIZE .endm + .macro tramp_data_read_var dst, var +#ifdef CONFIG_RANDOMIZE_BASE + tramp_data_page \dst + add \dst, \dst, #:lo12:__entry_tramp_data_\var + ldr \dst, [\dst] +#else + ldr \dst, =\var +#endif + .endm #define BHB_MITIGATION_NONE 0 #define BHB_MITIGATION_LOOP 1 @@ -676,13 +685,8 @@ alternative_else_nop_endif b . 2: tramp_map_kernel x30 -#ifdef CONFIG_RANDOMIZE_BASE - tramp_data_page x30 alternative_insn isb, nop, ARM64_WORKAROUND_QCOM_FALKOR_E1003 - ldr x30, [x30] -#else - ldr x30, =vectors -#endif + tramp_data_read_var x30, vectors alternative_if_not ARM64_WORKAROUND_CAVIUM_TX2_219_PRFM prfm plil1strm, [x30, #(1b - \vector_start)] alternative_else_nop_endif @@ -765,7 +769,12 @@ SYM_CODE_END(tramp_exit_compat) .pushsection ".rodata", "a" .align PAGE_SHIFT SYM_DATA_START(__entry_tramp_data_start) +__entry_tramp_data_vectors: .quad vectors +#ifdef CONFIG_ARM_SDE_INTERFACE +__entry_tramp_data___sdei_asm_handler: + .quad __sdei_asm_handler +#endif /* CONFIG_ARM_SDE_INTERFACE */ SYM_DATA_END(__entry_tramp_data_start) .popsection // .rodata #endif /* CONFIG_RANDOMIZE_BASE */ @@ -932,14 +941,7 @@ SYM_CODE_START(__sdei_asm_entry_trampoline) * Remember whether to unmap the kernel on exit. */ 1: str x4, [x1, #(SDEI_EVENT_INTREGS + S_SDEI_TTBR1)] - -#ifdef CONFIG_RANDOMIZE_BASE - tramp_data_page x4 - add x4, x4, #:lo12:__sdei_asm_trampoline_next_handler - ldr x4, [x4] -#else - ldr x4, =__sdei_asm_handler -#endif + tramp_data_read_var x4, __sdei_asm_handler br x4 SYM_CODE_END(__sdei_asm_entry_trampoline) NOKPROBE(__sdei_asm_entry_trampoline) @@ -962,13 +964,6 @@ SYM_CODE_END(__sdei_asm_exit_trampoline) NOKPROBE(__sdei_asm_exit_trampoline) .ltorg .popsection // .entry.tramp.text -#ifdef CONFIG_RANDOMIZE_BASE -.pushsection ".rodata", "a" -SYM_DATA_START(__sdei_asm_trampoline_next_handler) - .quad __sdei_asm_handler -SYM_DATA_END(__sdei_asm_trampoline_next_handler) -.popsection // .rodata -#endif /* CONFIG_RANDOMIZE_BASE */ #endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ /* -- cgit From bd09128d16fac3c34b80bd6a29088ac632e8ce09 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 23 Nov 2021 18:29:25 +0000 Subject: arm64: Add percpu vectors for EL1 The Spectre-BHB workaround adds a firmware call to the vectors. This is needed on some CPUs, but not others. To avoid the unaffected CPU in a big/little pair from making the firmware call, create per cpu vectors. The per-cpu vectors only apply when returning from EL0. Systems using KPTI can use the canonical 'full-fat' vectors directly at EL1, the trampoline exit code will switch to this_cpu_vector on exit to EL0. Systems not using KPTI should always use this_cpu_vector. this_cpu_vector will point at a vector in tramp_vecs or __bp_harden_el1_vectors, depending on whether KPTI is in use. Reviewed-by: Catalin Marinas Signed-off-by: James Morse --- arch/arm64/include/asm/vectors.h | 29 ++++++++++++++++++++++++++++- arch/arm64/kernel/cpufeature.c | 11 +++++++++++ arch/arm64/kernel/entry.S | 12 ++++++------ arch/arm64/kvm/hyp/vhe/switch.c | 10 ++++++++-- 4 files changed, 53 insertions(+), 9 deletions(-) diff --git a/arch/arm64/include/asm/vectors.h b/arch/arm64/include/asm/vectors.h index bac53fad037d..3f76dfd9e074 100644 --- a/arch/arm64/include/asm/vectors.h +++ b/arch/arm64/include/asm/vectors.h @@ -5,6 +5,15 @@ #ifndef __ASM_VECTORS_H #define __ASM_VECTORS_H +#include +#include + +#include + +extern char vectors[]; +extern char tramp_vectors[]; +extern char __bp_harden_el1_vectors[]; + /* * Note: the order of this enum corresponds to two arrays in entry.S: * tramp_vecs and __bp_harden_el1_vectors. By default the canonical @@ -29,6 +38,24 @@ enum arm64_bp_harden_el1_vectors { * Remap the kernel before branching to the canonical vectors. */ EL1_VECTOR_KPTI, -+}; +}; + +/* The vectors to use on return from EL0. e.g. to remap the kernel */ +DECLARE_PER_CPU_READ_MOSTLY(const char *, this_cpu_vector); + +#ifndef CONFIG_UNMAP_KERNEL_AT_EL0 +#define TRAMP_VALIAS 0 +#endif + +static inline const char * +arm64_get_bp_hardening_vector(enum arm64_bp_harden_el1_vectors slot) +{ + if (arm64_kernel_unmapped_at_el0()) + return (char *)TRAMP_VALIAS + SZ_2K * slot; + + WARN_ON_ONCE(slot == EL1_VECTOR_KPTI); + + return __bp_harden_el1_vectors + SZ_2K * slot; +} #endif /* __ASM_VECTORS_H */ diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index e5f23dab1c8d..45fed4974c44 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -73,6 +73,8 @@ #include #include #include +#include + #include #include #include @@ -85,6 +87,7 @@ #include #include #include +#include #include /* Kernel representation of AT_HWCAP and AT_HWCAP2 */ @@ -110,6 +113,8 @@ DECLARE_BITMAP(boot_capabilities, ARM64_NPATCHABLE); bool arm64_use_ng_mappings = false; EXPORT_SYMBOL(arm64_use_ng_mappings); +DEFINE_PER_CPU_READ_MOSTLY(const char *, this_cpu_vector) = vectors; + /* * Permit PER_LINUX32 and execve() of 32-bit binaries even if not all CPUs * support it? @@ -1590,6 +1595,12 @@ kpti_install_ng_mappings(const struct arm64_cpu_capabilities *__unused) int cpu = smp_processor_id(); + if (__this_cpu_read(this_cpu_vector) == vectors) { + const char *v = arm64_get_bp_hardening_vector(EL1_VECTOR_KPTI); + + __this_cpu_write(this_cpu_vector, v); + } + /* * We don't need to rewrite the page-tables if either we've done * it already or we have KASLR enabled and therefore have not diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 8da732fefd8f..a62fee121138 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -38,7 +38,6 @@ .macro kernel_ventry, el:req, ht:req, regsize:req, label:req .align 7 .Lventry_start\@: -#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 .if \el == 0 /* * This must be the first instruction of the EL0 vector entries. It is @@ -53,7 +52,6 @@ .endif .Lskip_tramp_vectors_cleanup\@: .endif -#endif sub sp, sp, #PT_REGS_SIZE #ifdef CONFIG_VMAP_STACK @@ -712,10 +710,10 @@ alternative_else_nop_endif .endm .macro tramp_exit, regsize = 64 - adr x30, tramp_vectors -#ifdef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY - add x30, x30, SZ_4K -#endif + tramp_data_read_var x30, this_cpu_vector + get_this_cpu_offset x29 + ldr x30, [x30, x29] + msr vbar_el1, x30 ldr lr, [sp, #S_LR] tramp_unmap_kernel x29 @@ -775,6 +773,8 @@ __entry_tramp_data_vectors: __entry_tramp_data___sdei_asm_handler: .quad __sdei_asm_handler #endif /* CONFIG_ARM_SDE_INTERFACE */ +__entry_tramp_data_this_cpu_vector: + .quad this_cpu_vector SYM_DATA_END(__entry_tramp_data_start) .popsection // .rodata #endif /* CONFIG_RANDOMIZE_BASE */ diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 11d053fdd604..54af47005e45 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -24,6 +25,8 @@ #include #include #include +#include +#include /* VHE specific context */ DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data); @@ -67,7 +70,7 @@ NOKPROBE_SYMBOL(__activate_traps); static void __deactivate_traps(struct kvm_vcpu *vcpu) { - extern char vectors[]; /* kernel exception vectors */ + const char *host_vectors = vectors; ___deactivate_traps(vcpu); @@ -81,7 +84,10 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT)); write_sysreg(CPACR_EL1_DEFAULT, cpacr_el1); - write_sysreg(vectors, vbar_el1); + + if (!arm64_kernel_unmapped_at_el0()) + host_vectors = __this_cpu_read(this_cpu_vector); + write_sysreg(host_vectors, vbar_el1); } NOKPROBE_SYMBOL(__deactivate_traps); -- cgit From dee435be76f4117410bbd90573a881fd33488f37 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 8 Feb 2022 16:08:13 +0000 Subject: arm64: proton-pack: Report Spectre-BHB vulnerabilities as part of Spectre-v2 Speculation attacks against some high-performance processors can make use of branch history to influence future speculation as part of a spectre-v2 attack. This is not mitigated by CSV2, meaning CPUs that previously reported 'Not affected' are now moderately mitigated by CSV2. Update the value in /sys/devices/system/cpu/vulnerabilities/spectre_v2 to also show the state of the BHB mitigation. Reviewed-by: Catalin Marinas Signed-off-by: James Morse --- arch/arm64/include/asm/spectre.h | 2 ++ arch/arm64/kernel/proton-pack.c | 36 ++++++++++++++++++++++++++++++++++-- 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/spectre.h b/arch/arm64/include/asm/spectre.h index f62ca39da6c5..c617fe90a843 100644 --- a/arch/arm64/include/asm/spectre.h +++ b/arch/arm64/include/asm/spectre.h @@ -93,5 +93,7 @@ void spectre_v4_enable_task_mitigation(struct task_struct *tsk); enum mitigation_state arm64_get_meltdown_state(void); +enum mitigation_state arm64_get_spectre_bhb_state(void); + #endif /* __ASSEMBLY__ */ #endif /* __ASM_SPECTRE_H */ diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index 6a5eeb8beea3..8a499b8373c0 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -96,14 +96,39 @@ static bool spectre_v2_mitigations_off(void) return ret; } +static const char *get_bhb_affected_string(enum mitigation_state bhb_state) +{ + switch (bhb_state) { + case SPECTRE_UNAFFECTED: + return ""; + default: + case SPECTRE_VULNERABLE: + return ", but not BHB"; + case SPECTRE_MITIGATED: + return ", BHB"; + } +} + ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, char *buf) { + enum mitigation_state bhb_state = arm64_get_spectre_bhb_state(); + const char *bhb_str = get_bhb_affected_string(bhb_state); + const char *v2_str = "Branch predictor hardening"; + switch (spectre_v2_state) { case SPECTRE_UNAFFECTED: - return sprintf(buf, "Not affected\n"); + if (bhb_state == SPECTRE_UNAFFECTED) + return sprintf(buf, "Not affected\n"); + + /* + * Platforms affected by Spectre-BHB can't report + * "Not affected" for Spectre-v2. + */ + v2_str = "CSV2"; + fallthrough; case SPECTRE_MITIGATED: - return sprintf(buf, "Mitigation: Branch predictor hardening\n"); + return sprintf(buf, "Mitigation: %s%s\n", v2_str, bhb_str); case SPECTRE_VULNERABLE: fallthrough; default: @@ -771,6 +796,13 @@ int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which) } } +static enum mitigation_state spectre_bhb_state; + +enum mitigation_state arm64_get_spectre_bhb_state(void) +{ + return spectre_bhb_state; +} + /* Patched to NOP when enabled */ void noinstr spectre_bhb_patch_loop_mitigation_enable(struct alt_instr *alt, __le32 *origptr, -- cgit From 610d086d6df0b15c3732a7b4a5b0f1c3e1b84d4c Mon Sep 17 00:00:00 2001 From: Deren Wu Date: Sun, 13 Feb 2022 00:20:15 +0800 Subject: mac80211: fix EAPoL rekey fail in 802.3 rx path mac80211 set capability NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211 to upper layer by default. That means we should pass EAPoL packets through nl80211 path only, and should not send the EAPoL skb to netdevice diretly. At the meanwhile, wpa_supplicant would not register sock to listen EAPoL skb on the netdevice. However, there is no control_port_protocol handler in mac80211 for 802.3 RX packets, mac80211 driver would pass up the EAPoL rekey frame to netdevice and wpa_supplicant would be never interactive with this kind of packets, if SUPPORTS_RX_DECAP_OFFLOAD is enabled. This causes STA always rekey fail if EAPoL frame go through 802.3 path. To avoid this problem, align the same process as 802.11 type to handle this frame before put it into network stack. This also addresses a potential security issue in 802.3 RX mode that was previously fixed in commit a8c4d76a8dd4 ("mac80211: do not accept/forward invalid EAPOL frames"). Cc: stable@vger.kernel.org # 5.12+ Fixes: 80a915ec4427 ("mac80211: add rx decapsulation offload support") Signed-off-by: Deren Wu Link: https://lore.kernel.org/r/6889c9fced5859ebb088564035f84fd0fa792a49.1644680751.git.deren.wu@mediatek.com [fix typos, update comment and add note about security issue] Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 93680af62c47..7b699b2f4d89 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2607,7 +2607,8 @@ static void ieee80211_deliver_skb_to_local_stack(struct sk_buff *skb, * address, so that the authenticator (e.g. hostapd) will see * the frame, but bridge won't forward it anywhere else. Note * that due to earlier filtering, the only other address can - * be the PAE group address. + * be the PAE group address, unless the hardware allowed them + * through in 802.3 offloaded mode. */ if (unlikely(skb->protocol == sdata->control_port_protocol && !ether_addr_equal(ehdr->h_dest, sdata->vif.addr))) @@ -4514,12 +4515,7 @@ static void ieee80211_rx_8023(struct ieee80211_rx_data *rx, /* deliver to local stack */ skb->protocol = eth_type_trans(skb, fast_rx->dev); - memset(skb->cb, 0, sizeof(skb->cb)); - if (rx->list) - list_add_tail(&skb->list, rx->list); - else - netif_receive_skb(skb); - + ieee80211_deliver_skb_to_local_stack(skb, rx); } static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, -- cgit From a6bce78262f5dd4b50510f0aa47f3995f7b185f3 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 3 Feb 2022 20:15:29 +0100 Subject: mac80211: refuse aggregations sessions before authorized If an MFP station isn't authorized, the receiver will (or at least should) drop the action frame since it's a robust management frame, but if we're not authorized we haven't installed keys yet. Refuse attempts to start a session as they'd just time out. Signed-off-by: Johannes Berg Link: https://lore.kernel.org/r/20220203201528.ff4d5679dce9.I34bb1f2bc341e161af2d6faf74f91b332ba11285@changeid Signed-off-by: Johannes Berg --- net/mac80211/agg-tx.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index 74a878f213d3..1deb3d874a4b 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -9,7 +9,7 @@ * Copyright 2007, Michael Wu * Copyright 2007-2010, Intel Corporation * Copyright(c) 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018 - 2021 Intel Corporation + * Copyright (C) 2018 - 2022 Intel Corporation */ #include @@ -626,6 +626,14 @@ int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid, return -EINVAL; } + if (test_sta_flag(sta, WLAN_STA_MFP) && + !test_sta_flag(sta, WLAN_STA_AUTHORIZED)) { + ht_dbg(sdata, + "MFP STA not authorized - deny BA session request %pM tid %d\n", + sta->sta.addr, tid); + return -EINVAL; + } + /* * 802.11n-2009 11.5.1.1: If the initiating STA is an HT STA, is a * member of an IBSS, and has no other existing Block Ack agreement -- cgit From 859ae7018316daa4adbc496012dcbbb458d7e510 Mon Sep 17 00:00:00 2001 From: Nicolas Escande Date: Mon, 14 Feb 2022 18:32:14 +0100 Subject: mac80211: fix forwarded mesh frames AC & queue selection There are two problems with the current code that have been highlighted with the AQL feature that is now enbaled by default. First problem is in ieee80211_rx_h_mesh_fwding(), ieee80211_select_queue_80211() is used on received packets to choose the sending AC queue of the forwarding packet although this function should only be called on TX packet (it uses ieee80211_tx_info). This ends with forwarded mesh packets been sent on unrelated random AC queue. To fix that, AC queue can directly be infered from skb->priority which has been extracted from QOS info (see ieee80211_parse_qos()). Second problem is the value of queue_mapping set on forwarded mesh frames via skb_set_queue_mapping() is not the AC of the packet but a hardware queue index. This may or may not work depending on AC to HW queue mapping which is driver specific. Both of these issues lead to improper AC selection while forwarding mesh packets but more importantly due to improper airtime accounting (which is done on a per STA, per AC basis) caused traffic stall with the introduction of AQL. Fixes: cf44012810cc ("mac80211: fix unnecessary frame drops in mesh fwding") Fixes: d3c1597b8d1b ("mac80211: fix forwarded mesh frame queue mapping") Co-developed-by: Remi Pommarel Signed-off-by: Remi Pommarel Signed-off-by: Nicolas Escande Link: https://lore.kernel.org/r/20220214173214.368862-1-nico.escande@gmail.com Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 7b699b2f4d89..48d9553dafe3 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2923,13 +2923,13 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) ether_addr_equal(sdata->vif.addr, hdr->addr3)) return RX_CONTINUE; - ac = ieee80211_select_queue_80211(sdata, skb, hdr); + ac = ieee802_1d_to_ac[skb->priority]; q = sdata->vif.hw_queue[ac]; if (ieee80211_queue_stopped(&local->hw, q)) { IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, dropped_frames_congestion); return RX_DROP_MONITOR; } - skb_set_queue_mapping(skb, q); + skb_set_queue_mapping(skb, ac); if (!--mesh_hdr->ttl) { if (!is_multicast_ether_addr(hdr->addr1)) -- cgit From 25666e8ccd952627899b09b68f7c9b68cfeaf028 Mon Sep 17 00:00:00 2001 From: Lucas Zampieri Date: Wed, 26 Jan 2022 11:44:00 -0300 Subject: HID: logitech-dj: add new lightspeed receiver id As of logitech lightspeed receiver fw version 04.02.B0009, HIDPP_PARAM_DEVICE_INFO is being reported as 0x11. With patch "HID: logitech-dj: add support for the new lightspeed receiver iteration", the mouse starts to error out with: logitech-djreceiver: unusable device of type UNKNOWN (0x011) connected on slot 1 and becomes unusable. This has been noticed on a Logitech G Pro X Superlight fw MPM 25.01.B0018. Signed-off-by: Lucas Zampieri Acked-by: Nestor Lopez Casado Signed-off-by: Jiri Kosina --- drivers/hid/hid-logitech-dj.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/hid/hid-logitech-dj.c b/drivers/hid/hid-logitech-dj.c index 7106b921b53c..c358778e070b 100644 --- a/drivers/hid/hid-logitech-dj.c +++ b/drivers/hid/hid-logitech-dj.c @@ -1068,6 +1068,7 @@ static void logi_hidpp_recv_queue_notif(struct hid_device *hdev, workitem.reports_supported |= STD_KEYBOARD; break; case 0x0f: + case 0x11: device_type = "eQUAD Lightspeed 1.2"; logi_hidpp_dev_conn_notif_equad(hdev, hidpp_report, &workitem); workitem.reports_supported |= STD_KEYBOARD; -- cgit From 0a5a587501b54e8c6d86960b047d4491fd40dcf2 Mon Sep 17 00:00:00 2001 From: Michael Hübner Date: Thu, 20 Jan 2022 08:40:48 +0100 Subject: HID: Add support for open wheel and no attachment to T300 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Different add ons to the wheel base report different models. Having no wheel mounted to the base and using the open wheel attachment is added here. Signed-off-by: Michael Hübner Signed-off-by: Jiri Kosina --- drivers/hid/hid-thrustmaster.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/hid/hid-thrustmaster.c b/drivers/hid/hid-thrustmaster.c index 03b935ff02d5..a4e20f9e598b 100644 --- a/drivers/hid/hid-thrustmaster.c +++ b/drivers/hid/hid-thrustmaster.c @@ -64,7 +64,9 @@ struct tm_wheel_info { */ static const struct tm_wheel_info tm_wheels_infos[] = { {0x0306, 0x0006, "Thrustmaster T150RS"}, + {0x0200, 0x0005, "Thrustmaster T300RS (Missing Attachment)"}, {0x0206, 0x0005, "Thrustmaster T300RS"}, + {0x0209, 0x0005, "Thrustmaster T300RS (Open Wheel Attachment)"}, {0x0204, 0x0005, "Thrustmaster T300 Ferrari Alcantara Edition"}, {0x0002, 0x0002, "Thrustmaster T500RS"} //{0x0407, 0x0001, "Thrustmaster TMX"} -- cgit From 9bdd10d57a8807dba0003af0325191f3cec0f11c Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Tue, 15 Feb 2022 14:06:45 +0100 Subject: ASoC: ops: Shift tested values in snd_soc_put_volsw() by +min While the $val/$val2 values passed in from userspace are always >= 0 integers, the limits of the control can be signed integers and the $min can be non-zero and less than zero. To correctly validate $val/$val2 against platform_max, add the $min offset to val first. Fixes: 817f7c9335ec0 ("ASoC: ops: Reject out of bounds values in snd_soc_put_volsw()") Signed-off-by: Marek Vasut Cc: Mark Brown Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20220215130645.164025-1-marex@denx.de Signed-off-by: Mark Brown --- sound/soc/soc-ops.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/soc-ops.c b/sound/soc/soc-ops.c index 03ea9591fb16..a0ca58ba1627 100644 --- a/sound/soc/soc-ops.c +++ b/sound/soc/soc-ops.c @@ -319,7 +319,7 @@ int snd_soc_put_volsw(struct snd_kcontrol *kcontrol, if (ucontrol->value.integer.value[0] < 0) return -EINVAL; val = ucontrol->value.integer.value[0]; - if (mc->platform_max && val > mc->platform_max) + if (mc->platform_max && ((int)val + min) > mc->platform_max) return -EINVAL; if (val > max - min) return -EINVAL; @@ -332,7 +332,7 @@ int snd_soc_put_volsw(struct snd_kcontrol *kcontrol, if (ucontrol->value.integer.value[1] < 0) return -EINVAL; val2 = ucontrol->value.integer.value[1]; - if (mc->platform_max && val2 > mc->platform_max) + if (mc->platform_max && ((int)val2 + min) > mc->platform_max) return -EINVAL; if (val2 > max - min) return -EINVAL; -- cgit From c5487b9cdea5c1ede38a7ec94db0fc59963c8e86 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Tue, 15 Feb 2022 09:05:14 -0300 Subject: ASoC: cs4265: Fix the duplicated control name Currently, the following error messages are seen during boot: asoc-simple-card sound: control 2:0:0:SPDIF Switch:0 is already present cs4265 1-004f: ASoC: failed to add widget SPDIF dapm kcontrol SPDIF Switch: -16 Quoting Mark Brown: "The driver is just plain buggy, it defines both a regular SPIDF Switch control and a SND_SOC_DAPM_SWITCH() called SPDIF both of which will create an identically named control, it can never have loaded without error. One or both of those has to be renamed or they need to be merged into one thing." Fix the duplicated control name by combining the two SPDIF controls here and move the register bits onto the DAPM widget and have DAPM control them. Fixes: f853d6b3ba34 ("ASoC: cs4265: Add a S/PDIF enable switch") Signed-off-by: Fabio Estevam Acked-by: Charles Keepax Link: https://lore.kernel.org/r/20220215120514.1760628-1-festevam@gmail.com Signed-off-by: Mark Brown --- sound/soc/codecs/cs4265.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sound/soc/codecs/cs4265.c b/sound/soc/codecs/cs4265.c index 4aaee1873a11..4415fb364d4d 100644 --- a/sound/soc/codecs/cs4265.c +++ b/sound/soc/codecs/cs4265.c @@ -150,7 +150,6 @@ static const struct snd_kcontrol_new cs4265_snd_controls[] = { SOC_SINGLE("E to F Buffer Disable Switch", CS4265_SPDIF_CTL1, 6, 1, 0), SOC_ENUM("C Data Access", cam_mode_enum), - SOC_SINGLE("SPDIF Switch", CS4265_SPDIF_CTL2, 5, 1, 1), SOC_SINGLE("Validity Bit Control Switch", CS4265_SPDIF_CTL2, 3, 1, 0), SOC_ENUM("SPDIF Mono/Stereo", spdif_mono_stereo_enum), @@ -186,7 +185,7 @@ static const struct snd_soc_dapm_widget cs4265_dapm_widgets[] = { SND_SOC_DAPM_SWITCH("Loopback", SND_SOC_NOPM, 0, 0, &loopback_ctl), - SND_SOC_DAPM_SWITCH("SPDIF", SND_SOC_NOPM, 0, 0, + SND_SOC_DAPM_SWITCH("SPDIF", CS4265_SPDIF_CTL2, 5, 1, &spdif_switch), SND_SOC_DAPM_SWITCH("DAC", CS4265_PWRCTL, 1, 1, &dac_switch), -- cgit From cc19db8b312a6c75645645f5cc1b45166b109006 Mon Sep 17 00:00:00 2001 From: Chuanhong Guo Date: Fri, 11 Feb 2022 08:13:44 +0800 Subject: MIPS: ralink: mt7621: do memory detection on KSEG1 It's reported that current memory detection code occasionally detects larger memory under some bootloaders. Current memory detection code tests whether address space wraps around on KSEG0, which is unreliable because it's cached. Rewrite memory size detection to perform the same test on KSEG1 instead. While at it, this patch also does the following two things: 1. use a fixed pattern instead of a random function pointer as the magic value. 2. add an additional memory write and a second comparison as part of the test to prevent possible smaller memory detection result due to leftover values in memory. Fixes: 139c949f7f0a MIPS: ("ralink: mt7621: add memory detection support") Reported-by: Rui Salvaterra Signed-off-by: Chuanhong Guo Tested-by: Sergio Paracuellos Tested-by: Rui Salvaterra Signed-off-by: Thomas Bogendoerfer --- arch/mips/ralink/mt7621.c | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/arch/mips/ralink/mt7621.c b/arch/mips/ralink/mt7621.c index d6efffd4dd20..12c8808e0dea 100644 --- a/arch/mips/ralink/mt7621.c +++ b/arch/mips/ralink/mt7621.c @@ -22,7 +22,9 @@ #include "common.h" -static void *detect_magic __initdata = detect_memory_region; +#define MT7621_MEM_TEST_PATTERN 0xaa5555aa + +static u32 detect_magic __initdata; int pcibios_root_bridge_prepare(struct pci_host_bridge *bridge) { @@ -58,24 +60,32 @@ phys_addr_t mips_cpc_default_phys_base(void) panic("Cannot detect cpc address"); } +static bool __init mt7621_addr_wraparound_test(phys_addr_t size) +{ + void *dm = (void *)KSEG1ADDR(&detect_magic); + + if (CPHYSADDR(dm + size) >= MT7621_LOWMEM_MAX_SIZE) + return true; + __raw_writel(MT7621_MEM_TEST_PATTERN, dm); + if (__raw_readl(dm) != __raw_readl(dm + size)) + return false; + __raw_writel(!MT7621_MEM_TEST_PATTERN, dm); + return __raw_readl(dm) == __raw_readl(dm + size); +} + static void __init mt7621_memory_detect(void) { - void *dm = &detect_magic; phys_addr_t size; - for (size = 32 * SZ_1M; size < 256 * SZ_1M; size <<= 1) { - if (!__builtin_memcmp(dm, dm + size, sizeof(detect_magic))) - break; + for (size = 32 * SZ_1M; size <= 256 * SZ_1M; size <<= 1) { + if (mt7621_addr_wraparound_test(size)) { + memblock_add(MT7621_LOWMEM_BASE, size); + return; + } } - if ((size == 256 * SZ_1M) && - (CPHYSADDR(dm + size) < MT7621_LOWMEM_MAX_SIZE) && - __builtin_memcmp(dm, dm + size, sizeof(detect_magic))) { - memblock_add(MT7621_LOWMEM_BASE, MT7621_LOWMEM_MAX_SIZE); - memblock_add(MT7621_HIGHMEM_BASE, MT7621_HIGHMEM_SIZE); - } else { - memblock_add(MT7621_LOWMEM_BASE, size); - } + memblock_add(MT7621_LOWMEM_BASE, MT7621_LOWMEM_MAX_SIZE); + memblock_add(MT7621_HIGHMEM_BASE, MT7621_HIGHMEM_SIZE); } void __init ralink_of_remap(void) -- cgit From f2703def339c793674010cc9f01bfe4980231808 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Sat, 12 Feb 2022 22:21:11 +0000 Subject: MIPS: smp: fill in sibling and core maps earlier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After enabling CONFIG_SCHED_CORE (landed during 5.14 cycle), 2-core 2-thread-per-core interAptiv (CPS-driven) started emitting the following: [ 0.025698] CPU1 revision is: 0001a120 (MIPS interAptiv (multi)) [ 0.048183] ------------[ cut here ]------------ [ 0.048187] WARNING: CPU: 1 PID: 0 at kernel/sched/core.c:6025 sched_core_cpu_starting+0x198/0x240 [ 0.048220] Modules linked in: [ 0.048233] CPU: 1 PID: 0 Comm: swapper/1 Not tainted 5.17.0-rc3+ #35 b7b319f24073fd9a3c2aa7ad15fb7993eec0b26f [ 0.048247] Stack : 817f0000 00000004 327804c8 810eb050 00000000 00000004 00000000 c314fdd1 [ 0.048278] 830cbd64 819c0000 81800000 817f0000 83070bf4 00000001 830cbd08 00000000 [ 0.048307] 00000000 00000000 815fcbc4 00000000 00000000 00000000 00000000 00000000 [ 0.048334] 00000000 00000000 00000000 00000000 817f0000 00000000 00000000 817f6f34 [ 0.048361] 817f0000 818a3c00 817f0000 00000004 00000000 00000000 4dc33260 0018c933 [ 0.048389] ... [ 0.048396] Call Trace: [ 0.048399] [<8105a7bc>] show_stack+0x3c/0x140 [ 0.048424] [<8131c2a0>] dump_stack_lvl+0x60/0x80 [ 0.048440] [<8108b5c0>] __warn+0xc0/0xf4 [ 0.048454] [<8108b658>] warn_slowpath_fmt+0x64/0x10c [ 0.048467] [<810bd418>] sched_core_cpu_starting+0x198/0x240 [ 0.048483] [<810c6514>] sched_cpu_starting+0x14/0x80 [ 0.048497] [<8108c0f8>] cpuhp_invoke_callback_range+0x78/0x140 [ 0.048510] [<8108d914>] notify_cpu_starting+0x94/0x140 [ 0.048523] [<8106593c>] start_secondary+0xbc/0x280 [ 0.048539] [ 0.048543] ---[ end trace 0000000000000000 ]--- [ 0.048636] Synchronize counters for CPU 1: done. ...for each but CPU 0/boot. Basic debug printks right before the mentioned line say: [ 0.048170] CPU: 1, smt_mask: So smt_mask, which is sibling mask obviously, is empty when entering the function. This is critical, as sched_core_cpu_starting() calculates core-scheduling parameters only once per CPU start, and it's crucial to have all the parameters filled in at that moment (at least it uses cpu_smt_mask() which in fact is `&cpu_sibling_map[cpu]` on MIPS). A bit of debugging led me to that set_cpu_sibling_map() performing the actual map calculation, was being invocated after notify_cpu_start(), and exactly the latter function starts CPU HP callback round (sched_core_cpu_starting() is basically a CPU HP callback). While the flow is same on ARM64 (maps after the notifier, although before calling set_cpu_online()), x86 started calculating sibling maps earlier than starting the CPU HP callbacks in Linux 4.14 (see [0] for the reference). Neither me nor my brief tests couldn't find any potential caveats in calculating the maps right after performing delay calibration, but the WARN splat is now gone. The very same debug prints now yield exactly what I expected from them: [ 0.048433] CPU: 1, smt_mask: 0-1 [0] https://git.kernel.org/pub/scm/linux/kernel/git/mips/linux.git/commit/?id=76ce7cfe35ef Signed-off-by: Alexander Lobakin Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Thomas Bogendoerfer --- arch/mips/kernel/smp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c index d542fb7af3ba..1986d1309410 100644 --- a/arch/mips/kernel/smp.c +++ b/arch/mips/kernel/smp.c @@ -351,6 +351,9 @@ asmlinkage void start_secondary(void) cpu = smp_processor_id(); cpu_data[cpu].udelay_val = loops_per_jiffy; + set_cpu_sibling_map(cpu); + set_cpu_core_map(cpu); + cpumask_set_cpu(cpu, &cpu_coherent_mask); notify_cpu_starting(cpu); @@ -362,9 +365,6 @@ asmlinkage void start_secondary(void) /* The CPU is running and counters synchronised, now mark it online */ set_cpu_online(cpu, true); - set_cpu_sibling_map(cpu); - set_cpu_core_map(cpu); - calculate_cpu_foreign_map(); /* -- cgit From ac89895213d8950dba6ab342863a0959f73142a7 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Thu, 17 Feb 2022 14:13:49 +0100 Subject: HID: elo: Revert USB reference counting Commit 817b8b9c539 ("HID: elo: fix memory leak in elo_probe") introduced memory leak on error path, but more importantly the whole USB reference counting is not needed at all in the first place, as the driver itself doesn't change the reference counting in any way, and the associated usb_device is guaranteed to be kept around by USB core as long as the driver binding exists. Reported-by: Alan Stern Reported-by: Dan Carpenter Fixes: fbf42729d0e ("HID: elo: update the reference count of the usb device structure") Fixes: 817b8b9c539 ("HID: elo: fix memory leak in elo_probe") Signed-off-by: Jiri Kosina --- drivers/hid/hid-elo.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/hid/hid-elo.c b/drivers/hid/hid-elo.c index 9b42b0cdeef0..2876cb6a7dca 100644 --- a/drivers/hid/hid-elo.c +++ b/drivers/hid/hid-elo.c @@ -228,7 +228,6 @@ static int elo_probe(struct hid_device *hdev, const struct hid_device_id *id) { struct elo_priv *priv; int ret; - struct usb_device *udev; if (!hid_is_usb(hdev)) return -EINVAL; @@ -238,8 +237,7 @@ static int elo_probe(struct hid_device *hdev, const struct hid_device_id *id) return -ENOMEM; INIT_DELAYED_WORK(&priv->work, elo_work); - udev = interface_to_usbdev(to_usb_interface(hdev->dev.parent)); - priv->usbdev = usb_get_dev(udev); + priv->usbdev = interface_to_usbdev(to_usb_interface(hdev->dev.parent)); hid_set_drvdata(hdev, priv); @@ -262,7 +260,6 @@ static int elo_probe(struct hid_device *hdev, const struct hid_device_id *id) return 0; err_free: - usb_put_dev(udev); kfree(priv); return ret; } @@ -271,8 +268,6 @@ static void elo_remove(struct hid_device *hdev) { struct elo_priv *priv = hid_get_drvdata(hdev); - usb_put_dev(priv->usbdev); - hid_hw_stop(hdev); cancel_delayed_work_sync(&priv->work); kfree(priv); -- cgit From a867e9d0cc15039a6ef72e17e2603303dcd1783f Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Thu, 17 Feb 2022 10:12:42 +0000 Subject: KVM: arm64: Don't miss pending interrupts for suspended vCPU In order to properly emulate the WFI instruction, KVM reads back ICH_VMCR_EL2 and enables doorbells for GICv4. These preparations are necessary in order to recognize pending interrupts in kvm_arch_vcpu_runnable() and return to the guest. Until recently, this work was done by kvm_arch_vcpu_{blocking,unblocking}(). Since commit 6109c5a6ab7f ("KVM: arm64: Move vGIC v4 handling for WFI out arch callback hook"), these callbacks were gutted and superseded by kvm_vcpu_wfi(). It is important to note that KVM implements PSCI CPU_SUSPEND calls as a WFI within the guest. However, the implementation calls directly into kvm_vcpu_halt(), which skips the needed work done in kvm_vcpu_wfi() to detect pending interrupts. Fix the issue by calling the WFI helper. Fixes: 6109c5a6ab7f ("KVM: arm64: Move vGIC v4 handling for WFI out arch callback hook") Signed-off-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220217101242.3013716-1-oupton@google.com --- arch/arm64/kvm/psci.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c index 3eae32876897..2ce60fecd861 100644 --- a/arch/arm64/kvm/psci.c +++ b/arch/arm64/kvm/psci.c @@ -46,8 +46,7 @@ static unsigned long kvm_psci_vcpu_suspend(struct kvm_vcpu *vcpu) * specification (ARM DEN 0022A). This means all suspend states * for KVM will preserve the register state. */ - kvm_vcpu_halt(vcpu); - kvm_clear_request(KVM_REQ_UNHALT, vcpu); + kvm_vcpu_wfi(vcpu); return PSCI_RET_SUCCESS; } -- cgit From 64324ef337d0caa5798fa8fa3f6bbfbd3245868a Mon Sep 17 00:00:00 2001 From: Anthoine Bourgeois Date: Tue, 25 Jan 2022 20:11:38 +0100 Subject: ARM: dts: switch timer config to common devkit8000 devicetree This patch allow lcd43 and lcd70 flavors to benefit from timer evolution. Fixes: e428e250fde6 ("ARM: dts: Configure system timers for omap3") Signed-off-by: Anthoine Bourgeois Signed-off-by: Tony Lindgren --- arch/arm/boot/dts/omap3-devkit8000-common.dtsi | 33 ++++++++++++++++++++++++++ arch/arm/boot/dts/omap3-devkit8000.dts | 33 -------------------------- 2 files changed, 33 insertions(+), 33 deletions(-) diff --git a/arch/arm/boot/dts/omap3-devkit8000-common.dtsi b/arch/arm/boot/dts/omap3-devkit8000-common.dtsi index 5e55198e4576..f5197bb31ed8 100644 --- a/arch/arm/boot/dts/omap3-devkit8000-common.dtsi +++ b/arch/arm/boot/dts/omap3-devkit8000-common.dtsi @@ -158,6 +158,39 @@ status = "disabled"; }; +/* Unusable as clocksource because of unreliable oscillator */ +&counter32k { + status = "disabled"; +}; + +/* Unusable as clockevent because if unreliable oscillator, allow to idle */ +&timer1_target { + /delete-property/ti,no-reset-on-init; + /delete-property/ti,no-idle; + timer@0 { + /delete-property/ti,timer-alwon; + }; +}; + +/* Preferred always-on timer for clocksource */ +&timer12_target { + ti,no-reset-on-init; + ti,no-idle; + timer@0 { + /* Always clocked by secure_32k_fck */ + }; +}; + +/* Preferred timer for clockevent */ +&timer2_target { + ti,no-reset-on-init; + ti,no-idle; + timer@0 { + assigned-clocks = <&gpt2_fck>; + assigned-clock-parents = <&sys_ck>; + }; +}; + &twl_gpio { ti,use-leds; /* diff --git a/arch/arm/boot/dts/omap3-devkit8000.dts b/arch/arm/boot/dts/omap3-devkit8000.dts index c2995a280729..162d0726b008 100644 --- a/arch/arm/boot/dts/omap3-devkit8000.dts +++ b/arch/arm/boot/dts/omap3-devkit8000.dts @@ -14,36 +14,3 @@ display2 = &tv0; }; }; - -/* Unusable as clocksource because of unreliable oscillator */ -&counter32k { - status = "disabled"; -}; - -/* Unusable as clockevent because if unreliable oscillator, allow to idle */ -&timer1_target { - /delete-property/ti,no-reset-on-init; - /delete-property/ti,no-idle; - timer@0 { - /delete-property/ti,timer-alwon; - }; -}; - -/* Preferred always-on timer for clocksource */ -&timer12_target { - ti,no-reset-on-init; - ti,no-idle; - timer@0 { - /* Always clocked by secure_32k_fck */ - }; -}; - -/* Preferred timer for clockevent */ -&timer2_target { - ti,no-reset-on-init; - ti,no-idle; - timer@0 { - assigned-clocks = <&gpt2_fck>; - assigned-clock-parents = <&sys_ck>; - }; -}; -- cgit From 8840f5460a23759403f1f2860429dcbcc2f04a65 Mon Sep 17 00:00:00 2001 From: Anthoine Bourgeois Date: Tue, 25 Jan 2022 20:11:39 +0100 Subject: ARM: dts: Use 32KiHz oscillator on devkit8000 Devkit8000 board seems to always used 32k_counter as clocksource. Restore this behavior. If clocksource is back to 32k_counter, timer12 is now the clockevent source (as before) and timer2 is not longer needed here. This commit fixes the same issue observed with commit 23885389dbbb ("ARM: dts: Fix timer regression for beagleboard revision c") when sleep is blocked until hitting keys over serial console. Fixes: aba1ad05da08 ("clocksource/drivers/timer-ti-dm: Add clockevent and clocksource support") Fixes: e428e250fde6 ("ARM: dts: Configure system timers for omap3") Signed-off-by: Anthoine Bourgeois Signed-off-by: Tony Lindgren --- arch/arm/boot/dts/omap3-devkit8000-common.dtsi | 17 +---------------- drivers/clocksource/timer-ti-dm-systimer.c | 3 +-- 2 files changed, 2 insertions(+), 18 deletions(-) diff --git a/arch/arm/boot/dts/omap3-devkit8000-common.dtsi b/arch/arm/boot/dts/omap3-devkit8000-common.dtsi index f5197bb31ed8..54cd37336be7 100644 --- a/arch/arm/boot/dts/omap3-devkit8000-common.dtsi +++ b/arch/arm/boot/dts/omap3-devkit8000-common.dtsi @@ -158,11 +158,6 @@ status = "disabled"; }; -/* Unusable as clocksource because of unreliable oscillator */ -&counter32k { - status = "disabled"; -}; - /* Unusable as clockevent because if unreliable oscillator, allow to idle */ &timer1_target { /delete-property/ti,no-reset-on-init; @@ -172,7 +167,7 @@ }; }; -/* Preferred always-on timer for clocksource */ +/* Preferred timer for clockevent */ &timer12_target { ti,no-reset-on-init; ti,no-idle; @@ -181,16 +176,6 @@ }; }; -/* Preferred timer for clockevent */ -&timer2_target { - ti,no-reset-on-init; - ti,no-idle; - timer@0 { - assigned-clocks = <&gpt2_fck>; - assigned-clock-parents = <&sys_ck>; - }; -}; - &twl_gpio { ti,use-leds; /* diff --git a/drivers/clocksource/timer-ti-dm-systimer.c b/drivers/clocksource/timer-ti-dm-systimer.c index 5c40ca1d4740..1fccb457fcc5 100644 --- a/drivers/clocksource/timer-ti-dm-systimer.c +++ b/drivers/clocksource/timer-ti-dm-systimer.c @@ -241,8 +241,7 @@ static void __init dmtimer_systimer_assign_alwon(void) bool quirk_unreliable_oscillator = false; /* Quirk unreliable 32 KiHz oscillator with incomplete dts */ - if (of_machine_is_compatible("ti,omap3-beagle-ab4") || - of_machine_is_compatible("timll,omap3-devkit8000")) { + if (of_machine_is_compatible("ti,omap3-beagle-ab4")) { quirk_unreliable_oscillator = true; counter_32k = -ENODEV; } -- cgit From a679a61520d8a7b0211a1da990404daf5cc80b72 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 18 Feb 2022 11:47:51 +0100 Subject: fuse: fix fileattr op failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The fileattr API conversion broke lsattr on ntfs3g. Previously the ioctl(... FS_IOC_GETFLAGS) returned an EINVAL error, but after the conversion the error returned by the fuse filesystem was not propagated back to the ioctl() system call, resulting in success being returned with bogus values. Fix by checking for outarg.result in fuse_priv_ioctl(), just as generic ioctl code does. Reported-by: Jean-Pierre André Fixes: 72227eac177d ("fuse: convert to fileattr") Cc: # v5.13 Signed-off-by: Miklos Szeredi --- fs/fuse/ioctl.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c index fbc09dab1f85..df58966bc874 100644 --- a/fs/fuse/ioctl.c +++ b/fs/fuse/ioctl.c @@ -394,9 +394,12 @@ static int fuse_priv_ioctl(struct inode *inode, struct fuse_file *ff, args.out_args[1].value = ptr; err = fuse_simple_request(fm, &args); - if (!err && outarg.flags & FUSE_IOCTL_RETRY) - err = -EIO; - + if (!err) { + if (outarg.result < 0) + err = outarg.result; + else if (outarg.flags & FUSE_IOCTL_RETRY) + err = -EIO; + } return err; } -- cgit From efd12405f1801ef0458d908a844317fb1388c3bf Mon Sep 17 00:00:00 2001 From: Li Yang Date: Mon, 8 Nov 2021 18:07:51 -0600 Subject: dt-bindings: qoriq-clock: add missing compatible for lx2160a The compatible string is already in use, fix the binding to include it. Signed-off-by: Li Yang Acked-by: Rob Herring --- Documentation/devicetree/bindings/clock/qoriq-clock.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/clock/qoriq-clock.txt b/Documentation/devicetree/bindings/clock/qoriq-clock.txt index f7d48f23da44..10119d9ef4b1 100644 --- a/Documentation/devicetree/bindings/clock/qoriq-clock.txt +++ b/Documentation/devicetree/bindings/clock/qoriq-clock.txt @@ -44,6 +44,7 @@ Required properties: * "fsl,ls1046a-clockgen" * "fsl,ls1088a-clockgen" * "fsl,ls2080a-clockgen" + * "fsl,lx2160a-clockgen" Chassis-version clock strings include: * "fsl,qoriq-clockgen-1.0": for chassis 1.0 clocks * "fsl,qoriq-clockgen-2.0": for chassis 2.0 clocks -- cgit From 6b4266b8deb857ce2dc2a9b769b242865b9a0bce Mon Sep 17 00:00:00 2001 From: Li Yang Date: Mon, 8 Nov 2021 18:10:18 -0600 Subject: dt-bindings: fsl,layerscape-dcfg: add missing compatible for lx2160a The compatible string is already in use, fix the chip list in binding to include it. Signed-off-by: Li Yang Acked-by: Rob Herring --- Documentation/devicetree/bindings/arm/freescale/fsl,layerscape-dcfg.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/arm/freescale/fsl,layerscape-dcfg.txt b/Documentation/devicetree/bindings/arm/freescale/fsl,layerscape-dcfg.txt index b5cb374dc47d..10a91cc8b997 100644 --- a/Documentation/devicetree/bindings/arm/freescale/fsl,layerscape-dcfg.txt +++ b/Documentation/devicetree/bindings/arm/freescale/fsl,layerscape-dcfg.txt @@ -8,7 +8,7 @@ Required properties: - compatible: Should contain a chip-specific compatible string, Chip-specific strings are of the form "fsl,-dcfg", The following s are known to be supported: - ls1012a, ls1021a, ls1043a, ls1046a, ls2080a. + ls1012a, ls1021a, ls1043a, ls1046a, ls2080a, lx2160a - reg : should contain base address and length of DCFG memory-mapped registers -- cgit From 988f0a9045b0058a43ccee764a671dfab81e6d15 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 10 Nov 2021 12:59:52 +0200 Subject: soc: fsl: Replace kernel.h with the necessary inclusions When kernel.h is used in the headers it adds a lot into dependency hell, especially when there are circular dependencies are involved. Replace kernel.h inclusion with the list of what is really being used. Signed-off-by: Andy Shevchenko Signed-off-by: Li Yang --- include/soc/fsl/dpaa2-fd.h | 3 ++- include/soc/fsl/qe/immap_qe.h | 3 ++- include/soc/fsl/qe/qe_tdm.h | 4 +++- include/soc/fsl/qe/ucc_fast.h | 2 +- include/soc/fsl/qe/ucc_slow.h | 2 +- 5 files changed, 9 insertions(+), 5 deletions(-) diff --git a/include/soc/fsl/dpaa2-fd.h b/include/soc/fsl/dpaa2-fd.h index 90ae8d191f1a..bae490cac0aa 100644 --- a/include/soc/fsl/dpaa2-fd.h +++ b/include/soc/fsl/dpaa2-fd.h @@ -7,7 +7,8 @@ #ifndef __FSL_DPAA2_FD_H #define __FSL_DPAA2_FD_H -#include +#include +#include /** * DOC: DPAA2 FD - Frame Descriptor APIs for DPAA2 diff --git a/include/soc/fsl/qe/immap_qe.h b/include/soc/fsl/qe/immap_qe.h index 7614fee532f1..edd601f53f5d 100644 --- a/include/soc/fsl/qe/immap_qe.h +++ b/include/soc/fsl/qe/immap_qe.h @@ -13,7 +13,8 @@ #define _ASM_POWERPC_IMMAP_QE_H #ifdef __KERNEL__ -#include +#include + #include #define QE_IMMAP_SIZE (1024 * 1024) /* 1MB from 1MB+IMMR */ diff --git a/include/soc/fsl/qe/qe_tdm.h b/include/soc/fsl/qe/qe_tdm.h index b6febe225071..43ea830cfe1f 100644 --- a/include/soc/fsl/qe/qe_tdm.h +++ b/include/soc/fsl/qe/qe_tdm.h @@ -10,8 +10,8 @@ #ifndef _QE_TDM_H_ #define _QE_TDM_H_ -#include #include +#include #include #include @@ -19,6 +19,8 @@ #include #include +struct device_node; + /* SI RAM entries */ #define SIR_LAST 0x0001 #define SIR_BYTE 0x0002 diff --git a/include/soc/fsl/qe/ucc_fast.h b/include/soc/fsl/qe/ucc_fast.h index 9696a5b9b5d1..ad60b87a3c69 100644 --- a/include/soc/fsl/qe/ucc_fast.h +++ b/include/soc/fsl/qe/ucc_fast.h @@ -10,7 +10,7 @@ #ifndef __UCC_FAST_H__ #define __UCC_FAST_H__ -#include +#include #include #include diff --git a/include/soc/fsl/qe/ucc_slow.h b/include/soc/fsl/qe/ucc_slow.h index 11a216e4e919..7548ce8a202d 100644 --- a/include/soc/fsl/qe/ucc_slow.h +++ b/include/soc/fsl/qe/ucc_slow.h @@ -11,7 +11,7 @@ #ifndef __UCC_SLOW_H__ #define __UCC_SLOW_H__ -#include +#include #include #include -- cgit From f2b70418ec6f104981b54709a4cfe3a3c46b7d8f Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 10 Nov 2021 12:59:50 +0200 Subject: soc: fsl: Correct MAINTAINERS database (QUICC ENGINE LIBRARY) MAINTAINERS lacks of proper coverage for FSL headers. Fix it accordingly. Fixes: 7aa1aa6ecec2 ("QE: Move QE from arch/powerpc to drivers/soc") Signed-off-by: Andy Shevchenko Signed-off-by: Li Yang --- MAINTAINERS | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index ea3e6c914384..9abac76757ad 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7736,8 +7736,7 @@ M: Qiang Zhao L: linuxppc-dev@lists.ozlabs.org S: Maintained F: drivers/soc/fsl/qe/ -F: include/soc/fsl/*qe*.h -F: include/soc/fsl/*ucc*.h +F: include/soc/fsl/qe/ FREESCALE QUICC ENGINE UCC ETHERNET DRIVER M: Li Yang -- cgit From b80af7564446c8ab96438cac00e0575eb86154ad Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 10 Nov 2021 12:59:51 +0200 Subject: soc: fsl: Correct MAINTAINERS database (SOC) MAINTAINERS lacks of proper coverage for FSL headers. Fix it accordingly. Fixes: 1b48706f027c ("MAINTAINERS: add entry for Freescale SoC drivers") Signed-off-by: Andy Shevchenko Signed-off-by: Li Yang --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 9abac76757ad..1ab143aab365 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7767,6 +7767,7 @@ F: Documentation/devicetree/bindings/misc/fsl,dpaa2-console.yaml F: Documentation/devicetree/bindings/soc/fsl/ F: drivers/soc/fsl/ F: include/linux/fsl/ +F: include/soc/fsl/ FREESCALE SOC FS_ENET DRIVER M: Pantelis Antoniou -- cgit From b113737cf12964a20cc3ba1ddabe6229099661c6 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Wed, 3 Nov 2021 21:00:17 +0100 Subject: soc: fsl: guts: Revert commit 3c0d64e867ed This reverts commit 3c0d64e867ed ("soc: fsl: guts: reuse machine name from device tree"). A following patch will fix the missing memory allocation failure check instead. Suggested-by: Tyrel Datwyler Signed-off-by: Christophe JAILLET Signed-off-by: Li Yang --- drivers/soc/fsl/guts.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/soc/fsl/guts.c b/drivers/soc/fsl/guts.c index 072473a16f4d..af7741eafc57 100644 --- a/drivers/soc/fsl/guts.c +++ b/drivers/soc/fsl/guts.c @@ -28,7 +28,6 @@ struct fsl_soc_die_attr { static struct guts *guts; static struct soc_device_attribute soc_dev_attr; static struct soc_device *soc_dev; -static struct device_node *root; /* SoC die attribute definition for QorIQ platform */ @@ -138,7 +137,7 @@ static u32 fsl_guts_get_svr(void) static int fsl_guts_probe(struct platform_device *pdev) { - struct device_node *np = pdev->dev.of_node; + struct device_node *root, *np = pdev->dev.of_node; struct device *dev = &pdev->dev; const struct fsl_soc_die_attr *soc_die; const char *machine; @@ -159,8 +158,9 @@ static int fsl_guts_probe(struct platform_device *pdev) root = of_find_node_by_path("/"); if (of_property_read_string(root, "model", &machine)) of_property_read_string_index(root, "compatible", 0, &machine); + of_node_put(root); if (machine) - soc_dev_attr.machine = machine; + soc_dev_attr.machine = devm_kstrdup(dev, machine, GFP_KERNEL); svr = fsl_guts_get_svr(); soc_die = fsl_soc_die_match(svr, fsl_soc_die); @@ -195,7 +195,6 @@ static int fsl_guts_probe(struct platform_device *pdev) static int fsl_guts_remove(struct platform_device *dev) { soc_device_unregister(soc_dev); - of_node_put(root); return 0; } -- cgit From b9abe942cda43a1d46a0fd96efb54f1aa909f757 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Wed, 3 Nov 2021 21:00:33 +0100 Subject: soc: fsl: guts: Add a missing memory allocation failure check If 'devm_kstrdup()' fails, we should return -ENOMEM. While at it, move the 'of_node_put()' call in the error handling path and after the 'machine' has been copied. Better safe than sorry. Fixes: a6fc3b698130 ("soc: fsl: add GUTS driver for QorIQ platforms") Depends-on: fddacc7ff4dd ("soc: fsl: guts: Revert commit 3c0d64e867ed") Suggested-by: Tyrel Datwyler Signed-off-by: Christophe JAILLET Signed-off-by: Li Yang --- drivers/soc/fsl/guts.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/soc/fsl/guts.c b/drivers/soc/fsl/guts.c index af7741eafc57..5ed2fc1c53a0 100644 --- a/drivers/soc/fsl/guts.c +++ b/drivers/soc/fsl/guts.c @@ -158,9 +158,14 @@ static int fsl_guts_probe(struct platform_device *pdev) root = of_find_node_by_path("/"); if (of_property_read_string(root, "model", &machine)) of_property_read_string_index(root, "compatible", 0, &machine); - of_node_put(root); - if (machine) + if (machine) { soc_dev_attr.machine = devm_kstrdup(dev, machine, GFP_KERNEL); + if (!soc_dev_attr.machine) { + of_node_put(root); + return -ENOMEM; + } + } + of_node_put(root); svr = fsl_guts_get_svr(); soc_die = fsl_soc_die_match(svr, fsl_soc_die); -- cgit From 6385960501d9e0248a8745714674e86bd077e198 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Sat, 11 Dec 2021 17:08:45 +0800 Subject: soc: fsl: qe: fix typo in a comment The double `is' in the comment in line 150 is repeated. Remove one of them from the comment. Also removes a redundant tab in a new line. Signed-off-by: Jason Wang Signed-off-by: Li Yang --- drivers/soc/fsl/qe/qe.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/soc/fsl/qe/qe.c b/drivers/soc/fsl/qe/qe.c index 4d38c80f8be8..b3c226eb5292 100644 --- a/drivers/soc/fsl/qe/qe.c +++ b/drivers/soc/fsl/qe/qe.c @@ -147,7 +147,7 @@ EXPORT_SYMBOL(qe_issue_cmd); * memory mapped space. * The BRG clock is the QE clock divided by 2. * It was set up long ago during the initial boot phase and is - * is given to us. + * given to us. * Baud rate clocks are zero-based in the driver code (as that maps * to port numbers). Documentation uses 1-based numbering. */ @@ -421,7 +421,7 @@ static void qe_upload_microcode(const void *base, for (i = 0; i < be32_to_cpu(ucode->count); i++) iowrite32be(be32_to_cpu(code[i]), &qe_immr->iram.idata); - + /* Set I-RAM Ready Register */ iowrite32be(QE_IRAM_READY, &qe_immr->iram.iready); } -- cgit From a222fd8541394b36b13c89d1698d9530afd59a9c Mon Sep 17 00:00:00 2001 From: Jiasheng Jiang Date: Thu, 30 Dec 2021 09:45:43 +0800 Subject: soc: fsl: qe: Check of ioremap return value As the possible failure of the ioremap(), the par_io could be NULL. Therefore it should be better to check it and return error in order to guarantee the success of the initiation. But, I also notice that all the caller like mpc85xx_qe_par_io_init() in `arch/powerpc/platforms/85xx/common.c` don't check the return value of the par_io_init(). Actually, par_io_init() needs to check to handle the potential error. I will submit another patch to fix that. Anyway, par_io_init() itsely should be fixed. Fixes: 7aa1aa6ecec2 ("QE: Move QE from arch/powerpc to drivers/soc") Signed-off-by: Jiasheng Jiang Signed-off-by: Li Yang --- drivers/soc/fsl/qe/qe_io.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/soc/fsl/qe/qe_io.c b/drivers/soc/fsl/qe/qe_io.c index e277c827bdf3..a5e2d0e5ab51 100644 --- a/drivers/soc/fsl/qe/qe_io.c +++ b/drivers/soc/fsl/qe/qe_io.c @@ -35,6 +35,8 @@ int par_io_init(struct device_node *np) if (ret) return ret; par_io = ioremap(res.start, resource_size(&res)); + if (!par_io) + return -ENOMEM; if (!of_property_read_u32(np, "num-ports", &num_ports)) num_par_io_ports = num_ports; -- cgit From fa231bef3b34f1670b240409c11e59a3ce095e6d Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Fri, 18 Feb 2022 23:57:20 +0200 Subject: soc: imx: gpcv2: Fix clock disabling imbalance in error path The imx_pgc_power_down() starts by enabling the domain clocks, and thus disables them in the error path. Commit 18c98573a4cf ("soc: imx: gpcv2: add domain option to keep domain clocks enabled") made the clock enable conditional, but forgot to add the same condition to the error path. This can result in a clock enable/disable imbalance. Fix it. Fixes: 18c98573a4cf ("soc: imx: gpcv2: add domain option to keep domain clocks enabled") Signed-off-by: Laurent Pinchart Reviewed-by: Lucas Stach Signed-off-by: Shawn Guo --- drivers/soc/imx/gpcv2.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/soc/imx/gpcv2.c b/drivers/soc/imx/gpcv2.c index 3e59d479d001..3cb123016b3e 100644 --- a/drivers/soc/imx/gpcv2.c +++ b/drivers/soc/imx/gpcv2.c @@ -382,7 +382,8 @@ static int imx_pgc_power_down(struct generic_pm_domain *genpd) return 0; out_clk_disable: - clk_bulk_disable_unprepare(domain->num_clks, domain->clks); + if (!domain->keep_clocks) + clk_bulk_disable_unprepare(domain->num_clks, domain->clks); return ret; } -- cgit From fc3ef2e3297b3c0e2006b5d7b3d66965e3392036 Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Sun, 20 Feb 2022 19:01:14 +0300 Subject: HID: hid-thrustmaster: fix OOB read in thrustmaster_interrupts Syzbot reported an slab-out-of-bounds Read in thrustmaster_probe() bug. The root case is in missing validation check of actual number of endpoints. Code should not blindly access usb_host_interface::endpoint array, since it may contain less endpoints than code expects. Fix it by adding missing validaion check and print an error if number of endpoints do not match expected number Fixes: c49c33637802 ("HID: support for initialization of some Thrustmaster wheels") Reported-and-tested-by: syzbot+35eebd505e97d315d01c@syzkaller.appspotmail.com Signed-off-by: Pavel Skripkin Signed-off-by: Jiri Kosina --- drivers/hid/hid-thrustmaster.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/hid/hid-thrustmaster.c b/drivers/hid/hid-thrustmaster.c index a4e20f9e598b..c3e6d69fdfbd 100644 --- a/drivers/hid/hid-thrustmaster.c +++ b/drivers/hid/hid-thrustmaster.c @@ -160,6 +160,12 @@ static void thrustmaster_interrupts(struct hid_device *hdev) return; } + if (usbif->cur_altsetting->desc.bNumEndpoints < 2) { + kfree(send_buf); + hid_err(hdev, "Wrong number of endpoints?\n"); + return; + } + ep = &usbif->cur_altsetting->endpoint[1]; b_ep = ep->desc.bEndpointAddress; -- cgit From d45476d9832409371537013ebdd8dc1a7781f97a Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Wed, 16 Feb 2022 20:57:00 +0100 Subject: x86/speculation: Rename RETPOLINE_AMD to RETPOLINE_LFENCE The RETPOLINE_AMD name is unfortunate since it isn't necessarily AMD only, in fact Hygon also uses it. Furthermore it will likely be sufficient for some Intel processors. Therefore rename the thing to RETPOLINE_LFENCE to better describe what it is. Add the spectre_v2=retpoline,lfence option as an alias to spectre_v2=retpoline,amd to preserve existing setups. However, the output of /sys/devices/system/cpu/vulnerabilities/spectre_v2 will be changed. [ bp: Fix typos, massage. ] Co-developed-by: Josh Poimboeuf Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Reviewed-by: Thomas Gleixner --- arch/x86/include/asm/cpufeatures.h | 2 +- arch/x86/include/asm/nospec-branch.h | 12 ++++++------ arch/x86/kernel/alternative.c | 8 ++++---- arch/x86/kernel/cpu/bugs.c | 29 ++++++++++++++++++----------- arch/x86/lib/retpoline.S | 2 +- arch/x86/net/bpf_jit_comp.c | 2 +- tools/arch/x86/include/asm/cpufeatures.h | 2 +- 7 files changed, 32 insertions(+), 25 deletions(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 6db4e2932b3d..65d147974f8d 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -204,7 +204,7 @@ /* FREE! ( 7*32+10) */ #define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */ #define X86_FEATURE_RETPOLINE ( 7*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */ -#define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* "" AMD Retpoline mitigation for Spectre variant 2 */ +#define X86_FEATURE_RETPOLINE_LFENCE ( 7*32+13) /* "" Use LFENCE for Spectre variant 2 */ #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ #define X86_FEATURE_CDP_L2 ( 7*32+15) /* Code and Data Prioritization L2 */ #define X86_FEATURE_MSR_SPEC_CTRL ( 7*32+16) /* "" MSR SPEC_CTRL is implemented */ diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index cc74dc584836..c91d97bee237 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -84,7 +84,7 @@ #ifdef CONFIG_RETPOLINE ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \ __stringify(jmp __x86_indirect_thunk_\reg), X86_FEATURE_RETPOLINE, \ - __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), X86_FEATURE_RETPOLINE_AMD + __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), X86_FEATURE_RETPOLINE_LFENCE #else jmp *%\reg #endif @@ -94,7 +94,7 @@ #ifdef CONFIG_RETPOLINE ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; call *%\reg), \ __stringify(call __x86_indirect_thunk_\reg), X86_FEATURE_RETPOLINE, \ - __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; call *%\reg), X86_FEATURE_RETPOLINE_AMD + __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; call *%\reg), X86_FEATURE_RETPOLINE_LFENCE #else call *%\reg #endif @@ -146,7 +146,7 @@ extern retpoline_thunk_t __x86_indirect_thunk_array[]; "lfence;\n" \ ANNOTATE_RETPOLINE_SAFE \ "call *%[thunk_target]\n", \ - X86_FEATURE_RETPOLINE_AMD) + X86_FEATURE_RETPOLINE_LFENCE) # define THUNK_TARGET(addr) [thunk_target] "r" (addr) @@ -176,7 +176,7 @@ extern retpoline_thunk_t __x86_indirect_thunk_array[]; "lfence;\n" \ ANNOTATE_RETPOLINE_SAFE \ "call *%[thunk_target]\n", \ - X86_FEATURE_RETPOLINE_AMD) + X86_FEATURE_RETPOLINE_LFENCE) # define THUNK_TARGET(addr) [thunk_target] "rm" (addr) #endif @@ -188,8 +188,8 @@ extern retpoline_thunk_t __x86_indirect_thunk_array[]; /* The Spectre V2 mitigation variants */ enum spectre_v2_mitigation { SPECTRE_V2_NONE, - SPECTRE_V2_RETPOLINE_GENERIC, - SPECTRE_V2_RETPOLINE_AMD, + SPECTRE_V2_RETPOLINE, + SPECTRE_V2_LFENCE, SPECTRE_V2_IBRS_ENHANCED, }; diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 5007c3ffe96f..b4470eabf151 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -389,7 +389,7 @@ static int emit_indirect(int op, int reg, u8 *bytes) * * CALL *%\reg * - * It also tries to inline spectre_v2=retpoline,amd when size permits. + * It also tries to inline spectre_v2=retpoline,lfence when size permits. */ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) { @@ -407,7 +407,7 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) BUG_ON(reg == 4); if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) && - !cpu_feature_enabled(X86_FEATURE_RETPOLINE_AMD)) + !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) return -1; op = insn->opcode.bytes[0]; @@ -438,9 +438,9 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) } /* - * For RETPOLINE_AMD: prepend the indirect CALL/JMP with an LFENCE. + * For RETPOLINE_LFENCE: prepend the indirect CALL/JMP with an LFENCE. */ - if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_AMD)) { + if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) { bytes[i++] = 0x0f; bytes[i++] = 0xae; bytes[i++] = 0xe8; /* LFENCE */ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 1c1f218a701d..e3bb8afc6768 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -664,7 +664,7 @@ enum spectre_v2_mitigation_cmd { SPECTRE_V2_CMD_FORCE, SPECTRE_V2_CMD_RETPOLINE, SPECTRE_V2_CMD_RETPOLINE_GENERIC, - SPECTRE_V2_CMD_RETPOLINE_AMD, + SPECTRE_V2_CMD_RETPOLINE_LFENCE, }; enum spectre_v2_user_cmd { @@ -824,8 +824,8 @@ set_mode: static const char * const spectre_v2_strings[] = { [SPECTRE_V2_NONE] = "Vulnerable", - [SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline", - [SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline", + [SPECTRE_V2_RETPOLINE] = "Mitigation: Retpolines", + [SPECTRE_V2_LFENCE] = "Mitigation: LFENCE", [SPECTRE_V2_IBRS_ENHANCED] = "Mitigation: Enhanced IBRS", }; @@ -837,7 +837,8 @@ static const struct { { "off", SPECTRE_V2_CMD_NONE, false }, { "on", SPECTRE_V2_CMD_FORCE, true }, { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false }, - { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_AMD, false }, + { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_LFENCE, false }, + { "retpoline,lfence", SPECTRE_V2_CMD_RETPOLINE_LFENCE, false }, { "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false }, { "auto", SPECTRE_V2_CMD_AUTO, false }, }; @@ -875,13 +876,19 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) } if ((cmd == SPECTRE_V2_CMD_RETPOLINE || - cmd == SPECTRE_V2_CMD_RETPOLINE_AMD || + cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE || cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC) && !IS_ENABLED(CONFIG_RETPOLINE)) { pr_err("%s selected but not compiled in. Switching to AUTO select\n", mitigation_options[i].option); return SPECTRE_V2_CMD_AUTO; } + if ((cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE) && + !boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { + pr_err("%s selected, but CPU doesn't have a serializing LFENCE. Switching to AUTO select\n", mitigation_options[i].option); + return SPECTRE_V2_CMD_AUTO; + } + spec_v2_print_cond(mitigation_options[i].option, mitigation_options[i].secure); return cmd; @@ -916,9 +923,9 @@ static void __init spectre_v2_select_mitigation(void) if (IS_ENABLED(CONFIG_RETPOLINE)) goto retpoline_auto; break; - case SPECTRE_V2_CMD_RETPOLINE_AMD: + case SPECTRE_V2_CMD_RETPOLINE_LFENCE: if (IS_ENABLED(CONFIG_RETPOLINE)) - goto retpoline_amd; + goto retpoline_lfence; break; case SPECTRE_V2_CMD_RETPOLINE_GENERIC: if (IS_ENABLED(CONFIG_RETPOLINE)) @@ -935,17 +942,17 @@ static void __init spectre_v2_select_mitigation(void) retpoline_auto: if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { - retpoline_amd: + retpoline_lfence: if (!boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { pr_err("Spectre mitigation: LFENCE not serializing, switching to generic retpoline\n"); goto retpoline_generic; } - mode = SPECTRE_V2_RETPOLINE_AMD; - setup_force_cpu_cap(X86_FEATURE_RETPOLINE_AMD); + mode = SPECTRE_V2_LFENCE; + setup_force_cpu_cap(X86_FEATURE_RETPOLINE_LFENCE); setup_force_cpu_cap(X86_FEATURE_RETPOLINE); } else { retpoline_generic: - mode = SPECTRE_V2_RETPOLINE_GENERIC; + mode = SPECTRE_V2_RETPOLINE; setup_force_cpu_cap(X86_FEATURE_RETPOLINE); } diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index 89b3fb244e15..afbdda539b80 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -34,7 +34,7 @@ SYM_INNER_LABEL(__x86_indirect_thunk_\reg, SYM_L_GLOBAL) ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \ __stringify(RETPOLINE \reg), X86_FEATURE_RETPOLINE, \ - __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg; int3), X86_FEATURE_RETPOLINE_AMD + __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg; int3), X86_FEATURE_RETPOLINE_LFENCE .endm diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 2b1e266ff95c..0ecb140864b2 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -394,7 +394,7 @@ static void emit_indirect_jump(u8 **pprog, int reg, u8 *ip) u8 *prog = *pprog; #ifdef CONFIG_RETPOLINE - if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_AMD)) { + if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) { EMIT_LFENCE(); EMIT2(0xFF, 0xE0 + reg); } else if (cpu_feature_enabled(X86_FEATURE_RETPOLINE)) { diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index 6db4e2932b3d..ab4e53715d8a 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -204,7 +204,7 @@ /* FREE! ( 7*32+10) */ #define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */ #define X86_FEATURE_RETPOLINE ( 7*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */ -#define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* "" AMD Retpoline mitigation for Spectre variant 2 */ +#define X86_FEATURE_RETPOLINE_LFENCE ( 7*32+13) /* "" Use LFENCEs for Spectre variant 2 */ #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ #define X86_FEATURE_CDP_L2 ( 7*32+15) /* Code and Data Prioritization L2 */ #define X86_FEATURE_MSR_SPEC_CTRL ( 7*32+16) /* "" MSR SPEC_CTRL is implemented */ -- cgit From 1e19da8522c81bf46b335f84137165741e0d82b7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Feb 2022 20:57:01 +0100 Subject: x86/speculation: Add eIBRS + Retpoline options Thanks to the chaps at VUsec it is now clear that eIBRS is not sufficient, therefore allow enabling of retpolines along with eIBRS. Add spectre_v2=eibrs, spectre_v2=eibrs,lfence and spectre_v2=eibrs,retpoline options to explicitly pick your preferred means of mitigation. Since there's new mitigations there's also user visible changes in /sys/devices/system/cpu/vulnerabilities/spectre_v2 to reflect these new mitigations. [ bp: Massage commit message, trim error messages, do more precise eIBRS mode checking. ] Co-developed-by: Josh Poimboeuf Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Reviewed-by: Patrick Colp Reviewed-by: Thomas Gleixner --- arch/x86/include/asm/nospec-branch.h | 4 +- arch/x86/kernel/cpu/bugs.c | 133 +++++++++++++++++++++++++---------- 2 files changed, 99 insertions(+), 38 deletions(-) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index c91d97bee237..acbaeaf83b61 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -190,7 +190,9 @@ enum spectre_v2_mitigation { SPECTRE_V2_NONE, SPECTRE_V2_RETPOLINE, SPECTRE_V2_LFENCE, - SPECTRE_V2_IBRS_ENHANCED, + SPECTRE_V2_EIBRS, + SPECTRE_V2_EIBRS_RETPOLINE, + SPECTRE_V2_EIBRS_LFENCE, }; /* The indirect branch speculation control variants */ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index e3bb8afc6768..79c52dd6c597 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -665,6 +665,9 @@ enum spectre_v2_mitigation_cmd { SPECTRE_V2_CMD_RETPOLINE, SPECTRE_V2_CMD_RETPOLINE_GENERIC, SPECTRE_V2_CMD_RETPOLINE_LFENCE, + SPECTRE_V2_CMD_EIBRS, + SPECTRE_V2_CMD_EIBRS_RETPOLINE, + SPECTRE_V2_CMD_EIBRS_LFENCE, }; enum spectre_v2_user_cmd { @@ -737,6 +740,13 @@ spectre_v2_parse_user_cmdline(enum spectre_v2_mitigation_cmd v2_cmd) return SPECTRE_V2_USER_CMD_AUTO; } +static inline bool spectre_v2_in_eibrs_mode(enum spectre_v2_mitigation mode) +{ + return (mode == SPECTRE_V2_EIBRS || + mode == SPECTRE_V2_EIBRS_RETPOLINE || + mode == SPECTRE_V2_EIBRS_LFENCE); +} + static void __init spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) { @@ -804,7 +814,7 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) */ if (!boot_cpu_has(X86_FEATURE_STIBP) || !smt_possible || - spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) + spectre_v2_in_eibrs_mode(spectre_v2_enabled)) return; /* @@ -826,7 +836,9 @@ static const char * const spectre_v2_strings[] = { [SPECTRE_V2_NONE] = "Vulnerable", [SPECTRE_V2_RETPOLINE] = "Mitigation: Retpolines", [SPECTRE_V2_LFENCE] = "Mitigation: LFENCE", - [SPECTRE_V2_IBRS_ENHANCED] = "Mitigation: Enhanced IBRS", + [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced IBRS", + [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced IBRS + LFENCE", + [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced IBRS + Retpolines", }; static const struct { @@ -840,6 +852,9 @@ static const struct { { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_LFENCE, false }, { "retpoline,lfence", SPECTRE_V2_CMD_RETPOLINE_LFENCE, false }, { "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false }, + { "eibrs", SPECTRE_V2_CMD_EIBRS, false }, + { "eibrs,lfence", SPECTRE_V2_CMD_EIBRS_LFENCE, false }, + { "eibrs,retpoline", SPECTRE_V2_CMD_EIBRS_RETPOLINE, false }, { "auto", SPECTRE_V2_CMD_AUTO, false }, }; @@ -877,15 +892,29 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) if ((cmd == SPECTRE_V2_CMD_RETPOLINE || cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE || - cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC) && + cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC || + cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || + cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && !IS_ENABLED(CONFIG_RETPOLINE)) { - pr_err("%s selected but not compiled in. Switching to AUTO select\n", mitigation_options[i].option); + pr_err("%s selected but not compiled in. Switching to AUTO select\n", + mitigation_options[i].option); + return SPECTRE_V2_CMD_AUTO; + } + + if ((cmd == SPECTRE_V2_CMD_EIBRS || + cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || + cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && + !boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) { + pr_err("%s selected but CPU doesn't have eIBRS. Switching to AUTO select\n", + mitigation_options[i].option); return SPECTRE_V2_CMD_AUTO; } - if ((cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE) && + if ((cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE || + cmd == SPECTRE_V2_CMD_EIBRS_LFENCE) && !boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { - pr_err("%s selected, but CPU doesn't have a serializing LFENCE. Switching to AUTO select\n", mitigation_options[i].option); + pr_err("%s selected, but CPU doesn't have a serializing LFENCE. Switching to AUTO select\n", + mitigation_options[i].option); return SPECTRE_V2_CMD_AUTO; } @@ -894,6 +923,25 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) return cmd; } +static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void) +{ + if (!IS_ENABLED(CONFIG_RETPOLINE)) { + pr_err("Kernel not compiled with retpoline; no mitigation available!"); + return SPECTRE_V2_NONE; + } + + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { + if (!boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { + pr_err("LFENCE not serializing, switching to generic retpoline\n"); + return SPECTRE_V2_RETPOLINE; + } + return SPECTRE_V2_LFENCE; + } + + return SPECTRE_V2_RETPOLINE; +} + static void __init spectre_v2_select_mitigation(void) { enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); @@ -914,49 +962,60 @@ static void __init spectre_v2_select_mitigation(void) case SPECTRE_V2_CMD_FORCE: case SPECTRE_V2_CMD_AUTO: if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) { - mode = SPECTRE_V2_IBRS_ENHANCED; - /* Force it so VMEXIT will restore correctly */ - x86_spec_ctrl_base |= SPEC_CTRL_IBRS; - wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); - goto specv2_set_mode; + mode = SPECTRE_V2_EIBRS; + break; } - if (IS_ENABLED(CONFIG_RETPOLINE)) - goto retpoline_auto; + + mode = spectre_v2_select_retpoline(); break; + case SPECTRE_V2_CMD_RETPOLINE_LFENCE: - if (IS_ENABLED(CONFIG_RETPOLINE)) - goto retpoline_lfence; + mode = SPECTRE_V2_LFENCE; break; + case SPECTRE_V2_CMD_RETPOLINE_GENERIC: - if (IS_ENABLED(CONFIG_RETPOLINE)) - goto retpoline_generic; + mode = SPECTRE_V2_RETPOLINE; break; + case SPECTRE_V2_CMD_RETPOLINE: - if (IS_ENABLED(CONFIG_RETPOLINE)) - goto retpoline_auto; + mode = spectre_v2_select_retpoline(); + break; + + case SPECTRE_V2_CMD_EIBRS: + mode = SPECTRE_V2_EIBRS; + break; + + case SPECTRE_V2_CMD_EIBRS_LFENCE: + mode = SPECTRE_V2_EIBRS_LFENCE; + break; + + case SPECTRE_V2_CMD_EIBRS_RETPOLINE: + mode = SPECTRE_V2_EIBRS_RETPOLINE; break; } - pr_err("Spectre mitigation: kernel not compiled with retpoline; no mitigation available!"); - return; -retpoline_auto: - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || - boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { - retpoline_lfence: - if (!boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { - pr_err("Spectre mitigation: LFENCE not serializing, switching to generic retpoline\n"); - goto retpoline_generic; - } - mode = SPECTRE_V2_LFENCE; + if (spectre_v2_in_eibrs_mode(mode)) { + /* Force it so VMEXIT will restore correctly */ + x86_spec_ctrl_base |= SPEC_CTRL_IBRS; + wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + } + + switch (mode) { + case SPECTRE_V2_NONE: + case SPECTRE_V2_EIBRS: + break; + + case SPECTRE_V2_LFENCE: + case SPECTRE_V2_EIBRS_LFENCE: setup_force_cpu_cap(X86_FEATURE_RETPOLINE_LFENCE); + fallthrough; + + case SPECTRE_V2_RETPOLINE: + case SPECTRE_V2_EIBRS_RETPOLINE: setup_force_cpu_cap(X86_FEATURE_RETPOLINE); - } else { - retpoline_generic: - mode = SPECTRE_V2_RETPOLINE; - setup_force_cpu_cap(X86_FEATURE_RETPOLINE); + break; } -specv2_set_mode: spectre_v2_enabled = mode; pr_info("%s\n", spectre_v2_strings[mode]); @@ -982,7 +1041,7 @@ specv2_set_mode: * the CPU supports Enhanced IBRS, kernel might un-intentionally not * enable IBRS around firmware calls. */ - if (boot_cpu_has(X86_FEATURE_IBRS) && mode != SPECTRE_V2_IBRS_ENHANCED) { + if (boot_cpu_has(X86_FEATURE_IBRS) && !spectre_v2_in_eibrs_mode(mode)) { setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW); pr_info("Enabling Restricted Speculation for firmware calls\n"); } @@ -1691,7 +1750,7 @@ static ssize_t tsx_async_abort_show_state(char *buf) static char *stibp_state(void) { - if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) + if (spectre_v2_in_eibrs_mode(spectre_v2_enabled)) return ""; switch (spectre_v2_user_stibp) { -- cgit From 5ad3eb1132453b9795ce5fd4572b1c18b292cca9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Feb 2022 20:57:02 +0100 Subject: Documentation/hw-vuln: Update spectre doc Update the doc with the new fun. [ bp: Massage commit message. ] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Reviewed-by: Thomas Gleixner --- Documentation/admin-guide/hw-vuln/spectre.rst | 44 +++++++++++++++++-------- Documentation/admin-guide/kernel-parameters.txt | 8 +++-- 2 files changed, 36 insertions(+), 16 deletions(-) diff --git a/Documentation/admin-guide/hw-vuln/spectre.rst b/Documentation/admin-guide/hw-vuln/spectre.rst index a2b22d5640ec..79051f4dac45 100644 --- a/Documentation/admin-guide/hw-vuln/spectre.rst +++ b/Documentation/admin-guide/hw-vuln/spectre.rst @@ -131,6 +131,19 @@ steer its indirect branch speculations to gadget code, and measure the speculative execution's side effects left in level 1 cache to infer the victim's data. +Yet another variant 2 attack vector is for the attacker to poison the +Branch History Buffer (BHB) to speculatively steer an indirect branch +to a specific Branch Target Buffer (BTB) entry, even if the entry isn't +associated with the source address of the indirect branch. Specifically, +the BHB might be shared across privilege levels even in the presence of +Enhanced IBRS. + +Currently the only known real-world BHB attack vector is via +unprivileged eBPF. Therefore, it's highly recommended to not enable +unprivileged eBPF, especially when eIBRS is used (without retpolines). +For a full mitigation against BHB attacks, it's recommended to use +retpolines (or eIBRS combined with retpolines). + Attack scenarios ---------------- @@ -364,13 +377,15 @@ The possible values in this file are: - Kernel status: - ==================================== ================================= - 'Not affected' The processor is not vulnerable - 'Vulnerable' Vulnerable, no mitigation - 'Mitigation: Full generic retpoline' Software-focused mitigation - 'Mitigation: Full AMD retpoline' AMD-specific software mitigation - 'Mitigation: Enhanced IBRS' Hardware-focused mitigation - ==================================== ================================= + ======================================== ================================= + 'Not affected' The processor is not vulnerable + 'Mitigation: None' Vulnerable, no mitigation + 'Mitigation: Retpolines' Use Retpoline thunks + 'Mitigation: LFENCE' Use LFENCE instructions + 'Mitigation: Enhanced IBRS' Hardware-focused mitigation + 'Mitigation: Enhanced IBRS + Retpolines' Hardware-focused + Retpolines + 'Mitigation: Enhanced IBRS + LFENCE' Hardware-focused + LFENCE + ======================================== ================================= - Firmware status: Show if Indirect Branch Restricted Speculation (IBRS) is used to protect against Spectre variant 2 attacks when calling firmware (x86 only). @@ -583,12 +598,13 @@ kernel command line. Specific mitigations can also be selected manually: - retpoline - replace indirect branches - retpoline,generic - google's original retpoline - retpoline,amd - AMD-specific minimal thunk + retpoline auto pick between generic,lfence + retpoline,generic Retpolines + retpoline,lfence LFENCE; indirect branch + retpoline,amd alias for retpoline,lfence + eibrs enhanced IBRS + eibrs,retpoline enhanced IBRS + Retpolines + eibrs,lfence enhanced IBRS + LFENCE Not specifying this option is equivalent to spectre_v2=auto. @@ -599,7 +615,7 @@ kernel command line. spectre_v2=off. Spectre variant 1 mitigations cannot be disabled. -For spectre_v2_user see :doc:`/admin-guide/kernel-parameters`. +For spectre_v2_user see Documentation/admin-guide/kernel-parameters.txt Mitigation selection guide -------------------------- diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index f5a27f067db9..7123524a86b8 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5361,8 +5361,12 @@ Specific mitigations can also be selected manually: retpoline - replace indirect branches - retpoline,generic - google's original retpoline - retpoline,amd - AMD-specific minimal thunk + retpoline,generic - Retpolines + retpoline,lfence - LFENCE; indirect branch + retpoline,amd - alias for retpoline,lfence + eibrs - enhanced IBRS + eibrs,retpoline - enhanced IBRS + Retpolines + eibrs,lfence - enhanced IBRS + LFENCE Not specifying this option is equivalent to spectre_v2=auto. -- cgit From 44a3918c8245ab10c6c9719dd12e7a8d291980d8 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 18 Feb 2022 11:49:08 -0800 Subject: x86/speculation: Include unprivileged eBPF status in Spectre v2 mitigation reporting With unprivileged eBPF enabled, eIBRS (without retpoline) is vulnerable to Spectre v2 BHB-based attacks. When both are enabled, print a warning message and report it in the 'spectre_v2' sysfs vulnerabilities file. Signed-off-by: Josh Poimboeuf Signed-off-by: Borislav Petkov Reviewed-by: Thomas Gleixner --- arch/x86/kernel/cpu/bugs.c | 35 +++++++++++++++++++++++++++++------ include/linux/bpf.h | 11 +++++++++++ kernel/sysctl.c | 7 +++++++ 3 files changed, 47 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 79c52dd6c597..0a4267c63d3b 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -650,6 +651,16 @@ static inline const char *spectre_v2_module_string(void) static inline const char *spectre_v2_module_string(void) { return ""; } #endif +#define SPECTRE_V2_EIBRS_EBPF_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS on, data leaks possible via Spectre v2 BHB attacks!\n" + +#ifdef CONFIG_BPF_SYSCALL +void unpriv_ebpf_notify(int new_state) +{ + if (spectre_v2_enabled == SPECTRE_V2_EIBRS && !new_state) + pr_err(SPECTRE_V2_EIBRS_EBPF_MSG); +} +#endif + static inline bool match_option(const char *arg, int arglen, const char *opt) { int len = strlen(opt); @@ -994,6 +1005,9 @@ static void __init spectre_v2_select_mitigation(void) break; } + if (mode == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) + pr_err(SPECTRE_V2_EIBRS_EBPF_MSG); + if (spectre_v2_in_eibrs_mode(mode)) { /* Force it so VMEXIT will restore correctly */ x86_spec_ctrl_base |= SPEC_CTRL_IBRS; @@ -1780,6 +1794,20 @@ static char *ibpb_state(void) return ""; } +static ssize_t spectre_v2_show_state(char *buf) +{ + if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) + return sprintf(buf, "Vulnerable: Unprivileged eBPF enabled\n"); + + return sprintf(buf, "%s%s%s%s%s%s\n", + spectre_v2_strings[spectre_v2_enabled], + ibpb_state(), + boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", + stibp_state(), + boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "", + spectre_v2_module_string()); +} + static ssize_t srbds_show_state(char *buf) { return sprintf(buf, "%s\n", srbds_strings[srbds_mitigation]); @@ -1805,12 +1833,7 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr return sprintf(buf, "%s\n", spectre_v1_strings[spectre_v1_mitigation]); case X86_BUG_SPECTRE_V2: - return sprintf(buf, "%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], - ibpb_state(), - boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", - stibp_state(), - boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "", - spectre_v2_module_string()); + return spectre_v2_show_state(buf); case X86_BUG_SPEC_STORE_BYPASS: return sprintf(buf, "%s\n", ssb_strings[ssb_mode]); diff --git a/include/linux/bpf.h b/include/linux/bpf.h index fa517ae604ad..1f56806d8eb9 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1793,6 +1793,11 @@ struct bpf_core_ctx { int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo, int relo_idx, void *insn); +static inline bool unprivileged_ebpf_enabled(void) +{ + return !sysctl_unprivileged_bpf_disabled; +} + #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { @@ -2012,6 +2017,12 @@ bpf_jit_find_kfunc_model(const struct bpf_prog *prog, { return NULL; } + +static inline bool unprivileged_ebpf_enabled(void) +{ + return false; +} + #endif /* CONFIG_BPF_SYSCALL */ void __bpf_free_used_btfs(struct bpf_prog_aux *aux, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 5ae443b2882e..730ab56d9e92 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -180,6 +180,10 @@ static int bpf_stats_handler(struct ctl_table *table, int write, return ret; } +void __weak unpriv_ebpf_notify(int new_state) +{ +} + static int bpf_unpriv_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -197,6 +201,9 @@ static int bpf_unpriv_handler(struct ctl_table *table, int write, return -EPERM; *(int *)table->data = unpriv_enable; } + + unpriv_ebpf_notify(unpriv_enable); + return ret; } #endif /* CONFIG_BPF_SYSCALL && CONFIG_SYSCTL */ -- cgit From c086df4902573e2f06c6a2a83452c13a8bc603f5 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 10 Jan 2022 18:52:52 -0500 Subject: fuse: move FUSE_SUPER_MAGIC definition to magic.h ...to help userland apps that need to identify FUSE mounts. Signed-off-by: Jeff Layton Signed-off-by: Miklos Szeredi --- fs/fuse/inode.c | 3 +-- include/uapi/linux/magic.h | 1 + 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index ee846ce371d8..9ee36aa73251 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -23,6 +23,7 @@ #include #include #include +#include MODULE_AUTHOR("Miklos Szeredi "); MODULE_DESCRIPTION("Filesystem in Userspace"); @@ -50,8 +51,6 @@ MODULE_PARM_DESC(max_user_congthresh, "Global limit for the maximum congestion threshold an " "unprivileged user can set"); -#define FUSE_SUPER_MAGIC 0x65735546 - #define FUSE_DEFAULT_BLKSIZE 512 /** Maximum number of outstanding background requests */ diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h index 0425cd79af9a..f724129c0425 100644 --- a/include/uapi/linux/magic.h +++ b/include/uapi/linux/magic.h @@ -36,6 +36,7 @@ #define EFIVARFS_MAGIC 0xde5e81e4 #define HOSTFS_SUPER_MAGIC 0x00c0ffee #define OVERLAYFS_SUPER_MAGIC 0x794c7630 +#define FUSE_SUPER_MAGIC 0x65735546 #define MINIX_SUPER_MAGIC 0x137F /* minix v1 fs, 14 char names */ #define MINIX_SUPER_MAGIC2 0x138F /* minix v1 fs, 30 char names */ -- cgit From d920eaa4c4559f59be7b4c2d26fa0a2e1aaa3da9 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Wed, 16 Feb 2022 15:37:38 +0000 Subject: ARM: Fix kgdb breakpoint for Thumb2 The kgdb code needs to register an undef hook for the Thumb UDF instruction that will fault in order to be functional on Thumb2 platforms. Reported-by: Johannes Stezenbach Tested-by: Johannes Stezenbach Fixes: 5cbad0ebf45c ("kgdb: support for ARCH=arm") Signed-off-by: Russell King (Oracle) --- arch/arm/kernel/kgdb.c | 36 ++++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/arch/arm/kernel/kgdb.c b/arch/arm/kernel/kgdb.c index 7bd30c0a4280..22f937e6f3ff 100644 --- a/arch/arm/kernel/kgdb.c +++ b/arch/arm/kernel/kgdb.c @@ -154,22 +154,38 @@ static int kgdb_compiled_brk_fn(struct pt_regs *regs, unsigned int instr) return 0; } -static struct undef_hook kgdb_brkpt_hook = { +static struct undef_hook kgdb_brkpt_arm_hook = { .instr_mask = 0xffffffff, .instr_val = KGDB_BREAKINST, - .cpsr_mask = MODE_MASK, + .cpsr_mask = PSR_T_BIT | MODE_MASK, .cpsr_val = SVC_MODE, .fn = kgdb_brk_fn }; -static struct undef_hook kgdb_compiled_brkpt_hook = { +static struct undef_hook kgdb_brkpt_thumb_hook = { + .instr_mask = 0xffff, + .instr_val = KGDB_BREAKINST & 0xffff, + .cpsr_mask = PSR_T_BIT | MODE_MASK, + .cpsr_val = PSR_T_BIT | SVC_MODE, + .fn = kgdb_brk_fn +}; + +static struct undef_hook kgdb_compiled_brkpt_arm_hook = { .instr_mask = 0xffffffff, .instr_val = KGDB_COMPILED_BREAK, - .cpsr_mask = MODE_MASK, + .cpsr_mask = PSR_T_BIT | MODE_MASK, .cpsr_val = SVC_MODE, .fn = kgdb_compiled_brk_fn }; +static struct undef_hook kgdb_compiled_brkpt_thumb_hook = { + .instr_mask = 0xffff, + .instr_val = KGDB_COMPILED_BREAK & 0xffff, + .cpsr_mask = PSR_T_BIT | MODE_MASK, + .cpsr_val = PSR_T_BIT | SVC_MODE, + .fn = kgdb_compiled_brk_fn +}; + static int __kgdb_notify(struct die_args *args, unsigned long cmd) { struct pt_regs *regs = args->regs; @@ -210,8 +226,10 @@ int kgdb_arch_init(void) if (ret != 0) return ret; - register_undef_hook(&kgdb_brkpt_hook); - register_undef_hook(&kgdb_compiled_brkpt_hook); + register_undef_hook(&kgdb_brkpt_arm_hook); + register_undef_hook(&kgdb_brkpt_thumb_hook); + register_undef_hook(&kgdb_compiled_brkpt_arm_hook); + register_undef_hook(&kgdb_compiled_brkpt_thumb_hook); return 0; } @@ -224,8 +242,10 @@ int kgdb_arch_init(void) */ void kgdb_arch_exit(void) { - unregister_undef_hook(&kgdb_brkpt_hook); - unregister_undef_hook(&kgdb_compiled_brkpt_hook); + unregister_undef_hook(&kgdb_brkpt_arm_hook); + unregister_undef_hook(&kgdb_brkpt_thumb_hook); + unregister_undef_hook(&kgdb_compiled_brkpt_arm_hook); + unregister_undef_hook(&kgdb_compiled_brkpt_thumb_hook); unregister_die_notifier(&kgdb_notifier); } -- cgit From 11c57c3ba94da74c3446924260e34e0b1950b5d7 Mon Sep 17 00:00:00 2001 From: Julian Braha Date: Mon, 17 Jan 2022 05:09:40 +0100 Subject: ARM: 9178/1: fix unmet dependency on BITREVERSE for HAVE_ARCH_BITREVERSE Resending this to properly add it to the patch tracker - thanks for letting me know, Arnd :) When ARM is enabled, and BITREVERSE is disabled, Kbuild gives the following warning: WARNING: unmet direct dependencies detected for HAVE_ARCH_BITREVERSE Depends on [n]: BITREVERSE [=n] Selected by [y]: - ARM [=y] && (CPU_32v7M [=n] || CPU_32v7 [=y]) && !CPU_32v6 [=n] This is because ARM selects HAVE_ARCH_BITREVERSE without selecting BITREVERSE, despite HAVE_ARCH_BITREVERSE depending on BITREVERSE. This unmet dependency bug was found by Kismet, a static analysis tool for Kconfig. Please advise if this is not the appropriate solution. Signed-off-by: Julian Braha Signed-off-by: Russell King (Oracle) --- lib/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/Kconfig b/lib/Kconfig index 5e7165e6a346..fa4b10322efc 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -45,7 +45,6 @@ config BITREVERSE config HAVE_ARCH_BITREVERSE bool default n - depends on BITREVERSE help This option enables the use of hardware bit-reversal instructions on architectures which support such operations. -- cgit From 1e6ae0e46e32749b130f1823da30cea9aa2a59a0 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 21 Feb 2022 09:50:29 -0800 Subject: mips: setup: fix setnocoherentio() boolean setting Correct a typo/pasto: setnocoherentio() should set dma_default_coherent to false, not true. Fixes: 14ac09a65e19 ("MIPS: refactor the runtime coherent vs noncoherent DMA indicators") Signed-off-by: Randy Dunlap Cc: Christoph Hellwig Cc: Thomas Bogendoerfer Cc: linux-mips@vger.kernel.org Reviewed-by: Christoph Hellwig Signed-off-by: Thomas Bogendoerfer --- arch/mips/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c index f979adfd4fc2..ef73ba1e0ec1 100644 --- a/arch/mips/kernel/setup.c +++ b/arch/mips/kernel/setup.c @@ -803,7 +803,7 @@ early_param("coherentio", setcoherentio); static int __init setnocoherentio(char *str) { - dma_default_coherent = true; + dma_default_coherent = false; pr_info("Software DMA cache coherency (command line)\n"); return 0; } -- cgit From ae089831ff28a115908b8d796f667c2dadef1637 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 22 Feb 2022 10:13:31 -0800 Subject: netfilter: nf_tables: prefer kfree_rcu(ptr, rcu) variant While kfree_rcu(ptr) _is_ supported, it has some limitations. Given that 99.99% of kfree_rcu() users [1] use the legacy two parameters variant, and @catchall objects do have an rcu head, simply use it. Choice of kfree_rcu(ptr) variant was probably not intentional. [1] including calls from net/netfilter/nf_tables_api.c Fixes: aaa31047a6d2 ("netfilter: nftables: add catch-all set element support") Signed-off-by: Eric Dumazet Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 9cd1d7a62804..c86748b3873b 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4502,7 +4502,7 @@ static void nft_set_catchall_destroy(const struct nft_ctx *ctx, list_for_each_entry_safe(catchall, next, &set->catchall_list, list) { list_del_rcu(&catchall->list); nft_set_elem_destroy(set, catchall->elem, true); - kfree_rcu(catchall); + kfree_rcu(catchall, rcu); } } @@ -5669,7 +5669,7 @@ static void nft_setelem_catchall_remove(const struct net *net, list_for_each_entry_safe(catchall, next, &set->catchall_list, list) { if (catchall->elem == elem->priv) { list_del_rcu(&catchall->list); - kfree_rcu(catchall); + kfree_rcu(catchall, rcu); break; } } -- cgit From 8d3b01e0d4bb54368d73d0984466d72c2eeeac74 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Mon, 20 Dec 2021 11:32:39 +0100 Subject: ARM: tegra: Move panels to AUX bus Move the eDP panel on Venice 2 and Nyan boards into the corresponding AUX bus device tree node. This allows us to avoid a nasty circular dependency that would otherwise be created between the DPAUX and panel nodes via the DDC/I2C phandle. Fixes: eb481f9ac95c ("ARM: tegra: add Acer Chromebook 13 device tree") Fixes: 59fe02cb079f ("ARM: tegra: Add DTS for the nyan-blaze board") Fixes: 40e231c770a4 ("ARM: tegra: Enable eDP for Venice2") Signed-off-by: Thierry Reding --- arch/arm/boot/dts/tegra124-nyan-big.dts | 15 +++++++++------ arch/arm/boot/dts/tegra124-nyan-blaze.dts | 15 +++++++++------ arch/arm/boot/dts/tegra124-venice2.dts | 14 +++++++------- 3 files changed, 25 insertions(+), 19 deletions(-) diff --git a/arch/arm/boot/dts/tegra124-nyan-big.dts b/arch/arm/boot/dts/tegra124-nyan-big.dts index 1d2aac2cb6d0..fdc1d64dfff9 100644 --- a/arch/arm/boot/dts/tegra124-nyan-big.dts +++ b/arch/arm/boot/dts/tegra124-nyan-big.dts @@ -13,12 +13,15 @@ "google,nyan-big-rev1", "google,nyan-big-rev0", "google,nyan-big", "google,nyan", "nvidia,tegra124"; - panel: panel { - compatible = "auo,b133xtn01"; - - power-supply = <&vdd_3v3_panel>; - backlight = <&backlight>; - ddc-i2c-bus = <&dpaux>; + host1x@50000000 { + dpaux@545c0000 { + aux-bus { + panel: panel { + compatible = "auo,b133xtn01"; + backlight = <&backlight>; + }; + }; + }; }; mmc@700b0400 { /* SD Card on this bus */ diff --git a/arch/arm/boot/dts/tegra124-nyan-blaze.dts b/arch/arm/boot/dts/tegra124-nyan-blaze.dts index 677babde6460..abdf4456826f 100644 --- a/arch/arm/boot/dts/tegra124-nyan-blaze.dts +++ b/arch/arm/boot/dts/tegra124-nyan-blaze.dts @@ -15,12 +15,15 @@ "google,nyan-blaze-rev0", "google,nyan-blaze", "google,nyan", "nvidia,tegra124"; - panel: panel { - compatible = "samsung,ltn140at29-301"; - - power-supply = <&vdd_3v3_panel>; - backlight = <&backlight>; - ddc-i2c-bus = <&dpaux>; + host1x@50000000 { + dpaux@545c0000 { + aux-bus { + panel: panel { + compatible = "samsung,ltn140at29-301"; + backlight = <&backlight>; + }; + }; + }; }; sound { diff --git a/arch/arm/boot/dts/tegra124-venice2.dts b/arch/arm/boot/dts/tegra124-venice2.dts index 232c90604df9..6a9592ceb5f2 100644 --- a/arch/arm/boot/dts/tegra124-venice2.dts +++ b/arch/arm/boot/dts/tegra124-venice2.dts @@ -48,6 +48,13 @@ dpaux@545c0000 { vdd-supply = <&vdd_3v3_panel>; status = "okay"; + + aux-bus { + panel: panel { + compatible = "lg,lp129qe"; + backlight = <&backlight>; + }; + }; }; }; @@ -1080,13 +1087,6 @@ }; }; - panel: panel { - compatible = "lg,lp129qe"; - power-supply = <&vdd_3v3_panel>; - backlight = <&backlight>; - ddc-i2c-bus = <&dpaux>; - }; - vdd_mux: regulator-mux { compatible = "regulator-fixed"; regulator-name = "+VDD_MUX"; -- cgit From 515415d316168c6521d74ea8280287e28d7303e6 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Sat, 19 Feb 2022 13:07:55 +0100 Subject: ARM: boot: dts: bcm2711: Fix HVS register range While the HVS has the same context memory size in the BCM2711 than in the previous SoCs, the range allocated to the registers doubled and it now takes 16k + 16k, compared to 8k + 16k before. The KMS driver will use the whole context RAM though, eventually resulting in a pointer dereference error when we access the higher half of the context memory since it hasn't been mapped. Fixes: 4564363351e2 ("ARM: dts: bcm2711: Enable the display pipeline") Signed-off-by: Maxime Ripard Signed-off-by: Stefan Wahren Signed-off-by: Florian Fainelli --- arch/arm/boot/dts/bcm2711.dtsi | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/boot/dts/bcm2711.dtsi b/arch/arm/boot/dts/bcm2711.dtsi index dff18fc9a906..21294f775a20 100644 --- a/arch/arm/boot/dts/bcm2711.dtsi +++ b/arch/arm/boot/dts/bcm2711.dtsi @@ -290,6 +290,7 @@ hvs: hvs@7e400000 { compatible = "brcm,bcm2711-hvs"; + reg = <0x7e400000 0x8000>; interrupts = ; }; -- cgit From 558c303c9734af5a813739cd284879227f7297d2 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 10 Nov 2021 14:48:00 +0000 Subject: arm64: Mitigate spectre style branch history side channels Speculation attacks against some high-performance processors can make use of branch history to influence future speculation. When taking an exception from user-space, a sequence of branches or a firmware call overwrites or invalidates the branch history. The sequence of branches is added to the vectors, and should appear before the first indirect branch. For systems using KPTI the sequence is added to the kpti trampoline where it has a free register as the exit from the trampoline is via a 'ret'. For systems not using KPTI, the same register tricks are used to free up a register in the vectors. For the firmware call, arch-workaround-3 clobbers 4 registers, so there is no choice but to save them to the EL1 stack. This only happens for entry from EL0, so if we take an exception due to the stack access, it will not become re-entrant. For KVM, the existing branch-predictor-hardening vectors are used. When a spectre version of these vectors is in use, the firmware call is sufficient to mitigate against Spectre-BHB. For the non-spectre versions, the sequence of branches is added to the indirect vector. Reviewed-by: Catalin Marinas Signed-off-by: James Morse --- arch/arm64/Kconfig | 9 ++ arch/arm64/include/asm/assembler.h | 14 +- arch/arm64/include/asm/cpufeature.h | 16 +++ arch/arm64/include/asm/cputype.h | 8 ++ arch/arm64/include/asm/spectre.h | 4 +- arch/arm64/include/asm/sysreg.h | 1 + arch/arm64/include/asm/vectors.h | 5 + arch/arm64/kernel/cpu_errata.c | 7 + arch/arm64/kernel/image-vars.h | 3 + arch/arm64/kernel/proton-pack.c | 278 ++++++++++++++++++++++++++++++++++++ arch/arm64/kvm/hyp/hyp-entry.S | 8 ++ arch/arm64/tools/cpucaps | 1 + 12 files changed, 352 insertions(+), 2 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index cbcd42decb2a..c9631626c0b3 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1382,6 +1382,15 @@ config UNMAP_KERNEL_AT_EL0 If unsure, say Y. +config MITIGATE_SPECTRE_BRANCH_HISTORY + bool "Mitigate Spectre style attacks against branch history" if EXPERT + default y + help + Speculation attacks against some high-performance processors can + make use of branch history to influence future speculation. + When taking an exception from user-space, a sequence of branches + or a firmware call overwrites the branch history. + config RODATA_FULL_DEFAULT_ENABLED bool "Apply r/o permissions of VM areas also to their linear aliases" default y diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 046c38ee2841..5cc97130843f 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -852,7 +852,9 @@ alternative_endif .macro __mitigate_spectre_bhb_loop tmp #ifdef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY - mov \tmp, #32 +alternative_cb spectre_bhb_patch_loop_iter + mov \tmp, #32 // Patched to correct the immediate +alternative_cb_end .Lspectre_bhb_loop\@: b . + 4 subs \tmp, \tmp, #1 @@ -861,6 +863,16 @@ alternative_endif #endif /* CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ .endm + .macro mitigate_spectre_bhb_loop tmp +#ifdef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY +alternative_cb spectre_bhb_patch_loop_mitigation_enable + b .L_spectre_bhb_loop_done\@ // Patched to NOP +alternative_cb_end + __mitigate_spectre_bhb_loop \tmp +.L_spectre_bhb_loop_done\@: +#endif /* CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ + .endm + /* Save/restores x0-x3 to the stack */ .macro __mitigate_spectre_bhb_fw #ifdef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index ef6be92b1921..b158fd447f3a 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -637,6 +637,22 @@ static inline bool cpu_supports_mixed_endian_el0(void) return id_aa64mmfr0_mixed_endian_el0(read_cpuid(ID_AA64MMFR0_EL1)); } + +static inline bool supports_csv2p3(int scope) +{ + u64 pfr0; + u8 csv2_val; + + if (scope == SCOPE_LOCAL_CPU) + pfr0 = read_sysreg_s(SYS_ID_AA64PFR0_EL1); + else + pfr0 = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); + + csv2_val = cpuid_feature_extract_unsigned_field(pfr0, + ID_AA64PFR0_CSV2_SHIFT); + return csv2_val == 3; +} + const struct cpumask *system_32bit_el0_cpumask(void); DECLARE_STATIC_KEY_FALSE(arm64_mismatched_32bit_el0); diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 999b9149f856..bfbf0c4c7c5e 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -73,10 +73,14 @@ #define ARM_CPU_PART_CORTEX_A76 0xD0B #define ARM_CPU_PART_NEOVERSE_N1 0xD0C #define ARM_CPU_PART_CORTEX_A77 0xD0D +#define ARM_CPU_PART_NEOVERSE_V1 0xD40 +#define ARM_CPU_PART_CORTEX_A78 0xD41 +#define ARM_CPU_PART_CORTEX_X1 0xD44 #define ARM_CPU_PART_CORTEX_A510 0xD46 #define ARM_CPU_PART_CORTEX_A710 0xD47 #define ARM_CPU_PART_CORTEX_X2 0xD48 #define ARM_CPU_PART_NEOVERSE_N2 0xD49 +#define ARM_CPU_PART_CORTEX_A78C 0xD4B #define APM_CPU_PART_POTENZA 0x000 @@ -117,10 +121,14 @@ #define MIDR_CORTEX_A76 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A76) #define MIDR_NEOVERSE_N1 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_N1) #define MIDR_CORTEX_A77 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A77) +#define MIDR_NEOVERSE_V1 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_V1) +#define MIDR_CORTEX_A78 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A78) +#define MIDR_CORTEX_X1 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X1) #define MIDR_CORTEX_A510 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A510) #define MIDR_CORTEX_A710 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A710) #define MIDR_CORTEX_X2 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X2) #define MIDR_NEOVERSE_N2 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_N2) +#define MIDR_CORTEX_A78C MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A78C) #define MIDR_THUNDERX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX) #define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX) #define MIDR_THUNDERX_83XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_83XX) diff --git a/arch/arm64/include/asm/spectre.h b/arch/arm64/include/asm/spectre.h index c617fe90a843..86e0cc9b9c68 100644 --- a/arch/arm64/include/asm/spectre.h +++ b/arch/arm64/include/asm/spectre.h @@ -94,6 +94,8 @@ void spectre_v4_enable_task_mitigation(struct task_struct *tsk); enum mitigation_state arm64_get_meltdown_state(void); enum mitigation_state arm64_get_spectre_bhb_state(void); - +bool is_spectre_bhb_affected(const struct arm64_cpu_capabilities *entry, int scope); +u8 spectre_bhb_loop_affected(int scope); +void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *__unused); #endif /* __ASSEMBLY__ */ #endif /* __ASM_SPECTRE_H */ diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 898bee0004ae..238b9f6a28fc 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -904,6 +904,7 @@ #endif /* id_aa64mmfr1 */ +#define ID_AA64MMFR1_ECBHB_SHIFT 60 #define ID_AA64MMFR1_AFP_SHIFT 44 #define ID_AA64MMFR1_ETS_SHIFT 36 #define ID_AA64MMFR1_TWED_SHIFT 32 diff --git a/arch/arm64/include/asm/vectors.h b/arch/arm64/include/asm/vectors.h index 3f76dfd9e074..1f65c37dc653 100644 --- a/arch/arm64/include/asm/vectors.h +++ b/arch/arm64/include/asm/vectors.h @@ -40,6 +40,11 @@ enum arm64_bp_harden_el1_vectors { EL1_VECTOR_KPTI, }; +#ifndef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY +#define EL1_VECTOR_BHB_LOOP -1 +#define EL1_VECTOR_BHB_FW -1 +#endif /* !CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ + /* The vectors to use on return from EL0. e.g. to remap the kernel */ DECLARE_PER_CPU_READ_MOSTLY(const char *, this_cpu_vector); diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index b217941713a8..a401180e8d66 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -502,6 +502,13 @@ const struct arm64_cpu_capabilities arm64_errata[] = { .matches = has_spectre_v4, .cpu_enable = spectre_v4_enable_mitigation, }, + { + .desc = "Spectre-BHB", + .capability = ARM64_SPECTRE_BHB, + .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, + .matches = is_spectre_bhb_affected, + .cpu_enable = spectre_bhb_enable_mitigation, + }, #ifdef CONFIG_ARM64_ERRATUM_1418040 { .desc = "ARM erratum 1418040", diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index 7eaf1f7c4168..56db964a4f0c 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -66,6 +66,9 @@ KVM_NVHE_ALIAS(kvm_patch_vector_branch); KVM_NVHE_ALIAS(kvm_update_va_mask); KVM_NVHE_ALIAS(kvm_get_kimage_voffset); KVM_NVHE_ALIAS(kvm_compute_final_ctr_el0); +KVM_NVHE_ALIAS(spectre_bhb_patch_loop_iter); +KVM_NVHE_ALIAS(spectre_bhb_patch_loop_mitigation_enable); +KVM_NVHE_ALIAS(spectre_bhb_patch_wa3); /* Global kernel state accessed by nVHE hyp code. */ KVM_NVHE_ALIAS(kvm_vgic_global_state); diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index 8a499b8373c0..fbbcd1d2eb19 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -24,9 +24,11 @@ #include #include +#include #include #include #include +#include #include /* @@ -796,6 +798,17 @@ int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which) } } +/* + * Spectre BHB. + * + * A CPU is either: + * - Mitigated by a branchy loop a CPU specific number of times, and listed + * in our "loop mitigated list". + * - Mitigated in software by the firmware Spectre v2 call. + * - Has the 'Exception Clears Branch History Buffer' (ECBHB) feature, so no + * software mitigation in the vectors is needed. + * - Has CSV2.3, so is unaffected. + */ static enum mitigation_state spectre_bhb_state; enum mitigation_state arm64_get_spectre_bhb_state(void) @@ -803,12 +816,227 @@ enum mitigation_state arm64_get_spectre_bhb_state(void) return spectre_bhb_state; } +enum bhb_mitigation_bits { + BHB_LOOP, + BHB_FW, + BHB_HW, +}; +static unsigned long system_bhb_mitigations; + +/* + * This must be called with SCOPE_LOCAL_CPU for each type of CPU, before any + * SCOPE_SYSTEM call will give the right answer. + */ +u8 spectre_bhb_loop_affected(int scope) +{ + u8 k = 0; + static u8 max_bhb_k; + + if (scope == SCOPE_LOCAL_CPU) { + static const struct midr_range spectre_bhb_k32_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A78), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A78C), + MIDR_ALL_VERSIONS(MIDR_CORTEX_X1), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A710), + MIDR_ALL_VERSIONS(MIDR_CORTEX_X2), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1), + {}, + }; + static const struct midr_range spectre_bhb_k24_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A76), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A77), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1), + {}, + }; + static const struct midr_range spectre_bhb_k8_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A72), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A57), + {}, + }; + + if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k32_list)) + k = 32; + else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k24_list)) + k = 24; + else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k8_list)) + k = 8; + + max_bhb_k = max(max_bhb_k, k); + } else { + k = max_bhb_k; + } + + return k; +} + +static enum mitigation_state spectre_bhb_get_cpu_fw_mitigation_state(void) +{ + int ret; + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, + ARM_SMCCC_ARCH_WORKAROUND_3, &res); + + ret = res.a0; + switch (ret) { + case SMCCC_RET_SUCCESS: + return SPECTRE_MITIGATED; + case SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED: + return SPECTRE_UNAFFECTED; + default: + fallthrough; + case SMCCC_RET_NOT_SUPPORTED: + return SPECTRE_VULNERABLE; + } +} + +static bool is_spectre_bhb_fw_affected(int scope) +{ + static bool system_affected; + enum mitigation_state fw_state; + bool has_smccc = arm_smccc_1_1_get_conduit() != SMCCC_CONDUIT_NONE; + static const struct midr_range spectre_bhb_firmware_mitigated_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A73), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A75), + {}, + }; + bool cpu_in_list = is_midr_in_range_list(read_cpuid_id(), + spectre_bhb_firmware_mitigated_list); + + if (scope != SCOPE_LOCAL_CPU) + return system_affected; + + fw_state = spectre_bhb_get_cpu_fw_mitigation_state(); + if (cpu_in_list || (has_smccc && fw_state == SPECTRE_MITIGATED)) { + system_affected = true; + return true; + } + + return false; +} + +static bool supports_ecbhb(int scope) +{ + u64 mmfr1; + + if (scope == SCOPE_LOCAL_CPU) + mmfr1 = read_sysreg_s(SYS_ID_AA64MMFR1_EL1); + else + mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); + + return cpuid_feature_extract_unsigned_field(mmfr1, + ID_AA64MMFR1_ECBHB_SHIFT); +} + +bool is_spectre_bhb_affected(const struct arm64_cpu_capabilities *entry, + int scope) +{ + WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); + + if (supports_csv2p3(scope)) + return false; + + if (spectre_bhb_loop_affected(scope)) + return true; + + if (is_spectre_bhb_fw_affected(scope)) + return true; + + return false; +} + +static void this_cpu_set_vectors(enum arm64_bp_harden_el1_vectors slot) +{ + const char *v = arm64_get_bp_hardening_vector(slot); + + if (slot < 0) + return; + + __this_cpu_write(this_cpu_vector, v); + + /* + * When KPTI is in use, the vectors are switched when exiting to + * user-space. + */ + if (arm64_kernel_unmapped_at_el0()) + return; + + write_sysreg(v, vbar_el1); + isb(); +} + +void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *entry) +{ + bp_hardening_cb_t cpu_cb; + enum mitigation_state fw_state, state = SPECTRE_VULNERABLE; + struct bp_hardening_data *data = this_cpu_ptr(&bp_hardening_data); + + if (!is_spectre_bhb_affected(entry, SCOPE_LOCAL_CPU)) + return; + + if (arm64_get_spectre_v2_state() == SPECTRE_VULNERABLE) { + /* No point mitigating Spectre-BHB alone. */ + } else if (!IS_ENABLED(CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY)) { + pr_info_once("spectre-bhb mitigation disabled by compile time option\n"); + } else if (cpu_mitigations_off()) { + pr_info_once("spectre-bhb mitigation disabled by command line option\n"); + } else if (supports_ecbhb(SCOPE_LOCAL_CPU)) { + state = SPECTRE_MITIGATED; + set_bit(BHB_HW, &system_bhb_mitigations); + } else if (spectre_bhb_loop_affected(SCOPE_LOCAL_CPU)) { + /* + * Ensure KVM uses the indirect vector which will have the + * branchy-loop added. A57/A72-r0 will already have selected + * the spectre-indirect vector, which is sufficient for BHB + * too. + */ + if (!data->slot) + data->slot = HYP_VECTOR_INDIRECT; + + this_cpu_set_vectors(EL1_VECTOR_BHB_LOOP); + state = SPECTRE_MITIGATED; + set_bit(BHB_LOOP, &system_bhb_mitigations); + } else if (is_spectre_bhb_fw_affected(SCOPE_LOCAL_CPU)) { + fw_state = spectre_bhb_get_cpu_fw_mitigation_state(); + if (fw_state == SPECTRE_MITIGATED) { + /* + * Ensure KVM uses one of the spectre bp_hardening + * vectors. The indirect vector doesn't include the EL3 + * call, so needs upgrading to + * HYP_VECTOR_SPECTRE_INDIRECT. + */ + if (!data->slot || data->slot == HYP_VECTOR_INDIRECT) + data->slot += 1; + + this_cpu_set_vectors(EL1_VECTOR_BHB_FW); + + /* + * The WA3 call in the vectors supersedes the WA1 call + * made during context-switch. Uninstall any firmware + * bp_hardening callback. + */ + cpu_cb = spectre_v2_get_sw_mitigation_cb(); + if (__this_cpu_read(bp_hardening_data.fn) != cpu_cb) + __this_cpu_write(bp_hardening_data.fn, NULL); + + state = SPECTRE_MITIGATED; + set_bit(BHB_FW, &system_bhb_mitigations); + } + } + + update_mitigation_state(&spectre_bhb_state, state); +} + /* Patched to NOP when enabled */ void noinstr spectre_bhb_patch_loop_mitigation_enable(struct alt_instr *alt, __le32 *origptr, __le32 *updptr, int nr_inst) { BUG_ON(nr_inst != 1); + + if (test_bit(BHB_LOOP, &system_bhb_mitigations)) + *updptr++ = cpu_to_le32(aarch64_insn_gen_nop()); } /* Patched to NOP when enabled */ @@ -817,4 +1045,54 @@ void noinstr spectre_bhb_patch_fw_mitigation_enabled(struct alt_instr *alt, __le32 *updptr, int nr_inst) { BUG_ON(nr_inst != 1); + + if (test_bit(BHB_FW, &system_bhb_mitigations)) + *updptr++ = cpu_to_le32(aarch64_insn_gen_nop()); +} + +/* Patched to correct the immediate */ +void noinstr spectre_bhb_patch_loop_iter(struct alt_instr *alt, + __le32 *origptr, __le32 *updptr, int nr_inst) +{ + u8 rd; + u32 insn; + u16 loop_count = spectre_bhb_loop_affected(SCOPE_SYSTEM); + + BUG_ON(nr_inst != 1); /* MOV -> MOV */ + + if (!IS_ENABLED(CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY)) + return; + + insn = le32_to_cpu(*origptr); + rd = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RD, insn); + insn = aarch64_insn_gen_movewide(rd, loop_count, 0, + AARCH64_INSN_VARIANT_64BIT, + AARCH64_INSN_MOVEWIDE_ZERO); + *updptr++ = cpu_to_le32(insn); +} + +/* Patched to mov WA3 when supported */ +void noinstr spectre_bhb_patch_wa3(struct alt_instr *alt, + __le32 *origptr, __le32 *updptr, int nr_inst) +{ + u8 rd; + u32 insn; + + BUG_ON(nr_inst != 1); /* MOV -> MOV */ + + if (!IS_ENABLED(CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY) || + !test_bit(BHB_FW, &system_bhb_mitigations)) + return; + + insn = le32_to_cpu(*origptr); + rd = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RD, insn); + + insn = aarch64_insn_gen_logical_immediate(AARCH64_INSN_LOGIC_ORR, + AARCH64_INSN_VARIANT_32BIT, + AARCH64_INSN_REG_ZR, rd, + ARM_SMCCC_ARCH_WORKAROUND_3); + if (WARN_ON_ONCE(insn == AARCH64_BREAK_FAULT)) + return; + + *updptr++ = cpu_to_le32(insn); } diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S index b6b6801d96d5..12aa49324955 100644 --- a/arch/arm64/kvm/hyp/hyp-entry.S +++ b/arch/arm64/kvm/hyp/hyp-entry.S @@ -62,6 +62,10 @@ el1_sync: // Guest trapped into EL2 /* ARM_SMCCC_ARCH_WORKAROUND_2 handling */ eor w1, w1, #(ARM_SMCCC_ARCH_WORKAROUND_1 ^ \ ARM_SMCCC_ARCH_WORKAROUND_2) + cbz w1, wa_epilogue + + eor w1, w1, #(ARM_SMCCC_ARCH_WORKAROUND_2 ^ \ + ARM_SMCCC_ARCH_WORKAROUND_3) cbnz w1, el1_trap wa_epilogue: @@ -192,7 +196,10 @@ SYM_CODE_END(__kvm_hyp_vector) sub sp, sp, #(8 * 4) stp x2, x3, [sp, #(8 * 0)] stp x0, x1, [sp, #(8 * 2)] + alternative_cb spectre_bhb_patch_wa3 + /* Patched to mov WA3 when supported */ mov w0, #ARM_SMCCC_ARCH_WORKAROUND_1 + alternative_cb_end smc #0 ldp x2, x3, [sp, #(8 * 0)] add sp, sp, #(8 * 2) @@ -205,6 +212,7 @@ SYM_CODE_END(__kvm_hyp_vector) spectrev2_smccc_wa1_smc .else stp x0, x1, [sp, #-16]! + mitigate_spectre_bhb_loop x0 .endif .if \indirect != 0 alternative_cb kvm_patch_vector_branch diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index 9c65b1e25a96..cea7533cb304 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -44,6 +44,7 @@ MTE_ASYMM SPECTRE_V2 SPECTRE_V3A SPECTRE_V4 +SPECTRE_BHB SSBS SVE UNMAP_KERNEL_AT_EL0 -- cgit From a5905d6af492ee6a4a2205f0d550b3f931b03d03 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 10 Dec 2021 11:16:18 +0000 Subject: KVM: arm64: Allow SMCCC_ARCH_WORKAROUND_3 to be discovered and migrated KVM allows the guest to discover whether the ARCH_WORKAROUND SMCCC are implemented, and to preserve that state during migration through its firmware register interface. Add the necessary boiler plate for SMCCC_ARCH_WORKAROUND_3. Reviewed-by: Russell King (Oracle) Reviewed-by: Catalin Marinas Signed-off-by: James Morse --- arch/arm64/include/uapi/asm/kvm.h | 5 +++++ arch/arm64/kvm/hypercalls.c | 12 ++++++++++++ arch/arm64/kvm/psci.c | 18 +++++++++++++++++- 3 files changed, 34 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index b3edde68bc3e..323e251ed37b 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -281,6 +281,11 @@ struct kvm_arm_copy_mte_tags { #define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED 3 #define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED (1U << 4) +#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3 KVM_REG_ARM_FW_REG(3) +#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3_NOT_AVAIL 0 +#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3_AVAIL 1 +#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3_NOT_REQUIRED 2 + /* SVE registers */ #define KVM_REG_ARM64_SVE (0x15 << KVM_REG_ARM_COPROC_SHIFT) diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c index 30da78f72b3b..202b8c455724 100644 --- a/arch/arm64/kvm/hypercalls.c +++ b/arch/arm64/kvm/hypercalls.c @@ -107,6 +107,18 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu) break; } break; + case ARM_SMCCC_ARCH_WORKAROUND_3: + switch (arm64_get_spectre_bhb_state()) { + case SPECTRE_VULNERABLE: + break; + case SPECTRE_MITIGATED: + val[0] = SMCCC_RET_SUCCESS; + break; + case SPECTRE_UNAFFECTED: + val[0] = SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED; + break; + } + break; case ARM_SMCCC_HV_PV_TIME_FEATURES: val[0] = SMCCC_RET_SUCCESS; break; diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c index 3eae32876897..14b9726041ff 100644 --- a/arch/arm64/kvm/psci.c +++ b/arch/arm64/kvm/psci.c @@ -406,7 +406,7 @@ int kvm_psci_call(struct kvm_vcpu *vcpu) int kvm_arm_get_fw_num_regs(struct kvm_vcpu *vcpu) { - return 3; /* PSCI version and two workaround registers */ + return 4; /* PSCI version and three workaround registers */ } int kvm_arm_copy_fw_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) @@ -420,6 +420,9 @@ int kvm_arm_copy_fw_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) if (put_user(KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2, uindices++)) return -EFAULT; + if (put_user(KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3, uindices++)) + return -EFAULT; + return 0; } @@ -459,6 +462,17 @@ static int get_kernel_wa_level(u64 regid) case SPECTRE_VULNERABLE: return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL; } + break; + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3: + switch (arm64_get_spectre_bhb_state()) { + case SPECTRE_VULNERABLE: + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3_NOT_AVAIL; + case SPECTRE_MITIGATED: + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3_AVAIL; + case SPECTRE_UNAFFECTED: + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3_NOT_REQUIRED; + } + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3_NOT_AVAIL; } return -EINVAL; @@ -475,6 +489,7 @@ int kvm_arm_get_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) break; case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1: case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2: + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3: val = get_kernel_wa_level(reg->id) & KVM_REG_FEATURE_LEVEL_MASK; break; default: @@ -520,6 +535,7 @@ int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) } case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1: + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3: if (val & ~KVM_REG_FEATURE_LEVEL_MASK) return -EINVAL; -- cgit From 228a26b912287934789023b4132ba76065d9491c Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 10 Dec 2021 14:32:56 +0000 Subject: arm64: Use the clearbhb instruction in mitigations Future CPUs may implement a clearbhb instruction that is sufficient to mitigate SpectreBHB. CPUs that implement this instruction, but not CSV2.3 must be affected by Spectre-BHB. Add support to use this instruction as the BHB mitigation on CPUs that support it. The instruction is in the hint space, so it will be treated by a NOP as older CPUs. Reviewed-by: Russell King (Oracle) Reviewed-by: Catalin Marinas Signed-off-by: James Morse --- arch/arm64/include/asm/assembler.h | 17 +++++++++++++++++ arch/arm64/include/asm/cpufeature.h | 13 +++++++++++++ arch/arm64/include/asm/insn.h | 1 + arch/arm64/include/asm/sysreg.h | 1 + arch/arm64/include/asm/vectors.h | 7 +++++++ arch/arm64/kernel/cpufeature.c | 1 + arch/arm64/kernel/entry.S | 8 ++++++++ arch/arm64/kernel/image-vars.h | 1 + arch/arm64/kernel/proton-pack.c | 29 +++++++++++++++++++++++++++++ arch/arm64/kvm/hyp/hyp-entry.S | 1 + 10 files changed, 79 insertions(+) diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 5cc97130843f..6ebdc0f834a7 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -108,6 +108,13 @@ hint #20 .endm +/* + * Clear Branch History instruction + */ + .macro clearbhb + hint #22 + .endm + /* * Speculation barrier */ @@ -884,6 +891,16 @@ alternative_cb smccc_patch_fw_mitigation_conduit alternative_cb_end ldp x2, x3, [sp], #16 ldp x0, x1, [sp], #16 +#endif /* CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ + .endm + + .macro mitigate_spectre_bhb_clear_insn +#ifdef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY +alternative_cb spectre_bhb_patch_clearbhb + /* Patched to NOP when not supported */ + clearbhb + isb +alternative_cb_end #endif /* CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ .endm #endif /* __ASM_ASSEMBLER_H */ diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index b158fd447f3a..a77b5f49b3a6 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -653,6 +653,19 @@ static inline bool supports_csv2p3(int scope) return csv2_val == 3; } +static inline bool supports_clearbhb(int scope) +{ + u64 isar2; + + if (scope == SCOPE_LOCAL_CPU) + isar2 = read_sysreg_s(SYS_ID_AA64ISAR2_EL1); + else + isar2 = read_sanitised_ftr_reg(SYS_ID_AA64ISAR2_EL1); + + return cpuid_feature_extract_unsigned_field(isar2, + ID_AA64ISAR2_CLEARBHB_SHIFT); +} + const struct cpumask *system_32bit_el0_cpumask(void); DECLARE_STATIC_KEY_FALSE(arm64_mismatched_32bit_el0); diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h index 6b776c8667b2..b02f0c328c8e 100644 --- a/arch/arm64/include/asm/insn.h +++ b/arch/arm64/include/asm/insn.h @@ -65,6 +65,7 @@ enum aarch64_insn_hint_cr_op { AARCH64_INSN_HINT_PSB = 0x11 << 5, AARCH64_INSN_HINT_TSB = 0x12 << 5, AARCH64_INSN_HINT_CSDB = 0x14 << 5, + AARCH64_INSN_HINT_CLEARBHB = 0x16 << 5, AARCH64_INSN_HINT_BTI = 0x20 << 5, AARCH64_INSN_HINT_BTIC = 0x22 << 5, diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 238b9f6a28fc..932d45b17877 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -773,6 +773,7 @@ #define ID_AA64ISAR1_GPI_IMP_DEF 0x1 /* id_aa64isar2 */ +#define ID_AA64ISAR2_CLEARBHB_SHIFT 28 #define ID_AA64ISAR2_RPRES_SHIFT 4 #define ID_AA64ISAR2_WFXT_SHIFT 0 diff --git a/arch/arm64/include/asm/vectors.h b/arch/arm64/include/asm/vectors.h index 1f65c37dc653..f64613a96d53 100644 --- a/arch/arm64/include/asm/vectors.h +++ b/arch/arm64/include/asm/vectors.h @@ -32,6 +32,12 @@ enum arm64_bp_harden_el1_vectors { * canonical vectors. */ EL1_VECTOR_BHB_FW, + + /* + * Use the ClearBHB instruction, before branching to the canonical + * vectors. + */ + EL1_VECTOR_BHB_CLEAR_INSN, #endif /* CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ /* @@ -43,6 +49,7 @@ enum arm64_bp_harden_el1_vectors { #ifndef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY #define EL1_VECTOR_BHB_LOOP -1 #define EL1_VECTOR_BHB_FW -1 +#define EL1_VECTOR_BHB_CLEAR_INSN -1 #endif /* !CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ /* The vectors to use on return from EL0. e.g. to remap the kernel */ diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 45fed4974c44..d33687673f6b 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -231,6 +231,7 @@ static const struct arm64_ftr_bits ftr_id_aa64isar1[] = { }; static const struct arm64_ftr_bits ftr_id_aa64isar2[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_HIGHER_SAFE, ID_AA64ISAR2_CLEARBHB_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_RPRES_SHIFT, 4, 0), ARM64_FTR_END, }; diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index a62fee121138..4a3a653df07e 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -657,6 +657,7 @@ alternative_else_nop_endif #define BHB_MITIGATION_NONE 0 #define BHB_MITIGATION_LOOP 1 #define BHB_MITIGATION_FW 2 +#define BHB_MITIGATION_INSN 3 .macro tramp_ventry, vector_start, regsize, kpti, bhb .align 7 @@ -673,6 +674,11 @@ alternative_else_nop_endif __mitigate_spectre_bhb_loop x30 .endif // \bhb == BHB_MITIGATION_LOOP + .if \bhb == BHB_MITIGATION_INSN + clearbhb + isb + .endif // \bhb == BHB_MITIGATION_INSN + .if \kpti == 1 /* * Defend against branch aliasing attacks by pushing a dummy @@ -749,6 +755,7 @@ SYM_CODE_START_NOALIGN(tramp_vectors) #ifdef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY generate_tramp_vector kpti=1, bhb=BHB_MITIGATION_LOOP generate_tramp_vector kpti=1, bhb=BHB_MITIGATION_FW + generate_tramp_vector kpti=1, bhb=BHB_MITIGATION_INSN #endif /* CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ generate_tramp_vector kpti=1, bhb=BHB_MITIGATION_NONE SYM_CODE_END(tramp_vectors) @@ -811,6 +818,7 @@ SYM_CODE_START(__bp_harden_el1_vectors) #ifdef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY generate_el1_vector bhb=BHB_MITIGATION_LOOP generate_el1_vector bhb=BHB_MITIGATION_FW + generate_el1_vector bhb=BHB_MITIGATION_INSN #endif /* CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ SYM_CODE_END(__bp_harden_el1_vectors) .popsection diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index 56db964a4f0c..55a1ced8eb77 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -69,6 +69,7 @@ KVM_NVHE_ALIAS(kvm_compute_final_ctr_el0); KVM_NVHE_ALIAS(spectre_bhb_patch_loop_iter); KVM_NVHE_ALIAS(spectre_bhb_patch_loop_mitigation_enable); KVM_NVHE_ALIAS(spectre_bhb_patch_wa3); +KVM_NVHE_ALIAS(spectre_bhb_patch_clearbhb); /* Global kernel state accessed by nVHE hyp code. */ KVM_NVHE_ALIAS(kvm_vgic_global_state); diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index fbbcd1d2eb19..d3fbff00993d 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -805,6 +805,7 @@ int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which) * - Mitigated by a branchy loop a CPU specific number of times, and listed * in our "loop mitigated list". * - Mitigated in software by the firmware Spectre v2 call. + * - Has the ClearBHB instruction to perform the mitigation. * - Has the 'Exception Clears Branch History Buffer' (ECBHB) feature, so no * software mitigation in the vectors is needed. * - Has CSV2.3, so is unaffected. @@ -820,6 +821,7 @@ enum bhb_mitigation_bits { BHB_LOOP, BHB_FW, BHB_HW, + BHB_INSN, }; static unsigned long system_bhb_mitigations; @@ -937,6 +939,9 @@ bool is_spectre_bhb_affected(const struct arm64_cpu_capabilities *entry, if (supports_csv2p3(scope)) return false; + if (supports_clearbhb(scope)) + return true; + if (spectre_bhb_loop_affected(scope)) return true; @@ -984,6 +989,17 @@ void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *entry) } else if (supports_ecbhb(SCOPE_LOCAL_CPU)) { state = SPECTRE_MITIGATED; set_bit(BHB_HW, &system_bhb_mitigations); + } else if (supports_clearbhb(SCOPE_LOCAL_CPU)) { + /* + * Ensure KVM uses the indirect vector which will have ClearBHB + * added. + */ + if (!data->slot) + data->slot = HYP_VECTOR_INDIRECT; + + this_cpu_set_vectors(EL1_VECTOR_BHB_CLEAR_INSN); + state = SPECTRE_MITIGATED; + set_bit(BHB_INSN, &system_bhb_mitigations); } else if (spectre_bhb_loop_affected(SCOPE_LOCAL_CPU)) { /* * Ensure KVM uses the indirect vector which will have the @@ -1096,3 +1112,16 @@ void noinstr spectre_bhb_patch_wa3(struct alt_instr *alt, *updptr++ = cpu_to_le32(insn); } + +/* Patched to NOP when not supported */ +void __init spectre_bhb_patch_clearbhb(struct alt_instr *alt, + __le32 *origptr, __le32 *updptr, int nr_inst) +{ + BUG_ON(nr_inst != 2); + + if (test_bit(BHB_INSN, &system_bhb_mitigations)) + return; + + *updptr++ = cpu_to_le32(aarch64_insn_gen_nop()); + *updptr++ = cpu_to_le32(aarch64_insn_gen_nop()); +} diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S index 12aa49324955..7839d075729b 100644 --- a/arch/arm64/kvm/hyp/hyp-entry.S +++ b/arch/arm64/kvm/hyp/hyp-entry.S @@ -213,6 +213,7 @@ SYM_CODE_END(__kvm_hyp_vector) .else stp x0, x1, [sp, #-16]! mitigate_spectre_bhb_loop x0 + mitigate_spectre_bhb_clear_insn .endif .if \indirect != 0 alternative_cb kvm_patch_vector_branch -- cgit From dd3b1dc3dd050f1f47cd13e300732852414270f8 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 4 Feb 2022 13:12:35 -0800 Subject: Bluetooth: hci_core: Fix leaking sent_cmd skb sent_cmd memory is not freed before freeing hci_dev causing it to leak it contents. Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 2b7bd3655b07..2882bc7d79d7 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -2738,6 +2738,7 @@ void hci_release_dev(struct hci_dev *hdev) hci_dev_unlock(hdev); ida_simple_remove(&hci_index_ida, hdev->id); + kfree_skb(hdev->sent_cmd); kfree(hdev); } EXPORT_SYMBOL(hci_release_dev); -- cgit From fa78d2d1d64f147062e384a4a10a26a5f89944b5 Mon Sep 17 00:00:00 2001 From: Lin Ma Date: Wed, 16 Feb 2022 12:37:14 +0800 Subject: Bluetooth: fix data races in smp_unregister(), smp_del_chan() Previous commit e04480920d1e ("Bluetooth: defer cleanup of resources in hci_unregister_dev()") defers all destructive actions to hci_release_dev() to prevent cocurrent problems like NPD, UAF. However, there are still some exceptions that are ignored. The smp_unregister() in hci_dev_close_sync() (previously in hci_dev_do_close) will release resources like the sensitive channel and the smp_dev objects. Consider the situations the device is detaching or power down while the kernel is still operating on it, the following data race could take place. thread-A hci_dev_close_sync | thread-B read_local_oob_ext_data | hci_dev_unlock() | ... | hci_dev_lock() if (hdev->smp_data) | chan = hdev->smp_data | | chan = hdev->smp_data (3) | hdev->smp_data = NULL (1) | if (!chan || !chan->data) (4) ... | smp = chan->data | smp = chan->data if (smp) | chan->data = NULL (2) | ... | kfree_sensitive(smp) | | // dereference smp trigger UFA That is, the objects hdev->smp_data and chan->data both suffer from the data races. In a preempt-enable kernel, the above schedule (when (3) is before (1) and (4) is before (2)) leads to UAF bugs. It can be reproduced in the latest kernel and below is part of the report: [ 49.097146] ================================================================ [ 49.097611] BUG: KASAN: use-after-free in smp_generate_oob+0x2dd/0x570 [ 49.097611] Read of size 8 at addr ffff888006528360 by task generate_oob/155 [ 49.097611] [ 49.097611] Call Trace: [ 49.097611] [ 49.097611] dump_stack_lvl+0x34/0x44 [ 49.097611] print_address_description.constprop.0+0x1f/0x150 [ 49.097611] ? smp_generate_oob+0x2dd/0x570 [ 49.097611] ? smp_generate_oob+0x2dd/0x570 [ 49.097611] kasan_report.cold+0x7f/0x11b [ 49.097611] ? smp_generate_oob+0x2dd/0x570 [ 49.097611] smp_generate_oob+0x2dd/0x570 [ 49.097611] read_local_oob_ext_data+0x689/0xc30 [ 49.097611] ? hci_event_packet+0xc80/0xc80 [ 49.097611] ? sysvec_apic_timer_interrupt+0x9b/0xc0 [ 49.097611] ? asm_sysvec_apic_timer_interrupt+0x12/0x20 [ 49.097611] ? mgmt_init_hdev+0x1c/0x240 [ 49.097611] ? mgmt_init_hdev+0x28/0x240 [ 49.097611] hci_sock_sendmsg+0x1880/0x1e70 [ 49.097611] ? create_monitor_event+0x890/0x890 [ 49.097611] ? create_monitor_event+0x890/0x890 [ 49.097611] sock_sendmsg+0xdf/0x110 [ 49.097611] __sys_sendto+0x19e/0x270 [ 49.097611] ? __ia32_sys_getpeername+0xa0/0xa0 [ 49.097611] ? kernel_fpu_begin_mask+0x1c0/0x1c0 [ 49.097611] __x64_sys_sendto+0xd8/0x1b0 [ 49.097611] ? syscall_exit_to_user_mode+0x1d/0x40 [ 49.097611] do_syscall_64+0x3b/0x90 [ 49.097611] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 49.097611] RIP: 0033:0x7f5a59f51f64 ... [ 49.097611] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f5a59f51f64 [ 49.097611] RDX: 0000000000000007 RSI: 00007f5a59d6ac70 RDI: 0000000000000006 [ 49.097611] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000 [ 49.097611] R10: 0000000000000040 R11: 0000000000000246 R12: 00007ffec26916ee [ 49.097611] R13: 00007ffec26916ef R14: 00007f5a59d6afc0 R15: 00007f5a59d6b700 To solve these data races, this patch places the smp_unregister() function in the protected area by the hci_dev_lock(). That is, the smp_unregister() function can not be concurrently executed when operating functions (most of them are mgmt operations in mgmt.c) hold the device lock. This patch is tested with kernel LOCK DEBUGGING enabled. The price from the extended holding time of the device lock is supposed to be low as the smp_unregister() function is fairly short and efficient. Signed-off-by: Lin Ma Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_sync.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 0feb68f12545..e34fc15b7d2c 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -4106,9 +4106,9 @@ int hci_dev_close_sync(struct hci_dev *hdev) hci_inquiry_cache_flush(hdev); hci_pend_le_actions_clear(hdev); hci_conn_hash_flush(hdev); - hci_dev_unlock(hdev); - + /* Prevent data races on hdev->smp_data or hdev->smp_bredr_data */ smp_unregister(hdev); + hci_dev_unlock(hdev); hci_sock_dev_event(hdev, HCI_DEV_DOWN); -- cgit From 29fb608396d6a62c1b85acc421ad7a4399085b9f Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 14 Feb 2022 17:59:38 -0800 Subject: Bluetooth: Fix bt_skb_sendmmsg not allocating partial chunks Since bt_skb_sendmmsg can be used with the likes of SOCK_STREAM it shall return the partial chunks it could allocate instead of freeing everything as otherwise it can cause problems like bellow. Fixes: 81be03e026dc ("Bluetooth: RFCOMM: Replace use of memcpy_from_msg with bt_skb_sendmmsg") Reported-by: Paul Menzel Link: https://lore.kernel.org/r/d7206e12-1b99-c3be-84f4-df22af427ef5@molgen.mpg.de BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=215594 Signed-off-by: Luiz Augusto von Dentz Tested-by: Paul Menzel (Nokia N9 (MeeGo/Harmattan) Signed-off-by: Marcel Holtmann --- include/net/bluetooth/bluetooth.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index 4b3d0b16c185..a647e5fabdbd 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -506,8 +506,7 @@ static inline struct sk_buff *bt_skb_sendmmsg(struct sock *sk, tmp = bt_skb_sendmsg(sk, msg, len, mtu, headroom, tailroom); if (IS_ERR(tmp)) { - kfree_skb(skb); - return tmp; + return skb; } len -= tmp->len; -- cgit From 2e8ecb4bbc13d4752d64a9f8f5512d59125cab25 Mon Sep 17 00:00:00 2001 From: Wang Qing Date: Mon, 14 Feb 2022 18:01:56 -0800 Subject: Bluetooth: assign len after null check len should be assigned after a null check Signed-off-by: Wang Qing Signed-off-by: Marcel Holtmann --- net/bluetooth/mgmt_util.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/bluetooth/mgmt_util.c b/net/bluetooth/mgmt_util.c index edee60bbc7b4..37eef2ce55ae 100644 --- a/net/bluetooth/mgmt_util.c +++ b/net/bluetooth/mgmt_util.c @@ -77,11 +77,12 @@ int mgmt_send_event_skb(unsigned short channel, struct sk_buff *skb, int flag, { struct hci_dev *hdev; struct mgmt_hdr *hdr; - int len = skb->len; + int len; if (!skb) return -EINVAL; + len = skb->len; hdev = bt_cb(skb)->mgmt.hdev; /* Time stamp */ -- cgit From 80740ebb7e1ad15ab9c11425dcd26e073f86d74b Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 24 Feb 2022 07:11:47 -0800 Subject: Bluetooth: hci_sync: Fix hci_update_accept_list_sync hci_update_accept_list_sync is returning the filter based on the error but that gets overwritten by hci_le_set_addr_resolution_enable_sync return instead of using the actual result of the likes of hci_le_add_accept_list_sync which was intended. Fixes: ad383c2c65a5b ("Bluetooth: hci_sync: Enable advertising when LL privacy is enabled") Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_sync.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index e34fc15b7d2c..9d8490570b42 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -1841,6 +1841,7 @@ static u8 hci_update_accept_list_sync(struct hci_dev *hdev) struct bdaddr_list *b, *t; u8 num_entries = 0; bool pend_conn, pend_report; + u8 filter_policy; int err; /* Pause advertising if resolving list can be used as controllers are @@ -1927,6 +1928,8 @@ static u8 hci_update_accept_list_sync(struct hci_dev *hdev) err = -EINVAL; done: + filter_policy = err ? 0x00 : 0x01; + /* Enable address resolution when LL Privacy is enabled. */ err = hci_le_set_addr_resolution_enable_sync(hdev, 0x01); if (err) @@ -1937,7 +1940,7 @@ done: hci_resume_advertising_sync(hdev); /* Select filter policy to use accept list */ - return err ? 0x00 : 0x01; + return filter_policy; } /* Returns true if an le connection is in the scanning state */ -- cgit From a56a1138cbd85e4d565356199d60e1cb94e5a77a Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 17 Feb 2022 13:10:38 -0800 Subject: Bluetooth: hci_sync: Fix not using conn_timeout When using hci_le_create_conn_sync it shall wait for the conn_timeout since the connection complete may take longer than just 2 seconds. Also fix the masking of HCI_EV_LE_ENHANCED_CONN_COMPLETE and HCI_EV_LE_CONN_COMPLETE so they are never both set so we can predict which one the controller will use in case of HCI_OP_LE_CREATE_CONN. Fixes: 6cd29ec6ae5e3 ("Bluetooth: hci_sync: Wait for proper events when connecting LE") Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 8 ++++++++ net/bluetooth/hci_sync.c | 21 +++++++++++++++------ 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 586f69d084a2..e336e9c1dda4 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1489,6 +1489,14 @@ void hci_conn_del_sysfs(struct hci_conn *conn); /* Extended advertising support */ #define ext_adv_capable(dev) (((dev)->le_features[1] & HCI_LE_EXT_ADV)) +/* BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E page 1789: + * + * C24: Mandatory if the LE Controller supports Connection State and either + * LE Feature (LL Privacy) or LE Feature (Extended Advertising) is supported + */ +#define use_enhanced_conn_complete(dev) (ll_privacy_capable(dev) || \ + ext_adv_capable(dev)) + /* ----- HCI protocols ----- */ #define HCI_PROTO_DEFER 0x01 diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 9d8490570b42..9ba2a1a7d481 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -3265,10 +3265,10 @@ static int hci_le_set_event_mask_sync(struct hci_dev *hdev) if (hdev->le_features[0] & HCI_LE_DATA_LEN_EXT) events[0] |= 0x40; /* LE Data Length Change */ - /* If the controller supports LL Privacy feature, enable - * the corresponding event. + /* If the controller supports LL Privacy feature or LE Extended Adv, + * enable the corresponding event. */ - if (hdev->le_features[0] & HCI_LE_LL_PRIVACY) + if (use_enhanced_conn_complete(hdev)) events[1] |= 0x02; /* LE Enhanced Connection Complete */ /* If the controller supports Extended Scanner Filter @@ -5188,7 +5188,7 @@ int hci_le_ext_create_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, return __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_EXT_CREATE_CONN, plen, data, HCI_EV_LE_ENHANCED_CONN_COMPLETE, - HCI_CMD_TIMEOUT, NULL); + conn->conn_timeout, NULL); } int hci_le_create_conn_sync(struct hci_dev *hdev, struct hci_conn *conn) @@ -5273,9 +5273,18 @@ int hci_le_create_conn_sync(struct hci_dev *hdev, struct hci_conn *conn) cp.min_ce_len = cpu_to_le16(0x0000); cp.max_ce_len = cpu_to_le16(0x0000); + /* BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E page 2261: + * + * If this event is unmasked and the HCI_LE_Connection_Complete event + * is unmasked, only the HCI_LE_Enhanced_Connection_Complete event is + * sent when a new connection has been created. + */ err = __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_CREATE_CONN, - sizeof(cp), &cp, HCI_EV_LE_CONN_COMPLETE, - HCI_CMD_TIMEOUT, NULL); + sizeof(cp), &cp, + use_enhanced_conn_complete(hdev) ? + HCI_EV_LE_ENHANCED_CONN_COMPLETE : + HCI_EV_LE_CONN_COMPLETE, + conn->conn_timeout, NULL); done: /* Re-enable advertising after the connection attempt is finished. */ -- cgit From 4e7c4d3652f96f41179aab3ff53025c7a550d689 Mon Sep 17 00:00:00 2001 From: Taniya Das Date: Thu, 24 Feb 2022 00:26:05 +0530 Subject: clk: qcom: gdsc: Add support to update GDSC transition delay GDSCs have multiple transition delays which are used for the GDSC FSM states. Older targets/designs required these values to be updated from gdsc code to certain default values for the FSM state to work as expected. But on the newer targets/designs the values updated from the GDSC driver can hamper the FSM state to not work as expected. On SC7180 we observe black screens because the gdsc is being enabled/disabled very rapidly and the GDSC FSM state does not work as expected. This is due to the fact that the GDSC reset value is being updated from SW. Thus add support to update the transition delay from the clock controller gdscs as required. Fixes: 45dd0e55317cc ("clk: qcom: Add support for GDSCs) Signed-off-by: Taniya Das Link: https://lore.kernel.org/r/20220223185606.3941-1-tdas@codeaurora.org Reviewed-by: Bjorn Andersson Signed-off-by: Stephen Boyd --- drivers/clk/qcom/gdsc.c | 26 +++++++++++++++++++++----- drivers/clk/qcom/gdsc.h | 8 +++++++- 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/drivers/clk/qcom/gdsc.c b/drivers/clk/qcom/gdsc.c index 7e1dd8ccfa38..44520efc6c72 100644 --- a/drivers/clk/qcom/gdsc.c +++ b/drivers/clk/qcom/gdsc.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (c) 2015, 2017-2018, The Linux Foundation. All rights reserved. + * Copyright (c) 2015, 2017-2018, 2022, The Linux Foundation. All rights reserved. */ #include @@ -35,9 +35,14 @@ #define CFG_GDSCR_OFFSET 0x4 /* Wait 2^n CXO cycles between all states. Here, n=2 (4 cycles). */ -#define EN_REST_WAIT_VAL (0x2 << 20) -#define EN_FEW_WAIT_VAL (0x8 << 16) -#define CLK_DIS_WAIT_VAL (0x2 << 12) +#define EN_REST_WAIT_VAL 0x2 +#define EN_FEW_WAIT_VAL 0x8 +#define CLK_DIS_WAIT_VAL 0x2 + +/* Transition delay shifts */ +#define EN_REST_WAIT_SHIFT 20 +#define EN_FEW_WAIT_SHIFT 16 +#define CLK_DIS_WAIT_SHIFT 12 #define RETAIN_MEM BIT(14) #define RETAIN_PERIPH BIT(13) @@ -380,7 +385,18 @@ static int gdsc_init(struct gdsc *sc) */ mask = HW_CONTROL_MASK | SW_OVERRIDE_MASK | EN_REST_WAIT_MASK | EN_FEW_WAIT_MASK | CLK_DIS_WAIT_MASK; - val = EN_REST_WAIT_VAL | EN_FEW_WAIT_VAL | CLK_DIS_WAIT_VAL; + + if (!sc->en_rest_wait_val) + sc->en_rest_wait_val = EN_REST_WAIT_VAL; + if (!sc->en_few_wait_val) + sc->en_few_wait_val = EN_FEW_WAIT_VAL; + if (!sc->clk_dis_wait_val) + sc->clk_dis_wait_val = CLK_DIS_WAIT_VAL; + + val = sc->en_rest_wait_val << EN_REST_WAIT_SHIFT | + sc->en_few_wait_val << EN_FEW_WAIT_SHIFT | + sc->clk_dis_wait_val << CLK_DIS_WAIT_SHIFT; + ret = regmap_update_bits(sc->regmap, sc->gdscr, mask, val); if (ret) return ret; diff --git a/drivers/clk/qcom/gdsc.h b/drivers/clk/qcom/gdsc.h index d7cc4c21a9d4..ad313d7210bd 100644 --- a/drivers/clk/qcom/gdsc.h +++ b/drivers/clk/qcom/gdsc.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * Copyright (c) 2015, 2017-2018, The Linux Foundation. All rights reserved. + * Copyright (c) 2015, 2017-2018, 2022, The Linux Foundation. All rights reserved. */ #ifndef __QCOM_GDSC_H__ @@ -22,6 +22,9 @@ struct reset_controller_dev; * @cxcs: offsets of branch registers to toggle mem/periph bits in * @cxc_count: number of @cxcs * @pwrsts: Possible powerdomain power states + * @en_rest_wait_val: transition delay value for receiving enr ack signal + * @en_few_wait_val: transition delay value for receiving enf ack signal + * @clk_dis_wait_val: transition delay value for halting clock * @resets: ids of resets associated with this gdsc * @reset_count: number of @resets * @rcdev: reset controller @@ -36,6 +39,9 @@ struct gdsc { unsigned int clamp_io_ctrl; unsigned int *cxcs; unsigned int cxc_count; + unsigned int en_rest_wait_val; + unsigned int en_few_wait_val; + unsigned int clk_dis_wait_val; const u8 pwrsts; /* Powerdomain allowable state bitfields */ #define PWRSTS_OFF BIT(0) -- cgit From 6e6fec3f961c00ca34ffb4bf2ad9febb4b499f8d Mon Sep 17 00:00:00 2001 From: Taniya Das Date: Thu, 24 Feb 2022 00:26:06 +0530 Subject: clk: qcom: dispcc: Update the transition delay for MDSS GDSC On SC7180 we observe black screens because the gdsc is being enabled/disabled very rapidly and the GDSC FSM state does not work as expected. This is due to the fact that the GDSC reset value is being updated from SW. The recommended transition delay for mdss core gdsc updated for SC7180/SC7280/SM8250. Fixes: dd3d06622138 ("clk: qcom: Add display clock controller driver for SC7180") Fixes: 1a00c962f9cd ("clk: qcom: Add display clock controller driver for SC7280") Fixes: 80a18f4a8567 ("clk: qcom: Add display clock controller driver for SM8150 and SM8250") Signed-off-by: Taniya Das Link: https://lore.kernel.org/r/20220223185606.3941-2-tdas@codeaurora.org Reviewed-by: Bjorn Andersson [sboyd@kernel.org: lowercase hex] Signed-off-by: Stephen Boyd --- drivers/clk/qcom/dispcc-sc7180.c | 5 ++++- drivers/clk/qcom/dispcc-sc7280.c | 5 ++++- drivers/clk/qcom/dispcc-sm8250.c | 5 ++++- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/clk/qcom/dispcc-sc7180.c b/drivers/clk/qcom/dispcc-sc7180.c index 538e4963c915..5d2ae297e741 100644 --- a/drivers/clk/qcom/dispcc-sc7180.c +++ b/drivers/clk/qcom/dispcc-sc7180.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (c) 2019, The Linux Foundation. All rights reserved. + * Copyright (c) 2019, 2022, The Linux Foundation. All rights reserved. */ #include @@ -625,6 +625,9 @@ static struct clk_branch disp_cc_mdss_vsync_clk = { static struct gdsc mdss_gdsc = { .gdscr = 0x3000, + .en_rest_wait_val = 0x2, + .en_few_wait_val = 0x2, + .clk_dis_wait_val = 0xf, .pd = { .name = "mdss_gdsc", }, diff --git a/drivers/clk/qcom/dispcc-sc7280.c b/drivers/clk/qcom/dispcc-sc7280.c index 4ef4ae231794..ad596d567f6a 100644 --- a/drivers/clk/qcom/dispcc-sc7280.c +++ b/drivers/clk/qcom/dispcc-sc7280.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (c) 2021, The Linux Foundation. All rights reserved. + * Copyright (c) 2021-2022, The Linux Foundation. All rights reserved. */ #include @@ -787,6 +787,9 @@ static struct clk_branch disp_cc_sleep_clk = { static struct gdsc disp_cc_mdss_core_gdsc = { .gdscr = 0x1004, + .en_rest_wait_val = 0x2, + .en_few_wait_val = 0x2, + .clk_dis_wait_val = 0xf, .pd = { .name = "disp_cc_mdss_core_gdsc", }, diff --git a/drivers/clk/qcom/dispcc-sm8250.c b/drivers/clk/qcom/dispcc-sm8250.c index 566fdfa0a15b..db9379634fb2 100644 --- a/drivers/clk/qcom/dispcc-sm8250.c +++ b/drivers/clk/qcom/dispcc-sm8250.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright (c) 2018-2020, The Linux Foundation. All rights reserved. + * Copyright (c) 2018-2020, 2022, The Linux Foundation. All rights reserved. */ #include @@ -1126,6 +1126,9 @@ static struct clk_branch disp_cc_mdss_vsync_clk = { static struct gdsc mdss_gdsc = { .gdscr = 0x3000, + .en_rest_wait_val = 0x2, + .en_few_wait_val = 0x2, + .clk_dis_wait_val = 0xf, .pd = { .name = "mdss_gdsc", }, -- cgit From 0d22b031662ad48d5835e470a90784f4b39adce9 Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Wed, 22 Dec 2021 19:01:30 +0000 Subject: drm/exynos/exynos7_drm_decon: Use platform_get_irq_byname() to get the interrupt platform_get_resource_byname(pdev, IORESOURCE_IRQ, ..) relies on static allocation of IRQ resources in DT core code, this causes an issue when using hierarchical interrupt domains using "interrupts" property in the node as this bypassed the hierarchical setup and messed up the irq chaining. In preparation for removal of static setup of IRQ resource from DT core code use platform_get_irq_byname(). Signed-off-by: Lad Prabhakar Signed-off-by: Inki Dae --- drivers/gpu/drm/exynos/exynos7_drm_decon.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/exynos/exynos7_drm_decon.c b/drivers/gpu/drm/exynos/exynos7_drm_decon.c index 12571ac45540..c04264f70ad1 100644 --- a/drivers/gpu/drm/exynos/exynos7_drm_decon.c +++ b/drivers/gpu/drm/exynos/exynos7_drm_decon.c @@ -678,7 +678,6 @@ static int decon_probe(struct platform_device *pdev) struct device *dev = &pdev->dev; struct decon_context *ctx; struct device_node *i80_if_timings; - struct resource *res; int ret; if (!dev->of_node) @@ -728,16 +727,11 @@ static int decon_probe(struct platform_device *pdev) goto err_iounmap; } - res = platform_get_resource_byname(pdev, IORESOURCE_IRQ, - ctx->i80_if ? "lcd_sys" : "vsync"); - if (!res) { - dev_err(dev, "irq request failed.\n"); - ret = -ENXIO; + ret = platform_get_irq_byname(pdev, ctx->i80_if ? "lcd_sys" : "vsync"); + if (ret < 0) goto err_iounmap; - } - ret = devm_request_irq(dev, res->start, decon_irq_handler, - 0, "drm_decon", ctx); + ret = devm_request_irq(dev, ret, decon_irq_handler, 0, "drm_decon", ctx); if (ret) { dev_err(dev, "irq request failed.\n"); goto err_iounmap; -- cgit From be52abd4d2b7ea343373cc116a99699a3e3c5573 Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Wed, 22 Dec 2021 19:01:31 +0000 Subject: drm/exynos: mixer: Use platform_get_irq() to get the interrupt platform_get_resource(pdev, IORESOURCE_IRQ, ..) relies on static allocation of IRQ resources in DT core code, this causes an issue when using hierarchical interrupt domains using "interrupts" property in the node as this bypassed the hierarchical setup and messed up the irq chaining. In preparation for removal of static setup of IRQ resource from DT core code use platform_get_irq(). Signed-off-by: Lad Prabhakar Signed-off-by: Inki Dae --- drivers/gpu/drm/exynos/exynos_mixer.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/exynos/exynos_mixer.c b/drivers/gpu/drm/exynos/exynos_mixer.c index 41c54f1f60bc..e5204be86093 100644 --- a/drivers/gpu/drm/exynos/exynos_mixer.c +++ b/drivers/gpu/drm/exynos/exynos_mixer.c @@ -809,19 +809,17 @@ static int mixer_resources_init(struct mixer_context *mixer_ctx) return -ENXIO; } - res = platform_get_resource(mixer_ctx->pdev, IORESOURCE_IRQ, 0); - if (res == NULL) { - dev_err(dev, "get interrupt resource failed.\n"); - return -ENXIO; - } + ret = platform_get_irq(mixer_ctx->pdev, 0); + if (ret < 0) + return ret; + mixer_ctx->irq = ret; - ret = devm_request_irq(dev, res->start, mixer_irq_handler, - 0, "drm_mixer", mixer_ctx); + ret = devm_request_irq(dev, mixer_ctx->irq, mixer_irq_handler, + 0, "drm_mixer", mixer_ctx); if (ret) { dev_err(dev, "request interrupt failed.\n"); return ret; } - mixer_ctx->irq = res->start; return 0; } -- cgit From b342c1f335981ebc442127efe03524d2331a273c Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Wed, 22 Dec 2021 19:01:32 +0000 Subject: drm/exynos/exynos_drm_fimd: Use platform_get_irq_byname() to get the interrupt platform_get_resource_byname(pdev, IORESOURCE_IRQ, ..) relies on static allocation of IRQ resources in DT core code, this causes an issue when using hierarchical interrupt domains using "interrupts" property in the node as this bypassed the hierarchical setup and messed up the irq chaining. In preparation for removal of static setup of IRQ resource from DT core code use platform_get_irq_byname(). Signed-off-by: Lad Prabhakar Signed-off-by: Inki Dae --- drivers/gpu/drm/exynos/exynos_drm_fimd.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/exynos/exynos_drm_fimd.c b/drivers/gpu/drm/exynos/exynos_drm_fimd.c index c735e53939d8..7d5a483a54de 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_fimd.c +++ b/drivers/gpu/drm/exynos/exynos_drm_fimd.c @@ -1133,7 +1133,6 @@ static int fimd_probe(struct platform_device *pdev) struct device *dev = &pdev->dev; struct fimd_context *ctx; struct device_node *i80_if_timings; - struct resource *res; int ret; if (!dev->of_node) @@ -1206,15 +1205,11 @@ static int fimd_probe(struct platform_device *pdev) if (IS_ERR(ctx->regs)) return PTR_ERR(ctx->regs); - res = platform_get_resource_byname(pdev, IORESOURCE_IRQ, - ctx->i80_if ? "lcd_sys" : "vsync"); - if (!res) { - dev_err(dev, "irq request failed.\n"); - return -ENXIO; - } + ret = platform_get_irq_byname(pdev, ctx->i80_if ? "lcd_sys" : "vsync"); + if (ret < 0) + return ret; - ret = devm_request_irq(dev, res->start, fimd_irq_handler, - 0, "drm_fimd", ctx); + ret = devm_request_irq(dev, ret, fimd_irq_handler, 0, "drm_fimd", ctx); if (ret) { dev_err(dev, "irq request failed.\n"); return ret; -- cgit From be0a3b7e2a97e3f73004a5b453cc2023d8c1317a Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Wed, 22 Dec 2021 19:01:33 +0000 Subject: drm/exynos/fimc: Use platform_get_irq() to get the interrupt platform_get_resource(pdev, IORESOURCE_IRQ, ..) relies on static allocation of IRQ resources in DT core code, this causes an issue when using hierarchical interrupt domains using "interrupts" property in the node as this bypassed the hierarchical setup and messed up the irq chaining. In preparation for removal of static setup of IRQ resource from DT core code use platform_get_irq(). Signed-off-by: Lad Prabhakar Signed-off-by: Inki Dae --- drivers/gpu/drm/exynos/exynos_drm_fimc.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/exynos/exynos_drm_fimc.c b/drivers/gpu/drm/exynos/exynos_drm_fimc.c index 023f54ee61a8..0ee32e4b1e43 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_fimc.c +++ b/drivers/gpu/drm/exynos/exynos_drm_fimc.c @@ -1267,7 +1267,6 @@ static int fimc_probe(struct platform_device *pdev) struct exynos_drm_ipp_formats *formats; struct device *dev = &pdev->dev; struct fimc_context *ctx; - struct resource *res; int ret; int i, j, num_limits, num_formats; @@ -1330,14 +1329,12 @@ static int fimc_probe(struct platform_device *pdev) return PTR_ERR(ctx->regs); /* resource irq */ - res = platform_get_resource(pdev, IORESOURCE_IRQ, 0); - if (!res) { - dev_err(dev, "failed to request irq resource.\n"); - return -ENOENT; - } + ret = platform_get_irq(pdev, 0); + if (ret < 0) + return ret; - ret = devm_request_irq(dev, res->start, fimc_irq_handler, - 0, dev_name(dev), ctx); + ret = devm_request_irq(dev, ret, fimc_irq_handler, + 0, dev_name(dev), ctx); if (ret < 0) { dev_err(dev, "failed to request irq.\n"); return ret; -- cgit From 586d0902456ad965c9a456fd0a0f451518aed1c5 Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Wed, 22 Dec 2021 19:01:34 +0000 Subject: drm/exynos: gsc: Use platform_get_irq() to get the interrupt platform_get_resource(pdev, IORESOURCE_IRQ, ..) relies on static allocation of IRQ resources in DT core code, this causes an issue when using hierarchical interrupt domains using "interrupts" property in the node as this bypassed the hierarchical setup and messed up the irq chaining. In preparation for removal of static setup of IRQ resource from DT core code use platform_get_irq(). Signed-off-by: Lad Prabhakar Signed-off-by: Inki Dae --- drivers/gpu/drm/exynos/exynos_drm_gsc.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/exynos/exynos_drm_gsc.c b/drivers/gpu/drm/exynos/exynos_drm_gsc.c index 166a80262896..964dceb28c1e 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_gsc.c +++ b/drivers/gpu/drm/exynos/exynos_drm_gsc.c @@ -1220,7 +1220,6 @@ static int gsc_probe(struct platform_device *pdev) struct gsc_driverdata *driver_data; struct exynos_drm_ipp_formats *formats; struct gsc_context *ctx; - struct resource *res; int num_formats, ret, i, j; ctx = devm_kzalloc(dev, sizeof(*ctx), GFP_KERNEL); @@ -1275,13 +1274,10 @@ static int gsc_probe(struct platform_device *pdev) return PTR_ERR(ctx->regs); /* resource irq */ - res = platform_get_resource(pdev, IORESOURCE_IRQ, 0); - if (!res) { - dev_err(dev, "failed to request irq resource.\n"); - return -ENOENT; - } + ctx->irq = platform_get_irq(pdev, 0); + if (ctx->irq < 0) + return ctx->irq; - ctx->irq = res->start; ret = devm_request_irq(dev, ctx->irq, gsc_irq_handler, 0, dev_name(dev), ctx); if (ret < 0) { -- cgit From 0a6e8d0a6df67e0fff9c7d130b89769df4167c2b Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Fri, 21 Jan 2022 11:00:39 +0100 Subject: drm/exynos: Don't fail if no TE-gpio is defined for DSI driver TE-gpio is optional and if it is not found then gpiod_get_optional() returns NULL. In such case the code will continue and try to convert NULL gpiod to irq what in turn fails. The failure is then propagated and driver is not registered. Fix this by returning early from exynos_dsi_register_te_irq() if no TE-gpio is found. Fixes: ee6c8b5afa62 ("drm/exynos: Replace legacy gpio interface for gpiod interface") Signed-off-by: Marek Szyprowski Signed-off-by: Inki Dae --- drivers/gpu/drm/exynos/exynos_drm_dsi.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/exynos/exynos_drm_dsi.c b/drivers/gpu/drm/exynos/exynos_drm_dsi.c index 32a36572b894..14ebbb124852 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_dsi.c +++ b/drivers/gpu/drm/exynos/exynos_drm_dsi.c @@ -1335,7 +1335,9 @@ static int exynos_dsi_register_te_irq(struct exynos_dsi *dsi, int te_gpio_irq; dsi->te_gpio = devm_gpiod_get_optional(dsi->dev, "te", GPIOD_IN); - if (IS_ERR(dsi->te_gpio)) { + if (!dsi->te_gpio) { + return 0; + } else if (IS_ERR(dsi->te_gpio)) { dev_err(dsi->dev, "gpio request failed with %ld\n", PTR_ERR(dsi->te_gpio)); return PTR_ERR(dsi->te_gpio); -- cgit From 4188db23285e28d9e9b9096f856cdcd7868005ee Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Mon, 24 Jan 2022 14:52:46 +0100 Subject: drm/exynos: Search for TE-gpio in DSI panel's node TE-gpio, if defined, is placed in the panel's node, not the parent DSI node. Change the devm_gpiod_get_optional() to gpiod_get_optional() and pass proper device node to it. The code already has a proper cleanup path, so it looks that the devm_* variant has been applied accidentally during the conversion to gpiod API. Fixes: ee6c8b5afa62 ("drm/exynos: Replace legacy gpio interface for gpiod interface") Signed-off-by: Marek Szyprowski Fixed a typo. Signed-off-by: Inki Dae --- drivers/gpu/drm/exynos/exynos_drm_dsi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/exynos/exynos_drm_dsi.c b/drivers/gpu/drm/exynos/exynos_drm_dsi.c index 14ebbb124852..d13f5e3a030d 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_dsi.c +++ b/drivers/gpu/drm/exynos/exynos_drm_dsi.c @@ -1334,7 +1334,7 @@ static int exynos_dsi_register_te_irq(struct exynos_dsi *dsi, int ret; int te_gpio_irq; - dsi->te_gpio = devm_gpiod_get_optional(dsi->dev, "te", GPIOD_IN); + dsi->te_gpio = gpiod_get_optional(panel, "te", GPIOD_IN); if (!dsi->te_gpio) { return 0; } else if (IS_ERR(dsi->te_gpio)) { -- cgit From aa091a6a91df395a0fa00a808a543301ec99e734 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Sat, 19 Feb 2022 15:15:36 +0100 Subject: clk: lan966x: Fix linking error If the config options HAS_IOMEM is not set then the driver fails to link with the following error: clk-lan966x.c:(.text+0x950): undefined reference to `devm_platform_ioremap_resource' Therefor add missing dependencies: HAS_IOMEM and OF. Fixes: 54104ee02333 ("clk: lan966x: Add lan966x SoC clock driver") Reported-by: kernel test robot Signed-off-by: Horatiu Vultur Link: https://lore.kernel.org/r/20220219141536.460812-1-horatiu.vultur@microchip.com Reviewed-by: Nicolas Ferre Signed-off-by: Stephen Boyd --- drivers/clk/Kconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/clk/Kconfig b/drivers/clk/Kconfig index ad4256d54361..d4d67fbae869 100644 --- a/drivers/clk/Kconfig +++ b/drivers/clk/Kconfig @@ -231,6 +231,8 @@ config COMMON_CLK_GEMINI config COMMON_CLK_LAN966X bool "Generic Clock Controller driver for LAN966X SoC" + depends on HAS_IOMEM + depends on OF help This driver provides support for Generic Clock Controller(GCK) on LAN966X SoC. GCK generates and supplies clock to various peripherals -- cgit From 07c2c7a3b622e109ba4d2efd916da0477617ce81 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 24 Feb 2022 16:52:57 -0800 Subject: mptcp: accurate SIOCOUTQ for fallback socket The MPTCP SIOCOUTQ implementation is not very accurate in case of fallback: it only measures the data in the MPTCP-level write queue, but it does not take in account the subflow write queue utilization. In case of fallback the first can be empty, while the latter is not. The above produces sporadic self-tests issues and can foul legit user-space application. Fix the issue additionally querying the subflow in case of fallback. Fixes: 644807e3e462 ("mptcp: add SIOCINQ, OUTQ and OUTQNSD ioctls") Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/260 Reported-by: Matthieu Baerts Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index f60f01b14fac..12bb28c5007e 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3294,6 +3294,17 @@ static int mptcp_ioctl_outq(const struct mptcp_sock *msk, u64 v) return 0; delta = msk->write_seq - v; + if (__mptcp_check_fallback(msk) && msk->first) { + struct tcp_sock *tp = tcp_sk(msk->first); + + /* the first subflow is disconnected after close - see + * __mptcp_close_ssk(). tcp_disconnect() moves the write_seq + * so ignore that status, too. + */ + if (!((1 << msk->first->sk_state) & + (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE))) + delta += READ_ONCE(tp->write_seq) - tp->snd_una; + } if (delta > INT_MAX) delta = INT_MAX; -- cgit From 63bb8239d80571204c61d19c73f7bf5e3d9ef5fa Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 24 Feb 2022 16:52:58 -0800 Subject: selftests: mptcp: do complete cleanup at exit After commit 05be5e273c84 ("selftests: mptcp: add disconnect tests") the mptcp selftests leave behind a couple of tmp files after each run. run_tests_disconnect() misnames a few variables used to track them. Address the issue setting the appropriate global variables Fixes: 05be5e273c84 ("selftests: mptcp: add disconnect tests") Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_connect.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh index cb5809b89081..f0f4ab96b8f3 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh @@ -763,8 +763,8 @@ run_tests_disconnect() run_tests_lo "$ns1" "$ns1" dead:beef:1::1 1 "-I 3 -i $old_cin" # restore previous status - cout=$old_cout - cout_disconnect="$cout".disconnect + sin=$old_sin + sin_disconnect="$cout".disconnect cin=$old_cin cin_disconnect="$cin".disconnect connect_per_transfer=1 -- cgit From 877d11f0332cd2160e19e3313e262754c321fa36 Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Thu, 24 Feb 2022 16:52:59 -0800 Subject: mptcp: Correctly set DATA_FIN timeout when number of retransmits is large Syzkaller with UBSAN uncovered a scenario where a large number of DATA_FIN retransmits caused a shift-out-of-bounds in the DATA_FIN timeout calculation: ================================================================================ UBSAN: shift-out-of-bounds in net/mptcp/protocol.c:470:29 shift exponent 32 is too large for 32-bit type 'unsigned int' CPU: 1 PID: 13059 Comm: kworker/1:0 Not tainted 5.17.0-rc2-00630-g5fbf21c90c60 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 Workqueue: events mptcp_worker Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 ubsan_epilogue+0xb/0x5a lib/ubsan.c:151 __ubsan_handle_shift_out_of_bounds.cold+0xb2/0x20e lib/ubsan.c:330 mptcp_set_datafin_timeout net/mptcp/protocol.c:470 [inline] __mptcp_retrans.cold+0x72/0x77 net/mptcp/protocol.c:2445 mptcp_worker+0x58a/0xa70 net/mptcp/protocol.c:2528 process_one_work+0x9df/0x16d0 kernel/workqueue.c:2307 worker_thread+0x95/0xe10 kernel/workqueue.c:2454 kthread+0x2f4/0x3b0 kernel/kthread.c:377 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295 ================================================================================ This change limits the maximum timeout by limiting the size of the shift, which keeps all intermediate values in-bounds. Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/259 Fixes: 6477dd39e62c ("mptcp: Retransmit DATA_FIN") Acked-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 12bb28c5007e..1c72f25f083e 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -466,9 +466,12 @@ static bool mptcp_pending_data_fin(struct sock *sk, u64 *seq) static void mptcp_set_datafin_timeout(const struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); + u32 retransmits; - mptcp_sk(sk)->timer_ival = min(TCP_RTO_MAX, - TCP_RTO_MIN << icsk->icsk_retransmits); + retransmits = min_t(u32, icsk->icsk_retransmits, + ilog2(TCP_RTO_MAX / TCP_RTO_MIN)); + + mptcp_sk(sk)->timer_ival = TCP_RTO_MIN << retransmits; } static void __mptcp_set_timeout(struct sock *sk, long tout) -- cgit From f4896248e9025ff744b4147e6758274a1cb8cbae Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Sat, 12 Feb 2022 20:27:13 +0900 Subject: can: etas_es58x: change opened_channel_cnt's type from atomic_t to u8 The driver uses an atomic_t variable: struct es58x_device::opened_channel_cnt to keep track of the number of opened channels in order to only allocate memory for the URBs when this count changes from zero to one. While the intent was to prevent race conditions, the choice of an atomic_t turns out to be a bad idea for several reasons: - implementation is incorrect and fails to decrement opened_channel_cnt when the URB allocation fails as reported in [1]. - even if opened_channel_cnt were to be correctly decremented, atomic_t is insufficient to cover edge cases: there can be a race condition in which 1/ a first process fails to allocate URBs memory 2/ a second process enters es58x_open() before the first process does its cleanup and decrements opened_channed_cnt. In which case, the second process would successfully return despite the URBs memory not being allocated. - actually, any kind of locking mechanism was useless here because it is redundant with the network stack big kernel lock (a.k.a. rtnl_lock) which is being hold by all the callers of net_device_ops:ndo_open() and net_device_ops:ndo_close(). c.f. the ASSERST_RTNL() calls in __dev_open() [2] and __dev_close_many() [3]. The atmomic_t is thus replaced by a simple u8 type and the logic to increment and decrement es58x_device:opened_channel_cnt is simplified accordingly fixing the bug reported in [1]. We do not check again for ASSERST_RTNL() as this is already done by the callers. [1] https://lore.kernel.org/linux-can/20220201140351.GA2548@kili/T/#u [2] https://elixir.bootlin.com/linux/v5.16/source/net/core/dev.c#L1463 [3] https://elixir.bootlin.com/linux/v5.16/source/net/core/dev.c#L1541 Fixes: 8537257874e9 ("can: etas_es58x: add core support for ETAS ES58X CAN USB interfaces") Link: https://lore.kernel.org/all/20220212112713.577957-1-mailhol.vincent@wanadoo.fr Reported-by: Dan Carpenter Signed-off-by: Vincent Mailhol Signed-off-by: Marc Kleine-Budde --- drivers/net/can/usb/etas_es58x/es58x_core.c | 9 +++++---- drivers/net/can/usb/etas_es58x/es58x_core.h | 8 +++++--- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/drivers/net/can/usb/etas_es58x/es58x_core.c b/drivers/net/can/usb/etas_es58x/es58x_core.c index 2ed2370a3166..2d73ebbf3836 100644 --- a/drivers/net/can/usb/etas_es58x/es58x_core.c +++ b/drivers/net/can/usb/etas_es58x/es58x_core.c @@ -1787,7 +1787,7 @@ static int es58x_open(struct net_device *netdev) struct es58x_device *es58x_dev = es58x_priv(netdev)->es58x_dev; int ret; - if (atomic_inc_return(&es58x_dev->opened_channel_cnt) == 1) { + if (!es58x_dev->opened_channel_cnt) { ret = es58x_alloc_rx_urbs(es58x_dev); if (ret) return ret; @@ -1805,12 +1805,13 @@ static int es58x_open(struct net_device *netdev) if (ret) goto free_urbs; + es58x_dev->opened_channel_cnt++; netif_start_queue(netdev); return ret; free_urbs: - if (atomic_dec_and_test(&es58x_dev->opened_channel_cnt)) + if (!es58x_dev->opened_channel_cnt) es58x_free_urbs(es58x_dev); netdev_err(netdev, "%s: Could not open the network device: %pe\n", __func__, ERR_PTR(ret)); @@ -1845,7 +1846,8 @@ static int es58x_stop(struct net_device *netdev) es58x_flush_pending_tx_msg(netdev); - if (atomic_dec_and_test(&es58x_dev->opened_channel_cnt)) + es58x_dev->opened_channel_cnt--; + if (!es58x_dev->opened_channel_cnt) es58x_free_urbs(es58x_dev); return 0; @@ -2215,7 +2217,6 @@ static struct es58x_device *es58x_init_es58x_dev(struct usb_interface *intf, init_usb_anchor(&es58x_dev->tx_urbs_idle); init_usb_anchor(&es58x_dev->tx_urbs_busy); atomic_set(&es58x_dev->tx_urbs_idle_cnt, 0); - atomic_set(&es58x_dev->opened_channel_cnt, 0); usb_set_intfdata(intf, es58x_dev); es58x_dev->rx_pipe = usb_rcvbulkpipe(es58x_dev->udev, diff --git a/drivers/net/can/usb/etas_es58x/es58x_core.h b/drivers/net/can/usb/etas_es58x/es58x_core.h index 826a15871573..e5033cb5e695 100644 --- a/drivers/net/can/usb/etas_es58x/es58x_core.h +++ b/drivers/net/can/usb/etas_es58x/es58x_core.h @@ -373,8 +373,6 @@ struct es58x_operators { * queue wake/stop logic should prevent this URB from getting * empty. Please refer to es58x_get_tx_urb() for more details. * @tx_urbs_idle_cnt: number of urbs in @tx_urbs_idle. - * @opened_channel_cnt: number of channels opened (c.f. es58x_open() - * and es58x_stop()). * @ktime_req_ns: kernel timestamp when es58x_set_realtime_diff_ns() * was called. * @realtime_diff_ns: difference in nanoseconds between the clocks of @@ -384,6 +382,10 @@ struct es58x_operators { * in RX branches. * @rx_max_packet_size: Maximum length of bulk-in URB. * @num_can_ch: Number of CAN channel (i.e. number of elements of @netdev). + * @opened_channel_cnt: number of channels opened. Free of race + * conditions because its two users (net_device_ops:ndo_open() + * and net_device_ops:ndo_close()) guarantee that the network + * stack big kernel lock (a.k.a. rtnl_mutex) is being hold. * @rx_cmd_buf_len: Length of @rx_cmd_buf. * @rx_cmd_buf: The device might split the URB commands in an * arbitrary amount of pieces. This buffer is used to concatenate @@ -406,7 +408,6 @@ struct es58x_device { struct usb_anchor tx_urbs_busy; struct usb_anchor tx_urbs_idle; atomic_t tx_urbs_idle_cnt; - atomic_t opened_channel_cnt; u64 ktime_req_ns; s64 realtime_diff_ns; @@ -415,6 +416,7 @@ struct es58x_device { u16 rx_max_packet_size; u8 num_can_ch; + u8 opened_channel_cnt; u16 rx_cmd_buf_len; union es58x_urb_cmd rx_cmd_buf; -- cgit From 035b0fcf02707d3c9c2890dc1484b11aa5335eb1 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Tue, 15 Feb 2022 08:48:14 +0900 Subject: can: gs_usb: change active_channels's type from atomic_t to u8 The driver uses an atomic_t variable: gs_usb:active_channels to keep track of the number of opened channels in order to only allocate memory for the URBs when this count changes from zero to one. However, the driver does not decrement the counter when an error occurs in gs_can_open(). This issue is fixed by changing the type from atomic_t to u8 and by simplifying the logic accordingly. It is safe to use an u8 here because the network stack big kernel lock (a.k.a. rtnl_mutex) is being hold. For details, please refer to [1]. [1] https://lore.kernel.org/linux-can/CAMZ6Rq+sHpiw34ijPsmp7vbUpDtJwvVtdV7CvRZJsLixjAFfrg@mail.gmail.com/T/#t Fixes: d08e973a77d1 ("can: gs_usb: Added support for the GS_USB CAN devices") Link: https://lore.kernel.org/all/20220214234814.1321599-1-mailhol.vincent@wanadoo.fr Signed-off-by: Vincent Mailhol Signed-off-by: Marc Kleine-Budde --- drivers/net/can/usb/gs_usb.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/can/usb/gs_usb.c b/drivers/net/can/usb/gs_usb.c index b487e3fe770a..d35749fad1ef 100644 --- a/drivers/net/can/usb/gs_usb.c +++ b/drivers/net/can/usb/gs_usb.c @@ -191,8 +191,8 @@ struct gs_can { struct gs_usb { struct gs_can *canch[GS_MAX_INTF]; struct usb_anchor rx_submitted; - atomic_t active_channels; struct usb_device *udev; + u8 active_channels; }; /* 'allocate' a tx context. @@ -589,7 +589,7 @@ static int gs_can_open(struct net_device *netdev) if (rc) return rc; - if (atomic_add_return(1, &parent->active_channels) == 1) { + if (!parent->active_channels) { for (i = 0; i < GS_MAX_RX_URBS; i++) { struct urb *urb; u8 *buf; @@ -690,6 +690,7 @@ static int gs_can_open(struct net_device *netdev) dev->can.state = CAN_STATE_ERROR_ACTIVE; + parent->active_channels++; if (!(dev->can.ctrlmode & CAN_CTRLMODE_LISTENONLY)) netif_start_queue(netdev); @@ -705,7 +706,8 @@ static int gs_can_close(struct net_device *netdev) netif_stop_queue(netdev); /* Stop polling */ - if (atomic_dec_and_test(&parent->active_channels)) + parent->active_channels--; + if (!parent->active_channels) usb_kill_anchored_urbs(&parent->rx_submitted); /* Stop sending URBs */ @@ -984,8 +986,6 @@ static int gs_usb_probe(struct usb_interface *intf, init_usb_anchor(&dev->rx_submitted); - atomic_set(&dev->active_channels, 0); - usb_set_intfdata(intf, dev); dev->udev = interface_to_usbdev(intf); -- cgit From bca06b85fcaf866602e328b3bcd86f74180eca14 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 24 Feb 2022 19:19:16 +0000 Subject: Revert "KVM: VMX: Save HOST_CR3 in vmx_set_host_fs_gs()" Undo a nested VMX fix as a step toward reverting the commit it fixed, 15ad9762d69f ("KVM: VMX: Save HOST_CR3 in vmx_prepare_switch_to_guest()"), as the underlying premise that "host CR3 in the vcpu thread can only be changed when scheduling" is wrong. This reverts commit a9f2705ec84449e3b8d70c804766f8e97e23080d. Signed-off-by: Sean Christopherson Message-Id: <20220224191917.3508476-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 3 +-- arch/x86/kvm/vmx/vmx.c | 20 +++++++++++--------- arch/x86/kvm/vmx/vmx.h | 5 ++--- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index ba34e94049c7..c12f95004a72 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -246,8 +246,7 @@ static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, src = &prev->host_state; dest = &vmx->loaded_vmcs->host_state; - vmx_set_vmcs_host_state(dest, src->cr3, src->fs_sel, src->gs_sel, - src->fs_base, src->gs_base); + vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base); dest->ldt_sel = src->ldt_sel; #ifdef CONFIG_X86_64 dest->ds_sel = src->ds_sel; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index efda5e4d6247..beb68cd28aca 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1080,14 +1080,9 @@ static void pt_guest_exit(struct vcpu_vmx *vmx) wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); } -void vmx_set_vmcs_host_state(struct vmcs_host_state *host, unsigned long cr3, - u16 fs_sel, u16 gs_sel, - unsigned long fs_base, unsigned long gs_base) +void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, + unsigned long fs_base, unsigned long gs_base) { - if (unlikely(cr3 != host->cr3)) { - vmcs_writel(HOST_CR3, cr3); - host->cr3 = cr3; - } if (unlikely(fs_sel != host->fs_sel)) { if (!(fs_sel & 7)) vmcs_write16(HOST_FS_SELECTOR, fs_sel); @@ -1119,6 +1114,7 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) #ifdef CONFIG_X86_64 int cpu = raw_smp_processor_id(); #endif + unsigned long cr3; unsigned long fs_base, gs_base; u16 fs_sel, gs_sel; int i; @@ -1182,8 +1178,14 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) gs_base = segment_base(gs_sel); #endif - vmx_set_vmcs_host_state(host_state, __get_current_cr3_fast(), - fs_sel, gs_sel, fs_base, gs_base); + vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base); + + /* Host CR3 including its PCID is stable when guest state is loaded. */ + cr3 = __get_current_cr3_fast(); + if (unlikely(cr3 != host_state->cr3)) { + vmcs_writel(HOST_CR3, cr3); + host_state->cr3 = cr3; + } vmx->guest_state_loaded = true; } diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 7f2c82e7f38f..9c6bfcd84008 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -374,9 +374,8 @@ int allocate_vpid(void); void free_vpid(int vpid); void vmx_set_constant_host_state(struct vcpu_vmx *vmx); void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu); -void vmx_set_vmcs_host_state(struct vmcs_host_state *host, unsigned long cr3, - u16 fs_sel, u16 gs_sel, - unsigned long fs_base, unsigned long gs_base); +void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, + unsigned long fs_base, unsigned long gs_base); int vmx_get_cpl(struct kvm_vcpu *vcpu); bool vmx_emulation_required(struct kvm_vcpu *vcpu); unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu); -- cgit From 1a71581012ddf1f465040ef3d9f700341fa3cf04 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 24 Feb 2022 19:19:17 +0000 Subject: Revert "KVM: VMX: Save HOST_CR3 in vmx_prepare_switch_to_guest()" Revert back to refreshing vmcs.HOST_CR3 immediately prior to VM-Enter. The PCID (ASID) part of CR3 can be bumped without KVM being scheduled out, as the kernel will switch CR3 during __text_poke(), e.g. in response to a static key toggling. If switch_mm_irqs_off() chooses a new ASID for the mm associate with KVM, KVM will do VM-Enter => VM-Exit with a stale vmcs.HOST_CR3. Add a comment to explain why KVM must wait until VM-Enter is imminent to refresh vmcs.HOST_CR3. The following splat was captured by stashing vmcs.HOST_CR3 in kvm_vcpu and adding a WARN in load_new_mm_cr3() to fire if a new ASID is being loaded for the KVM-associated mm while KVM has a "running" vCPU: static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush) { struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); ... WARN(vcpu && (vcpu->cr3 & GENMASK(11, 0)) != (new_mm_cr3 & GENMASK(11, 0)) && (vcpu->cr3 & PHYSICAL_PAGE_MASK) == (new_mm_cr3 & PHYSICAL_PAGE_MASK), "KVM is hosed, loading CR3 = %lx, vmcs.HOST_CR3 = %lx", new_mm_cr3, vcpu->cr3); } ------------[ cut here ]------------ KVM is hosed, loading CR3 = 8000000105393004, vmcs.HOST_CR3 = 105393003 WARNING: CPU: 4 PID: 20717 at arch/x86/mm/tlb.c:291 load_new_mm_cr3+0x82/0xe0 Modules linked in: vhost_net vhost vhost_iotlb tap kvm_intel CPU: 4 PID: 20717 Comm: stable Tainted: G W 5.17.0-rc3+ #747 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 RIP: 0010:load_new_mm_cr3+0x82/0xe0 RSP: 0018:ffffc9000489fa98 EFLAGS: 00010082 RAX: 0000000000000000 RBX: 8000000105393004 RCX: 0000000000000027 RDX: 0000000000000027 RSI: 00000000ffffdfff RDI: ffff888277d1b788 RBP: 0000000000000004 R08: ffff888277d1b780 R09: ffffc9000489f8b8 R10: 0000000000000001 R11: 0000000000000001 R12: 0000000000000000 R13: ffff88810678a800 R14: 0000000000000004 R15: 0000000000000c33 FS: 00007fa9f0e72700(0000) GS:ffff888277d00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 00000001001b5003 CR4: 0000000000172ea0 Call Trace: switch_mm_irqs_off+0x1cb/0x460 __text_poke+0x308/0x3e0 text_poke_bp_batch+0x168/0x220 text_poke_finish+0x1b/0x30 arch_jump_label_transform_apply+0x18/0x30 static_key_slow_inc_cpuslocked+0x7c/0x90 static_key_slow_inc+0x16/0x20 kvm_lapic_set_base+0x116/0x190 kvm_set_apic_base+0xa5/0xe0 kvm_set_msr_common+0x2f4/0xf60 vmx_set_msr+0x355/0xe70 [kvm_intel] kvm_set_msr_ignored_check+0x91/0x230 kvm_emulate_wrmsr+0x36/0x120 vmx_handle_exit+0x609/0x6c0 [kvm_intel] kvm_arch_vcpu_ioctl_run+0x146f/0x1b80 kvm_vcpu_ioctl+0x279/0x690 __x64_sys_ioctl+0x83/0xb0 do_syscall_64+0x3b/0xc0 entry_SYSCALL_64_after_hwframe+0x44/0xae ---[ end trace 0000000000000000 ]--- This reverts commit 15ad9762d69fd8e40a4a51828c1d6b0c1b8fbea0. Fixes: 15ad9762d69f ("KVM: VMX: Save HOST_CR3 in vmx_prepare_switch_to_guest()") Reported-by: Wanpeng Li Cc: Lai Jiangshan Signed-off-by: Sean Christopherson Acked-by: Lai Jiangshan Message-Id: <20220224191917.3508476-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 8 +++++++- arch/x86/kvm/vmx/vmx.c | 24 ++++++++++++++---------- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index c12f95004a72..dc822a1d403d 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3055,7 +3055,7 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long cr4; + unsigned long cr3, cr4; bool vm_fail; if (!nested_early_check) @@ -3078,6 +3078,12 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) */ vmcs_writel(GUEST_RFLAGS, 0); + cr3 = __get_current_cr3_fast(); + if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { + vmcs_writel(HOST_CR3, cr3); + vmx->loaded_vmcs->host_state.cr3 = cr3; + } + cr4 = cr4_read_shadow(); if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { vmcs_writel(HOST_CR4, cr4); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index beb68cd28aca..b730d799c26e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1114,7 +1114,6 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) #ifdef CONFIG_X86_64 int cpu = raw_smp_processor_id(); #endif - unsigned long cr3; unsigned long fs_base, gs_base; u16 fs_sel, gs_sel; int i; @@ -1179,14 +1178,6 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) #endif vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base); - - /* Host CR3 including its PCID is stable when guest state is loaded. */ - cr3 = __get_current_cr3_fast(); - if (unlikely(cr3 != host_state->cr3)) { - vmcs_writel(HOST_CR3, cr3); - host_state->cr3 = cr3; - } - vmx->guest_state_loaded = true; } @@ -6793,7 +6784,7 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long cr4; + unsigned long cr3, cr4; /* Record the guest's net vcpu time for enforced NMI injections. */ if (unlikely(!enable_vnmi && @@ -6836,6 +6827,19 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); vcpu->arch.regs_dirty = 0; + /* + * Refresh vmcs.HOST_CR3 if necessary. This must be done immediately + * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time + * it switches back to the current->mm, which can occur in KVM context + * when switching to a temporary mm to patch kernel code, e.g. if KVM + * toggles a static key while handling a VM-Exit. + */ + cr3 = __get_current_cr3_fast(); + if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { + vmcs_writel(HOST_CR3, cr3); + vmx->loaded_vmcs->host_state.cr3 = cr3; + } + cr4 = cr4_read_shadow(); if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { vmcs_writel(HOST_CR4, cr4); -- cgit From dcf4ff7a48e7598e6b10126cc02177abb8ae4f3f Mon Sep 17 00:00:00 2001 From: Marek Marczykowski-Górecki Date: Wed, 23 Feb 2022 22:19:54 +0100 Subject: xen/netfront: destroy queues before real_num_tx_queues is zeroed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit xennet_destroy_queues() relies on info->netdev->real_num_tx_queues to delete queues. Since d7dac083414eb5bb99a6d2ed53dc2c1b405224e5 ("net-sysfs: update the queue counts in the unregistration path"), unregister_netdev() indirectly sets real_num_tx_queues to 0. Those two facts together means, that xennet_destroy_queues() called from xennet_remove() cannot do its job, because it's called after unregister_netdev(). This results in kfree-ing queues that are still linked in napi, which ultimately crashes: BUG: kernel NULL pointer dereference, address: 0000000000000000 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 1 PID: 52 Comm: xenwatch Tainted: G W 5.16.10-1.32.fc32.qubes.x86_64+ #226 RIP: 0010:free_netdev+0xa3/0x1a0 Code: ff 48 89 df e8 2e e9 00 00 48 8b 43 50 48 8b 08 48 8d b8 a0 fe ff ff 48 8d a9 a0 fe ff ff 49 39 c4 75 26 eb 47 e8 ed c1 66 ff <48> 8b 85 60 01 00 00 48 8d 95 60 01 00 00 48 89 ef 48 2d 60 01 00 RSP: 0000:ffffc90000bcfd00 EFLAGS: 00010286 RAX: 0000000000000000 RBX: ffff88800edad000 RCX: 0000000000000000 RDX: 0000000000000001 RSI: ffffc90000bcfc30 RDI: 00000000ffffffff RBP: fffffffffffffea0 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000001 R12: ffff88800edad050 R13: ffff8880065f8f88 R14: 0000000000000000 R15: ffff8880066c6680 FS: 0000000000000000(0000) GS:ffff8880f3300000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 00000000e998c006 CR4: 00000000003706e0 Call Trace: xennet_remove+0x13d/0x300 [xen_netfront] xenbus_dev_remove+0x6d/0xf0 __device_release_driver+0x17a/0x240 device_release_driver+0x24/0x30 bus_remove_device+0xd8/0x140 device_del+0x18b/0x410 ? _raw_spin_unlock+0x16/0x30 ? klist_iter_exit+0x14/0x20 ? xenbus_dev_request_and_reply+0x80/0x80 device_unregister+0x13/0x60 xenbus_dev_changed+0x18e/0x1f0 xenwatch_thread+0xc0/0x1a0 ? do_wait_intr_irq+0xa0/0xa0 kthread+0x16b/0x190 ? set_kthread_struct+0x40/0x40 ret_from_fork+0x22/0x30 Fix this by calling xennet_destroy_queues() from xennet_uninit(), when real_num_tx_queues is still available. This ensures that queues are destroyed when real_num_tx_queues is set to 0, regardless of how unregister_netdev() was called. Originally reported at https://github.com/QubesOS/qubes-issues/issues/7257 Fixes: d7dac083414eb5bb9 ("net-sysfs: update the queue counts in the unregistration path") Cc: stable@vger.kernel.org Signed-off-by: Marek Marczykowski-Górecki Signed-off-by: David S. Miller --- drivers/net/xen-netfront.c | 39 +++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index 8b18246ad999..7748f07e2cf1 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -842,6 +842,28 @@ static int xennet_close(struct net_device *dev) return 0; } +static void xennet_destroy_queues(struct netfront_info *info) +{ + unsigned int i; + + for (i = 0; i < info->netdev->real_num_tx_queues; i++) { + struct netfront_queue *queue = &info->queues[i]; + + if (netif_running(info->netdev)) + napi_disable(&queue->napi); + netif_napi_del(&queue->napi); + } + + kfree(info->queues); + info->queues = NULL; +} + +static void xennet_uninit(struct net_device *dev) +{ + struct netfront_info *np = netdev_priv(dev); + xennet_destroy_queues(np); +} + static void xennet_set_rx_rsp_cons(struct netfront_queue *queue, RING_IDX val) { unsigned long flags; @@ -1611,6 +1633,7 @@ static int xennet_xdp(struct net_device *dev, struct netdev_bpf *xdp) } static const struct net_device_ops xennet_netdev_ops = { + .ndo_uninit = xennet_uninit, .ndo_open = xennet_open, .ndo_stop = xennet_close, .ndo_start_xmit = xennet_start_xmit, @@ -2103,22 +2126,6 @@ error: return err; } -static void xennet_destroy_queues(struct netfront_info *info) -{ - unsigned int i; - - for (i = 0; i < info->netdev->real_num_tx_queues; i++) { - struct netfront_queue *queue = &info->queues[i]; - - if (netif_running(info->netdev)) - napi_disable(&queue->napi); - netif_napi_del(&queue->napi); - } - - kfree(info->queues); - info->queues = NULL; -} - static int xennet_create_page_pool(struct netfront_queue *queue) -- cgit From 087a7b944c5db409f7c1a68bf4896c56ba54eaff Mon Sep 17 00:00:00 2001 From: Vincent Whitchurch Date: Thu, 24 Feb 2022 12:38:29 +0100 Subject: net: stmmac: only enable DMA interrupts when ready In this driver's ->ndo_open() callback, it enables DMA interrupts, starts the DMA channels, then requests interrupts with request_irq(), and then finally enables napi. If RX DMA interrupts are received before napi is enabled, no processing is done because napi_schedule_prep() will return false. If the network has a lot of broadcast/multicast traffic, then the RX ring could fill up completely before napi is enabled. When this happens, no further RX interrupts will be delivered, and the driver will fail to receive any packets. Fix this by only enabling DMA interrupts after all other initialization is complete. Fixes: 523f11b5d4fd72efb ("net: stmmac: move hardware setup for stmmac_open to new function") Reported-by: Lars Persson Signed-off-by: Vincent Whitchurch Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 28 +++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index bde76ea2deec..cb9b6e08780c 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -2262,6 +2262,23 @@ static void stmmac_stop_tx_dma(struct stmmac_priv *priv, u32 chan) stmmac_stop_tx(priv, priv->ioaddr, chan); } +static void stmmac_enable_all_dma_irq(struct stmmac_priv *priv) +{ + u32 rx_channels_count = priv->plat->rx_queues_to_use; + u32 tx_channels_count = priv->plat->tx_queues_to_use; + u32 dma_csr_ch = max(rx_channels_count, tx_channels_count); + u32 chan; + + for (chan = 0; chan < dma_csr_ch; chan++) { + struct stmmac_channel *ch = &priv->channel[chan]; + unsigned long flags; + + spin_lock_irqsave(&ch->lock, flags); + stmmac_enable_dma_irq(priv, priv->ioaddr, chan, 1, 1); + spin_unlock_irqrestore(&ch->lock, flags); + } +} + /** * stmmac_start_all_dma - start all RX and TX DMA channels * @priv: driver private structure @@ -2904,8 +2921,10 @@ static int stmmac_init_dma_engine(struct stmmac_priv *priv) stmmac_axi(priv, priv->ioaddr, priv->plat->axi); /* DMA CSR Channel configuration */ - for (chan = 0; chan < dma_csr_ch; chan++) + for (chan = 0; chan < dma_csr_ch; chan++) { stmmac_init_chan(priv, priv->ioaddr, priv->plat->dma_cfg, chan); + stmmac_disable_dma_irq(priv, priv->ioaddr, chan, 1, 1); + } /* DMA RX Channel Configuration */ for (chan = 0; chan < rx_channels_count; chan++) { @@ -3761,6 +3780,7 @@ static int stmmac_open(struct net_device *dev) stmmac_enable_all_queues(priv); netif_tx_start_all_queues(priv->dev); + stmmac_enable_all_dma_irq(priv); return 0; @@ -6510,8 +6530,10 @@ int stmmac_xdp_open(struct net_device *dev) } /* DMA CSR Channel configuration */ - for (chan = 0; chan < dma_csr_ch; chan++) + for (chan = 0; chan < dma_csr_ch; chan++) { stmmac_init_chan(priv, priv->ioaddr, priv->plat->dma_cfg, chan); + stmmac_disable_dma_irq(priv, priv->ioaddr, chan, 1, 1); + } /* Adjust Split header */ sph_en = (priv->hw->rx_csum > 0) && priv->sph; @@ -6572,6 +6594,7 @@ int stmmac_xdp_open(struct net_device *dev) stmmac_enable_all_queues(priv); netif_carrier_on(dev); netif_tx_start_all_queues(dev); + stmmac_enable_all_dma_irq(priv); return 0; @@ -7451,6 +7474,7 @@ int stmmac_resume(struct device *dev) stmmac_restore_hw_vlan_rx_fltr(priv, ndev, priv->hw); stmmac_enable_all_queues(priv); + stmmac_enable_all_dma_irq(priv); mutex_unlock(&priv->lock); rtnl_unlock(); -- cgit From 9f1c50cf39167ff71dc5953a3234f3f6eeb8fcb5 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Thu, 24 Feb 2022 23:26:19 +0800 Subject: net/smc: fix connection leak There's a potential leak issue under following execution sequence : smc_release smc_connect_work if (sk->sk_state == SMC_INIT) send_clc_confirim tcp_abort(); ... sk.sk_state = SMC_ACTIVE smc_close_active switch(sk->sk_state) { ... case SMC_ACTIVE: smc_close_final() // then wait peer closed Unfortunately, tcp_abort() may discard CLC CONFIRM messages that are still in the tcp send buffer, in which case our connection token cannot be delivered to the server side, which means that we cannot get a passive close message at all. Therefore, it is impossible for the to be disconnected at all. This patch tries a very simple way to avoid this issue, once the state has changed to SMC_ACTIVE after tcp_abort(), we can actively abort the smc connection, considering that the state is SMC_INIT before tcp_abort(), abandoning the complete disconnection process should not cause too much problem. In fact, this problem may exist as long as the CLC CONFIRM message is not received by the server. Whether a timer should be added after smc_close_final() needs to be discussed in the future. But even so, this patch provides a faster release for connection in above case, it should also be valuable. Fixes: 39f41f367b08 ("net/smc: common release code for non-accepted sockets") Signed-off-by: D. Wythe Acked-by: Karsten Graul Signed-off-by: David S. Miller --- net/smc/af_smc.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 306d9e8cd1dd..81984c1c0e78 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -183,7 +183,7 @@ static int smc_release(struct socket *sock) { struct sock *sk = sock->sk; struct smc_sock *smc; - int rc = 0; + int old_state, rc = 0; if (!sk) goto out; @@ -191,8 +191,10 @@ static int smc_release(struct socket *sock) sock_hold(sk); /* sock_put below */ smc = smc_sk(sk); + old_state = sk->sk_state; + /* cleanup for a dangling non-blocking connect */ - if (smc->connect_nonblock && sk->sk_state == SMC_INIT) + if (smc->connect_nonblock && old_state == SMC_INIT) tcp_abort(smc->clcsock->sk, ECONNABORTED); if (cancel_work_sync(&smc->connect_work)) @@ -206,6 +208,10 @@ static int smc_release(struct socket *sock) else lock_sock(sk); + if (old_state == SMC_INIT && sk->sk_state == SMC_ACTIVE && + !smc->use_fallback) + smc_close_active_abort(smc); + rc = __smc_release(smc); /* detach socket */ -- cgit From 91b0383fef06f20b847fa9e4f0e3054ead0b1a1b Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 24 Feb 2022 18:01:54 +0200 Subject: net: dcb: flush lingering app table entries for unregistered devices If I'm not mistaken (and I don't think I am), the way in which the dcbnl_ops work is that drivers call dcb_ieee_setapp() and this populates the application table with dynamically allocated struct dcb_app_type entries that are kept in the module-global dcb_app_list. However, nobody keeps exact track of these entries, and although dcb_ieee_delapp() is supposed to remove them, nobody does so when the interface goes away (example: driver unbinds from device). So the dcb_app_list will contain lingering entries with an ifindex that no longer matches any device in dcb_app_lookup(). Reclaim the lost memory by listening for the NETDEV_UNREGISTER event and flushing the app table entries of interfaces that are now gone. In fact something like this used to be done as part of the initial commit (blamed below), but it was done in dcbnl_exit() -> dcb_flushapp(), essentially at module_exit time. That became dead code after commit 7a6b6f515f77 ("DCB: fix kconfig option") which essentially merged "tristate config DCB" and "bool config DCBNL" into a single "bool config DCB", so net/dcb/dcbnl.c could not be built as a module anymore. Commit 36b9ad8084bd ("net/dcb: make dcbnl.c explicitly non-modular") recognized this and deleted dcbnl_exit() and dcb_flushapp() altogether, leaving us with the version we have today. Since flushing application table entries can and should be done as soon as the netdevice disappears, fundamentally the commit that is to blame is the one that introduced the design of this API. Fixes: 9ab933ab2cc8 ("dcbnl: add appliction tlv handlers") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- net/dcb/dcbnl.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index b441ab330fd3..36c91273daac 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -2073,8 +2073,52 @@ u8 dcb_ieee_getapp_default_prio_mask(const struct net_device *dev) } EXPORT_SYMBOL(dcb_ieee_getapp_default_prio_mask); +static void dcbnl_flush_dev(struct net_device *dev) +{ + struct dcb_app_type *itr, *tmp; + + spin_lock(&dcb_lock); + + list_for_each_entry_safe(itr, tmp, &dcb_app_list, list) { + if (itr->ifindex == dev->ifindex) { + list_del(&itr->list); + kfree(itr); + } + } + + spin_unlock(&dcb_lock); +} + +static int dcbnl_netdevice_event(struct notifier_block *nb, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + + switch (event) { + case NETDEV_UNREGISTER: + if (!dev->dcbnl_ops) + return NOTIFY_DONE; + + dcbnl_flush_dev(dev); + + return NOTIFY_OK; + default: + return NOTIFY_DONE; + } +} + +static struct notifier_block dcbnl_nb __read_mostly = { + .notifier_call = dcbnl_netdevice_event, +}; + static int __init dcbnl_init(void) { + int err; + + err = register_netdevice_notifier(&dcbnl_nb); + if (err) + return err; + rtnl_register(PF_UNSPEC, RTM_GETDCB, dcb_doit, NULL, 0); rtnl_register(PF_UNSPEC, RTM_SETDCB, dcb_doit, NULL, 0); -- cgit From 8d0657f39f487d904fca713e0bc39c2707382553 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Thu, 24 Feb 2022 22:23:51 -0800 Subject: ibmvnic: free reset-work-item when flushing Fix a tiny memory leak when flushing the reset work queue. Fixes: 2770a7984db5 ("ibmvnic: Introduce hard reset recovery") Signed-off-by: Sukadev Bhattiprolu Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index dee05a353dbd..86d522171892 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -2784,8 +2784,10 @@ static int ibmvnic_reset(struct ibmvnic_adapter *adapter, * flush reset queue and process this reset */ if (adapter->force_reset_recovery && !list_empty(&adapter->rwi_list)) { - list_for_each_safe(entry, tmp_entry, &adapter->rwi_list) + list_for_each_safe(entry, tmp_entry, &adapter->rwi_list) { list_del(entry); + kfree(list_entry(entry, struct ibmvnic_rwi, list)); + } } rwi->reset_reason = reason; list_add_tail(&rwi->list, &adapter->rwi_list); -- cgit From 765559b10ce514eb1576595834f23cdc92125fee Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Thu, 24 Feb 2022 22:23:52 -0800 Subject: ibmvnic: initialize rc before completing wait We should initialize ->init_done_rc before calling complete(). Otherwise the waiting thread may see ->init_done_rc as 0 before we have updated it and may assume that the CRQ was successful. Fixes: 6b278c0cb378 ("ibmvnic delay complete()") Signed-off-by: Sukadev Bhattiprolu Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 86d522171892..c73c699600a8 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -5323,9 +5323,9 @@ static void ibmvnic_handle_crq(union ibmvnic_crq *crq, } if (!completion_done(&adapter->init_done)) { - complete(&adapter->init_done); if (!adapter->init_done_rc) adapter->init_done_rc = -EAGAIN; + complete(&adapter->init_done); } break; -- cgit From 83da53f7e4bd86dca4b2edc1e2bb324fb3c033a1 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Thu, 24 Feb 2022 22:23:53 -0800 Subject: ibmvnic: define flush_reset_queue helper Define and use a helper to flush the reset queue. Fixes: 2770a7984db5 ("ibmvnic: Introduce hard reset recovery") Signed-off-by: Sukadev Bhattiprolu Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index c73c699600a8..42c3ac9ebb75 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -2735,12 +2735,23 @@ static void __ibmvnic_delayed_reset(struct work_struct *work) __ibmvnic_reset(&adapter->ibmvnic_reset); } +static void flush_reset_queue(struct ibmvnic_adapter *adapter) +{ + struct list_head *entry, *tmp_entry; + + if (!list_empty(&adapter->rwi_list)) { + list_for_each_safe(entry, tmp_entry, &adapter->rwi_list) { + list_del(entry); + kfree(list_entry(entry, struct ibmvnic_rwi, list)); + } + } +} + static int ibmvnic_reset(struct ibmvnic_adapter *adapter, enum ibmvnic_reset_reason reason) { - struct list_head *entry, *tmp_entry; - struct ibmvnic_rwi *rwi, *tmp; struct net_device *netdev = adapter->netdev; + struct ibmvnic_rwi *rwi, *tmp; unsigned long flags; int ret; @@ -2783,12 +2794,9 @@ static int ibmvnic_reset(struct ibmvnic_adapter *adapter, /* if we just received a transport event, * flush reset queue and process this reset */ - if (adapter->force_reset_recovery && !list_empty(&adapter->rwi_list)) { - list_for_each_safe(entry, tmp_entry, &adapter->rwi_list) { - list_del(entry); - kfree(list_entry(entry, struct ibmvnic_rwi, list)); - } - } + if (adapter->force_reset_recovery) + flush_reset_queue(adapter); + rwi->reset_reason = reason; list_add_tail(&rwi->list, &adapter->rwi_list); netdev_dbg(adapter->netdev, "Scheduling reset (reason %s)\n", -- cgit From 36491f2df9ad2501e5a4ec25d3d95d72bafd2781 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Thu, 24 Feb 2022 22:23:54 -0800 Subject: ibmvnic: complete init_done on transport events If we get a transport event, set the error and mark the init as complete so the attempt to send crq-init or login fail sooner rather than wait for the timeout. Fixes: bbd669a868bb ("ibmvnic: Fix completion structure initialization") Signed-off-by: Sukadev Bhattiprolu Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 42c3ac9ebb75..5913d372bc27 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -5356,6 +5356,13 @@ static void ibmvnic_handle_crq(union ibmvnic_crq *crq, adapter->fw_done_rc = -EIO; complete(&adapter->fw_done); } + + /* if we got here during crq-init, retry crq-init */ + if (!completion_done(&adapter->init_done)) { + adapter->init_done_rc = -EAGAIN; + complete(&adapter->init_done); + } + if (!completion_done(&adapter->stats_done)) complete(&adapter->stats_done); if (test_bit(0, &adapter->resetting)) -- cgit From 570425f8c7c18b14fa8a2a58a0adb431968ad118 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Thu, 24 Feb 2022 22:23:55 -0800 Subject: ibmvnic: register netdev after init of adapter Finish initializing the adapter before registering netdev so state is consistent. Fixes: c26eba03e407 ("ibmvnic: Update reset infrastructure to support tunable parameters") Signed-off-by: Sukadev Bhattiprolu Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 5913d372bc27..a7b03ca109d8 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -5826,12 +5826,6 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id) goto ibmvnic_dev_file_err; netif_carrier_off(netdev); - rc = register_netdev(netdev); - if (rc) { - dev_err(&dev->dev, "failed to register netdev rc=%d\n", rc); - goto ibmvnic_register_fail; - } - dev_info(&dev->dev, "ibmvnic registered\n"); if (init_success) { adapter->state = VNIC_PROBED; @@ -5844,6 +5838,14 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id) adapter->wait_for_reset = false; adapter->last_reset_time = jiffies; + + rc = register_netdev(netdev); + if (rc) { + dev_err(&dev->dev, "failed to register netdev rc=%d\n", rc); + goto ibmvnic_register_fail; + } + dev_info(&dev->dev, "ibmvnic registered\n"); + return 0; ibmvnic_register_fail: -- cgit From ae16bf15374d8b055e040ac6f3f1147ab1c9bb7d Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Thu, 24 Feb 2022 22:23:56 -0800 Subject: ibmvnic: init init_done_rc earlier We currently initialize the ->init_done completion/return code fields before issuing a CRQ_INIT command. But if we get a transport event soon after registering the CRQ the taskslet may already have recorded the completion and error code. If we initialize here, we might overwrite/ lose that and end up issuing the CRQ_INIT only to timeout later. If that timeout happens during probe, we will leave the adapter in the DOWN state rather than retrying to register/init the CRQ. Initialize the completion before registering the CRQ so we don't lose the notification. Fixes: 032c5e82847a ("Driver for IBM System i/p VNIC protocol") Signed-off-by: Sukadev Bhattiprolu Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index a7b03ca109d8..16a8f92adbda 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -2212,6 +2212,19 @@ static const char *reset_reason_to_string(enum ibmvnic_reset_reason reason) return "UNKNOWN"; } +/* + * Initialize the init_done completion and return code values. We + * can get a transport event just after registering the CRQ and the + * tasklet will use this to communicate the transport event. To ensure + * we don't miss the notification/error, initialize these _before_ + * regisering the CRQ. + */ +static inline void reinit_init_done(struct ibmvnic_adapter *adapter) +{ + reinit_completion(&adapter->init_done); + adapter->init_done_rc = 0; +} + /* * do_reset returns zero if we are able to keep processing reset events, or * non-zero if we hit a fatal error and must halt. @@ -2318,6 +2331,8 @@ static int do_reset(struct ibmvnic_adapter *adapter, */ adapter->state = VNIC_PROBED; + reinit_init_done(adapter); + if (adapter->reset_reason == VNIC_RESET_CHANGE_PARAM) { rc = init_crq_queue(adapter); } else if (adapter->reset_reason == VNIC_RESET_MOBILITY) { @@ -2461,7 +2476,8 @@ static int do_hard_reset(struct ibmvnic_adapter *adapter, */ adapter->state = VNIC_PROBED; - reinit_completion(&adapter->init_done); + reinit_init_done(adapter); + rc = init_crq_queue(adapter); if (rc) { netdev_err(adapter->netdev, @@ -5679,10 +5695,6 @@ static int ibmvnic_reset_init(struct ibmvnic_adapter *adapter, bool reset) adapter->from_passive_init = false; - if (reset) - reinit_completion(&adapter->init_done); - - adapter->init_done_rc = 0; rc = ibmvnic_send_crq_init(adapter); if (rc) { dev_err(dev, "Send crq init failed with error %d\n", rc); @@ -5696,12 +5708,14 @@ static int ibmvnic_reset_init(struct ibmvnic_adapter *adapter, bool reset) if (adapter->init_done_rc) { release_crq_queue(adapter); + dev_err(dev, "CRQ-init failed, %d\n", adapter->init_done_rc); return adapter->init_done_rc; } if (adapter->from_passive_init) { adapter->state = VNIC_OPEN; adapter->from_passive_init = false; + dev_err(dev, "CRQ-init failed, passive-init\n"); return -EINVAL; } @@ -5795,6 +5809,8 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id) init_success = false; do { + reinit_init_done(adapter); + rc = init_crq_queue(adapter); if (rc) { dev_err(&dev->dev, "Couldn't initialize crq. rc=%d\n", -- cgit From f628ad531b4f34fdba0984255b4a2850dd369513 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Thu, 24 Feb 2022 22:23:57 -0800 Subject: ibmvnic: clear fop when retrying probe Clear ->failover_pending flag that may have been set in the previous pass of registering CRQ. If we don't clear, a subsequent ibmvnic_open() call would be misled into thinking a failover is pending and assuming that the reset worker thread would open the adapter. If this pass of registering the CRQ succeeds (i.e there is no transport event), there wouldn't be a reset worker thread. This would leave the adapter unconfigured and require manual intervention to bring it up during boot. Fixes: 5a18e1e0c193 ("ibmvnic: Fix failover case for non-redundant configuration") Signed-off-by: Sukadev Bhattiprolu Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 16a8f92adbda..93580c68600a 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -5811,6 +5811,11 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id) do { reinit_init_done(adapter); + /* clear any failovers we got in the previous pass + * since we are reinitializing the CRQ + */ + adapter->failover_pending = false; + rc = init_crq_queue(adapter); if (rc) { dev_err(&dev->dev, "Couldn't initialize crq. rc=%d\n", -- cgit From fd98693cb0721317f27341951593712c580c36a1 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Thu, 24 Feb 2022 22:23:58 -0800 Subject: ibmvnic: Allow queueing resets during probe We currently don't allow queuing resets when adapter is in VNIC_PROBING state - instead we throw away the reset and return EBUSY. The reasoning is probably that during ibmvnic_probe() the ibmvnic_adapter itself is being initialized so performing a reset during this time can lead us to accessing fields in the ibmvnic_adapter that are not fully initialized. A review of the code shows that all the adapter state neede to process a reset is initialized before registering the CRQ so that should no longer be a concern. Further the expectation is that if we do get a reset (transport event) during probe, the do..while() loop in ibmvnic_probe() will handle this by reinitializing the CRQ. While that is true to some extent, it is possible that the reset might occur _after_ the CRQ is registered and CRQ_INIT message was exchanged but _before_ the adapter state is set to VNIC_PROBED. As mentioned above, such a reset will be thrown away. While the client assumes that the adapter is functional, the vnic server will wait for the client to reinit the adapter. This disconnect between the two leaves the adapter down needing manual intervention. Because ibmvnic_probe() has other work to do after initializing the CRQ (such as registering the netdev at a minimum) and because the reset event can occur at any instant after the CRQ is initialized, there will always be a window between initializing the CRQ and considering the adapter ready for resets (ie state == PROBED). So rather than discarding resets during this window, allow queueing them - but only process them after the adapter is fully initialized. To do this, introduce a new completion state ->probe_done and have the reset worker thread wait on this before processing resets. This change brings up two new situations in or just after ibmvnic_probe(). First after one or more resets were queued, we encounter an error and decide to retry the initialization. At that point the queued resets are no longer relevant since we could be talking to a new vnic server. So we must purge/flush the queued resets before restarting the initialization. As a side note, since we are still in the probing stage and we have not registered the netdev, it will not be CHANGE_PARAM reset. Second this change opens up a potential race between the worker thread in __ibmvnic_reset(), the tasklet and the ibmvnic_open() due to the following sequence of events: 1. Register CRQ 2. Get transport event before CRQ_INIT completes. 3. Tasklet schedules reset: a) add rwi to list b) schedule_work() to start worker thread which runs and waits for ->probe_done. 4. ibmvnic_probe() decides to retry, purges rwi_list 5. Re-register crq and this time rest of probe succeeds - register netdev and complete(->probe_done). 6. Worker thread resumes in __ibmvnic_reset() from 3b. 7. Worker thread sets ->resetting bit 8. ibmvnic_open() comes in, notices ->resetting bit, sets state to IBMVNIC_OPEN and returns early expecting worker thread to finish the open. 9. Worker thread finds rwi_list empty and returns without opening the interface. If this happens, the ->ndo_open() call is effectively lost and the interface remains down. To address this, ensure that ->rwi_list is not empty before setting the ->resetting bit. See also comments in __ibmvnic_reset(). Fixes: 6a2fb0e99f9c ("ibmvnic: driver initialization for kdump/kexec") Signed-off-by: Sukadev Bhattiprolu Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 107 +++++++++++++++++++++++++++++++++---- drivers/net/ethernet/ibm/ibmvnic.h | 1 + 2 files changed, 98 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 93580c68600a..b423e94956f1 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -2618,23 +2618,82 @@ out: static void __ibmvnic_reset(struct work_struct *work) { struct ibmvnic_adapter *adapter; - bool saved_state = false; + unsigned int timeout = 5000; struct ibmvnic_rwi *tmprwi; + bool saved_state = false; struct ibmvnic_rwi *rwi; unsigned long flags; - u32 reset_state; + struct device *dev; + bool need_reset; int num_fails = 0; + u32 reset_state; int rc = 0; adapter = container_of(work, struct ibmvnic_adapter, ibmvnic_reset); + dev = &adapter->vdev->dev; - if (test_and_set_bit_lock(0, &adapter->resetting)) { + /* Wait for ibmvnic_probe() to complete. If probe is taking too long + * or if another reset is in progress, defer work for now. If probe + * eventually fails it will flush and terminate our work. + * + * Three possibilities here: + * 1. Adpater being removed - just return + * 2. Timed out on probe or another reset in progress - delay the work + * 3. Completed probe - perform any resets in queue + */ + if (adapter->state == VNIC_PROBING && + !wait_for_completion_timeout(&adapter->probe_done, timeout)) { + dev_err(dev, "Reset thread timed out on probe"); queue_delayed_work(system_long_wq, &adapter->ibmvnic_delayed_reset, IBMVNIC_RESET_DELAY); return; } + /* adapter is done with probe (i.e state is never VNIC_PROBING now) */ + if (adapter->state == VNIC_REMOVING) + return; + + /* ->rwi_list is stable now (no one else is removing entries) */ + + /* ibmvnic_probe() may have purged the reset queue after we were + * scheduled to process a reset so there maybe no resets to process. + * Before setting the ->resetting bit though, we have to make sure + * that there is infact a reset to process. Otherwise we may race + * with ibmvnic_open() and end up leaving the vnic down: + * + * __ibmvnic_reset() ibmvnic_open() + * ----------------- -------------- + * + * set ->resetting bit + * find ->resetting bit is set + * set ->state to IBMVNIC_OPEN (i.e + * assume reset will open device) + * return + * find reset queue empty + * return + * + * Neither performed vnic login/open and vnic stays down + * + * If we hold the lock and conditionally set the bit, either we + * or ibmvnic_open() will complete the open. + */ + need_reset = false; + spin_lock(&adapter->rwi_lock); + if (!list_empty(&adapter->rwi_list)) { + if (test_and_set_bit_lock(0, &adapter->resetting)) { + queue_delayed_work(system_long_wq, + &adapter->ibmvnic_delayed_reset, + IBMVNIC_RESET_DELAY); + } else { + need_reset = true; + } + } + spin_unlock(&adapter->rwi_lock); + + if (!need_reset) + return; + rwi = get_next_rwi(adapter); while (rwi) { spin_lock_irqsave(&adapter->state_lock, flags); @@ -2786,13 +2845,6 @@ static int ibmvnic_reset(struct ibmvnic_adapter *adapter, goto err; } - if (adapter->state == VNIC_PROBING) { - netdev_warn(netdev, "Adapter reset during probe\n"); - adapter->init_done_rc = -EAGAIN; - ret = EAGAIN; - goto err; - } - list_for_each_entry(tmp, &adapter->rwi_list, list) { if (tmp->reset_reason == reason) { netdev_dbg(netdev, "Skipping matching reset, reason=%s\n", @@ -5755,6 +5807,7 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id) struct ibmvnic_adapter *adapter; struct net_device *netdev; unsigned char *mac_addr_p; + unsigned long flags; bool init_success; int rc; @@ -5799,6 +5852,7 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id) spin_lock_init(&adapter->rwi_lock); spin_lock_init(&adapter->state_lock); mutex_init(&adapter->fw_lock); + init_completion(&adapter->probe_done); init_completion(&adapter->init_done); init_completion(&adapter->fw_done); init_completion(&adapter->reset_done); @@ -5816,6 +5870,26 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id) */ adapter->failover_pending = false; + /* If we had already initialized CRQ, we may have one or + * more resets queued already. Discard those and release + * the CRQ before initializing the CRQ again. + */ + release_crq_queue(adapter); + + /* Since we are still in PROBING state, __ibmvnic_reset() + * will not access the ->rwi_list and since we released CRQ, + * we won't get _new_ transport events. But there maybe an + * ongoing ibmvnic_reset() call. So serialize access to + * rwi_list. If we win the race, ibvmnic_reset() could add + * a reset after we purged but thats ok - we just may end + * up with an extra reset (i.e similar to having two or more + * resets in the queue at once). + * CHECK. + */ + spin_lock_irqsave(&adapter->rwi_lock, flags); + flush_reset_queue(adapter); + spin_unlock_irqrestore(&adapter->rwi_lock, flags); + rc = init_crq_queue(adapter); if (rc) { dev_err(&dev->dev, "Couldn't initialize crq. rc=%d\n", @@ -5867,6 +5941,8 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id) } dev_info(&dev->dev, "ibmvnic registered\n"); + complete(&adapter->probe_done); + return 0; ibmvnic_register_fail: @@ -5881,6 +5957,17 @@ ibmvnic_stats_fail: ibmvnic_init_fail: release_sub_crqs(adapter, 1); release_crq_queue(adapter); + + /* cleanup worker thread after releasing CRQ so we don't get + * transport events (i.e new work items for the worker thread). + */ + adapter->state = VNIC_REMOVING; + complete(&adapter->probe_done); + flush_work(&adapter->ibmvnic_reset); + flush_delayed_work(&adapter->ibmvnic_delayed_reset); + + flush_reset_queue(adapter); + mutex_destroy(&adapter->fw_lock); free_netdev(netdev); diff --git a/drivers/net/ethernet/ibm/ibmvnic.h b/drivers/net/ethernet/ibm/ibmvnic.h index 4a7a56ff74ce..fa2d607a7b1b 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.h +++ b/drivers/net/ethernet/ibm/ibmvnic.h @@ -930,6 +930,7 @@ struct ibmvnic_adapter { struct ibmvnic_tx_pool *tx_pool; struct ibmvnic_tx_pool *tso_pool; + struct completion probe_done; struct completion init_done; int init_done_rc; -- cgit From 767b9825ed1765894e569a3d698749d40d83762a Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Fri, 25 Feb 2022 04:37:27 -0800 Subject: net: chelsio: cxgb3: check the return value of pci_find_capability() The function pci_find_capability() in t3_prep_adapter() can fail, so its return value should be checked. Fixes: 4d22de3e6cc4 ("Add support for the latest 1G/10G Chelsio adapter, T3") Reported-by: TOTE Robot Signed-off-by: Jia-Ju Bai Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb3/t3_hw.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/chelsio/cxgb3/t3_hw.c b/drivers/net/ethernet/chelsio/cxgb3/t3_hw.c index da41eee2f25c..a06003bfa04b 100644 --- a/drivers/net/ethernet/chelsio/cxgb3/t3_hw.c +++ b/drivers/net/ethernet/chelsio/cxgb3/t3_hw.c @@ -3613,6 +3613,8 @@ int t3_prep_adapter(struct adapter *adapter, const struct adapter_info *ai, MAC_STATS_ACCUM_SECS : (MAC_STATS_ACCUM_SECS * 10); adapter->params.pci.vpd_cap_addr = pci_find_capability(adapter->pdev, PCI_CAP_ID_VPD); + if (!adapter->params.pci.vpd_cap_addr) + return -ENODEV; ret = get_vpd_params(adapter, &adapter->params.vpd); if (ret < 0) return ret; -- cgit From b3a34dc362c03215031b268fcc0b988e69490231 Mon Sep 17 00:00:00 2001 From: Casper Andersson Date: Fri, 25 Feb 2022 11:15:16 +0100 Subject: net: sparx5: Fix add vlan when invalid operation Check if operation is valid before changing any settings in hardware. Otherwise it results in changes being made despite it not being a valid operation. Fixes: 78eab33bb68b ("net: sparx5: add vlan support") Signed-off-by: Casper Andersson Signed-off-by: David S. Miller --- drivers/net/ethernet/microchip/sparx5/sparx5_vlan.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_vlan.c b/drivers/net/ethernet/microchip/sparx5/sparx5_vlan.c index 4ce490a25f33..8e56ffa1c4f7 100644 --- a/drivers/net/ethernet/microchip/sparx5/sparx5_vlan.c +++ b/drivers/net/ethernet/microchip/sparx5/sparx5_vlan.c @@ -58,16 +58,6 @@ int sparx5_vlan_vid_add(struct sparx5_port *port, u16 vid, bool pvid, struct sparx5 *sparx5 = port->sparx5; int ret; - /* Make the port a member of the VLAN */ - set_bit(port->portno, sparx5->vlan_mask[vid]); - ret = sparx5_vlant_set_mask(sparx5, vid); - if (ret) - return ret; - - /* Default ingress vlan classification */ - if (pvid) - port->pvid = vid; - /* Untagged egress vlan classification */ if (untagged && port->vid != vid) { if (port->vid) { @@ -79,6 +69,16 @@ int sparx5_vlan_vid_add(struct sparx5_port *port, u16 vid, bool pvid, port->vid = vid; } + /* Make the port a member of the VLAN */ + set_bit(port->portno, sparx5->vlan_mask[vid]); + ret = sparx5_vlant_set_mask(sparx5, vid); + if (ret) + return ret; + + /* Default ingress vlan classification */ + if (pvid) + port->pvid = vid; + sparx5_vlan_port_apply(sparx5, port); return 0; -- cgit From 456f89e0928ab938122a40e9f094a6524cc158b4 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 23 Feb 2022 13:16:24 +0000 Subject: KVM: selftests: aarch64: Skip tests if we can't create a vgic-v3 The arch_timer and vgic_irq kselftests assume that they can create a vgic-v3, using the library function vgic_v3_setup() which aborts with a test failure if it is not possible to do so. Since vgic-v3 can only be instantiated on systems where the host has GICv3 this leads to false positives on older systems where that is not the case. Fix this by changing vgic_v3_setup() to return an error if the vgic can't be instantiated and have the callers skip if this happens. We could also exit flagging a skip in vgic_v3_setup() but this would prevent future test cases conditionally deciding which GIC to use or generally doing more complex output. Signed-off-by: Mark Brown Reviewed-by: Andrew Jones Tested-by: Ricardo Koller Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220223131624.1830351-1-broonie@kernel.org --- tools/testing/selftests/kvm/aarch64/arch_timer.c | 7 ++++++- tools/testing/selftests/kvm/aarch64/vgic_irq.c | 4 ++++ tools/testing/selftests/kvm/lib/aarch64/vgic.c | 4 +++- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/aarch64/arch_timer.c b/tools/testing/selftests/kvm/aarch64/arch_timer.c index 9ad38bd360a4..b08d30bf71c5 100644 --- a/tools/testing/selftests/kvm/aarch64/arch_timer.c +++ b/tools/testing/selftests/kvm/aarch64/arch_timer.c @@ -366,6 +366,7 @@ static struct kvm_vm *test_vm_create(void) { struct kvm_vm *vm; unsigned int i; + int ret; int nr_vcpus = test_args.nr_vcpus; vm = vm_create_default_with_vcpus(nr_vcpus, 0, 0, guest_code, NULL); @@ -382,7 +383,11 @@ static struct kvm_vm *test_vm_create(void) ucall_init(vm, NULL); test_init_timer_irq(vm); - vgic_v3_setup(vm, nr_vcpus, 64, GICD_BASE_GPA, GICR_BASE_GPA); + ret = vgic_v3_setup(vm, nr_vcpus, 64, GICD_BASE_GPA, GICR_BASE_GPA); + if (ret < 0) { + print_skip("Failed to create vgic-v3"); + exit(KSFT_SKIP); + } /* Make all the test's cmdline args visible to the guest */ sync_global_to_guest(vm, test_args); diff --git a/tools/testing/selftests/kvm/aarch64/vgic_irq.c b/tools/testing/selftests/kvm/aarch64/vgic_irq.c index e6c7d7f8fbd1..7eca97799917 100644 --- a/tools/testing/selftests/kvm/aarch64/vgic_irq.c +++ b/tools/testing/selftests/kvm/aarch64/vgic_irq.c @@ -761,6 +761,10 @@ static void test_vgic(uint32_t nr_irqs, bool level_sensitive, bool eoi_split) gic_fd = vgic_v3_setup(vm, 1, nr_irqs, GICD_BASE_GPA, GICR_BASE_GPA); + if (gic_fd < 0) { + print_skip("Failed to create vgic-v3, skipping"); + exit(KSFT_SKIP); + } vm_install_exception_handler(vm, VECTOR_IRQ_CURRENT, guest_irq_handlers[args.eoi_split][args.level_sensitive]); diff --git a/tools/testing/selftests/kvm/lib/aarch64/vgic.c b/tools/testing/selftests/kvm/lib/aarch64/vgic.c index b3a0fca0d780..f5cd0c536d85 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/vgic.c +++ b/tools/testing/selftests/kvm/lib/aarch64/vgic.c @@ -52,7 +52,9 @@ int vgic_v3_setup(struct kvm_vm *vm, unsigned int nr_vcpus, uint32_t nr_irqs, nr_vcpus, nr_vcpus_created); /* Distributor setup */ - gic_fd = kvm_create_device(vm, KVM_DEV_TYPE_ARM_VGIC_V3, false); + if (_kvm_create_device(vm, KVM_DEV_TYPE_ARM_VGIC_V3, + false, &gic_fd) != 0) + return -1; kvm_device_access(gic_fd, KVM_DEV_ARM_VGIC_GRP_NR_IRQS, 0, &nr_irqs, true); -- cgit From 40cd58dbf121e1d0c18f1bd4dd10335ae45a28fc Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Fri, 25 Feb 2022 00:29:40 -0800 Subject: x86/kvm: Don't use PV TLB/yield when mwait is advertised MWAIT is advertised in host is not overcommitted scenario, however, PV TLB/sched yield should be enabled in host overcommitted scenario. Let's add the MWAIT checking when enabling PV TLB/sched yield. Signed-off-by: Wanpeng Li Message-Id: <1645777780-2581-1-git-send-email-wanpengli@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index f734e3b0cfec..491e1d9ca750 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -463,6 +463,7 @@ static bool pv_tlb_flush_supported(void) return (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) && !kvm_para_has_hint(KVM_HINTS_REALTIME) && kvm_para_has_feature(KVM_FEATURE_STEAL_TIME) && + !boot_cpu_has(X86_FEATURE_MWAIT) && (num_possible_cpus() != 1)); } @@ -477,6 +478,7 @@ static bool pv_sched_yield_supported(void) return (kvm_para_has_feature(KVM_FEATURE_PV_SCHED_YIELD) && !kvm_para_has_hint(KVM_HINTS_REALTIME) && kvm_para_has_feature(KVM_FEATURE_STEAL_TIME) && + !boot_cpu_has(X86_FEATURE_MWAIT) && (num_possible_cpus() != 1)); } -- cgit From 3c51d0a6c761c2025c6db1ed4d3a7273167bf899 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 22 Feb 2022 01:02:03 -0800 Subject: x86/kvm: Don't waste memory if kvmclock is disabled Even if "no-kvmclock" is passed in cmdline parameter, the guest kernel still allocates hvclock_mem which is scaled by the number of vCPUs, let's check kvmclock enable in advance to avoid this memory waste. Signed-off-by: Wanpeng Li Message-Id: <1645520523-30814-1-git-send-email-wanpengli@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvmclock.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index a35cbf9107af..44ed677d401f 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -239,6 +239,9 @@ static void __init kvmclock_init_mem(void) static int __init kvm_setup_vsyscall_timeinfo(void) { + if (!kvmclock) + return 0; + kvmclock_init_mem(); #ifdef CONFIG_X86_64 -- cgit From 92e68cc558774de01024c18e8b35cdce4731c910 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Fri, 25 Feb 2022 00:46:00 -0800 Subject: x86/kvmclock: Fix Hyper-V Isolated VM's boot issue when vCPUs > 64 When Linux runs as an Isolated VM on Hyper-V, it supports AMD SEV-SNP but it's partially enlightened, i.e. cc_platform_has( CC_ATTR_GUEST_MEM_ENCRYPT) is true but sev_active() is false. Commit 4d96f9109109 per se is good, but with it now kvm_setup_vsyscall_timeinfo() -> kvmclock_init_mem() calls set_memory_decrypted(), and later gets stuck when trying to zere out the pages pointed by 'hvclock_mem', if Linux runs as an Isolated VM on Hyper-V. The cause is that here now the Linux VM should no longer access the original guest physical addrss (GPA); instead the VM should do memremap() and access the original GPA + ms_hyperv.shared_gpa_boundary: see the example code in drivers/hv/connection.c: vmbus_connect() or drivers/hv/ring_buffer.c: hv_ringbuffer_init(). If the VM tries to access the original GPA, it keepts getting injected a fault by Hyper-V and gets stuck there. Here the issue happens only when the VM has >=65 vCPUs, because the global static array hv_clock_boot[] can hold 64 "struct pvclock_vsyscall_time_info" (the sizeof of the struct is 64 bytes), so kvmclock_init_mem() only allocates memory in the case of vCPUs > 64. Since the 'hvclock_mem' pages are only useful when the kvm clock is supported by the underlying hypervisor, fix the issue by returning early when Linux VM runs on Hyper-V, which doesn't support kvm clock. Fixes: 4d96f9109109 ("x86/sev: Replace occurrences of sev_active() with cc_platform_has()") Tested-by: Andrea Parri (Microsoft) Signed-off-by: Andrea Parri (Microsoft) Signed-off-by: Dexuan Cui Message-Id: <20220225084600.17817-1-decui@microsoft.com> Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvmclock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 44ed677d401f..c5caa7311bd8 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -239,7 +239,7 @@ static void __init kvmclock_init_mem(void) static int __init kvm_setup_vsyscall_timeinfo(void) { - if (!kvmclock) + if (!kvm_para_available() || !kvmclock) return 0; kvmclock_init_mem(); -- cgit From 9ee83635d872812f3920209c606c6ea9e412ffcc Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Wed, 9 Feb 2022 12:16:41 +0800 Subject: KVM: x86: Yield to IPI target vCPU only if it is busy When sending a call-function IPI-many to vCPUs, yield to the IPI target vCPU which is marked as preempted. but when emulating HLT, an idling vCPU will be voluntarily scheduled out and mark as preempted from the guest kernel perspective. yielding to idle vCPU is pointless and increase unnecessary vmexit, maybe miss the true preempted vCPU so yield to IPI target vCPU only if vCPU is busy and preempted Signed-off-by: Li RongQing Message-Id: <1644380201-29423-1-git-send-email-lirongqing@baidu.com> Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 491e1d9ca750..d77481ecb0d5 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -624,7 +624,7 @@ static void kvm_smp_send_call_func_ipi(const struct cpumask *mask) /* Make sure other vCPUs get a chance to run if they need to. */ for_each_cpu(cpu, mask) { - if (vcpu_is_preempted(cpu)) { + if (!idle_cpu(cpu) && vcpu_is_preempted(cpu)) { kvm_hypercall1(KVM_HC_SCHED_YIELD, per_cpu(x86_cpu_to_apicid, cpu)); break; } -- cgit From 0ac983f512033cb7b5e210c9589768ad25b1e36b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 24 Feb 2022 08:32:28 -0600 Subject: ucounts: Fix systemd LimitNPROC with private users regression Long story short recursively enforcing RLIMIT_NPROC when it is not enforced on the process that creates a new user namespace, causes currently working code to fail. There is no reason to enforce RLIMIT_NPROC recursively when we don't enforce it normally so update the code to detect this case. I would like to simply use capable(CAP_SYS_RESOURCE) to detect when RLIMIT_NPROC is not enforced upon the caller. Unfortunately because RLIMIT_NPROC is charged and checked for enforcement based upon the real uid, using capable() which is euid based is inconsistent with reality. Come as close as possible to testing for capable(CAP_SYS_RESOURCE) by testing for when the real uid would match the conditions when CAP_SYS_RESOURCE would be present if the real uid was the effective uid. Reported-by: Etienne Dechamps Link: https://bugzilla.kernel.org/show_bug.cgi?id=215596 Link: https://lkml.kernel.org/r/e9589141-cfeb-90cd-2d0e-83a62787239a@edechamps.fr Link: https://lkml.kernel.org/r/87sfs8jmpz.fsf_-_@email.froward.int.ebiederm.org Cc: stable@vger.kernel.org Fixes: 21d1c5e386bc ("Reimplement RLIMIT_NPROC on top of ucounts") Reviewed-by: Kees Cook Signed-off-by: "Eric W. Biederman" --- kernel/user_namespace.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 6b2e3ca7ee99..5481ba44a8d6 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -58,6 +58,18 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) cred->user_ns = user_ns; } +static unsigned long enforced_nproc_rlimit(void) +{ + unsigned long limit = RLIM_INFINITY; + + /* Is RLIMIT_NPROC currently enforced? */ + if (!uid_eq(current_uid(), GLOBAL_ROOT_UID) || + (current_user_ns() != &init_user_ns)) + limit = rlimit(RLIMIT_NPROC); + + return limit; +} + /* * Create a new user namespace, deriving the creator from the user in the * passed credentials, and replacing that user with the new root user for the @@ -122,7 +134,7 @@ int create_user_ns(struct cred *new) for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++) { ns->ucount_max[i] = INT_MAX; } - set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)); + set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_NPROC, enforced_nproc_rlimit()); set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MSGQUEUE, rlimit(RLIMIT_MSGQUEUE)); set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_SIGPENDING, rlimit(RLIMIT_SIGPENDING)); set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MEMLOCK, rlimit(RLIMIT_MEMLOCK)); -- cgit From c5048a7b2c23ab589f3476a783bd586b663eda5b Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Mon, 21 Feb 2022 22:59:35 +0000 Subject: can: rcar_canfd: rcar_canfd_channel_probe(): register the CAN device when fully ready Register the CAN device only when all the necessary initialization is completed. This patch makes sure all the data structures and locks are initialized before registering the CAN device. Link: https://lore.kernel.org/all/20220221225935.12300-1-prabhakar.mahadev-lad.rj@bp.renesas.com Reported-by: Pavel Machek Signed-off-by: Lad Prabhakar Reviewed-by: Pavel Machek Reviewed-by: Ulrich Hecht Signed-off-by: Marc Kleine-Budde --- drivers/net/can/rcar/rcar_canfd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/can/rcar/rcar_canfd.c b/drivers/net/can/rcar/rcar_canfd.c index b7dc1c32875f..acd74725831f 100644 --- a/drivers/net/can/rcar/rcar_canfd.c +++ b/drivers/net/can/rcar/rcar_canfd.c @@ -1715,15 +1715,15 @@ static int rcar_canfd_channel_probe(struct rcar_canfd_global *gpriv, u32 ch, netif_napi_add(ndev, &priv->napi, rcar_canfd_rx_poll, RCANFD_NAPI_WEIGHT); + spin_lock_init(&priv->tx_lock); + devm_can_led_init(ndev); + gpriv->ch[priv->channel] = priv; err = register_candev(ndev); if (err) { dev_err(&pdev->dev, "register_candev() failed, error %d\n", err); goto fail_candev; } - spin_lock_init(&priv->tx_lock); - devm_can_led_init(ndev); - gpriv->ch[priv->channel] = priv; dev_info(&pdev->dev, "device registered (channel %u)\n", priv->channel); return 0; -- cgit From 50e06ddceeea263f57fe92baa677c638ecd65bb6 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 23 Feb 2022 19:35:28 -0800 Subject: net: sxgbe: fix return value of __setup handler __setup() handlers should return 1 on success, i.e., the parameter has been handled. A return of 0 causes the "option=value" string to be added to init's environment strings, polluting it. Fixes: acc18c147b22 ("net: sxgbe: add EEE(Energy Efficient Ethernet) for Samsung sxgbe") Fixes: 1edb9ca69e8a ("net: sxgbe: add basic framework for Samsung 10Gb ethernet driver") Signed-off-by: Randy Dunlap Reported-by: Igor Zhbanov Link: lore.kernel.org/r/64644a2f-4a20-bab3-1e15-3b2cdd0defe3@omprussia.ru Cc: Siva Reddy Cc: Girish K S Cc: Byungho An Link: https://lore.kernel.org/r/20220224033528.24640-1-rdunlap@infradead.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c b/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c index 32161a56726c..2881f5b2b5f4 100644 --- a/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c +++ b/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c @@ -2285,18 +2285,18 @@ static int __init sxgbe_cmdline_opt(char *str) char *opt; if (!str || !*str) - return -EINVAL; + return 1; while ((opt = strsep(&str, ",")) != NULL) { if (!strncmp(opt, "eee_timer:", 10)) { if (kstrtoint(opt + 10, 0, &eee_timer)) goto err; } } - return 0; + return 1; err: pr_err("%s: ERROR broken module parameter conversion\n", __func__); - return -EINVAL; + return 1; } __setup("sxgbeeth=", sxgbe_cmdline_opt); -- cgit From e01b042e580f1fbf4fd8da467442451da00c7a90 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 23 Feb 2022 19:35:36 -0800 Subject: net: stmmac: fix return value of __setup handler __setup() handlers should return 1 on success, i.e., the parameter has been handled. A return of 0 causes the "option=value" string to be added to init's environment strings, polluting it. Fixes: 47dd7a540b8a ("net: add support for STMicroelectronics Ethernet controllers.") Fixes: f3240e2811f0 ("stmmac: remove warning when compile as built-in (V2)") Signed-off-by: Randy Dunlap Reported-by: Igor Zhbanov Link: lore.kernel.org/r/64644a2f-4a20-bab3-1e15-3b2cdd0defe3@omprussia.ru Cc: Giuseppe Cavallaro Cc: Alexandre Torgue Cc: Jose Abreu Link: https://lore.kernel.org/r/20220224033536.25056-1-rdunlap@infradead.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index cb9b6e08780c..422e3225f476 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -7491,7 +7491,7 @@ static int __init stmmac_cmdline_opt(char *str) char *opt; if (!str || !*str) - return -EINVAL; + return 1; while ((opt = strsep(&str, ",")) != NULL) { if (!strncmp(opt, "debug:", 6)) { if (kstrtoint(opt + 6, 0, &debug)) @@ -7522,11 +7522,11 @@ static int __init stmmac_cmdline_opt(char *str) goto err; } } - return 0; + return 1; err: pr_err("%s: ERROR broken module parameter conversion", __func__); - return -EINVAL; + return 1; } __setup("stmmaceth=", stmmac_cmdline_opt); -- cgit From fc2e6b3b132a907378f6af08356b105a4139c4fb Mon Sep 17 00:00:00 2001 From: Slawomir Laba Date: Wed, 23 Feb 2022 13:35:49 +0100 Subject: iavf: Rework mutexes for better synchronisation The driver used to crash in multiple spots when put to stress testing of the init, reset and remove paths. The user would experience call traces or hangs when creating, resetting, removing VFs. Depending on the machines, the call traces are happening in random spots, like reset restoring resources racing with driver remove. Make adapter->crit_lock mutex a mandatory lock for guarding the operations performed on all workqueues and functions dealing with resource allocation and disposal. Make __IAVF_REMOVE a final state of the driver respected by workqueues that shall not requeue, when they fail to obtain the crit_lock. Make the IRQ handler not to queue the new work for adminq_task when the __IAVF_REMOVE state is set. Fixes: 5ac49f3c2702 ("iavf: use mutexes for locking of critical sections") Signed-off-by: Slawomir Laba Signed-off-by: Phani Burra Signed-off-by: Jacob Keller Signed-off-by: Mateusz Palczewski Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/iavf/iavf.h | 1 - drivers/net/ethernet/intel/iavf/iavf_main.c | 66 ++++++++++++++++------------- 2 files changed, 36 insertions(+), 31 deletions(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf.h b/drivers/net/ethernet/intel/iavf/iavf.h index 59806d1f7e79..44f83e06486d 100644 --- a/drivers/net/ethernet/intel/iavf/iavf.h +++ b/drivers/net/ethernet/intel/iavf/iavf.h @@ -246,7 +246,6 @@ struct iavf_adapter { struct list_head mac_filter_list; struct mutex crit_lock; struct mutex client_lock; - struct mutex remove_lock; /* Lock to protect accesses to MAC and VLAN lists */ spinlock_t mac_vlan_list_lock; char misc_vector_name[IFNAMSIZ + 9]; diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index 8125b9120615..84ae96e912d7 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -302,8 +302,9 @@ static irqreturn_t iavf_msix_aq(int irq, void *data) rd32(hw, IAVF_VFINT_ICR01); rd32(hw, IAVF_VFINT_ICR0_ENA1); - /* schedule work on the private workqueue */ - queue_work(iavf_wq, &adapter->adminq_task); + if (adapter->state != __IAVF_REMOVE) + /* schedule work on the private workqueue */ + queue_work(iavf_wq, &adapter->adminq_task); return IRQ_HANDLED; } @@ -2374,8 +2375,12 @@ static void iavf_watchdog_task(struct work_struct *work) struct iavf_hw *hw = &adapter->hw; u32 reg_val; - if (!mutex_trylock(&adapter->crit_lock)) + if (!mutex_trylock(&adapter->crit_lock)) { + if (adapter->state == __IAVF_REMOVE) + return; + goto restart_watchdog; + } if (adapter->flags & IAVF_FLAG_PF_COMMS_FAILED) iavf_change_state(adapter, __IAVF_COMM_FAILED); @@ -2601,13 +2606,13 @@ static void iavf_reset_task(struct work_struct *work) /* When device is being removed it doesn't make sense to run the reset * task, just return in such a case. */ - if (mutex_is_locked(&adapter->remove_lock)) - return; + if (!mutex_trylock(&adapter->crit_lock)) { + if (adapter->state != __IAVF_REMOVE) + queue_work(iavf_wq, &adapter->reset_task); - if (iavf_lock_timeout(&adapter->crit_lock, 200)) { - schedule_work(&adapter->reset_task); return; } + while (!mutex_trylock(&adapter->client_lock)) usleep_range(500, 1000); if (CLIENT_ENABLED(adapter)) { @@ -2826,13 +2831,19 @@ static void iavf_adminq_task(struct work_struct *work) if (adapter->flags & IAVF_FLAG_PF_COMMS_FAILED) goto out; + if (!mutex_trylock(&adapter->crit_lock)) { + if (adapter->state == __IAVF_REMOVE) + return; + + queue_work(iavf_wq, &adapter->adminq_task); + goto out; + } + event.buf_len = IAVF_MAX_AQ_BUF_SIZE; event.msg_buf = kzalloc(event.buf_len, GFP_KERNEL); if (!event.msg_buf) goto out; - if (iavf_lock_timeout(&adapter->crit_lock, 200)) - goto freedom; do { ret = iavf_clean_arq_element(hw, &event, &pending); v_op = (enum virtchnl_ops)le32_to_cpu(event.desc.cookie_high); @@ -3800,11 +3811,12 @@ static int iavf_close(struct net_device *netdev) struct iavf_adapter *adapter = netdev_priv(netdev); int status; - if (adapter->state <= __IAVF_DOWN_PENDING) - return 0; + mutex_lock(&adapter->crit_lock); - while (!mutex_trylock(&adapter->crit_lock)) - usleep_range(500, 1000); + if (adapter->state <= __IAVF_DOWN_PENDING) { + mutex_unlock(&adapter->crit_lock); + return 0; + } set_bit(__IAVF_VSI_DOWN, adapter->vsi.state); if (CLIENT_ENABLED(adapter)) @@ -4431,7 +4443,6 @@ static int iavf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) */ mutex_init(&adapter->crit_lock); mutex_init(&adapter->client_lock); - mutex_init(&adapter->remove_lock); mutex_init(&hw->aq.asq_mutex); mutex_init(&hw->aq.arq_mutex); @@ -4556,11 +4567,7 @@ static void iavf_remove(struct pci_dev *pdev) struct iavf_cloud_filter *cf, *cftmp; struct iavf_hw *hw = &adapter->hw; int err; - /* Indicate we are in remove and not to run reset_task */ - mutex_lock(&adapter->remove_lock); - cancel_work_sync(&adapter->reset_task); - cancel_delayed_work_sync(&adapter->watchdog_task); - cancel_delayed_work_sync(&adapter->client_task); + if (adapter->netdev_registered) { unregister_netdev(netdev); adapter->netdev_registered = false; @@ -4572,6 +4579,10 @@ static void iavf_remove(struct pci_dev *pdev) err); } + mutex_lock(&adapter->crit_lock); + dev_info(&adapter->pdev->dev, "Remove device\n"); + iavf_change_state(adapter, __IAVF_REMOVE); + iavf_request_reset(adapter); msleep(50); /* If the FW isn't responding, kick it once, but only once. */ @@ -4579,18 +4590,19 @@ static void iavf_remove(struct pci_dev *pdev) iavf_request_reset(adapter); msleep(50); } - if (iavf_lock_timeout(&adapter->crit_lock, 5000)) - dev_warn(&adapter->pdev->dev, "failed to acquire crit_lock in %s\n", __FUNCTION__); - dev_info(&adapter->pdev->dev, "Removing device\n"); + iavf_misc_irq_disable(adapter); /* Shut down all the garbage mashers on the detention level */ - iavf_change_state(adapter, __IAVF_REMOVE); + cancel_work_sync(&adapter->reset_task); + cancel_delayed_work_sync(&adapter->watchdog_task); + cancel_work_sync(&adapter->adminq_task); + cancel_delayed_work_sync(&adapter->client_task); + adapter->aq_required = 0; adapter->flags &= ~IAVF_FLAG_REINIT_ITR_NEEDED; iavf_free_all_tx_resources(adapter); iavf_free_all_rx_resources(adapter); - iavf_misc_irq_disable(adapter); iavf_free_misc_irq(adapter); /* In case we enter iavf_remove from erroneous state, free traffic irqs @@ -4606,10 +4618,6 @@ static void iavf_remove(struct pci_dev *pdev) iavf_reset_interrupt_capability(adapter); iavf_free_q_vectors(adapter); - cancel_delayed_work_sync(&adapter->watchdog_task); - - cancel_work_sync(&adapter->adminq_task); - iavf_free_rss(adapter); if (hw->aq.asq.count) @@ -4621,8 +4629,6 @@ static void iavf_remove(struct pci_dev *pdev) mutex_destroy(&adapter->client_lock); mutex_unlock(&adapter->crit_lock); mutex_destroy(&adapter->crit_lock); - mutex_unlock(&adapter->remove_lock); - mutex_destroy(&adapter->remove_lock); iounmap(hw->hw_addr); pci_release_regions(pdev); -- cgit From 974578017fc1fdd06cea8afb9dfa32602e8529ed Mon Sep 17 00:00:00 2001 From: Slawomir Laba Date: Wed, 23 Feb 2022 13:36:56 +0100 Subject: iavf: Add waiting so the port is initialized in remove There exist races when port is being configured and remove is triggered. unregister_netdev is not and can't be called under crit_lock mutex since it is calling ndo_stop -> iavf_close which requires this lock. Depending on init state the netdev could be still unregistered so unregister_netdev never cleans up, when shortly after that the device could become registered. Make iavf_remove wait until port finishes initialization. All critical state changes are atomic (under crit_lock). Crashes that come from iavf_reset_interrupt_capability and iavf_free_traffic_irqs should now be solved in a graceful manner. Fixes: 605ca7c5c6707 ("iavf: Fix kernel BUG in free_msi_irqs") Signed-off-by: Slawomir Laba Signed-off-by: Phani Burra Signed-off-by: Jacob Keller Signed-off-by: Mateusz Palczewski Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/iavf/iavf_main.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index 84ae96e912d7..5e71b38e9154 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -4558,7 +4558,6 @@ static int __maybe_unused iavf_resume(struct device *dev_d) static void iavf_remove(struct pci_dev *pdev) { struct iavf_adapter *adapter = iavf_pdev_to_adapter(pdev); - enum iavf_state_t prev_state = adapter->last_state; struct net_device *netdev = adapter->netdev; struct iavf_fdir_fltr *fdir, *fdirtmp; struct iavf_vlan_filter *vlf, *vlftmp; @@ -4568,6 +4567,22 @@ static void iavf_remove(struct pci_dev *pdev) struct iavf_hw *hw = &adapter->hw; int err; + /* Wait until port initialization is complete. + * There are flows where register/unregister netdev may race. + */ + while (1) { + mutex_lock(&adapter->crit_lock); + if (adapter->state == __IAVF_RUNNING || + adapter->state == __IAVF_DOWN) { + mutex_unlock(&adapter->crit_lock); + break; + } + + mutex_unlock(&adapter->crit_lock); + usleep_range(500, 1000); + } + cancel_delayed_work_sync(&adapter->watchdog_task); + if (adapter->netdev_registered) { unregister_netdev(netdev); adapter->netdev_registered = false; @@ -4605,16 +4620,6 @@ static void iavf_remove(struct pci_dev *pdev) iavf_free_all_rx_resources(adapter); iavf_free_misc_irq(adapter); - /* In case we enter iavf_remove from erroneous state, free traffic irqs - * here, so as to not cause a kernel crash, when calling - * iavf_reset_interrupt_capability. - */ - if ((adapter->last_state == __IAVF_RESETTING && - prev_state != __IAVF_DOWN) || - (adapter->last_state == __IAVF_RUNNING && - !(netdev->flags & IFF_UP))) - iavf_free_traffic_irqs(adapter); - iavf_reset_interrupt_capability(adapter); iavf_free_q_vectors(adapter); -- cgit From 3ccd54ef44ebfa0792c5441b6d9c86618f3378d1 Mon Sep 17 00:00:00 2001 From: Slawomir Laba Date: Wed, 23 Feb 2022 13:37:10 +0100 Subject: iavf: Fix init state closure on remove When init states of the adapter work, the errors like lack of communication with the PF might hop in. If such events occur the driver restores previous states in order to retry initialization in a proper way. When remove task kicks in, this situation could lead to races with unregistering the netdevice as well as resources cleanup. With the commit introducing the waiting in remove for init to complete, this problem turns into an endless waiting if init never recovers from errors. Introduce __IAVF_IN_REMOVE_TASK bit to indicate that the remove thread has started. Make __IAVF_COMM_FAILED adapter state respect the __IAVF_IN_REMOVE_TASK bit and set the __IAVF_INIT_FAILED state and return without any action instead of trying to recover. Make __IAVF_INIT_FAILED adapter state respect the __IAVF_IN_REMOVE_TASK bit and return without any further actions. Make the loop in the remove handler break when adapter has __IAVF_INIT_FAILED state set. Fixes: 898ef1cb1cb2 ("iavf: Combine init and watchdog state machines") Signed-off-by: Slawomir Laba Signed-off-by: Phani Burra Signed-off-by: Jacob Keller Signed-off-by: Mateusz Palczewski Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/iavf/iavf.h | 4 ++++ drivers/net/ethernet/intel/iavf/iavf_main.c | 24 +++++++++++++++++++++++- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf.h b/drivers/net/ethernet/intel/iavf/iavf.h index 44f83e06486d..f259fd517b2c 100644 --- a/drivers/net/ethernet/intel/iavf/iavf.h +++ b/drivers/net/ethernet/intel/iavf/iavf.h @@ -201,6 +201,10 @@ enum iavf_state_t { __IAVF_RUNNING, /* opened, working */ }; +enum iavf_critical_section_t { + __IAVF_IN_REMOVE_TASK, /* device being removed */ +}; + #define IAVF_CLOUD_FIELD_OMAC 0x01 #define IAVF_CLOUD_FIELD_IMAC 0x02 #define IAVF_CLOUD_FIELD_IVLAN 0x04 diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index 5e71b38e9154..be51da978e7c 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -2424,6 +2424,15 @@ static void iavf_watchdog_task(struct work_struct *work) msecs_to_jiffies(1)); return; case __IAVF_INIT_FAILED: + if (test_bit(__IAVF_IN_REMOVE_TASK, + &adapter->crit_section)) { + /* Do not update the state and do not reschedule + * watchdog task, iavf_remove should handle this state + * as it can loop forever + */ + mutex_unlock(&adapter->crit_lock); + return; + } if (++adapter->aq_wait_count > IAVF_AQ_MAX_ERR) { dev_err(&adapter->pdev->dev, "Failed to communicate with PF; waiting before retry\n"); @@ -2440,6 +2449,17 @@ static void iavf_watchdog_task(struct work_struct *work) queue_delayed_work(iavf_wq, &adapter->watchdog_task, HZ); return; case __IAVF_COMM_FAILED: + if (test_bit(__IAVF_IN_REMOVE_TASK, + &adapter->crit_section)) { + /* Set state to __IAVF_INIT_FAILED and perform remove + * steps. Remove IAVF_FLAG_PF_COMMS_FAILED so the task + * doesn't bring the state back to __IAVF_COMM_FAILED. + */ + iavf_change_state(adapter, __IAVF_INIT_FAILED); + adapter->flags &= ~IAVF_FLAG_PF_COMMS_FAILED; + mutex_unlock(&adapter->crit_lock); + return; + } reg_val = rd32(hw, IAVF_VFGEN_RSTAT) & IAVF_VFGEN_RSTAT_VFR_STATE_MASK; if (reg_val == VIRTCHNL_VFR_VFACTIVE || @@ -4567,13 +4587,15 @@ static void iavf_remove(struct pci_dev *pdev) struct iavf_hw *hw = &adapter->hw; int err; + set_bit(__IAVF_IN_REMOVE_TASK, &adapter->crit_section); /* Wait until port initialization is complete. * There are flows where register/unregister netdev may race. */ while (1) { mutex_lock(&adapter->crit_lock); if (adapter->state == __IAVF_RUNNING || - adapter->state == __IAVF_DOWN) { + adapter->state == __IAVF_DOWN || + adapter->state == __IAVF_INIT_FAILED) { mutex_unlock(&adapter->crit_lock); break; } -- cgit From 0579fafd37fb7efe091f0e6c8ccf968864f40f3e Mon Sep 17 00:00:00 2001 From: Slawomir Laba Date: Wed, 23 Feb 2022 13:37:50 +0100 Subject: iavf: Fix locking for VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS iavf_virtchnl_completion is called under crit_lock but when the code for VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS is called, this lock is released in order to obtain rtnl_lock to avoid ABBA deadlock with unregister_netdev. Along with the new way iavf_remove behaves, there exist many risks related to the lock release and attmepts to regrab it. The driver faces crashes related to races between unregister_netdev and netdev_update_features. Yet another risk is that the driver could already obtain the crit_lock in order to destroy it and iavf_virtchnl_completion could crash or block forever. Make iavf_virtchnl_completion never relock crit_lock in it's call paths. Extract rtnl_lock locking logic to the driver for unregister_netdev in order to set the netdev_registered flag inside the lock. Introduce a new flag that will inform adminq_task to perform the code from VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS right after it finishes processing messages. Guard this code with remove flags so it's never called when the driver is in remove state. Fixes: 5951a2b9812d ("iavf: Fix VLAN feature flags after VFR") Signed-off-by: Slawomir Laba Signed-off-by: Phani Burra Signed-off-by: Jacob Keller Signed-off-by: Mateusz Palczewski Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/iavf/iavf.h | 1 + drivers/net/ethernet/intel/iavf/iavf_main.c | 22 +++++++++++++++++++++- drivers/net/ethernet/intel/iavf/iavf_virtchnl.c | 24 +----------------------- 3 files changed, 23 insertions(+), 24 deletions(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf.h b/drivers/net/ethernet/intel/iavf/iavf.h index f259fd517b2c..89423947ee65 100644 --- a/drivers/net/ethernet/intel/iavf/iavf.h +++ b/drivers/net/ethernet/intel/iavf/iavf.h @@ -287,6 +287,7 @@ struct iavf_adapter { #define IAVF_FLAG_LEGACY_RX BIT(15) #define IAVF_FLAG_REINIT_ITR_NEEDED BIT(16) #define IAVF_FLAG_QUEUES_DISABLED BIT(17) +#define IAVF_FLAG_SETUP_NETDEV_FEATURES BIT(18) /* duplicates for common code */ #define IAVF_FLAG_DCB_ENABLED 0 /* flags for admin queue service task */ diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index be51da978e7c..67349d24dc90 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -2879,6 +2879,24 @@ static void iavf_adminq_task(struct work_struct *work) } while (pending); mutex_unlock(&adapter->crit_lock); + if ((adapter->flags & IAVF_FLAG_SETUP_NETDEV_FEATURES)) { + if (adapter->netdev_registered || + !test_bit(__IAVF_IN_REMOVE_TASK, &adapter->crit_section)) { + struct net_device *netdev = adapter->netdev; + + rtnl_lock(); + netdev_update_features(netdev); + rtnl_unlock(); + /* Request VLAN offload settings */ + if (VLAN_V2_ALLOWED(adapter)) + iavf_set_vlan_offload_features + (adapter, 0, netdev->features); + + iavf_set_queue_vlan_tag_loc(adapter); + } + + adapter->flags &= ~IAVF_FLAG_SETUP_NETDEV_FEATURES; + } if ((adapter->flags & (IAVF_FLAG_RESET_PENDING | IAVF_FLAG_RESET_NEEDED)) || adapter->state == __IAVF_RESETTING) @@ -4606,8 +4624,10 @@ static void iavf_remove(struct pci_dev *pdev) cancel_delayed_work_sync(&adapter->watchdog_task); if (adapter->netdev_registered) { - unregister_netdev(netdev); + rtnl_lock(); + unregister_netdevice(netdev); adapter->netdev_registered = false; + rtnl_unlock(); } if (CLIENT_ALLOWED(adapter)) { err = iavf_lan_del_device(adapter); diff --git a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c index 5ee1d118fd30..88844d68e150 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c +++ b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c @@ -2146,29 +2146,7 @@ void iavf_virtchnl_completion(struct iavf_adapter *adapter, sizeof(adapter->vlan_v2_caps))); iavf_process_config(adapter); - - /* unlock crit_lock before acquiring rtnl_lock as other - * processes holding rtnl_lock could be waiting for the same - * crit_lock - */ - mutex_unlock(&adapter->crit_lock); - /* VLAN capabilities can change during VFR, so make sure to - * update the netdev features with the new capabilities - */ - rtnl_lock(); - netdev_update_features(netdev); - rtnl_unlock(); - if (iavf_lock_timeout(&adapter->crit_lock, 10000)) - dev_warn(&adapter->pdev->dev, "failed to acquire crit_lock in %s\n", - __FUNCTION__); - - /* Request VLAN offload settings */ - if (VLAN_V2_ALLOWED(adapter)) - iavf_set_vlan_offload_features(adapter, 0, - netdev->features); - - iavf_set_queue_vlan_tag_loc(adapter); - + adapter->flags |= IAVF_FLAG_SETUP_NETDEV_FEATURES; } break; case VIRTCHNL_OP_ENABLE_QUEUES: -- cgit From a472eb5cbaebb5774672c565e024336c039e9128 Mon Sep 17 00:00:00 2001 From: Slawomir Laba Date: Wed, 23 Feb 2022 13:38:01 +0100 Subject: iavf: Fix race in init state When iavf_init_version_check sends VIRTCHNL_OP_GET_VF_RESOURCES message, the driver will wait for the response after requeueing the watchdog task in iavf_init_get_resources call stack. The logic is implemented this way that iavf_init_get_resources has to be called in order to allocate adapter->vf_res. It is polling for the AQ response in iavf_get_vf_config function. Expect a call trace from kernel when adminq_task worker handles this message first. adapter->vf_res will be NULL in iavf_virtchnl_completion. Make the watchdog task not queue the adminq_task if the init process is not finished yet. Fixes: 898ef1cb1cb2 ("iavf: Combine init and watchdog state machines") Signed-off-by: Slawomir Laba Signed-off-by: Phani Burra Signed-off-by: Jacob Keller Signed-off-by: Mateusz Palczewski Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/iavf/iavf_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index 67349d24dc90..36433d6504b7 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -2532,7 +2532,8 @@ static void iavf_watchdog_task(struct work_struct *work) schedule_delayed_work(&adapter->client_task, msecs_to_jiffies(5)); mutex_unlock(&adapter->crit_lock); restart_watchdog: - queue_work(iavf_wq, &adapter->adminq_task); + if (adapter->state >= __IAVF_DOWN) + queue_work(iavf_wq, &adapter->adminq_task); if (adapter->aq_required) queue_delayed_work(iavf_wq, &adapter->watchdog_task, msecs_to_jiffies(20)); -- cgit From e85ff9c631e1bf109ce8428848dfc8e8b0041f48 Mon Sep 17 00:00:00 2001 From: Slawomir Laba Date: Wed, 23 Feb 2022 13:38:31 +0100 Subject: iavf: Fix deadlock in iavf_reset_task There exists a missing mutex_unlock call on crit_lock in iavf_reset_task call path. Unlock the crit_lock before returning from reset task. Fixes: 5ac49f3c2702 ("iavf: use mutexes for locking of critical sections") Signed-off-by: Slawomir Laba Signed-off-by: Phani Burra Signed-off-by: Jacob Keller Signed-off-by: Mateusz Palczewski Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/iavf/iavf_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index 36433d6504b7..da50ea3e44c2 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -2688,6 +2688,7 @@ static void iavf_reset_task(struct work_struct *work) reg_val); iavf_disable_vf(adapter); mutex_unlock(&adapter->client_lock); + mutex_unlock(&adapter->crit_lock); return; /* Do not attempt to reinit. It's dead, Jim. */ } -- cgit From d2c0f45fcceb0995f208c441d9c9a453623f9ccf Mon Sep 17 00:00:00 2001 From: Slawomir Laba Date: Wed, 23 Feb 2022 13:38:43 +0100 Subject: iavf: Fix missing check for running netdev The driver was queueing reset_task regardless of the netdev state. Do not queue the reset task in iavf_change_mtu if netdev is not running. Fixes: fdd4044ffdc8 ("iavf: Remove timer for work triggering, use delaying work instead") Signed-off-by: Slawomir Laba Signed-off-by: Phani Burra Signed-off-by: Jacob Keller Signed-off-by: Mateusz Palczewski Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/iavf/iavf_main.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index da50ea3e44c2..be97ac251802 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -3905,8 +3905,11 @@ static int iavf_change_mtu(struct net_device *netdev, int new_mtu) iavf_notify_client_l2_params(&adapter->vsi); adapter->flags |= IAVF_FLAG_SERVICE_CLIENT_REQUESTED; } - adapter->flags |= IAVF_FLAG_RESET_NEEDED; - queue_work(iavf_wq, &adapter->reset_task); + + if (netif_running(netdev)) { + adapter->flags |= IAVF_FLAG_RESET_NEEDED; + queue_work(iavf_wq, &adapter->reset_task); + } return 0; } -- cgit From 14756b2ae265d526b8356e86729090b01778fdf6 Mon Sep 17 00:00:00 2001 From: Slawomir Laba Date: Wed, 23 Feb 2022 13:38:55 +0100 Subject: iavf: Fix __IAVF_RESETTING state usage The setup of __IAVF_RESETTING state in watchdog task had no effect and could lead to slow resets in the driver as the task for __IAVF_RESETTING state only requeues watchdog. Till now the __IAVF_RESETTING was interpreted by reset task as running state which could lead to errors with allocating and resources disposal. Make watchdog_task queue the reset task when it's necessary. Do not update the state to __IAVF_RESETTING so the reset task knows exactly what is the current state of the adapter. Fixes: 898ef1cb1cb2 ("iavf: Combine init and watchdog state machines") Signed-off-by: Slawomir Laba Signed-off-by: Phani Burra Signed-off-by: Jacob Keller Signed-off-by: Mateusz Palczewski Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/iavf/iavf_main.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index be97ac251802..dcf24264c7ea 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -1137,8 +1137,7 @@ void iavf_down(struct iavf_adapter *adapter) rss->state = IAVF_ADV_RSS_DEL_REQUEST; spin_unlock_bh(&adapter->adv_rss_lock); - if (!(adapter->flags & IAVF_FLAG_PF_COMMS_FAILED) && - adapter->state != __IAVF_RESETTING) { + if (!(adapter->flags & IAVF_FLAG_PF_COMMS_FAILED)) { /* cancel any current operation */ adapter->current_op = VIRTCHNL_OP_UNKNOWN; /* Schedule operations to close down the HW. Don't wait @@ -2385,11 +2384,12 @@ static void iavf_watchdog_task(struct work_struct *work) if (adapter->flags & IAVF_FLAG_PF_COMMS_FAILED) iavf_change_state(adapter, __IAVF_COMM_FAILED); - if (adapter->flags & IAVF_FLAG_RESET_NEEDED && - adapter->state != __IAVF_RESETTING) { - iavf_change_state(adapter, __IAVF_RESETTING); + if (adapter->flags & IAVF_FLAG_RESET_NEEDED) { adapter->aq_required = 0; adapter->current_op = VIRTCHNL_OP_UNKNOWN; + mutex_unlock(&adapter->crit_lock); + queue_work(iavf_wq, &adapter->reset_task); + return; } switch (adapter->state) { @@ -2697,8 +2697,7 @@ continue_reset: * ndo_open() returning, so we can't assume it means all our open * tasks have finished, since we're not holding the rtnl_lock here. */ - running = ((adapter->state == __IAVF_RUNNING) || - (adapter->state == __IAVF_RESETTING)); + running = adapter->state == __IAVF_RUNNING; if (running) { netdev->flags &= ~IFF_UP; -- cgit From ba115adf61b36b8c167126425a62b0efc23f72c0 Mon Sep 17 00:00:00 2001 From: David Gow Date: Sun, 27 Feb 2022 21:00:10 -0800 Subject: Input: samsung-keypad - properly state IOMEM dependency Make the samsung-keypad driver explicitly depend on CONFIG_HAS_IOMEM, as it calls devm_ioremap(). This prevents compile errors in some configs (e.g, allyesconfig/randconfig under UML): /usr/bin/ld: drivers/input/keyboard/samsung-keypad.o: in function `samsung_keypad_probe': samsung-keypad.c:(.text+0xc60): undefined reference to `devm_ioremap' Signed-off-by: David Gow Acked-by: anton ivanov Link: https://lore.kernel.org/r/20220225041727.1902850-1-davidgow@google.com Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/input/keyboard/Kconfig b/drivers/input/keyboard/Kconfig index 0c607da9ee10..9417ee0b1eff 100644 --- a/drivers/input/keyboard/Kconfig +++ b/drivers/input/keyboard/Kconfig @@ -556,7 +556,7 @@ config KEYBOARD_PMIC8XXX config KEYBOARD_SAMSUNG tristate "Samsung keypad support" - depends on HAVE_CLK + depends on HAS_IOMEM && HAVE_CLK select INPUT_MATRIXKMAP help Say Y here if you want to use the keypad on your Samsung mobile -- cgit From dcf0c838854c86e1f41fb1934aea906845d69782 Mon Sep 17 00:00:00 2001 From: Sunil V L Date: Fri, 28 Jan 2022 10:20:04 +0530 Subject: riscv/efi_stub: Fix get_boot_hartid_from_fdt() return value The get_boot_hartid_from_fdt() function currently returns U32_MAX for failure case which is not correct because U32_MAX is a valid hartid value. This patch fixes the issue by returning error code. Cc: Fixes: d7071743db31 ("RISC-V: Add EFI stub support.") Signed-off-by: Sunil V L Reviewed-by: Heinrich Schuchardt Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/libstub/riscv-stub.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/drivers/firmware/efi/libstub/riscv-stub.c b/drivers/firmware/efi/libstub/riscv-stub.c index 380e4e251399..9c460843442f 100644 --- a/drivers/firmware/efi/libstub/riscv-stub.c +++ b/drivers/firmware/efi/libstub/riscv-stub.c @@ -25,7 +25,7 @@ typedef void __noreturn (*jump_kernel_func)(unsigned int, unsigned long); static u32 hartid; -static u32 get_boot_hartid_from_fdt(void) +static int get_boot_hartid_from_fdt(void) { const void *fdt; int chosen_node, len; @@ -33,23 +33,26 @@ static u32 get_boot_hartid_from_fdt(void) fdt = get_efi_config_table(DEVICE_TREE_GUID); if (!fdt) - return U32_MAX; + return -EINVAL; chosen_node = fdt_path_offset(fdt, "/chosen"); if (chosen_node < 0) - return U32_MAX; + return -EINVAL; prop = fdt_getprop((void *)fdt, chosen_node, "boot-hartid", &len); if (!prop || len != sizeof(u32)) - return U32_MAX; + return -EINVAL; - return fdt32_to_cpu(*prop); + hartid = fdt32_to_cpu(*prop); + return 0; } efi_status_t check_platform_features(void) { - hartid = get_boot_hartid_from_fdt(); - if (hartid == U32_MAX) { + int ret; + + ret = get_boot_hartid_from_fdt(); + if (ret) { efi_err("/chosen/boot-hartid missing or invalid!\n"); return EFI_UNSUPPORTED; } -- cgit From 258dd902022cb10c83671176688074879517fd21 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 18 Feb 2022 19:05:59 +0100 Subject: efivars: Respect "block" flag in efivar_entry_set_safe() When the "block" flag is false, the old code would sometimes still call check_var_size(), which wrongly tells ->query_variable_store() that it can block. As far as I can tell, this can't really materialize as a bug at the moment, because ->query_variable_store only does something on X86 with generic EFI, and in that configuration we always take the efivar_entry_set_nonblocking() path. Fixes: ca0e30dcaa53 ("efi: Add nonblocking option to efi_query_variable_store()") Signed-off-by: Jann Horn Signed-off-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20220218180559.1432559-1-jannh@google.com --- drivers/firmware/efi/vars.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/firmware/efi/vars.c b/drivers/firmware/efi/vars.c index abdc8a6a3963..cae590bd08f2 100644 --- a/drivers/firmware/efi/vars.c +++ b/drivers/firmware/efi/vars.c @@ -742,6 +742,7 @@ int efivar_entry_set_safe(efi_char16_t *name, efi_guid_t vendor, u32 attributes, { const struct efivar_operations *ops; efi_status_t status; + unsigned long varsize; if (!__efivars) return -EINVAL; @@ -764,15 +765,17 @@ int efivar_entry_set_safe(efi_char16_t *name, efi_guid_t vendor, u32 attributes, return efivar_entry_set_nonblocking(name, vendor, attributes, size, data); + varsize = size + ucs2_strsize(name, 1024); if (!block) { if (down_trylock(&efivars_lock)) return -EBUSY; + status = check_var_size_nonblocking(attributes, varsize); } else { if (down_interruptible(&efivars_lock)) return -EINTR; + status = check_var_size(attributes, varsize); } - status = check_var_size(attributes, size + ucs2_strsize(name, 1024)); if (status != EFI_SUCCESS) { up(&efivars_lock); return -ENOSPC; -- cgit From 9995b408f17ff8c7f11bc725c8aa225ba3a63b1c Mon Sep 17 00:00:00 2001 From: "j.nixdorf@avm.de" Date: Thu, 24 Feb 2022 10:06:49 +0100 Subject: net: ipv6: ensure we call ipv6_mc_down() at most once There are two reasons for addrconf_notify() to be called with NETDEV_DOWN: either the network device is actually going down, or IPv6 was disabled on the interface. If either of them stays down while the other is toggled, we repeatedly call the code for NETDEV_DOWN, including ipv6_mc_down(), while never calling the corresponding ipv6_mc_up() in between. This will cause a new entry in idev->mc_tomb to be allocated for each multicast group the interface is subscribed to, which in turn leaks one struct ifmcaddr6 per nontrivial multicast group the interface is subscribed to. The following reproducer will leak at least $n objects: ip addr add ff2e::4242/32 dev eth0 autojoin sysctl -w net.ipv6.conf.eth0.disable_ipv6=1 for i in $(seq 1 $n); do ip link set up eth0; ip link set down eth0 done Joining groups with IPV6_ADD_MEMBERSHIP (unprivileged) or setting the sysctl net.ipv6.conf.eth0.forwarding to 1 (=> subscribing to ff02::2) can also be used to create a nontrivial idev->mc_list, which will the leak objects with the right up-down-sequence. Based on both sources for NETDEV_DOWN events the interface IPv6 state should be considered: - not ready if the network interface is not ready OR IPv6 is disabled for it - ready if the network interface is ready AND IPv6 is enabled for it The functions ipv6_mc_up() and ipv6_down() should only be run when this state changes. Implement this by remembering when the IPv6 state is ready, and only run ipv6_mc_down() if it actually changed from ready to not ready. The other direction (not ready -> ready) already works correctly, as: - the interface notification triggered codepath for NETDEV_UP / NETDEV_CHANGE returns early if ipv6 is disabled, and - the disable_ipv6=0 triggered codepath skips fully initializing the interface as long as addrconf_link_ready(dev) returns false - calling ipv6_mc_up() repeatedly does not leak anything Fixes: 3ce62a84d53c ("ipv6: exit early in addrconf_notify() if IPv6 is disabled") Signed-off-by: Johannes Nixdorf Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 6c8ab3e6e6fe..f908e2fd30b2 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3732,6 +3732,7 @@ static int addrconf_ifdown(struct net_device *dev, bool unregister) struct inet6_dev *idev; struct inet6_ifaddr *ifa, *tmp; bool keep_addr = false; + bool was_ready; int state, i; ASSERT_RTNL(); @@ -3797,7 +3798,10 @@ restart: addrconf_del_rs_timer(idev); - /* Step 2: clear flags for stateless addrconf */ + /* Step 2: clear flags for stateless addrconf, repeated down + * detection + */ + was_ready = idev->if_flags & IF_READY; if (!unregister) idev->if_flags &= ~(IF_RS_SENT|IF_RA_RCVD|IF_READY); @@ -3871,7 +3875,7 @@ restart: if (unregister) { ipv6_ac_destroy_dev(idev); ipv6_mc_destroy_dev(idev); - } else { + } else if (was_ready) { ipv6_mc_down(idev); } -- cgit From 32568ae37596b529628ac09b875f4874e614f63f Mon Sep 17 00:00:00 2001 From: "Nícolas F. R. A. Prado" Date: Mon, 14 Feb 2022 15:05:07 -0500 Subject: arm64: dts: mt8183: jacuzzi: Fix bus properties in anx's DSI endpoint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mt8183-kukui-jacuzzi has an anx7625 bridge connected to the output of its DSI host. However, after commit fd0310b6fe7d ("drm/bridge: anx7625: add MIPI DPI input feature"), a bus-type property started being required in the endpoint node by the driver to indicate whether it is DSI or DPI. Add the missing bus-type property and set it to 5 (V4L2_FWNODE_BUS_TYPE_PARALLEL) so that the driver has its input configured to DSI and the display pipeline can probe correctly. While at it, also set the data-lanes property that was also introduced in that same commit, so that we don't rely on the default value. Fixes: fd0310b6fe7d ("drm/bridge: anx7625: add MIPI DPI input feature") Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Nícolas F. R. A. Prado Link: https://lore.kernel.org/r/20220214200507.2500693-1-nfraprado@collabora.com Signed-off-by: Matthias Brugger --- arch/arm64/boot/dts/mediatek/mt8183-kukui-jacuzzi.dtsi | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/boot/dts/mediatek/mt8183-kukui-jacuzzi.dtsi b/arch/arm64/boot/dts/mediatek/mt8183-kukui-jacuzzi.dtsi index 8f7bf33f607d..e8f133dc96b9 100644 --- a/arch/arm64/boot/dts/mediatek/mt8183-kukui-jacuzzi.dtsi +++ b/arch/arm64/boot/dts/mediatek/mt8183-kukui-jacuzzi.dtsi @@ -171,6 +171,8 @@ anx7625_in: endpoint { remote-endpoint = <&dsi_out>; + bus-type = <5>; + data-lanes = <0 1 2 3>; }; }; -- cgit From 4d08b7b57ece83a1c31c633a7e4e27f121157f9c Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Fri, 25 Feb 2022 14:56:57 +0800 Subject: net/smc: Fix cleanup when register ULP fails This patch calls smc_ib_unregister_client() when tcp_register_ulp() fails, and make sure to clean it up. Fixes: d7cd421da9da ("net/smc: Introduce TCP ULP support") Signed-off-by: Tony Lu Signed-off-by: David S. Miller --- net/smc/af_smc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 81984c1c0e78..284befa90967 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -3087,12 +3087,14 @@ static int __init smc_init(void) rc = tcp_register_ulp(&smc_ulp_ops); if (rc) { pr_err("%s: tcp_ulp_register fails with %d\n", __func__, rc); - goto out_sock; + goto out_ib; } static_branch_enable(&tcp_have_smc); return 0; +out_ib: + smc_ib_unregister_client(); out_sock: sock_unregister(PF_SMC); out_proto6: -- cgit From 90d4025285748448809701a44cf466a3f5443eaa Mon Sep 17 00:00:00 2001 From: Casper Andersson Date: Fri, 25 Feb 2022 13:43:27 +0100 Subject: net: sparx5: Add #include to remove warning main.h uses NUM_TARGETS from main_regs.h, but the missing include never causes any errors because everywhere main.h is (currently) included, main_regs.h is included before. But since it is dependent on main_regs.h it should always be included. Signed-off-by: Casper Andersson Reviewed-by: Joacim Zetterling Signed-off-by: David S. Miller --- drivers/net/ethernet/microchip/sparx5/sparx5_main.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_main.h b/drivers/net/ethernet/microchip/sparx5/sparx5_main.h index a1acc9b461f2..d40e18ce3293 100644 --- a/drivers/net/ethernet/microchip/sparx5/sparx5_main.h +++ b/drivers/net/ethernet/microchip/sparx5/sparx5_main.h @@ -16,6 +16,8 @@ #include #include +#include "sparx5_main_regs.h" + /* Target chip type */ enum spx5_target_chiptype { SPX5_TARGET_CT_7546 = 0x7546, /* SparX-5-64 Enterprise */ -- cgit From d4e26aaea7f82ba884dcb4acfe689406bc092dc3 Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Fri, 25 Feb 2022 04:52:30 -0800 Subject: atm: firestream: check the return value of ioremap() in fs_init() The function ioremap() in fs_init() can fail, so its return value should be checked. Reported-by: TOTE Robot Signed-off-by: Jia-Ju Bai Signed-off-by: David S. Miller --- drivers/atm/firestream.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/atm/firestream.c b/drivers/atm/firestream.c index 3bc3c314a467..4f67404fe64c 100644 --- a/drivers/atm/firestream.c +++ b/drivers/atm/firestream.c @@ -1676,6 +1676,8 @@ static int fs_init(struct fs_dev *dev) dev->hw_base = pci_resource_start(pci_dev, 0); dev->base = ioremap(dev->hw_base, 0x1000); + if (!dev->base) + return 1; reset_chip (dev); -- cgit From caef14b7530c065fb85d54492768fa48fdb5093e Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 25 Feb 2022 14:15:30 -0600 Subject: net: ipa: fix a build dependency An IPA build problem arose in the linux-next tree the other day. The problem is that a recent commit adds a new dependency on some code, and the Kconfig file for IPA doesn't reflect that dependency. As a result, some configurations can fail to build (particularly when COMPILE_TEST is enabled). The recent patch adds calls to qmp_get(), qmp_put(), and qmp_send(), and those are built based on the QCOM_AOSS_QMP config option. If that symbol is not defined, stubs are defined, so we just need to ensure QCOM_AOSS_QMP is compatible with QCOM_IPA, or it's not defined. Reported-by: Randy Dunlap Fixes: 34a081761e4e3 ("net: ipa: request IPA register values be retained") Signed-off-by: Alex Elder Tested-by: Randy Dunlap Acked-by: Randy Dunlap Signed-off-by: David S. Miller --- drivers/net/ipa/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ipa/Kconfig b/drivers/net/ipa/Kconfig index d037682fb7ad..e0164a55c1e6 100644 --- a/drivers/net/ipa/Kconfig +++ b/drivers/net/ipa/Kconfig @@ -3,6 +3,7 @@ config QCOM_IPA depends on NET && QCOM_SMEM depends on ARCH_QCOM || COMPILE_TEST depends on QCOM_RPROC_COMMON || (QCOM_RPROC_COMMON=n && COMPILE_TEST) + depends on QCOM_AOSS_QMP || QCOM_AOSS_QMP=n select QCOM_MDT_LOADER if ARCH_QCOM select QCOM_SCM select QCOM_QMI_HELPERS -- cgit From 1b279f6ad467535c3b8a66b4edefaca2cdd5bdc3 Mon Sep 17 00:00:00 2001 From: Vinay Belgaumkar Date: Wed, 16 Feb 2022 10:15:04 -0800 Subject: drm/i915/guc/slpc: Correct the param count for unset param SLPC unset param H2G only needs one parameter - the id of the param. Fixes: 025cb07bebfa ("drm/i915/guc/slpc: Cache platform frequency limits") Suggested-by: Umesh Nerlige Ramappa Signed-off-by: Vinay Belgaumkar Reviewed-by: Umesh Nerlige Ramappa Signed-off-by: Ramalingam C Link: https://patchwork.freedesktop.org/patch/msgid/20220216181504.7155-1-vinay.belgaumkar@intel.com (cherry picked from commit 9648f1c3739505557d94ff749a4f32192ea81fe3) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/gt/uc/intel_guc_slpc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_slpc.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_slpc.c index 13b27b8ff74e..ba21ace973da 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_slpc.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_slpc.c @@ -110,7 +110,7 @@ static int guc_action_slpc_unset_param(struct intel_guc *guc, u8 id) { u32 request[] = { GUC_ACTION_HOST2GUC_PC_SLPC_REQUEST, - SLPC_EVENT(SLPC_EVENT_PARAMETER_UNSET, 2), + SLPC_EVENT(SLPC_EVENT_PARAMETER_UNSET, 1), id, }; -- cgit From 08783aa7693f55619859f4f63f384abf17cb58c5 Mon Sep 17 00:00:00 2001 From: Ville Syrjälä Date: Thu, 24 Feb 2022 15:21:42 +0200 Subject: drm/i915: s/JSP2/ICP2/ PCH MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This JSP2 PCH actually seems to be some special Apple specific ICP variant rather than a JSP. Make it so. Or at least all the references to it seem to be some Apple ICL machines. Didn't manage to find these PCI IDs in any public chipset docs unfortunately. The only thing we're losing here with this JSP->ICP change is Wa_14011294188, but based on the HSD that isn't actually needed on any ICP based design (including JSP), only TGP based stuff (including MCC) really need it. The documented w/a just never made that distinction because Windows didn't want to differentiate between JSP and MCC (not sure how they handle hpd/ddc/etc. then though...). Cc: stable@vger.kernel.org Cc: Matt Roper Cc: Vivek Kasireddy Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/4226 Fixes: 943682e3bd19 ("drm/i915: Introduce Jasper Lake PCH") Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20220224132142.12927-1-ville.syrjala@linux.intel.com Acked-by: Vivek Kasireddy Tested-by: Tomas Bzatek (cherry picked from commit 53581504a8e216d435f114a4f2596ad0dfd902fc) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/intel_pch.c | 2 +- drivers/gpu/drm/i915/intel_pch.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_pch.c b/drivers/gpu/drm/i915/intel_pch.c index da8f82c2342f..fc8a68f3a2ed 100644 --- a/drivers/gpu/drm/i915/intel_pch.c +++ b/drivers/gpu/drm/i915/intel_pch.c @@ -108,6 +108,7 @@ intel_pch_type(const struct drm_i915_private *dev_priv, unsigned short id) /* Comet Lake V PCH is based on KBP, which is SPT compatible */ return PCH_SPT; case INTEL_PCH_ICP_DEVICE_ID_TYPE: + case INTEL_PCH_ICP2_DEVICE_ID_TYPE: drm_dbg_kms(&dev_priv->drm, "Found Ice Lake PCH\n"); drm_WARN_ON(&dev_priv->drm, !IS_ICELAKE(dev_priv)); return PCH_ICP; @@ -123,7 +124,6 @@ intel_pch_type(const struct drm_i915_private *dev_priv, unsigned short id) !IS_GEN9_BC(dev_priv)); return PCH_TGP; case INTEL_PCH_JSP_DEVICE_ID_TYPE: - case INTEL_PCH_JSP2_DEVICE_ID_TYPE: drm_dbg_kms(&dev_priv->drm, "Found Jasper Lake PCH\n"); drm_WARN_ON(&dev_priv->drm, !IS_JSL_EHL(dev_priv)); return PCH_JSP; diff --git a/drivers/gpu/drm/i915/intel_pch.h b/drivers/gpu/drm/i915/intel_pch.h index 6bff77521094..4ba0f1967cca 100644 --- a/drivers/gpu/drm/i915/intel_pch.h +++ b/drivers/gpu/drm/i915/intel_pch.h @@ -50,11 +50,11 @@ enum intel_pch { #define INTEL_PCH_CMP2_DEVICE_ID_TYPE 0x0680 #define INTEL_PCH_CMP_V_DEVICE_ID_TYPE 0xA380 #define INTEL_PCH_ICP_DEVICE_ID_TYPE 0x3480 +#define INTEL_PCH_ICP2_DEVICE_ID_TYPE 0x3880 #define INTEL_PCH_MCC_DEVICE_ID_TYPE 0x4B00 #define INTEL_PCH_TGP_DEVICE_ID_TYPE 0xA080 #define INTEL_PCH_TGP2_DEVICE_ID_TYPE 0x4380 #define INTEL_PCH_JSP_DEVICE_ID_TYPE 0x4D80 -#define INTEL_PCH_JSP2_DEVICE_ID_TYPE 0x3880 #define INTEL_PCH_ADP_DEVICE_ID_TYPE 0x7A80 #define INTEL_PCH_ADP2_DEVICE_ID_TYPE 0x5180 #define INTEL_PCH_ADP3_DEVICE_ID_TYPE 0x7A00 -- cgit From f0d2f15362f02444c5d7ffd5a5eb03e4aa54b685 Mon Sep 17 00:00:00 2001 From: Rong Chen Date: Wed, 16 Feb 2022 20:42:39 +0800 Subject: mmc: meson: Fix usage of meson_mmc_post_req() Currently meson_mmc_post_req() is called in meson_mmc_request() right after meson_mmc_start_cmd(). This could lead to DMA unmapping before the request is actually finished. To fix, don't call meson_mmc_post_req() until meson_mmc_request_done(). Signed-off-by: Rong Chen Reviewed-by: Kevin Hilman Fixes: 79ed05e329c3 ("mmc: meson-gx: add support for descriptor chain mode") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20220216124239.4007667-1-rong.chen@amlogic.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/meson-gx-mmc.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/mmc/host/meson-gx-mmc.c b/drivers/mmc/host/meson-gx-mmc.c index 8f36536cb1b6..58ab9d90bc8b 100644 --- a/drivers/mmc/host/meson-gx-mmc.c +++ b/drivers/mmc/host/meson-gx-mmc.c @@ -173,6 +173,8 @@ struct meson_host { int irq; bool vqmmc_enabled; + bool needs_pre_post_req; + }; #define CMD_CFG_LENGTH_MASK GENMASK(8, 0) @@ -663,6 +665,8 @@ static void meson_mmc_request_done(struct mmc_host *mmc, struct meson_host *host = mmc_priv(mmc); host->cmd = NULL; + if (host->needs_pre_post_req) + meson_mmc_post_req(mmc, mrq, 0); mmc_request_done(host->mmc, mrq); } @@ -880,7 +884,7 @@ static int meson_mmc_validate_dram_access(struct mmc_host *mmc, struct mmc_data static void meson_mmc_request(struct mmc_host *mmc, struct mmc_request *mrq) { struct meson_host *host = mmc_priv(mmc); - bool needs_pre_post_req = mrq->data && + host->needs_pre_post_req = mrq->data && !(mrq->data->host_cookie & SD_EMMC_PRE_REQ_DONE); /* @@ -896,22 +900,19 @@ static void meson_mmc_request(struct mmc_host *mmc, struct mmc_request *mrq) } } - if (needs_pre_post_req) { + if (host->needs_pre_post_req) { meson_mmc_get_transfer_mode(mmc, mrq); if (!meson_mmc_desc_chain_mode(mrq->data)) - needs_pre_post_req = false; + host->needs_pre_post_req = false; } - if (needs_pre_post_req) + if (host->needs_pre_post_req) meson_mmc_pre_req(mmc, mrq); /* Stop execution */ writel(0, host->regs + SD_EMMC_START); meson_mmc_start_cmd(mmc, mrq->sbc ?: mrq->cmd); - - if (needs_pre_post_req) - meson_mmc_post_req(mmc, mrq, 0); } static void meson_mmc_read_resp(struct mmc_host *mmc, struct mmc_command *cmd) -- cgit From b00833768e170a31af09268f7ab96aecfcca9623 Mon Sep 17 00:00:00 2001 From: Adrian Huang Date: Mon, 21 Feb 2022 13:33:48 +0800 Subject: iommu/vt-d: Fix double list_add when enabling VMD in scalable mode When enabling VMD and IOMMU scalable mode, the following kernel panic call trace/kernel log is shown in Eagle Stream platform (Sapphire Rapids CPU) during booting: pci 0000:59:00.5: Adding to iommu group 42 ... vmd 0000:59:00.5: PCI host bridge to bus 10000:80 pci 10000:80:01.0: [8086:352a] type 01 class 0x060400 pci 10000:80:01.0: reg 0x10: [mem 0x00000000-0x0001ffff 64bit] pci 10000:80:01.0: enabling Extended Tags pci 10000:80:01.0: PME# supported from D0 D3hot D3cold pci 10000:80:01.0: DMAR: Setup RID2PASID failed pci 10000:80:01.0: Failed to add to iommu group 42: -16 pci 10000:80:03.0: [8086:352b] type 01 class 0x060400 pci 10000:80:03.0: reg 0x10: [mem 0x00000000-0x0001ffff 64bit] pci 10000:80:03.0: enabling Extended Tags pci 10000:80:03.0: PME# supported from D0 D3hot D3cold ------------[ cut here ]------------ kernel BUG at lib/list_debug.c:29! invalid opcode: 0000 [#1] PREEMPT SMP NOPTI CPU: 0 PID: 7 Comm: kworker/0:1 Not tainted 5.17.0-rc3+ #7 Hardware name: Lenovo ThinkSystem SR650V3/SB27A86647, BIOS ESE101Y-1.00 01/13/2022 Workqueue: events work_for_cpu_fn RIP: 0010:__list_add_valid.cold+0x26/0x3f Code: 9a 4a ab ff 4c 89 c1 48 c7 c7 40 0c d9 9e e8 b9 b1 fe ff 0f 0b 48 89 f2 4c 89 c1 48 89 fe 48 c7 c7 f0 0c d9 9e e8 a2 b1 fe ff <0f> 0b 48 89 d1 4c 89 c6 4c 89 ca 48 c7 c7 98 0c d9 9e e8 8b b1 fe RSP: 0000:ff5ad434865b3a40 EFLAGS: 00010246 RAX: 0000000000000058 RBX: ff4d61160b74b880 RCX: ff4d61255e1fffa8 RDX: 0000000000000000 RSI: 00000000fffeffff RDI: ffffffff9fd34f20 RBP: ff4d611d8e245c00 R08: 0000000000000000 R09: ff5ad434865b3888 R10: ff5ad434865b3880 R11: ff4d61257fdc6fe8 R12: ff4d61160b74b8a0 R13: ff4d61160b74b8a0 R14: ff4d611d8e245c10 R15: ff4d611d8001ba70 FS: 0000000000000000(0000) GS:ff4d611d5ea00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ff4d611fa1401000 CR3: 0000000aa0210001 CR4: 0000000000771ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: intel_pasid_alloc_table+0x9c/0x1d0 dmar_insert_one_dev_info+0x423/0x540 ? device_to_iommu+0x12d/0x2f0 intel_iommu_attach_device+0x116/0x290 __iommu_attach_device+0x1a/0x90 iommu_group_add_device+0x190/0x2c0 __iommu_probe_device+0x13e/0x250 iommu_probe_device+0x24/0x150 iommu_bus_notifier+0x69/0x90 blocking_notifier_call_chain+0x5a/0x80 device_add+0x3db/0x7b0 ? arch_memremap_can_ram_remap+0x19/0x50 ? memremap+0x75/0x140 pci_device_add+0x193/0x1d0 pci_scan_single_device+0xb9/0xf0 pci_scan_slot+0x4c/0x110 pci_scan_child_bus_extend+0x3a/0x290 vmd_enable_domain.constprop.0+0x63e/0x820 vmd_probe+0x163/0x190 local_pci_probe+0x42/0x80 work_for_cpu_fn+0x13/0x20 process_one_work+0x1e2/0x3b0 worker_thread+0x1c4/0x3a0 ? rescuer_thread+0x370/0x370 kthread+0xc7/0xf0 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x1f/0x30 Modules linked in: ---[ end trace 0000000000000000 ]--- ... Kernel panic - not syncing: Fatal exception Kernel Offset: 0x1ca00000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) ---[ end Kernel panic - not syncing: Fatal exception ]--- The following 'lspci' output shows devices '10000:80:*' are subdevices of the VMD device 0000:59:00.5: $ lspci ... 0000:59:00.5 RAID bus controller: Intel Corporation Volume Management Device NVMe RAID Controller (rev 20) ... 10000:80:01.0 PCI bridge: Intel Corporation Device 352a (rev 03) 10000:80:03.0 PCI bridge: Intel Corporation Device 352b (rev 03) 10000:80:05.0 PCI bridge: Intel Corporation Device 352c (rev 03) 10000:80:07.0 PCI bridge: Intel Corporation Device 352d (rev 03) 10000:81:00.0 Non-Volatile memory controller: Intel Corporation NVMe Datacenter SSD [3DNAND, Beta Rock Controller] 10000:82:00.0 Non-Volatile memory controller: Intel Corporation NVMe Datacenter SSD [3DNAND, Beta Rock Controller] The symptom 'list_add double add' is caused by the following failure message: pci 10000:80:01.0: DMAR: Setup RID2PASID failed pci 10000:80:01.0: Failed to add to iommu group 42: -16 pci 10000:80:03.0: [8086:352b] type 01 class 0x060400 Device 10000:80:01.0 is the subdevice of the VMD device 0000:59:00.5, so invoking intel_pasid_alloc_table() gets the pasid_table of the VMD device 0000:59:00.5. Here is call path: intel_pasid_alloc_table pci_for_each_dma_alias get_alias_pasid_table search_pasid_table pci_real_dma_dev() in pci_for_each_dma_alias() gets the real dma device which is the VMD device 0000:59:00.5. However, pte of the VMD device 0000:59:00.5 has been configured during this message "pci 0000:59:00.5: Adding to iommu group 42". So, the status -EBUSY is returned when configuring pasid entry for device 10000:80:01.0. It then invokes dmar_remove_one_dev_info() to release 'struct device_domain_info *' from iommu_devinfo_cache. But, the pasid table is not released because of the following statement in __dmar_remove_one_dev_info(): if (info->dev && !dev_is_real_dma_subdevice(info->dev)) { ... intel_pasid_free_table(info->dev); } The subsequent dmar_insert_one_dev_info() operation of device 10000:80:03.0 allocates 'struct device_domain_info *' from iommu_devinfo_cache. The allocated address is the same address that is released previously for device 10000:80:01.0. Finally, invoking device_attach_pasid_table() causes the issue. `git bisect` points to the offending commit 474dd1c65064 ("iommu/vt-d: Fix clearing real DMA device's scalable-mode context entries"), which releases the pasid table if the device is not the subdevice by checking the returned status of dev_is_real_dma_subdevice(). Reverting the offending commit can work around the issue. The solution is to prevent from allocating pasid table if those devices are subdevices of the VMD device. Fixes: 474dd1c65064 ("iommu/vt-d: Fix clearing real DMA device's scalable-mode context entries") Cc: stable@vger.kernel.org # v5.14+ Signed-off-by: Adrian Huang Link: https://lore.kernel.org/r/20220216091307.703-1-adrianhuang0701@gmail.com Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20220221053348.262724-2-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 92fea3fbbb11..5b196cfe9ed2 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -2738,7 +2738,7 @@ static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu, spin_unlock_irqrestore(&device_domain_lock, flags); /* PASID table is mandatory for a PCI device in scalable mode. */ - if (dev && dev_is_pci(dev) && sm_supported(iommu)) { + if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) { ret = intel_pasid_alloc_table(dev); if (ret) { dev_err(dev, "PASID table allocation failed\n"); -- cgit From 9826e393e4a8c3df474e7f9eacd3087266f74005 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Fri, 7 Jan 2022 08:09:11 +0000 Subject: iommu/tegra-smmu: Fix missing put_device() call in tegra_smmu_find The reference taken by 'of_find_device_by_node()' must be released when not needed anymore. Add the corresponding 'put_device()' in the error handling path. Fixes: 765a9d1d02b2 ("iommu/tegra-smmu: Fix mc errors on tegra124-nyan") Signed-off-by: Miaoqian Lin Acked-by: Thierry Reding Link: https://lore.kernel.org/r/20220107080915.12686-1-linmq006@gmail.com Signed-off-by: Joerg Roedel --- drivers/iommu/tegra-smmu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c index e900e3c46903..2561ce8a2ce8 100644 --- a/drivers/iommu/tegra-smmu.c +++ b/drivers/iommu/tegra-smmu.c @@ -808,8 +808,10 @@ static struct tegra_smmu *tegra_smmu_find(struct device_node *np) return NULL; mc = platform_get_drvdata(pdev); - if (!mc) + if (!mc) { + put_device(&pdev->dev); return NULL; + } return mc->smmu; } -- cgit From 30939293262eb433c960c4532a0d59c4073b2b84 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 28 Feb 2022 11:43:54 +0800 Subject: blktrace: fix use after free for struct blk_trace When tracing the whole disk, 'dropped' and 'msg' will be created under 'q->debugfs_dir' and 'bt->dir' is NULL, thus blk_trace_free() won't remove those files. What's worse, the following UAF can be triggered because of accessing stale 'dropped' and 'msg': ================================================================== BUG: KASAN: use-after-free in blk_dropped_read+0x89/0x100 Read of size 4 at addr ffff88816912f3d8 by task blktrace/1188 CPU: 27 PID: 1188 Comm: blktrace Not tainted 5.17.0-rc4-next-20220217+ #469 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-4 Call Trace: dump_stack_lvl+0x34/0x44 print_address_description.constprop.0.cold+0xab/0x381 ? blk_dropped_read+0x89/0x100 ? blk_dropped_read+0x89/0x100 kasan_report.cold+0x83/0xdf ? blk_dropped_read+0x89/0x100 kasan_check_range+0x140/0x1b0 blk_dropped_read+0x89/0x100 ? blk_create_buf_file_callback+0x20/0x20 ? kmem_cache_free+0xa1/0x500 ? do_sys_openat2+0x258/0x460 full_proxy_read+0x8f/0xc0 vfs_read+0xc6/0x260 ksys_read+0xb9/0x150 ? vfs_write+0x3d0/0x3d0 ? fpregs_assert_state_consistent+0x55/0x60 ? exit_to_user_mode_prepare+0x39/0x1e0 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7fbc080d92fd Code: ce 20 00 00 75 10 b8 00 00 00 00 0f 05 48 3d 01 f0 ff ff 73 31 c3 48 83 1 RSP: 002b:00007fbb95ff9cb0 EFLAGS: 00000293 ORIG_RAX: 0000000000000000 RAX: ffffffffffffffda RBX: 00007fbb95ff9dc0 RCX: 00007fbc080d92fd RDX: 0000000000000100 RSI: 00007fbb95ff9cc0 RDI: 0000000000000045 RBP: 0000000000000045 R08: 0000000000406299 R09: 00000000fffffffd R10: 000000000153afa0 R11: 0000000000000293 R12: 00007fbb780008c0 R13: 00007fbb78000938 R14: 0000000000608b30 R15: 00007fbb780029c8 Allocated by task 1050: kasan_save_stack+0x1e/0x40 __kasan_kmalloc+0x81/0xa0 do_blk_trace_setup+0xcb/0x410 __blk_trace_setup+0xac/0x130 blk_trace_ioctl+0xe9/0x1c0 blkdev_ioctl+0xf1/0x390 __x64_sys_ioctl+0xa5/0xe0 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae Freed by task 1050: kasan_save_stack+0x1e/0x40 kasan_set_track+0x21/0x30 kasan_set_free_info+0x20/0x30 __kasan_slab_free+0x103/0x180 kfree+0x9a/0x4c0 __blk_trace_remove+0x53/0x70 blk_trace_ioctl+0x199/0x1c0 blkdev_common_ioctl+0x5e9/0xb30 blkdev_ioctl+0x1a5/0x390 __x64_sys_ioctl+0xa5/0xe0 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae The buggy address belongs to the object at ffff88816912f380 which belongs to the cache kmalloc-96 of size 96 The buggy address is located 88 bytes inside of 96-byte region [ffff88816912f380, ffff88816912f3e0) The buggy address belongs to the page: page:000000009a1b4e7c refcount:1 mapcount:0 mapping:0000000000000000 index:0x0f flags: 0x17ffffc0000200(slab|node=0|zone=2|lastcpupid=0x1fffff) raw: 0017ffffc0000200 ffffea00044f1100 dead000000000002 ffff88810004c780 raw: 0000000000000000 0000000000200020 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff88816912f280: fa fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc ffff88816912f300: fa fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc >ffff88816912f380: fa fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc ^ ffff88816912f400: fa fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc ffff88816912f480: fa fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc ================================================================== Fixes: c0ea57608b69 ("blktrace: remove debugfs file dentries from struct blk_trace") Signed-off-by: Yu Kuai Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20220228034354.4047385-1-yukuai3@huawei.com Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index af68a67179b4..21dea90eaa93 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -310,10 +310,20 @@ record_it: local_irq_restore(flags); } -static void blk_trace_free(struct blk_trace *bt) +static void blk_trace_free(struct request_queue *q, struct blk_trace *bt) { relay_close(bt->rchan); - debugfs_remove(bt->dir); + + /* + * If 'bt->dir' is not set, then both 'dropped' and 'msg' are created + * under 'q->debugfs_dir', thus lookup and remove them. + */ + if (!bt->dir) { + debugfs_remove(debugfs_lookup("dropped", q->debugfs_dir)); + debugfs_remove(debugfs_lookup("msg", q->debugfs_dir)); + } else { + debugfs_remove(bt->dir); + } free_percpu(bt->sequence); free_percpu(bt->msg_data); kfree(bt); @@ -335,10 +345,10 @@ static void put_probe_ref(void) mutex_unlock(&blk_probe_mutex); } -static void blk_trace_cleanup(struct blk_trace *bt) +static void blk_trace_cleanup(struct request_queue *q, struct blk_trace *bt) { synchronize_rcu(); - blk_trace_free(bt); + blk_trace_free(q, bt); put_probe_ref(); } @@ -352,7 +362,7 @@ static int __blk_trace_remove(struct request_queue *q) return -EINVAL; if (bt->trace_state != Blktrace_running) - blk_trace_cleanup(bt); + blk_trace_cleanup(q, bt); return 0; } @@ -572,7 +582,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, ret = 0; err: if (ret) - blk_trace_free(bt); + blk_trace_free(q, bt); return ret; } @@ -1616,7 +1626,7 @@ static int blk_trace_remove_queue(struct request_queue *q) put_probe_ref(); synchronize_rcu(); - blk_trace_free(bt); + blk_trace_free(q, bt); return 0; } @@ -1647,7 +1657,7 @@ static int blk_trace_setup_queue(struct request_queue *q, return 0; free_bt: - blk_trace_free(bt); + blk_trace_free(q, bt); return ret; } -- cgit From 7b83299e5b9385943a857d59e15cba270df20d7e Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 23 Feb 2022 20:46:35 +0100 Subject: ARM: 9182/1: mmu: fix returns from early_param() and __setup() functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit early_param() handlers should return 0 on success. __setup() handlers should return 1 on success, i.e., the parameter has been handled. A return of 0 would cause the "option=value" string to be added to init's environment strings, polluting it. ../arch/arm/mm/mmu.c: In function 'test_early_cachepolicy': ../arch/arm/mm/mmu.c:215:1: error: no return statement in function returning non-void [-Werror=return-type] ../arch/arm/mm/mmu.c: In function 'test_noalign_setup': ../arch/arm/mm/mmu.c:221:1: error: no return statement in function returning non-void [-Werror=return-type] Fixes: b849a60e0903 ("ARM: make cr_alignment read-only #ifndef CONFIG_CPU_CP15") Signed-off-by: Randy Dunlap Reported-by: Igor Zhbanov Cc: Uwe Kleine-König Cc: linux-arm-kernel@lists.infradead.org Cc: patches@armlinux.org.uk Signed-off-by: Russell King (Oracle) --- arch/arm/mm/mmu.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index 274e4f73fd33..5e2be37a198e 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -212,12 +212,14 @@ early_param("ecc", early_ecc); static int __init early_cachepolicy(char *p) { pr_warn("cachepolicy kernel parameter not supported without cp15\n"); + return 0; } early_param("cachepolicy", early_cachepolicy); static int __init noalign_setup(char *__unused) { pr_warn("noalign kernel parameter not supported without cp15\n"); + return 1; } __setup("noalign", noalign_setup); -- cgit From fda2635466cd26ad237e1bc5d3f6a60f97ad09b6 Mon Sep 17 00:00:00 2001 From: Corinna Vinschen Date: Wed, 16 Feb 2022 14:31:35 +0100 Subject: igc: igc_read_phy_reg_gpy: drop premature return igc_read_phy_reg_gpy checks the return value from igc_read_phy_reg_mdic and if it's not 0, returns immediately. By doing this, it leaves the HW semaphore in the acquired state. Drop this premature return statement, the function returns after releasing the semaphore immediately anyway. Fixes: 5586838fe9ce ("igc: Add code for PHY support") Signed-off-by: Corinna Vinschen Acked-by: Sasha Neftin Tested-by: Naama Meir Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_phy.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_phy.c b/drivers/net/ethernet/intel/igc/igc_phy.c index 5cad31c3c7b0..df91d07ce82a 100644 --- a/drivers/net/ethernet/intel/igc/igc_phy.c +++ b/drivers/net/ethernet/intel/igc/igc_phy.c @@ -779,8 +779,6 @@ s32 igc_read_phy_reg_gpy(struct igc_hw *hw, u32 offset, u16 *data) if (ret_val) return ret_val; ret_val = igc_read_phy_reg_mdic(hw, offset, data); - if (ret_val) - return ret_val; hw->phy.ops.release(hw); } else { ret_val = igc_read_xmdio_reg(hw, (u16)offset, dev_addr, -- cgit From c4208653a327a09da1e9e7b10299709b6d9b17bf Mon Sep 17 00:00:00 2001 From: Sasha Neftin Date: Sun, 20 Feb 2022 09:29:15 +0200 Subject: igc: igc_write_phy_reg_gpy: drop premature return Similar to "igc_read_phy_reg_gpy: drop premature return" patch. igc_write_phy_reg_gpy checks the return value from igc_write_phy_reg_mdic and if it's not 0, returns immediately. By doing this, it leaves the HW semaphore in the acquired state. Drop this premature return statement, the function returns after releasing the semaphore immediately anyway. Fixes: 5586838fe9ce ("igc: Add code for PHY support") Suggested-by: Dima Ruinskiy Reported-by: Corinna Vinschen Signed-off-by: Sasha Neftin Tested-by: Naama Meir Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_phy.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_phy.c b/drivers/net/ethernet/intel/igc/igc_phy.c index df91d07ce82a..40dbf4b43234 100644 --- a/drivers/net/ethernet/intel/igc/igc_phy.c +++ b/drivers/net/ethernet/intel/igc/igc_phy.c @@ -746,8 +746,6 @@ s32 igc_write_phy_reg_gpy(struct igc_hw *hw, u32 offset, u16 data) if (ret_val) return ret_val; ret_val = igc_write_phy_reg_mdic(hw, offset, data); - if (ret_val) - return ret_val; hw->phy.ops.release(hw); } else { ret_val = igc_write_xmdio_reg(hw, (u16)offset, dev_addr, -- cgit From 244d00b5dd4755f8df892c86cab35fb2cfd4f14b Mon Sep 17 00:00:00 2001 From: Kim Phillips Date: Mon, 28 Feb 2022 11:23:15 -0600 Subject: x86/speculation: Use generic retpoline by default on AMD AMD retpoline may be susceptible to speculation. The speculation execution window for an incorrect indirect branch prediction using LFENCE/JMP sequence may potentially be large enough to allow exploitation using Spectre V2. By default, don't use retpoline,lfence on AMD. Instead, use the generic retpoline. Signed-off-by: Kim Phillips Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/bugs.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 0a4267c63d3b..a36bfe2c2480 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -941,15 +941,6 @@ static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void) return SPECTRE_V2_NONE; } - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || - boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { - if (!boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { - pr_err("LFENCE not serializing, switching to generic retpoline\n"); - return SPECTRE_V2_RETPOLINE; - } - return SPECTRE_V2_LFENCE; - } - return SPECTRE_V2_RETPOLINE; } -- cgit From e9b6013a7ce31535b04b02ba99babefe8a8599fa Mon Sep 17 00:00:00 2001 From: Kim Phillips Date: Mon, 28 Feb 2022 11:23:16 -0600 Subject: x86/speculation: Update link to AMD speculation whitepaper Update the link to the "Software Techniques for Managing Speculation on AMD Processors" whitepaper. Signed-off-by: Kim Phillips Signed-off-by: Borislav Petkov --- Documentation/admin-guide/hw-vuln/spectre.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/admin-guide/hw-vuln/spectre.rst b/Documentation/admin-guide/hw-vuln/spectre.rst index 79051f4dac45..9e9556826450 100644 --- a/Documentation/admin-guide/hw-vuln/spectre.rst +++ b/Documentation/admin-guide/hw-vuln/spectre.rst @@ -60,8 +60,8 @@ privileged data touched during the speculative execution. Spectre variant 1 attacks take advantage of speculative execution of conditional branches, while Spectre variant 2 attacks use speculative execution of indirect branches to leak privileged memory. -See :ref:`[1] ` :ref:`[5] ` :ref:`[7] ` -:ref:`[10] ` :ref:`[11] `. +See :ref:`[1] ` :ref:`[5] ` :ref:`[6] ` +:ref:`[7] ` :ref:`[10] ` :ref:`[11] `. Spectre variant 1 (Bounds Check Bypass) --------------------------------------- @@ -697,7 +697,7 @@ AMD white papers: .. _spec_ref6: -[6] `Software techniques for managing speculation on AMD processors `_. +[6] `Software techniques for managing speculation on AMD processors `_. ARM white papers: -- cgit From 26d3474348293dc752c55fe6d41282199f73714c Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Tue, 22 Feb 2022 14:18:43 -0800 Subject: drm/bridge: ti-sn65dsi86: Properly undo autosuspend The PM Runtime docs say: Drivers in ->remove() callback should undo the runtime PM changes done in ->probe(). Usually this means calling pm_runtime_disable(), pm_runtime_dont_use_autosuspend() etc. We weren't doing that for autosuspend. Let's do it. Fixes: 9bede63127c6 ("drm/bridge: ti-sn65dsi86: Use pm_runtime autosuspend") Signed-off-by: Douglas Anderson Reviewed-by: Linus Walleij Link: https://patchwork.freedesktop.org/patch/msgid/20220222141838.1.If784ba19e875e8ded4ec4931601ce6d255845245@changeid --- drivers/gpu/drm/bridge/ti-sn65dsi86.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/bridge/ti-sn65dsi86.c b/drivers/gpu/drm/bridge/ti-sn65dsi86.c index dab8f76618f3..68d8415e6c28 100644 --- a/drivers/gpu/drm/bridge/ti-sn65dsi86.c +++ b/drivers/gpu/drm/bridge/ti-sn65dsi86.c @@ -1802,6 +1802,7 @@ static inline void ti_sn_gpio_unregister(void) {} static void ti_sn65dsi86_runtime_disable(void *data) { + pm_runtime_dont_use_autosuspend(data); pm_runtime_disable(data); } @@ -1861,11 +1862,11 @@ static int ti_sn65dsi86_probe(struct i2c_client *client, "failed to get reference clock\n"); pm_runtime_enable(dev); + pm_runtime_set_autosuspend_delay(pdata->dev, 500); + pm_runtime_use_autosuspend(pdata->dev); ret = devm_add_action_or_reset(dev, ti_sn65dsi86_runtime_disable, dev); if (ret) return ret; - pm_runtime_set_autosuspend_delay(pdata->dev, 500); - pm_runtime_use_autosuspend(pdata->dev); ti_sn65dsi86_debugfs_init(pdata); -- cgit From cb1852783f790feae845006d062acb9e0a5d4304 Mon Sep 17 00:00:00 2001 From: Carsten Haitzler Date: Mon, 24 Jan 2022 16:24:37 +0000 Subject: drm/arm: arm hdlcd select DRM_GEM_CMA_HELPER Without DRM_GEM_CMA_HELPER HDLCD won't build. This needs to be there too. Fixes: 09717af7d13d ("drm: Remove CONFIG_DRM_KMS_CMA_HELPER option") Reviewed-by: Steven Price Signed-off-by: Carsten Haitzler Acked-by: Liviu Dudau Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20220124162437.2470344-1-carsten.haitzler@foss.arm.com --- drivers/gpu/drm/arm/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/arm/Kconfig b/drivers/gpu/drm/arm/Kconfig index 58a242871b28..6e3f1d600541 100644 --- a/drivers/gpu/drm/arm/Kconfig +++ b/drivers/gpu/drm/arm/Kconfig @@ -6,6 +6,7 @@ config DRM_HDLCD depends on DRM && OF && (ARM || ARM64 || COMPILE_TEST) depends on COMMON_CLK select DRM_KMS_HELPER + select DRM_GEM_CMA_HELPER help Choose this option if you have an ARM High Definition Colour LCD controller. -- cgit From 56763f12b0f02706576a088e85ef856deacc98a0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 27 Feb 2022 10:01:41 -0800 Subject: netfilter: fix use-after-free in __nf_register_net_hook() We must not dereference @new_hooks after nf_hook_mutex has been released, because other threads might have freed our allocated hooks already. BUG: KASAN: use-after-free in nf_hook_entries_get_hook_ops include/linux/netfilter.h:130 [inline] BUG: KASAN: use-after-free in hooks_validate net/netfilter/core.c:171 [inline] BUG: KASAN: use-after-free in __nf_register_net_hook+0x77a/0x820 net/netfilter/core.c:438 Read of size 2 at addr ffff88801c1a8000 by task syz-executor237/4430 CPU: 1 PID: 4430 Comm: syz-executor237 Not tainted 5.17.0-rc5-syzkaller-00306-g2293be58d6a1 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 print_address_description.constprop.0.cold+0x8d/0x336 mm/kasan/report.c:255 __kasan_report mm/kasan/report.c:442 [inline] kasan_report.cold+0x83/0xdf mm/kasan/report.c:459 nf_hook_entries_get_hook_ops include/linux/netfilter.h:130 [inline] hooks_validate net/netfilter/core.c:171 [inline] __nf_register_net_hook+0x77a/0x820 net/netfilter/core.c:438 nf_register_net_hook+0x114/0x170 net/netfilter/core.c:571 nf_register_net_hooks+0x59/0xc0 net/netfilter/core.c:587 nf_synproxy_ipv6_init+0x85/0xe0 net/netfilter/nf_synproxy_core.c:1218 synproxy_tg6_check+0x30d/0x560 net/ipv6/netfilter/ip6t_SYNPROXY.c:81 xt_check_target+0x26c/0x9e0 net/netfilter/x_tables.c:1038 check_target net/ipv6/netfilter/ip6_tables.c:530 [inline] find_check_entry.constprop.0+0x7f1/0x9e0 net/ipv6/netfilter/ip6_tables.c:573 translate_table+0xc8b/0x1750 net/ipv6/netfilter/ip6_tables.c:735 do_replace net/ipv6/netfilter/ip6_tables.c:1153 [inline] do_ip6t_set_ctl+0x56e/0xb90 net/ipv6/netfilter/ip6_tables.c:1639 nf_setsockopt+0x83/0xe0 net/netfilter/nf_sockopt.c:101 ipv6_setsockopt+0x122/0x180 net/ipv6/ipv6_sockglue.c:1024 rawv6_setsockopt+0xd3/0x6a0 net/ipv6/raw.c:1084 __sys_setsockopt+0x2db/0x610 net/socket.c:2180 __do_sys_setsockopt net/socket.c:2191 [inline] __se_sys_setsockopt net/socket.c:2188 [inline] __x64_sys_setsockopt+0xba/0x150 net/socket.c:2188 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f65a1ace7d9 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 71 15 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f65a1a7f308 EFLAGS: 00000246 ORIG_RAX: 0000000000000036 RAX: ffffffffffffffda RBX: 0000000000000006 RCX: 00007f65a1ace7d9 RDX: 0000000000000040 RSI: 0000000000000029 RDI: 0000000000000003 RBP: 00007f65a1b574c8 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000020000000 R11: 0000000000000246 R12: 00007f65a1b55130 R13: 00007f65a1b574c0 R14: 00007f65a1b24090 R15: 0000000000022000 The buggy address belongs to the page: page:ffffea0000706a00 refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x1c1a8 flags: 0xfff00000000000(node=0|zone=1|lastcpupid=0x7ff) raw: 00fff00000000000 ffffea0001c1b108 ffffea000046dd08 0000000000000000 raw: 0000000000000000 0000000000000000 00000000ffffffff 0000000000000000 page dumped because: kasan: bad access detected page_owner tracks the page as freed page last allocated via order 2, migratetype Unmovable, gfp_mask 0x52dc0(GFP_KERNEL|__GFP_NOWARN|__GFP_NORETRY|__GFP_COMP|__GFP_ZERO), pid 4430, ts 1061781545818, free_ts 1061791488993 prep_new_page mm/page_alloc.c:2434 [inline] get_page_from_freelist+0xa72/0x2f50 mm/page_alloc.c:4165 __alloc_pages+0x1b2/0x500 mm/page_alloc.c:5389 __alloc_pages_node include/linux/gfp.h:572 [inline] alloc_pages_node include/linux/gfp.h:595 [inline] kmalloc_large_node+0x62/0x130 mm/slub.c:4438 __kmalloc_node+0x35a/0x4a0 mm/slub.c:4454 kmalloc_node include/linux/slab.h:604 [inline] kvmalloc_node+0x97/0x100 mm/util.c:580 kvmalloc include/linux/slab.h:731 [inline] kvzalloc include/linux/slab.h:739 [inline] allocate_hook_entries_size net/netfilter/core.c:61 [inline] nf_hook_entries_grow+0x140/0x780 net/netfilter/core.c:128 __nf_register_net_hook+0x144/0x820 net/netfilter/core.c:429 nf_register_net_hook+0x114/0x170 net/netfilter/core.c:571 nf_register_net_hooks+0x59/0xc0 net/netfilter/core.c:587 nf_synproxy_ipv6_init+0x85/0xe0 net/netfilter/nf_synproxy_core.c:1218 synproxy_tg6_check+0x30d/0x560 net/ipv6/netfilter/ip6t_SYNPROXY.c:81 xt_check_target+0x26c/0x9e0 net/netfilter/x_tables.c:1038 check_target net/ipv6/netfilter/ip6_tables.c:530 [inline] find_check_entry.constprop.0+0x7f1/0x9e0 net/ipv6/netfilter/ip6_tables.c:573 translate_table+0xc8b/0x1750 net/ipv6/netfilter/ip6_tables.c:735 do_replace net/ipv6/netfilter/ip6_tables.c:1153 [inline] do_ip6t_set_ctl+0x56e/0xb90 net/ipv6/netfilter/ip6_tables.c:1639 nf_setsockopt+0x83/0xe0 net/netfilter/nf_sockopt.c:101 page last free stack trace: reset_page_owner include/linux/page_owner.h:24 [inline] free_pages_prepare mm/page_alloc.c:1352 [inline] free_pcp_prepare+0x374/0x870 mm/page_alloc.c:1404 free_unref_page_prepare mm/page_alloc.c:3325 [inline] free_unref_page+0x19/0x690 mm/page_alloc.c:3404 kvfree+0x42/0x50 mm/util.c:613 rcu_do_batch kernel/rcu/tree.c:2527 [inline] rcu_core+0x7b1/0x1820 kernel/rcu/tree.c:2778 __do_softirq+0x29b/0x9c2 kernel/softirq.c:558 Memory state around the buggy address: ffff88801c1a7f00: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ffff88801c1a7f80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff >ffff88801c1a8000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ^ ffff88801c1a8080: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ffff88801c1a8100: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff Fixes: 2420b79f8c18 ("netfilter: debug: check for sorted array") Signed-off-by: Eric Dumazet Reported-by: syzbot Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 354cb472f386..8a77a3fd69bc 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -428,14 +428,15 @@ static int __nf_register_net_hook(struct net *net, int pf, p = nf_entry_dereference(*pp); new_hooks = nf_hook_entries_grow(p, reg); - if (!IS_ERR(new_hooks)) + if (!IS_ERR(new_hooks)) { + hooks_validate(new_hooks); rcu_assign_pointer(*pp, new_hooks); + } mutex_unlock(&nf_hook_mutex); if (IS_ERR(new_hooks)) return PTR_ERR(new_hooks); - hooks_validate(new_hooks); #ifdef CONFIG_NETFILTER_INGRESS if (nf_ingress_hook(reg, pf)) net_inc_ingress_queue(); -- cgit From 17a8f31bba7bac8cce4bd12bab50697da96e7710 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 28 Feb 2022 04:18:05 +0100 Subject: netfilter: egress: silence egress hook lockdep splats Netfilter assumes its called with rcu_read_lock held, but in egress hook case it may be called with BH readlock. This triggers lockdep splat. In order to avoid to change all rcu_dereference() to rcu_dereference_check(..., rcu_read_lock_bh_held()), wrap nf_hook_slow with read lock/unlock pair. Reported-by: Eric Dumazet Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_netdev.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/linux/netfilter_netdev.h b/include/linux/netfilter_netdev.h index b4dd96e4dc8d..e6487a691136 100644 --- a/include/linux/netfilter_netdev.h +++ b/include/linux/netfilter_netdev.h @@ -101,7 +101,11 @@ static inline struct sk_buff *nf_hook_egress(struct sk_buff *skb, int *rc, nf_hook_state_init(&state, NF_NETDEV_EGRESS, NFPROTO_NETDEV, dev, NULL, NULL, dev_net(dev), NULL); + + /* nf assumes rcu_read_lock, not just read_lock_bh */ + rcu_read_lock(); ret = nf_hook_slow(skb, &state, e, 0); + rcu_read_unlock(); if (ret == 1) { return skb; -- cgit From 1866aa0d0d6492bc2f8d22d0df49abaccf50cddd Mon Sep 17 00:00:00 2001 From: Sasha Neftin Date: Tue, 25 Jan 2022 19:31:23 +0200 Subject: e1000e: Fix possible HW unit hang after an s0ix exit Disable the OEM bit/Gig Disable/restart AN impact and disable the PHY LAN connected device (LCD) reset during power management flows. This fixes possible HW unit hangs on the s0ix exit on some corporate ADL platforms. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=214821 Fixes: 3e55d231716e ("e1000e: Add handshake with the CSME to support S0ix") Suggested-by: Dima Ruinskiy Suggested-by: Nir Efrati Signed-off-by: Sasha Neftin Tested-by: Kai-Heng Feng Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/e1000e/hw.h | 1 + drivers/net/ethernet/intel/e1000e/ich8lan.c | 4 ++++ drivers/net/ethernet/intel/e1000e/ich8lan.h | 1 + drivers/net/ethernet/intel/e1000e/netdev.c | 26 ++++++++++++++++++++++++++ 4 files changed, 32 insertions(+) diff --git a/drivers/net/ethernet/intel/e1000e/hw.h b/drivers/net/ethernet/intel/e1000e/hw.h index bcf680e83811..13382df2f2ef 100644 --- a/drivers/net/ethernet/intel/e1000e/hw.h +++ b/drivers/net/ethernet/intel/e1000e/hw.h @@ -630,6 +630,7 @@ struct e1000_phy_info { bool disable_polarity_correction; bool is_mdix; bool polarity_correction; + bool reset_disable; bool speed_downgraded; bool autoneg_wait_to_complete; }; diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.c b/drivers/net/ethernet/intel/e1000e/ich8lan.c index c908c84b86d2..e298da712758 100644 --- a/drivers/net/ethernet/intel/e1000e/ich8lan.c +++ b/drivers/net/ethernet/intel/e1000e/ich8lan.c @@ -2050,6 +2050,10 @@ static s32 e1000_check_reset_block_ich8lan(struct e1000_hw *hw) bool blocked = false; int i = 0; + /* Check the PHY (LCD) reset flag */ + if (hw->phy.reset_disable) + return true; + while ((blocked = !(er32(FWSM) & E1000_ICH_FWSM_RSPCIPHY)) && (i++ < 30)) usleep_range(10000, 11000); diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.h b/drivers/net/ethernet/intel/e1000e/ich8lan.h index 2504b11c3169..638a3ddd7ada 100644 --- a/drivers/net/ethernet/intel/e1000e/ich8lan.h +++ b/drivers/net/ethernet/intel/e1000e/ich8lan.h @@ -271,6 +271,7 @@ #define I217_CGFREG_ENABLE_MTA_RESET 0x0002 #define I217_MEMPWR PHY_REG(772, 26) #define I217_MEMPWR_DISABLE_SMB_RELEASE 0x0010 +#define I217_MEMPWR_MOEM 0x1000 /* Receive Address Initial CRC Calculation */ #define E1000_PCH_RAICC(_n) (0x05F50 + ((_n) * 4)) diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index a42aeb555f34..c5bdef3ffe26 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -6987,8 +6987,21 @@ static __maybe_unused int e1000e_pm_suspend(struct device *dev) struct net_device *netdev = pci_get_drvdata(to_pci_dev(dev)); struct e1000_adapter *adapter = netdev_priv(netdev); struct pci_dev *pdev = to_pci_dev(dev); + struct e1000_hw *hw = &adapter->hw; + u16 phy_data; int rc; + if (er32(FWSM) & E1000_ICH_FWSM_FW_VALID && + hw->mac.type >= e1000_pch_adp) { + /* Mask OEM Bits / Gig Disable / Restart AN (772_26[12] = 1) */ + e1e_rphy(hw, I217_MEMPWR, &phy_data); + phy_data |= I217_MEMPWR_MOEM; + e1e_wphy(hw, I217_MEMPWR, phy_data); + + /* Disable LCD reset */ + hw->phy.reset_disable = true; + } + e1000e_flush_lpic(pdev); e1000e_pm_freeze(dev); @@ -7010,6 +7023,8 @@ static __maybe_unused int e1000e_pm_resume(struct device *dev) struct net_device *netdev = pci_get_drvdata(to_pci_dev(dev)); struct e1000_adapter *adapter = netdev_priv(netdev); struct pci_dev *pdev = to_pci_dev(dev); + struct e1000_hw *hw = &adapter->hw; + u16 phy_data; int rc; /* Introduce S0ix implementation */ @@ -7020,6 +7035,17 @@ static __maybe_unused int e1000e_pm_resume(struct device *dev) if (rc) return rc; + if (er32(FWSM) & E1000_ICH_FWSM_FW_VALID && + hw->mac.type >= e1000_pch_adp) { + /* Unmask OEM Bits / Gig Disable / Restart AN 772_26[12] = 0 */ + e1e_rphy(hw, I217_MEMPWR, &phy_data); + phy_data &= ~I217_MEMPWR_MOEM; + e1e_wphy(hw, I217_MEMPWR, phy_data); + + /* Enable LCD reset */ + hw->phy.reset_disable = false; + } + return e1000e_pm_thaw(dev); } -- cgit From ffd24fa2fcc76ecb2e61e7a4ef8588177bcb42a6 Mon Sep 17 00:00:00 2001 From: Sasha Neftin Date: Thu, 3 Feb 2022 14:21:49 +0200 Subject: e1000e: Correct NVM checksum verification flow Update MAC type check e1000_pch_tgp because for e1000_pch_cnp, NVM checksum update is still possible. Emit a more detailed warning message. Bugzilla: https://bugzilla.opensuse.org/show_bug.cgi?id=1191663 Fixes: 4051f68318ca ("e1000e: Do not take care about recovery NVM checksum") Reported-by: Thomas Bogendoerfer Signed-off-by: Sasha Neftin Tested-by: Naama Meir Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/e1000e/ich8lan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.c b/drivers/net/ethernet/intel/e1000e/ich8lan.c index e298da712758..d60e2016d03c 100644 --- a/drivers/net/ethernet/intel/e1000e/ich8lan.c +++ b/drivers/net/ethernet/intel/e1000e/ich8lan.c @@ -4140,9 +4140,9 @@ static s32 e1000_validate_nvm_checksum_ich8lan(struct e1000_hw *hw) return ret_val; if (!(data & valid_csum_mask)) { - e_dbg("NVM Checksum Invalid\n"); + e_dbg("NVM Checksum valid bit not set\n"); - if (hw->mac.type < e1000_pch_cnp) { + if (hw->mac.type < e1000_pch_tgp) { data |= valid_csum_mask; ret_val = e1000_write_nvm(hw, word, 1, &data); if (ret_val) -- cgit From 7795686d573de0438bba6b2b344e6b203223c889 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Wed, 26 Jan 2022 12:02:04 +0100 Subject: pinctrl-sunxi: sunxi_pinctrl_gpio_direction_in/output: use correct offset The commit that sets the direction directly without calling pinctrl_gpio_direction(), forgot to add chip->base to the offset when calling sunxi_pmx_gpio_set_direction(). This caused failures for various Allwinner boards which have two GPIO blocks. Signed-off-by: Hans Verkuil Reported-by: 5kft <5kft@5kft.org> Suggested-by: 5kft <5kft@5kft.org> Reported-by: Corentin Labbe Fixes: 8df89a7cbc63 (pinctrl-sunxi: don't call pinctrl_gpio_direction()) Tested-by: Corentin Labbe Tested-by: Jernej Skrabec Acked-by: Jernej Skrabec Link: https://lore.kernel.org/r/0f536cd8-01db-5d16-2cec-ec6d19409a49@xs4all.nl Signed-off-by: Guenter Roeck [Picked from linux-next to pinctrl fixes] Signed-off-by: Linus Walleij --- drivers/pinctrl/sunxi/pinctrl-sunxi.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/pinctrl/sunxi/pinctrl-sunxi.c b/drivers/pinctrl/sunxi/pinctrl-sunxi.c index 80d6750c74a6..061323eab8b1 100644 --- a/drivers/pinctrl/sunxi/pinctrl-sunxi.c +++ b/drivers/pinctrl/sunxi/pinctrl-sunxi.c @@ -837,7 +837,8 @@ static int sunxi_pinctrl_gpio_direction_input(struct gpio_chip *chip, { struct sunxi_pinctrl *pctl = gpiochip_get_data(chip); - return sunxi_pmx_gpio_set_direction(pctl->pctl_dev, NULL, offset, true); + return sunxi_pmx_gpio_set_direction(pctl->pctl_dev, NULL, + chip->base + offset, true); } static int sunxi_pinctrl_gpio_get(struct gpio_chip *chip, unsigned offset) @@ -890,7 +891,8 @@ static int sunxi_pinctrl_gpio_direction_output(struct gpio_chip *chip, struct sunxi_pinctrl *pctl = gpiochip_get_data(chip); sunxi_pinctrl_gpio_set(chip, offset, value); - return sunxi_pmx_gpio_set_direction(pctl->pctl_dev, NULL, offset, false); + return sunxi_pmx_gpio_set_direction(pctl->pctl_dev, NULL, + chip->base + offset, false); } static int sunxi_pinctrl_gpio_of_xlate(struct gpio_chip *gc, -- cgit From bac129dbc6560dfeb634c03f0c08b78024e71915 Mon Sep 17 00:00:00 2001 From: Samuel Holland Date: Tue, 15 Feb 2022 22:00:36 -0600 Subject: pinctrl: sunxi: Use unique lockdep classes for IRQs This driver, like several others, uses a chained IRQ for each GPIO bank, and forwards .irq_set_wake to the GPIO bank's upstream IRQ. As a result, a call to irq_set_irq_wake() needs to lock both the upstream and downstream irq_desc's. Lockdep considers this to be a possible deadlock when the irq_desc's share lockdep classes, which they do by default: ============================================ WARNING: possible recursive locking detected 5.17.0-rc3-00394-gc849047c2473 #1 Not tainted -------------------------------------------- init/307 is trying to acquire lock: c2dfe27c (&irq_desc_lock_class){-.-.}-{2:2}, at: __irq_get_desc_lock+0x58/0xa0 but task is already holding lock: c3c0ac7c (&irq_desc_lock_class){-.-.}-{2:2}, at: __irq_get_desc_lock+0x58/0xa0 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&irq_desc_lock_class); lock(&irq_desc_lock_class); *** DEADLOCK *** May be due to missing lock nesting notation 4 locks held by init/307: #0: c1f29f18 (system_transition_mutex){+.+.}-{3:3}, at: __do_sys_reboot+0x90/0x23c #1: c20f7760 (&dev->mutex){....}-{3:3}, at: device_shutdown+0xf4/0x224 #2: c2e804d8 (&dev->mutex){....}-{3:3}, at: device_shutdown+0x104/0x224 #3: c3c0ac7c (&irq_desc_lock_class){-.-.}-{2:2}, at: __irq_get_desc_lock+0x58/0xa0 stack backtrace: CPU: 0 PID: 307 Comm: init Not tainted 5.17.0-rc3-00394-gc849047c2473 #1 Hardware name: Allwinner sun8i Family unwind_backtrace from show_stack+0x10/0x14 show_stack from dump_stack_lvl+0x68/0x90 dump_stack_lvl from __lock_acquire+0x1680/0x31a0 __lock_acquire from lock_acquire+0x148/0x3dc lock_acquire from _raw_spin_lock_irqsave+0x50/0x6c _raw_spin_lock_irqsave from __irq_get_desc_lock+0x58/0xa0 __irq_get_desc_lock from irq_set_irq_wake+0x2c/0x19c irq_set_irq_wake from irq_set_irq_wake+0x13c/0x19c [tail call from sunxi_pinctrl_irq_set_wake] irq_set_irq_wake from gpio_keys_suspend+0x80/0x1a4 gpio_keys_suspend from gpio_keys_shutdown+0x10/0x2c gpio_keys_shutdown from device_shutdown+0x180/0x224 device_shutdown from __do_sys_reboot+0x134/0x23c __do_sys_reboot from ret_fast_syscall+0x0/0x1c However, this can never deadlock because the upstream and downstream IRQs are never the same (nor do they even involve the same irqchip). Silence this erroneous lockdep splat by applying what appears to be the usual fix of moving the GPIO IRQs to separate lockdep classes. Fixes: a59c99d9eaf9 ("pinctrl: sunxi: Forward calls to irq_set_irq_wake") Reported-by: Guenter Roeck Signed-off-by: Samuel Holland Reviewed-by: Jernej Skrabec Tested-by: Guenter Roeck Link: https://lore.kernel.org/r/20220216040037.22730-1-samuel@sholland.org Signed-off-by: Linus Walleij --- drivers/pinctrl/sunxi/pinctrl-sunxi.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/pinctrl/sunxi/pinctrl-sunxi.c b/drivers/pinctrl/sunxi/pinctrl-sunxi.c index 061323eab8b1..1f401377ff60 100644 --- a/drivers/pinctrl/sunxi/pinctrl-sunxi.c +++ b/drivers/pinctrl/sunxi/pinctrl-sunxi.c @@ -36,6 +36,13 @@ #include "../core.h" #include "pinctrl-sunxi.h" +/* + * These lock classes tell lockdep that GPIO IRQs are in a different + * category than their parents, so it won't report false recursion. + */ +static struct lock_class_key sunxi_pinctrl_irq_lock_class; +static struct lock_class_key sunxi_pinctrl_irq_request_class; + static struct irq_chip sunxi_pinctrl_edge_irq_chip; static struct irq_chip sunxi_pinctrl_level_irq_chip; @@ -1557,6 +1564,8 @@ int sunxi_pinctrl_init_with_variant(struct platform_device *pdev, for (i = 0; i < (pctl->desc->irq_banks * IRQ_PER_BANK); i++) { int irqno = irq_create_mapping(pctl->domain, i); + irq_set_lockdep_class(irqno, &sunxi_pinctrl_irq_lock_class, + &sunxi_pinctrl_irq_request_class); irq_set_chip_and_handler(irqno, &sunxi_pinctrl_edge_irq_chip, handle_edge_irq); irq_set_chip_data(irqno, pctl); -- cgit From d176708ffc20332d1c730098d2b111e0b77ece82 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Mon, 28 Feb 2022 22:52:31 -0800 Subject: Input: goodix - use the new soc_intel_is_byt() helper Use the new soc_intel_is_byt() helper from linux/platform_data/x86/soc.h. Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20220131143539.109142-5-hdegoede@redhat.com Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/goodix.c | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/drivers/input/touchscreen/goodix.c b/drivers/input/touchscreen/goodix.c index a3bfc7a41679..3647147fd818 100644 --- a/drivers/input/touchscreen/goodix.c +++ b/drivers/input/touchscreen/goodix.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -805,21 +806,6 @@ static int goodix_reset(struct goodix_ts_data *ts) } #ifdef ACPI_GPIO_SUPPORT -#include -#include - -static const struct x86_cpu_id baytrail_cpu_ids[] = { - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT, X86_FEATURE_ANY, }, - {} -}; - -static inline bool is_byt(void) -{ - const struct x86_cpu_id *id = x86_match_cpu(baytrail_cpu_ids); - - return !!id; -} - static const struct acpi_gpio_params first_gpio = { 0, 0, false }; static const struct acpi_gpio_params second_gpio = { 1, 0, false }; @@ -903,7 +889,7 @@ static int goodix_add_acpi_gpio_mappings(struct goodix_ts_data *ts) dev_info(dev, "Using ACPI INTI and INTO methods for IRQ pin access\n"); ts->irq_pin_access_method = IRQ_PIN_ACCESS_ACPI_METHOD; gpio_mapping = acpi_goodix_reset_only_gpios; - } else if (is_byt() && ts->gpio_count == 2 && ts->gpio_int_idx == -1) { + } else if (soc_intel_is_byt() && ts->gpio_count == 2 && ts->gpio_int_idx == -1) { dev_info(dev, "No ACPI GpioInt resource, assuming that the GPIO order is reset, int\n"); ts->irq_pin_access_method = IRQ_PIN_ACCESS_ACPI_GPIO; gpio_mapping = acpi_goodix_int_last_gpios; -- cgit From d982992669733dd75520000c6057d8ee0725a363 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Mon, 28 Feb 2022 22:53:12 -0800 Subject: Input: goodix - workaround Cherry Trail devices with a bogus ACPI Interrupt() resource ACPI/x86 devices with a Cherry Trail SoC should have a GpioInt + a regular GPIO ACPI resource in their ACPI tables. Some CHT devices have a bug, where the also is bogus interrupt resource (likely copied from a previous Bay Trail based generation of the device). The i2c-core-acpi code will assign the bogus, non-working, interrupt resource to client->irq. Add a workaround to fix this up. BugLink: https://bugzilla.redhat.com/show_bug.cgi?id=2043960 Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20220228111613.363336-1-hdegoede@redhat.com Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/goodix.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/input/touchscreen/goodix.c b/drivers/input/touchscreen/goodix.c index 3647147fd818..752e8ba4fecb 100644 --- a/drivers/input/touchscreen/goodix.c +++ b/drivers/input/touchscreen/goodix.c @@ -864,7 +864,7 @@ static int goodix_add_acpi_gpio_mappings(struct goodix_ts_data *ts) const struct acpi_gpio_mapping *gpio_mapping = NULL; struct device *dev = &ts->client->dev; LIST_HEAD(resources); - int ret; + int irq, ret; ts->gpio_count = 0; ts->gpio_int_idx = -1; @@ -877,6 +877,20 @@ static int goodix_add_acpi_gpio_mappings(struct goodix_ts_data *ts) acpi_dev_free_resource_list(&resources); + /* + * CHT devices should have a GpioInt + a regular GPIO ACPI resource. + * Some CHT devices have a bug (where the also is bogus Interrupt + * resource copied from a previous BYT based generation). i2c-core-acpi + * will use the non-working Interrupt resource, fix this up. + */ + if (soc_intel_is_cht() && ts->gpio_count == 2 && ts->gpio_int_idx != -1) { + irq = acpi_dev_gpio_irq_get(ACPI_COMPANION(dev), 0); + if (irq > 0 && irq != ts->client->irq) { + dev_warn(dev, "Overriding IRQ %d -> %d\n", ts->client->irq, irq); + ts->client->irq = irq; + } + } + if (ts->gpio_count == 2 && ts->gpio_int_idx == 0) { ts->irq_pin_access_method = IRQ_PIN_ACCESS_ACPI_GPIO; gpio_mapping = acpi_goodix_int_first_gpios; -- cgit From c432cd598a185afefba1ac3b0ee226f222f71341 Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Fri, 28 Jan 2022 15:20:56 +0100 Subject: soc: mediatek: mt8192-mmsys: Fix dither to dsi0 path's input sel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In commit d687e056a18f ("soc: mediatek: mmsys: Add mt8192 mmsys routing table"), the mmsys routing table for mt8192 was introduced but the input selector for DITHER->DSI0 has no value assigned to it. This means that we are clearing bit 0 instead of setting it, blocking communication between these two blocks; due to that, any display that is connected to DSI0 will not work, as no data will go through. The effect of that issue is that, during bootup, the DRM will block for some time, while atomically waiting for a vblank that never happens; later, the situation doesn't get better, leaving the display in a non-functional state. To fix this issue, fix the route entry in the table by assigning the dither input selector to MT8192_DISP_DSI0_SEL_IN. Fixes: d687e056a18f ("soc: mediatek: mmsys: Add mt8192 mmsys routing table") Signed-off-by: AngeloGioacchino Del Regno Tested-by: Alyssa Rosenzweig Reviewed-by: Nícolas F. R. A. Prado Link: https://lore.kernel.org/r/20220128142056.359900-1-angelogioacchino.delregno@collabora.com Signed-off-by: Matthias Brugger --- drivers/soc/mediatek/mt8192-mmsys.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/soc/mediatek/mt8192-mmsys.h b/drivers/soc/mediatek/mt8192-mmsys.h index 6f0a57044a7b..6aae0b12b6ff 100644 --- a/drivers/soc/mediatek/mt8192-mmsys.h +++ b/drivers/soc/mediatek/mt8192-mmsys.h @@ -53,7 +53,8 @@ static const struct mtk_mmsys_routes mmsys_mt8192_routing_table[] = { MT8192_AAL0_SEL_IN_CCORR0 }, { DDP_COMPONENT_DITHER, DDP_COMPONENT_DSI0, - MT8192_DISP_DSI0_SEL_IN, MT8192_DSI0_SEL_IN_DITHER0 + MT8192_DISP_DSI0_SEL_IN, MT8192_DSI0_SEL_IN_DITHER0, + MT8192_DSI0_SEL_IN_DITHER0 }, { DDP_COMPONENT_RDMA0, DDP_COMPONENT_COLOR0, MT8192_DISP_RDMA0_SOUT_SEL, MT8192_RDMA0_SOUT_COLOR0, -- cgit From 5d8965704fe5662e2e4a7e4424a2cbe53e182670 Mon Sep 17 00:00:00 2001 From: Ilya Lipnitskiy Date: Mon, 28 Feb 2022 17:15:07 -0800 Subject: MIPS: ralink: mt7621: use bitwise NOT instead of logical It was the intention to reverse the bits, not make them all zero by using logical NOT operator. Fixes: cc19db8b312a ("MIPS: ralink: mt7621: do memory detection on KSEG1") Suggested-by: Chuanhong Guo Signed-off-by: Ilya Lipnitskiy Reviewed-by: Sergio Paracuellos Signed-off-by: Thomas Bogendoerfer --- arch/mips/ralink/mt7621.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/ralink/mt7621.c b/arch/mips/ralink/mt7621.c index 12c8808e0dea..fb0565bc34fd 100644 --- a/arch/mips/ralink/mt7621.c +++ b/arch/mips/ralink/mt7621.c @@ -69,7 +69,7 @@ static bool __init mt7621_addr_wraparound_test(phys_addr_t size) __raw_writel(MT7621_MEM_TEST_PATTERN, dm); if (__raw_readl(dm) != __raw_readl(dm + size)) return false; - __raw_writel(!MT7621_MEM_TEST_PATTERN, dm); + __raw_writel(~MT7621_MEM_TEST_PATTERN, dm); return __raw_readl(dm) == __raw_readl(dm + size); } -- cgit From 50bb467c9e76743fbc8441d29113cdad62dbc4fe Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Fri, 18 Feb 2022 09:38:58 +0000 Subject: rfkill: define rfill_soft_blocked() if !RFKILL If CONFIG_RFKILL is not set, the Intel WiFi driver will not build the iw_mvm driver part due to the missing rfill_soft_blocked() call. Adding a inline declaration of rfill_soft_blocked() if CONFIG_RFKILL=n fixes the following error: drivers/net/wireless/intel/iwlwifi/mvm/mvm.h: In function 'iwl_mvm_mei_set_sw_rfkill_state': drivers/net/wireless/intel/iwlwifi/mvm/mvm.h:2215:38: error: implicit declaration of function 'rfkill_soft_blocked'; did you mean 'rfkill_blocked'? [-Werror=implicit-function-declaration] 2215 | mvm->hw_registered ? rfkill_soft_blocked(mvm->hw->wiphy->rfkill) : false; | ^~~~~~~~~~~~~~~~~~~ | rfkill_blocked Signed-off-by: Ben Dooks Reported-by: Neill Whillans Fixes: 5bc9a9dd7535 ("rfkill: allow to get the software rfkill state") Link: https://lore.kernel.org/r/20220218093858.1245677-1-ben.dooks@codethink.co.uk Signed-off-by: Johannes Berg --- include/linux/rfkill.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/rfkill.h b/include/linux/rfkill.h index c35f3962dc4f..373003ace639 100644 --- a/include/linux/rfkill.h +++ b/include/linux/rfkill.h @@ -308,6 +308,11 @@ static inline bool rfkill_blocked(struct rfkill *rfkill) return false; } +static inline bool rfkill_soft_blocked(struct rfkill *rfkill) +{ + return false; +} + static inline enum rfkill_type rfkill_find_type(const char *name) { return RFKILL_TYPE_ALL; -- cgit From 1db5fcbba2631277b78d7f8aff99c9607d29f6d8 Mon Sep 17 00:00:00 2001 From: Golan Ben Ami Date: Tue, 1 Mar 2022 09:29:26 +0200 Subject: iwlwifi: don't advertise TWT support Some APs misbehave when TWT is used and cause our firmware to crash. We don't know a reasonable way to detect and work around this problem in the FW yet. To prevent these crashes, disable TWT in the driver by stopping to advertise TWT support. Link: https://bugzilla.kernel.org/show_bug.cgi?id=215523 Signed-off-by: Golan Ben Ami [reworded the commit message] Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/20220301072926.153969-1-luca@coelho.fi Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c | 3 +-- drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c index dd58c8f9aa11..04addf964d83 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c +++ b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c @@ -553,8 +553,7 @@ static const struct ieee80211_sband_iftype_data iwl_he_capa[] = { .has_he = true, .he_cap_elem = { .mac_cap_info[0] = - IEEE80211_HE_MAC_CAP0_HTC_HE | - IEEE80211_HE_MAC_CAP0_TWT_REQ, + IEEE80211_HE_MAC_CAP0_HTC_HE, .mac_cap_info[1] = IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_16US | IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_8, diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c index 4ac599f6ad22..709a3df57b10 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c @@ -226,7 +226,6 @@ static const u8 he_if_types_ext_capa_sta[] = { [0] = WLAN_EXT_CAPA1_EXT_CHANNEL_SWITCHING, [2] = WLAN_EXT_CAPA3_MULTI_BSSID_SUPPORT, [7] = WLAN_EXT_CAPA8_OPMODE_NOTIF, - [9] = WLAN_EXT_CAPA10_TWT_REQUESTER_SUPPORT, }; static const struct wiphy_iftype_ext_capab he_iftypes_ext_capa[] = { -- cgit From 5a6248c0a22352f09ea041665d3bd3e18f6f872c Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 22 Feb 2022 19:06:30 -0800 Subject: iwlwifi: mvm: check debugfs_dir ptr before use When "debugfs=off" is used on the kernel command line, iwiwifi's mvm module uses an invalid/unchecked debugfs_dir pointer and causes a BUG: BUG: kernel NULL pointer dereference, address: 000000000000004f #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP CPU: 1 PID: 503 Comm: modprobe Tainted: G W 5.17.0-rc5 #7 Hardware name: Dell Inc. Inspiron 15 5510/076F7Y, BIOS 2.4.1 11/05/2021 RIP: 0010:iwl_mvm_dbgfs_register+0x692/0x700 [iwlmvm] Code: 69 a0 be 80 01 00 00 48 c7 c7 50 73 6a a0 e8 95 cf ee e0 48 8b 83 b0 1e 00 00 48 c7 c2 54 73 6a a0 be 64 00 00 00 48 8d 7d 8c <48> 8b 48 50 e8 15 22 07 e1 48 8b 43 28 48 8d 55 8c 48 c7 c7 5f 73 RSP: 0018:ffffc90000a0ba68 EFLAGS: 00010246 RAX: ffffffffffffffff RBX: ffff88817d6e3328 RCX: ffff88817d6e3328 RDX: ffffffffa06a7354 RSI: 0000000000000064 RDI: ffffc90000a0ba6c RBP: ffffc90000a0bae0 R08: ffffffff824e4880 R09: ffffffffa069d620 R10: ffffc90000a0ba00 R11: ffffffffffffffff R12: 0000000000000000 R13: ffffc90000a0bb28 R14: ffff88817d6e3328 R15: ffff88817d6e3320 FS: 00007f64dd92d740(0000) GS:ffff88847f640000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000000000004f CR3: 000000016fc79001 CR4: 0000000000770ee0 PKRU: 55555554 Call Trace: ? iwl_mvm_mac_setup_register+0xbdc/0xda0 [iwlmvm] iwl_mvm_start_post_nvm+0x71/0x100 [iwlmvm] iwl_op_mode_mvm_start+0xab8/0xb30 [iwlmvm] _iwl_op_mode_start+0x6f/0xd0 [iwlwifi] iwl_opmode_register+0x6a/0xe0 [iwlwifi] ? 0xffffffffa0231000 iwl_mvm_init+0x35/0x1000 [iwlmvm] ? 0xffffffffa0231000 do_one_initcall+0x5a/0x1b0 ? kmem_cache_alloc+0x1e5/0x2f0 ? do_init_module+0x1e/0x220 do_init_module+0x48/0x220 load_module+0x2602/0x2bc0 ? __kernel_read+0x145/0x2e0 ? kernel_read_file+0x229/0x290 __do_sys_finit_module+0xc5/0x130 ? __do_sys_finit_module+0xc5/0x130 __x64_sys_finit_module+0x13/0x20 do_syscall_64+0x38/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f64dda564dd Code: 5b 41 5c c3 66 0f 1f 84 00 00 00 00 00 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 1b 29 0f 00 f7 d8 64 89 01 48 RSP: 002b:00007ffdba393f88 EFLAGS: 00000246 ORIG_RAX: 0000000000000139 RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f64dda564dd RDX: 0000000000000000 RSI: 00005575399e2ab2 RDI: 0000000000000001 RBP: 000055753a91c5e0 R08: 0000000000000000 R09: 0000000000000002 R10: 0000000000000001 R11: 0000000000000246 R12: 00005575399e2ab2 R13: 000055753a91ceb0 R14: 0000000000000000 R15: 000055753a923018 Modules linked in: btintel(+) btmtk bluetooth vfat snd_hda_codec_hdmi fat snd_hda_codec_realtek snd_hda_codec_generic iwlmvm(+) snd_sof_pci_intel_tgl mac80211 snd_sof_intel_hda_common soundwire_intel soundwire_generic_allocation soundwire_cadence soundwire_bus snd_sof_intel_hda snd_sof_pci snd_sof snd_sof_xtensa_dsp snd_soc_hdac_hda snd_hda_ext_core snd_soc_acpi_intel_match snd_soc_acpi snd_soc_core btrfs snd_compress snd_hda_intel snd_intel_dspcfg snd_intel_sdw_acpi snd_hda_codec raid6_pq iwlwifi snd_hda_core snd_pcm snd_timer snd soundcore cfg80211 intel_ish_ipc(+) thunderbolt rfkill intel_ishtp ucsi_acpi wmi i2c_hid_acpi i2c_hid evdev CR2: 000000000000004f ---[ end trace 0000000000000000 ]--- Check the debugfs_dir pointer for an error before using it. Fixes: 8c082a99edb9 ("iwlwifi: mvm: simplify iwl_mvm_dbgfs_register") Signed-off-by: Randy Dunlap Cc: Luca Coelho Cc: linux-wireless@vger.kernel.org Cc: Kalle Valo Cc: Greg Kroah-Hartman Cc: Emmanuel Grumbach Cc: stable Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20220223030630.23241-1-rdunlap@infradead.org [change to make both conditional] Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mvm/debugfs.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/debugfs.c b/drivers/net/wireless/intel/iwlwifi/mvm/debugfs.c index 63432c24eb59..445c94adb076 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/debugfs.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/debugfs.c @@ -5,6 +5,7 @@ * Copyright (C) 2016-2017 Intel Deutschland GmbH */ #include +#include #include #include @@ -1857,7 +1858,6 @@ void iwl_mvm_sta_add_debugfs(struct ieee80211_hw *hw, void iwl_mvm_dbgfs_register(struct iwl_mvm *mvm) { struct dentry *bcast_dir __maybe_unused; - char buf[100]; spin_lock_init(&mvm->drv_stats_lock); @@ -1939,6 +1939,11 @@ void iwl_mvm_dbgfs_register(struct iwl_mvm *mvm) * Create a symlink with mac80211. It will be removed when mac80211 * exists (before the opmode exists which removes the target.) */ - snprintf(buf, 100, "../../%pd2", mvm->debugfs_dir->d_parent); - debugfs_create_symlink("iwlwifi", mvm->hw->wiphy->debugfsdir, buf); + if (!IS_ERR(mvm->debugfs_dir)) { + char buf[100]; + + snprintf(buf, 100, "../../%pd2", mvm->debugfs_dir->d_parent); + debugfs_create_symlink("iwlwifi", mvm->hw->wiphy->debugfsdir, + buf); + } } -- cgit From 6ad27f522cb3b210476daf63ce6ddb6568c0508b Mon Sep 17 00:00:00 2001 From: Jiasheng Jiang Date: Tue, 1 Mar 2022 18:00:20 +0800 Subject: nl80211: Handle nla_memdup failures in handle_nan_filter As there's potential for failure of the nla_memdup(), check the return value. Fixes: a442b761b24b ("cfg80211: add add_nan_func / del_nan_func") Signed-off-by: Jiasheng Jiang Link: https://lore.kernel.org/r/20220301100020.3801187-1-jiasheng@iscas.ac.cn Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 578bff9c378b..b1909ce2b739 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -13411,6 +13411,9 @@ static int handle_nan_filter(struct nlattr *attr_filter, i = 0; nla_for_each_nested(attr, attr_filter, rem) { filter[i].filter = nla_memdup(attr, GFP_KERNEL); + if (!filter[i].filter) + goto err; + filter[i].len = nla_len(attr); i++; } @@ -13423,6 +13426,15 @@ static int handle_nan_filter(struct nlattr *attr_filter, } return 0; + +err: + i = 0; + nla_for_each_nested(attr, attr_filter, rem) { + kfree(filter[i].filter); + i++; + } + kfree(filter); + return -ENOMEM; } static int nl80211_nan_add_func(struct sk_buff *skb, -- cgit From 94d9864cc86f572f881db9b842a78e9d075493ae Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 24 Feb 2022 10:39:34 +0100 Subject: mac80211: treat some SAE auth steps as final When we get anti-clogging token required (added by the commit mentioned below), or the other status codes added by the later commit 4e56cde15f7d ("mac80211: Handle special status codes in SAE commit") we currently just pretend (towards the internal state machine of authentication) that we didn't receive anything. This has the undesirable consequence of retransmitting the prior frame, which is not expected, because the timer is still armed. If we just disarm the timer at that point, it would result in the undesirable side effect of being in this state indefinitely if userspace crashes, or so. So to fix this, reset the timer and set a new auth_data->waiting in order to have no more retransmissions, but to have the data destroyed when the timer actually fires, which will only happen if userspace didn't continue (i.e. crashed or abandoned it.) Fixes: a4055e74a2ff ("mac80211: Don't destroy auth data in case of anti-clogging") Reported-by: Jouni Malinen Link: https://lore.kernel.org/r/20220224103932.75964e1d7932.Ia487f91556f29daae734bf61f8181404642e1eec@changeid Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 2 +- net/mac80211/mlme.c | 16 ++++++++++++---- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 330ea62231fa..e87bccaab561 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -376,7 +376,7 @@ struct ieee80211_mgd_auth_data { u8 key[WLAN_KEY_LEN_WEP104]; u8 key_len, key_idx; - bool done; + bool done, waiting; bool peer_confirmed; bool timeout_started; diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index e5ccf17618ab..744842c4513b 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -37,6 +37,7 @@ #define IEEE80211_AUTH_TIMEOUT_SAE (HZ * 2) #define IEEE80211_AUTH_MAX_TRIES 3 #define IEEE80211_AUTH_WAIT_ASSOC (HZ * 5) +#define IEEE80211_AUTH_WAIT_SAE_RETRY (HZ * 2) #define IEEE80211_ASSOC_TIMEOUT (HZ / 5) #define IEEE80211_ASSOC_TIMEOUT_LONG (HZ / 2) #define IEEE80211_ASSOC_TIMEOUT_SHORT (HZ / 10) @@ -3011,8 +3012,15 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, (status_code == WLAN_STATUS_ANTI_CLOG_REQUIRED || (auth_transaction == 1 && (status_code == WLAN_STATUS_SAE_HASH_TO_ELEMENT || - status_code == WLAN_STATUS_SAE_PK)))) + status_code == WLAN_STATUS_SAE_PK)))) { + /* waiting for userspace now */ + ifmgd->auth_data->waiting = true; + ifmgd->auth_data->timeout = + jiffies + IEEE80211_AUTH_WAIT_SAE_RETRY; + ifmgd->auth_data->timeout_started = true; + run_again(sdata, ifmgd->auth_data->timeout); goto notify_driver; + } sdata_info(sdata, "%pM denied authentication (status %d)\n", mgmt->sa, status_code); @@ -4603,10 +4611,10 @@ void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata) if (ifmgd->auth_data && ifmgd->auth_data->timeout_started && time_after(jiffies, ifmgd->auth_data->timeout)) { - if (ifmgd->auth_data->done) { + if (ifmgd->auth_data->done || ifmgd->auth_data->waiting) { /* - * ok ... we waited for assoc but userspace didn't, - * so let's just kill the auth data + * ok ... we waited for assoc or continuation but + * userspace didn't do it, so kill the auth data */ ieee80211_destroy_auth_data(sdata, false); } else if (ieee80211_auth(sdata)) { -- cgit From 747670fd9a2d1b7774030dba65ca022ba442ce71 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 25 Feb 2022 14:02:41 +0100 Subject: netfilter: nf_queue: don't assume sk is full socket There is no guarantee that state->sk refers to a full socket. If refcount transitions to 0, sock_put calls sk_free which then ends up with garbage fields. I'd like to thank Oleksandr Natalenko and Jiri Benc for considerable debug work and pointing out state->sk oddities. Fixes: ca6fb0651883 ("tcp: attach SYNACK messages to request sockets instead of listener") Tested-by: Oleksandr Natalenko Signed-off-by: Florian Westphal --- net/netfilter/nf_queue.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index 6d12afabfe8a..5ab0680db445 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -46,6 +46,15 @@ void nf_unregister_queue_handler(void) } EXPORT_SYMBOL(nf_unregister_queue_handler); +static void nf_queue_sock_put(struct sock *sk) +{ +#ifdef CONFIG_INET + sock_gen_put(sk); +#else + sock_put(sk); +#endif +} + static void nf_queue_entry_release_refs(struct nf_queue_entry *entry) { struct nf_hook_state *state = &entry->state; @@ -54,7 +63,7 @@ static void nf_queue_entry_release_refs(struct nf_queue_entry *entry) dev_put(state->in); dev_put(state->out); if (state->sk) - sock_put(state->sk); + nf_queue_sock_put(state->sk); #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) dev_put(entry->physin); -- cgit From 2e78855d311c401083df9776aa450d32d716e83e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 25 Feb 2022 12:01:23 +0100 Subject: selftests: netfilter: add nfqueue TCP_NEW_SYN_RECV socket race test causes: BUG: KASAN: slab-out-of-bounds in sk_free+0x25/0x80 Write of size 4 at addr ffff888106df0284 by task nf-queue/1459 sk_free+0x25/0x80 nf_queue_entry_release_refs+0x143/0x1a0 nf_reinject+0x233/0x770 ... without 'netfilter: nf_queue: don't assume sk is full socket'. Signed-off-by: Florian Westphal --- tools/testing/selftests/netfilter/.gitignore | 1 + tools/testing/selftests/netfilter/Makefile | 2 +- tools/testing/selftests/netfilter/connect_close.c | 136 ++++++++++++++++++++++ tools/testing/selftests/netfilter/nft_queue.sh | 19 +++ 4 files changed, 157 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/netfilter/connect_close.c diff --git a/tools/testing/selftests/netfilter/.gitignore b/tools/testing/selftests/netfilter/.gitignore index 8448f74adfec..4cb887b57413 100644 --- a/tools/testing/selftests/netfilter/.gitignore +++ b/tools/testing/selftests/netfilter/.gitignore @@ -1,2 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only nf-queue +connect_close diff --git a/tools/testing/selftests/netfilter/Makefile b/tools/testing/selftests/netfilter/Makefile index e4f845dd942b..7e81c9a7fff9 100644 --- a/tools/testing/selftests/netfilter/Makefile +++ b/tools/testing/selftests/netfilter/Makefile @@ -9,6 +9,6 @@ TEST_PROGS := nft_trans_stress.sh nft_fib.sh nft_nat.sh bridge_brouter.sh \ conntrack_vrf.sh nft_synproxy.sh LDLIBS = -lmnl -TEST_GEN_FILES = nf-queue +TEST_GEN_FILES = nf-queue connect_close include ../lib.mk diff --git a/tools/testing/selftests/netfilter/connect_close.c b/tools/testing/selftests/netfilter/connect_close.c new file mode 100644 index 000000000000..1c3b0add54c4 --- /dev/null +++ b/tools/testing/selftests/netfilter/connect_close.c @@ -0,0 +1,136 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include + +#include +#include + +#define PORT 12345 +#define RUNTIME 10 + +static struct { + unsigned int timeout; + unsigned int port; +} opts = { + .timeout = RUNTIME, + .port = PORT, +}; + +static void handler(int sig) +{ + _exit(sig == SIGALRM ? 0 : 1); +} + +static void set_timeout(void) +{ + struct sigaction action = { + .sa_handler = handler, + }; + + sigaction(SIGALRM, &action, NULL); + + alarm(opts.timeout); +} + +static void do_connect(const struct sockaddr_in *dst) +{ + int s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + + if (s >= 0) + fcntl(s, F_SETFL, O_NONBLOCK); + + connect(s, (struct sockaddr *)dst, sizeof(*dst)); + close(s); +} + +static void do_accept(const struct sockaddr_in *src) +{ + int c, one = 1, s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + + if (s < 0) + return; + + setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one)); + setsockopt(s, SOL_SOCKET, SO_REUSEPORT, &one, sizeof(one)); + + bind(s, (struct sockaddr *)src, sizeof(*src)); + + listen(s, 16); + + c = accept(s, NULL, NULL); + if (c >= 0) + close(c); + + close(s); +} + +static int accept_loop(void) +{ + struct sockaddr_in src = { + .sin_family = AF_INET, + .sin_port = htons(opts.port), + }; + + inet_pton(AF_INET, "127.0.0.1", &src.sin_addr); + + set_timeout(); + + for (;;) + do_accept(&src); + + return 1; +} + +static int connect_loop(void) +{ + struct sockaddr_in dst = { + .sin_family = AF_INET, + .sin_port = htons(opts.port), + }; + + inet_pton(AF_INET, "127.0.0.1", &dst.sin_addr); + + set_timeout(); + + for (;;) + do_connect(&dst); + + return 1; +} + +static void parse_opts(int argc, char **argv) +{ + int c; + + while ((c = getopt(argc, argv, "t:p:")) != -1) { + switch (c) { + case 't': + opts.timeout = atoi(optarg); + break; + case 'p': + opts.port = atoi(optarg); + break; + } + } +} + +int main(int argc, char *argv[]) +{ + pid_t p; + + parse_opts(argc, argv); + + p = fork(); + if (p < 0) + return 111; + + if (p > 0) + return accept_loop(); + + return connect_loop(); +} diff --git a/tools/testing/selftests/netfilter/nft_queue.sh b/tools/testing/selftests/netfilter/nft_queue.sh index 7d27f1f3bc01..e12729753351 100755 --- a/tools/testing/selftests/netfilter/nft_queue.sh +++ b/tools/testing/selftests/netfilter/nft_queue.sh @@ -113,6 +113,7 @@ table inet $name { chain output { type filter hook output priority $prio; policy accept; tcp dport 12345 queue num 3 + tcp sport 23456 queue num 3 jump nfq } chain post { @@ -296,6 +297,23 @@ test_tcp_localhost() wait 2>/dev/null } +test_tcp_localhost_connectclose() +{ + tmpfile=$(mktemp) || exit 1 + + ip netns exec ${nsrouter} ./connect_close -p 23456 -t $timeout & + + ip netns exec ${nsrouter} ./nf-queue -q 3 -t $timeout & + local nfqpid=$! + + sleep 1 + rm -f "$tmpfile" + + wait $rpid + [ $? -eq 0 ] && echo "PASS: tcp via loopback with connect/close" + wait 2>/dev/null +} + test_tcp_localhost_requeue() { ip netns exec ${nsrouter} nft -f /dev/stdin < Date: Mon, 28 Feb 2022 06:22:22 +0100 Subject: netfilter: nf_queue: fix possible use-after-free Eric Dumazet says: The sock_hold() side seems suspect, because there is no guarantee that sk_refcnt is not already 0. On failure, we cannot queue the packet and need to indicate an error. The packet will be dropped by the caller. v2: split skb prefetch hunk into separate change Fixes: 271b72c7fa82c ("udp: RCU handling for Unicast packets.") Reported-by: Eric Dumazet Reviewed-by: Eric Dumazet Signed-off-by: Florian Westphal --- include/net/netfilter/nf_queue.h | 2 +- net/netfilter/nf_queue.c | 13 +++++++++---- net/netfilter/nfnetlink_queue.c | 12 +++++++++--- 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h index 9eed51e920e8..980daa6e1e3a 100644 --- a/include/net/netfilter/nf_queue.h +++ b/include/net/netfilter/nf_queue.h @@ -37,7 +37,7 @@ void nf_register_queue_handler(const struct nf_queue_handler *qh); void nf_unregister_queue_handler(void); void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict); -void nf_queue_entry_get_refs(struct nf_queue_entry *entry); +bool nf_queue_entry_get_refs(struct nf_queue_entry *entry); void nf_queue_entry_free(struct nf_queue_entry *entry); static inline void init_hashrandom(u32 *jhash_initval) diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index 5ab0680db445..e39549c55945 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -96,19 +96,21 @@ static void __nf_queue_entry_init_physdevs(struct nf_queue_entry *entry) } /* Bump dev refs so they don't vanish while packet is out */ -void nf_queue_entry_get_refs(struct nf_queue_entry *entry) +bool nf_queue_entry_get_refs(struct nf_queue_entry *entry) { struct nf_hook_state *state = &entry->state; + if (state->sk && !refcount_inc_not_zero(&state->sk->sk_refcnt)) + return false; + dev_hold(state->in); dev_hold(state->out); - if (state->sk) - sock_hold(state->sk); #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) dev_hold(entry->physin); dev_hold(entry->physout); #endif + return true; } EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs); @@ -196,7 +198,10 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state, __nf_queue_entry_init_physdevs(entry); - nf_queue_entry_get_refs(entry); + if (!nf_queue_entry_get_refs(entry)) { + kfree(entry); + return -ENOTCONN; + } switch (entry->state.pf) { case AF_INET: diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index ea2d9c2a44cf..64a6acb6aeae 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -710,9 +710,15 @@ static struct nf_queue_entry * nf_queue_entry_dup(struct nf_queue_entry *e) { struct nf_queue_entry *entry = kmemdup(e, e->size, GFP_ATOMIC); - if (entry) - nf_queue_entry_get_refs(entry); - return entry; + + if (!entry) + return NULL; + + if (nf_queue_entry_get_refs(entry)) + return entry; + + kfree(entry); + return NULL; } #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) -- cgit From 3b836da4081fa585cf6c392f62557496f2cb0efe Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 1 Mar 2022 00:46:19 +0100 Subject: netfilter: nf_queue: handle socket prefetch In case someone combines bpf socket assign and nf_queue, then we will queue an skb who references a struct sock that did not have its reference count incremented. As we leave rcu protection, there is no guarantee that skb->sk is still valid. For refcount-less skb->sk case, try to increment the reference count and then override the destructor. In case of failure we have two choices: orphan the skb and 'delete' preselect or let nf_queue() drop the packet. Do the latter, it should not happen during normal operation. Fixes: cf7fbe660f2d ("bpf: Add socket assign support") Acked-by: Joe Stringer Signed-off-by: Florian Westphal --- net/netfilter/nf_queue.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index e39549c55945..63d1516816b1 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -180,6 +180,18 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state, break; } + if (skb_sk_is_prefetched(skb)) { + struct sock *sk = skb->sk; + + if (!sk_is_refcounted(sk)) { + if (!refcount_inc_not_zero(&sk->sk_refcnt)) + return -ENOTCONN; + + /* drop refcount on skb_orphan */ + skb->destructor = sock_edemux; + } + } + entry = kmalloc(sizeof(*entry) + route_key_size, GFP_ATOMIC); if (!entry) return -ENOMEM; -- cgit From 4ff2980b6bd2aa6b4ded3ce3b7c0ccfab29980af Mon Sep 17 00:00:00 2001 From: Lina Wang Date: Sat, 26 Feb 2022 15:48:01 +0800 Subject: xfrm: fix tunnel model fragmentation behavior in tunnel mode, if outer interface(ipv4) is less, it is easily to let inner IPV6 mtu be less than 1280. If so, a Packet Too Big ICMPV6 message is received. When send again, packets are fragmentized with 1280, they are still rejected with ICMPV6(Packet Too Big) by xfrmi_xmit2(). According to RFC4213 Section3.2.2: if (IPv4 path MTU - 20) is less than 1280 if packet is larger than 1280 bytes Send ICMPv6 "packet too big" with MTU=1280 Drop packet else Encapsulate but do not set the Don't Fragment flag in the IPv4 header. The resulting IPv4 packet might be fragmented by the IPv4 layer on the encapsulator or by some router along the IPv4 path. endif else if packet is larger than (IPv4 path MTU - 20) Send ICMPv6 "packet too big" with MTU = (IPv4 path MTU - 20). Drop packet. else Encapsulate and set the Don't Fragment flag in the IPv4 header. endif endif Packets should be fragmentized with ipv4 outer interface, so change it. After it is fragemtized with ipv4, there will be double fragmenation. No.48 & No.51 are ipv6 fragment packets, No.48 is double fragmentized, then tunneled with IPv4(No.49& No.50), which obey spec. And received peer cannot decrypt it rightly. 48 2002::10 2002::11 1296(length) IPv6 fragment (off=0 more=y ident=0xa20da5bc nxt=50) 49 0x0000 (0) 2002::10 2002::11 1304 IPv6 fragment (off=0 more=y ident=0x7448042c nxt=44) 50 0x0000 (0) 2002::10 2002::11 200 ESP (SPI=0x00035000) 51 2002::10 2002::11 180 Echo (ping) request 52 0x56dc 2002::10 2002::11 248 IPv6 fragment (off=1232 more=n ident=0xa20da5bc nxt=50) xfrm6_noneed_fragment has fixed above issues. Finally, it acted like below: 1 0x6206 192.168.1.138 192.168.1.1 1316 Fragmented IP protocol (proto=Encap Security Payload 50, off=0, ID=6206) [Reassembled in #2] 2 0x6206 2002::10 2002::11 88 IPv6 fragment (off=0 more=y ident=0x1f440778 nxt=50) 3 0x0000 2002::10 2002::11 248 ICMPv6 Echo (ping) request Signed-off-by: Lina Wang Signed-off-by: Steffen Klassert --- net/ipv6/xfrm6_output.c | 16 ++++++++++++++++ net/xfrm/xfrm_interface.c | 5 ++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index d0d280077721..ad07904642ca 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -45,6 +45,19 @@ static int __xfrm6_output_finish(struct net *net, struct sock *sk, struct sk_buf return xfrm_output(sk, skb); } +static int xfrm6_noneed_fragment(struct sk_buff *skb) +{ + struct frag_hdr *fh; + u8 prevhdr = ipv6_hdr(skb)->nexthdr; + + if (prevhdr != NEXTHDR_FRAGMENT) + return 0; + fh = (struct frag_hdr *)(skb->data + sizeof(struct ipv6hdr)); + if (fh->nexthdr == NEXTHDR_ESP || fh->nexthdr == NEXTHDR_AUTH) + return 1; + return 0; +} + static int __xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); @@ -73,6 +86,9 @@ static int __xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb) xfrm6_local_rxpmtu(skb, mtu); kfree_skb(skb); return -EMSGSIZE; + } else if (toobig && xfrm6_noneed_fragment(skb)) { + skb->ignore_df = 1; + goto skip_frag; } else if (!skb->ignore_df && toobig && skb->sk) { xfrm_local_error(skb, mtu); kfree_skb(skb); diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index 4e3c62d1ad9e..1e8b26eecb3f 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -304,7 +304,10 @@ xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; - icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + if (skb->len > 1280) + icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + else + goto xmit; } else { if (!(ip_hdr(skb)->frag_off & htons(IP_DF))) goto xmit; -- cgit From ea49432d184a6a09f84461604b7711a4e9f5ec9c Mon Sep 17 00:00:00 2001 From: Daniel Palmer Date: Tue, 1 Mar 2022 19:43:49 +0900 Subject: ARM: mstar: Select HAVE_ARM_ARCH_TIMER The mstar SoCs have an arch timer but HAVE_ARM_ARCH_TIMER wasn't selected. If MSC313E_TIMER isn't selected then the kernel gets stuck at boot because there are no timers available. Signed-off-by: Daniel Palmer Link: https://lore.kernel.org/r/20220301104349.3040422-1-daniel@0x0f.com' Signed-off-by: Arnd Bergmann --- arch/arm/mach-mstar/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/mach-mstar/Kconfig b/arch/arm/mach-mstar/Kconfig index cd300eeedc20..0bf4d312bcfd 100644 --- a/arch/arm/mach-mstar/Kconfig +++ b/arch/arm/mach-mstar/Kconfig @@ -3,6 +3,7 @@ menuconfig ARCH_MSTARV7 depends on ARCH_MULTI_V7 select ARM_GIC select ARM_HEAVY_MB + select HAVE_ARM_ARCH_TIMER select MST_IRQ select MSTAR_MSC313_MPLL help -- cgit From a12f76345e026f1b300a0d17c56f020b6949b093 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 21 Feb 2022 15:55:12 +0100 Subject: cfg80211: fix CONFIG_CFG80211_EXTRA_REGDB_KEYDIR typo The kbuild change here accidentally removed not only the unquoting, but also the last character of the variable name. Fix that. Fixes: 129ab0d2d9f3 ("kbuild: do not quote string values in include/config/auto.conf") Reviewed-by: Masahiro Yamada Link: https://lore.kernel.org/r/20220221155512.1d25895f7c5f.I50fa3d4189fcab90a2896fe8cae215035dae9508@changeid Signed-off-by: Johannes Berg --- net/wireless/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/wireless/Makefile b/net/wireless/Makefile index 1e9be50469ce..527ae669f6f7 100644 --- a/net/wireless/Makefile +++ b/net/wireless/Makefile @@ -33,7 +33,7 @@ $(obj)/shipped-certs.c: $(wildcard $(srctree)/$(src)/certs/*.hex) echo 'unsigned int shipped_regdb_certs_len = sizeof(shipped_regdb_certs);'; \ ) > $@ -$(obj)/extra-certs.c: $(CONFIG_CFG80211_EXTRA_REGDB_KEYDI) \ +$(obj)/extra-certs.c: $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR) \ $(wildcard $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR)/*.x509) @$(kecho) " GEN $@" $(Q)(set -e; \ -- cgit From cc71d37fd1f11e0495b1cf580909ebea37eaa886 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 25 Feb 2022 17:18:58 -0800 Subject: HID: vivaldi: fix sysfs attributes leak The driver creates the top row map sysfs attribute in input_configured() method; unfortunately we do not have a callback that is executed when HID interface is unbound, thus we are leaking these sysfs attributes, for example when device is disconnected. To fix it let's switch to managed version of adding sysfs attributes which will ensure that they are destroyed when the driver is unbound. Fixes: 14c9c014babe ("HID: add vivaldi HID driver") Signed-off-by: Dmitry Torokhov Tested-by: Stephen Boyd Reviewed-by: Stephen Boyd Signed-off-by: Jiri Kosina --- drivers/hid/hid-vivaldi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hid/hid-vivaldi.c b/drivers/hid/hid-vivaldi.c index efa6140915f4..42ceb2058a09 100644 --- a/drivers/hid/hid-vivaldi.c +++ b/drivers/hid/hid-vivaldi.c @@ -144,7 +144,7 @@ out: static int vivaldi_input_configured(struct hid_device *hdev, struct hid_input *hidinput) { - return sysfs_create_group(&hdev->dev.kobj, &input_attribute_group); + return devm_device_add_group(&hdev->dev, &input_attribute_group); } static const struct hid_device_id vivaldi_table[] = { -- cgit From fe23b6bbeac40de957724b90a88d46fb336e29a9 Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Thu, 24 Feb 2022 19:41:10 -0800 Subject: HID: nintendo: check the return value of alloc_workqueue() The function alloc_workqueue() in nintendo_hid_probe() can fail, but there is no check of its return value. To fix this bug, its return value should be checked with new error handling code. Fixes: c4eae84feff3e ("HID: nintendo: add rumble support") Reported-by: TOTE Robot Signed-off-by: Jia-Ju Bai Reviewed-by: Silvan Jegen Signed-off-by: Jiri Kosina --- drivers/hid/hid-nintendo.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/hid/hid-nintendo.c b/drivers/hid/hid-nintendo.c index b6a9a0f3966e..2204de889739 100644 --- a/drivers/hid/hid-nintendo.c +++ b/drivers/hid/hid-nintendo.c @@ -2128,6 +2128,10 @@ static int nintendo_hid_probe(struct hid_device *hdev, spin_lock_init(&ctlr->lock); ctlr->rumble_queue = alloc_workqueue("hid-nintendo-rumble_wq", WQ_FREEZABLE | WQ_MEM_RECLAIM, 0); + if (!ctlr->rumble_queue) { + ret = -ENOMEM; + goto err; + } INIT_WORK(&ctlr->rumble_worker, joycon_rumble_worker); ret = hid_parse(hdev); -- cgit From 5838a14832d447990827d85e90afe17e6fb9c175 Mon Sep 17 00:00:00 2001 From: Nicolas Cavallari Date: Mon, 28 Feb 2022 12:03:51 +0100 Subject: thermal: core: Fix TZ_GET_TRIP NULL pointer dereference Do not call get_trip_hyst() from thermal_genl_cmd_tz_get_trip() if the thermal zone does not define one. Fixes: 1ce50e7d408e ("thermal: core: genetlink support for events/cmd/sampling") Signed-off-by: Nicolas Cavallari Cc: 5.10+ # 5.10+ Signed-off-by: Rafael J. Wysocki --- drivers/thermal/thermal_netlink.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/thermal/thermal_netlink.c b/drivers/thermal/thermal_netlink.c index a16dd4d5d710..73e68cce292e 100644 --- a/drivers/thermal/thermal_netlink.c +++ b/drivers/thermal/thermal_netlink.c @@ -419,11 +419,12 @@ static int thermal_genl_cmd_tz_get_trip(struct param *p) for (i = 0; i < tz->trips; i++) { enum thermal_trip_type type; - int temp, hyst; + int temp, hyst = 0; tz->ops->get_trip_type(tz, i, &type); tz->ops->get_trip_temp(tz, i, &temp); - tz->ops->get_trip_hyst(tz, i, &hyst); + if (tz->ops->get_trip_hyst) + tz->ops->get_trip_hyst(tz, i, &hyst); if (nla_put_u32(msg, THERMAL_GENL_ATTR_TZ_TRIP_ID, i) || nla_put_u32(msg, THERMAL_GENL_ATTR_TZ_TRIP_TYPE, type) || -- cgit From 439a8468242b313486e69b8cc3b45ddcfa898fbf Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 28 Feb 2022 10:59:12 -0800 Subject: binfmt_elf: Avoid total_mapping_size for ET_EXEC Partially revert commit 5f501d555653 ("binfmt_elf: reintroduce using MAP_FIXED_NOREPLACE"), which applied the ET_DYN "total_mapping_size" logic also to ET_EXEC. At least ia64 has ET_EXEC PT_LOAD segments that are not virtual-address contiguous (but _are_ file-offset contiguous). This would result in a giant mapping attempting to cover the entire span, including the virtual address range hole, and well beyond the size of the ELF file itself, causing the kernel to refuse to load it. For example: $ readelf -lW /usr/bin/gcc ... Program Headers: Type Offset VirtAddr PhysAddr FileSiz MemSiz ... ... LOAD 0x000000 0x4000000000000000 0x4000000000000000 0x00b5a0 0x00b5a0 ... LOAD 0x00b5a0 0x600000000000b5a0 0x600000000000b5a0 0x0005ac 0x000710 ... ... ^^^^^^^^ ^^^^^^^^^^^^^^^^^^ ^^^^^^^^ ^^^^^^^^ File offset range : 0x000000-0x00bb4c 0x00bb4c bytes Virtual address range : 0x4000000000000000-0x600000000000bcb0 0x200000000000bcb0 bytes Remove the total_mapping_size logic for ET_EXEC, which reduces the ET_EXEC MAP_FIXED_NOREPLACE coverage to only the first PT_LOAD (better than nothing), and retains it for ET_DYN. Ironically, this is the reverse of the problem that originally caused problems with MAP_FIXED_NOREPLACE: overlapping PT_LOAD segments. Future work could restore full coverage if load_elf_binary() were to perform mappings in a separate phase from the loading (where it could resolve both overlaps and holes). Cc: Eric Biederman Cc: Alexander Viro Cc: linux-fsdevel@vger.kernel.org Cc: linux-mm@kvack.org Reported-by: matoro Fixes: 5f501d555653 ("binfmt_elf: reintroduce using MAP_FIXED_NOREPLACE") Link: https://lore.kernel.org/r/a3edd529-c42d-3b09-135c-7e98a15b150f@leemhuis.info Tested-by: matoro Link: https://lore.kernel.org/lkml/ce8af9c13bcea9230c7689f3c1e0e2cd@matoro.tk Tested-By: John Paul Adrian Glaubitz Link: https://lore.kernel.org/lkml/49182d0d-708b-4029-da5f-bc18603440a6@physik.fu-berlin.de Cc: stable@vger.kernel.org Signed-off-by: Kees Cook --- fs/binfmt_elf.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 605017eb9349..c36b1ec357fb 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1135,14 +1135,25 @@ out_free_interp: * is then page aligned. */ load_bias = ELF_PAGESTART(load_bias - vaddr); - } - /* - * Calculate the entire size of the ELF mapping (total_size). - * (Note that load_addr_set is set to true later once the - * initial mapping is performed.) - */ - if (!load_addr_set) { + /* + * Calculate the entire size of the ELF mapping + * (total_size), used for the initial mapping, + * due to load_addr_set which is set to true later + * once the initial mapping is performed. + * + * Note that this is only sensible when the LOAD + * segments are contiguous (or overlapping). If + * used for LOADs that are far apart, this would + * cause the holes between LOADs to be mapped, + * running the risk of having the mapping fail, + * as it would be larger than the ELF file itself. + * + * As a result, only ET_DYN does this, since + * some ET_EXEC (e.g. ia64) may have large virtual + * memory holes between LOADs. + * + */ total_size = total_mapping_size(elf_phdata, elf_ex->e_phnum); if (!total_size) { -- cgit From 62929726ef0ec72cbbe9440c5d125d4278b99894 Mon Sep 17 00:00:00 2001 From: Manasi Navare Date: Thu, 24 Feb 2022 17:30:54 -0800 Subject: drm/vrr: Set VRR capable prop only if it is attached to connector MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit VRR capable property is not attached by default to the connector It is attached only if VRR is supported. So if the driver tries to call drm core set prop function without it being attached that causes NULL dereference. Cc: Jani Nikula Cc: Ville Syrjälä Cc: dri-devel@lists.freedesktop.org Signed-off-by: Manasi Navare Reviewed-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20220225013055.9282-1-manasi.d.navare@intel.com --- drivers/gpu/drm/drm_connector.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/drm_connector.c b/drivers/gpu/drm/drm_connector.c index a50c82bc2b2f..76a8c707c34b 100644 --- a/drivers/gpu/drm/drm_connector.c +++ b/drivers/gpu/drm/drm_connector.c @@ -2330,6 +2330,9 @@ EXPORT_SYMBOL(drm_connector_atomic_hdr_metadata_equal); void drm_connector_set_vrr_capable_property( struct drm_connector *connector, bool capable) { + if (!connector->vrr_capable_property) + return; + drm_object_property_set_value(&connector->base, connector->vrr_capable_property, capable); -- cgit From 6b4b54c7ca347bcb4aa7a3cc01aa16e84ac7fbe4 Mon Sep 17 00:00:00 2001 From: Alexander Egorenkov Date: Wed, 9 Feb 2022 11:25:09 +0100 Subject: s390/setup: preserve memory at OLDMEM_BASE and OLDMEM_SIZE We need to preserve the values at OLDMEM_BASE and OLDMEM_SIZE which are used by zgetdump in case when kdump crashes. In that case zgetdump will attempt to read OLDMEM_BASE and OLDMEM_SIZE in order to find out where the memory range [0 - OLDMEM_SIZE] belonging to the production kernel is. Fixes: f1a546947431 ("s390/setup: don't reserve memory that occupied decompressor's head") Cc: stable@vger.kernel.org # 5.15+ Signed-off-by: Alexander Egorenkov Acked-by: Vasily Gorbik Signed-off-by: Vasily Gorbik --- arch/s390/kernel/setup.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index f2c25d113e7b..05327be3a982 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -800,6 +800,8 @@ static void __init check_initrd(void) static void __init reserve_kernel(void) { memblock_reserve(0, STARTUP_NORMAL_OFFSET); + memblock_reserve(OLDMEM_BASE, sizeof(unsigned long)); + memblock_reserve(OLDMEM_SIZE, sizeof(unsigned long)); memblock_reserve(__amode31_base, __eamode31 - __samode31); memblock_reserve(__pa(sclp_early_sccb), EXT_SCCB_READ_SCP); memblock_reserve(__pa(_stext), _end - _stext); -- cgit From 9fa881f7e3c74ce6626d166bca9397e5d925937f Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 23 Feb 2022 13:02:59 +0100 Subject: s390/ftrace: fix ftrace_caller/ftrace_regs_caller generation ftrace_caller was used for both ftrace_caller and ftrace_regs_caller, which means that the target address of the hotpatch trampoline was never updated. With commit 894979689d3a ("s390/ftrace: provide separate ftrace_caller/ftrace_regs_caller implementations") a separate ftrace_regs_caller entry point was implemeted, however it was forgotten to implement the necessary changes for ftrace_modify_call and ftrace_make_call, where the branch target has to be modified accordingly. Therefore add the missing code now. Fixes: 894979689d3a ("s390/ftrace: provide separate ftrace_caller/ftrace_regs_caller implementations") Reviewed-by: Sven Schnelle Acked-by: Ilya Leoshkevich Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/ftrace.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c index 21d62d8b6b9a..5026d218066d 100644 --- a/arch/s390/kernel/ftrace.c +++ b/arch/s390/kernel/ftrace.c @@ -159,9 +159,38 @@ int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec) return 0; } +static struct ftrace_hotpatch_trampoline *ftrace_get_trampoline(struct dyn_ftrace *rec) +{ + struct ftrace_hotpatch_trampoline *trampoline; + struct ftrace_insn insn; + s64 disp; + u16 opc; + + if (copy_from_kernel_nofault(&insn, (void *)rec->ip, sizeof(insn))) + return ERR_PTR(-EFAULT); + disp = (s64)insn.disp * 2; + trampoline = (void *)(rec->ip + disp); + if (get_kernel_nofault(opc, &trampoline->brasl_opc)) + return ERR_PTR(-EFAULT); + if (opc != 0xc015) + return ERR_PTR(-EINVAL); + return trampoline; +} + int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, unsigned long addr) { + struct ftrace_hotpatch_trampoline *trampoline; + u64 old; + + trampoline = ftrace_get_trampoline(rec); + if (IS_ERR(trampoline)) + return PTR_ERR(trampoline); + if (get_kernel_nofault(old, &trampoline->interceptor)) + return -EFAULT; + if (old != old_addr) + return -EINVAL; + s390_kernel_write(&trampoline->interceptor, &addr, sizeof(addr)); return 0; } @@ -188,6 +217,12 @@ static void brcl_enable(void *brcl) int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) { + struct ftrace_hotpatch_trampoline *trampoline; + + trampoline = ftrace_get_trampoline(rec); + if (IS_ERR(trampoline)) + return PTR_ERR(trampoline); + s390_kernel_write(&trampoline->interceptor, &addr, sizeof(addr)); brcl_enable((void *)rec->ip); return 0; } -- cgit From 1389f17937a03fe4ec71b094e1aa6530a901963e Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 22 Feb 2022 14:53:47 +0100 Subject: s390/ftrace: fix arch_ftrace_get_regs implementation arch_ftrace_get_regs is supposed to return a struct pt_regs pointer only if the pt_regs structure contains all register contents, which means it must have been populated when created via ftrace_regs_caller. If it was populated via ftrace_caller the contents are not complete (the psw mask part is missing), and therefore a NULL pointer needs be returned. The current code incorrectly always returns a struct pt_regs pointer. Fix this by adding another pt_regs flag which indicates if the contents are complete, and fix arch_ftrace_get_regs accordingly. Fixes: 894979689d3a ("s390/ftrace: provide separate ftrace_caller/ftrace_regs_caller implementations") Reported-by: Christophe Leroy Reported-by: Naveen N. Rao Reviewed-by: Sven Schnelle Acked-by: Ilya Leoshkevich Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/ftrace.h | 10 ++++++---- arch/s390/include/asm/ptrace.h | 2 ++ arch/s390/kernel/ftrace.c | 2 +- arch/s390/kernel/mcount.S | 9 +++++++++ 4 files changed, 18 insertions(+), 5 deletions(-) diff --git a/arch/s390/include/asm/ftrace.h b/arch/s390/include/asm/ftrace.h index 267f70f4393f..6f80ec9c04be 100644 --- a/arch/s390/include/asm/ftrace.h +++ b/arch/s390/include/asm/ftrace.h @@ -47,15 +47,17 @@ struct ftrace_regs { static __always_inline struct pt_regs *arch_ftrace_get_regs(struct ftrace_regs *fregs) { - return &fregs->regs; + struct pt_regs *regs = &fregs->regs; + + if (test_pt_regs_flag(regs, PIF_FTRACE_FULL_REGS)) + return regs; + return NULL; } static __always_inline void ftrace_instruction_pointer_set(struct ftrace_regs *fregs, unsigned long ip) { - struct pt_regs *regs = arch_ftrace_get_regs(fregs); - - regs->psw.addr = ip; + fregs->regs.psw.addr = ip; } /* diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h index 4ffa8e7f0ed3..ddb70fb13fbc 100644 --- a/arch/s390/include/asm/ptrace.h +++ b/arch/s390/include/asm/ptrace.h @@ -15,11 +15,13 @@ #define PIF_EXECVE_PGSTE_RESTART 1 /* restart execve for PGSTE binaries */ #define PIF_SYSCALL_RET_SET 2 /* return value was set via ptrace */ #define PIF_GUEST_FAULT 3 /* indicates program check in sie64a */ +#define PIF_FTRACE_FULL_REGS 4 /* all register contents valid (ftrace) */ #define _PIF_SYSCALL BIT(PIF_SYSCALL) #define _PIF_EXECVE_PGSTE_RESTART BIT(PIF_EXECVE_PGSTE_RESTART) #define _PIF_SYSCALL_RET_SET BIT(PIF_SYSCALL_RET_SET) #define _PIF_GUEST_FAULT BIT(PIF_GUEST_FAULT) +#define _PIF_FTRACE_FULL_REGS BIT(PIF_FTRACE_FULL_REGS) #ifndef __ASSEMBLY__ diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c index 5026d218066d..89c0870d5679 100644 --- a/arch/s390/kernel/ftrace.c +++ b/arch/s390/kernel/ftrace.c @@ -326,7 +326,7 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, regs = ftrace_get_regs(fregs); p = get_kprobe((kprobe_opcode_t *)ip); - if (unlikely(!p) || kprobe_disabled(p)) + if (!regs || unlikely(!p) || kprobe_disabled(p)) goto out; if (kprobe_running()) { diff --git a/arch/s390/kernel/mcount.S b/arch/s390/kernel/mcount.S index 39bcc0e39a10..a24177dcd12a 100644 --- a/arch/s390/kernel/mcount.S +++ b/arch/s390/kernel/mcount.S @@ -27,6 +27,7 @@ ENDPROC(ftrace_stub) #define STACK_PTREGS_GPRS (STACK_PTREGS + __PT_GPRS) #define STACK_PTREGS_PSW (STACK_PTREGS + __PT_PSW) #define STACK_PTREGS_ORIG_GPR2 (STACK_PTREGS + __PT_ORIG_GPR2) +#define STACK_PTREGS_FLAGS (STACK_PTREGS + __PT_FLAGS) #ifdef __PACK_STACK /* allocate just enough for r14, r15 and backchain */ #define TRACED_FUNC_FRAME_SIZE 24 @@ -57,6 +58,14 @@ ENDPROC(ftrace_stub) .if \allregs == 1 stg %r14,(STACK_PTREGS_PSW)(%r15) stosm (STACK_PTREGS_PSW)(%r15),0 +#ifdef CONFIG_HAVE_MARCH_Z10_FEATURES + mvghi STACK_PTREGS_FLAGS(%r15),_PIF_FTRACE_FULL_REGS +#else + lghi %r14,_PIF_FTRACE_FULL_REGS + stg %r14,STACK_PTREGS_FLAGS(%r15) +#endif + .else + xc STACK_PTREGS_FLAGS(8,%r15),STACK_PTREGS_FLAGS(%r15) .endif lg %r14,(__SF_GPRS+8*8)(%r1) # restore original return address -- cgit From c194dad21025dfd043210912653baab823bdff67 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 24 Feb 2022 22:03:29 +0100 Subject: s390/extable: fix exception table sorting s390 has a swap_ex_entry_fixup function, however it is not being used since common code expects a swap_ex_entry_fixup define. If it is not defined the default implementation will be used. So fix this by adding a proper define. However also the implementation of the function must be fixed, since a NULL value for handler has a special meaning and must not be adjusted. Luckily all of this doesn't fix a real bug currently: the main extable is correctly sorted during build time, and for runtime sorting there is currently no case where the handler field is not NULL. Fixes: 05a68e892e89 ("s390/kernel: expand exception table logic to allow new handling options") Acked-by: Ilya Leoshkevich Reviewed-by: Alexander Gordeev Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/extable.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/s390/include/asm/extable.h b/arch/s390/include/asm/extable.h index 16dc57dd90b3..8511f0e59290 100644 --- a/arch/s390/include/asm/extable.h +++ b/arch/s390/include/asm/extable.h @@ -69,8 +69,13 @@ static inline void swap_ex_entry_fixup(struct exception_table_entry *a, { a->fixup = b->fixup + delta; b->fixup = tmp.fixup - delta; - a->handler = b->handler + delta; - b->handler = tmp.handler - delta; + a->handler = b->handler; + if (a->handler) + a->handler += delta; + b->handler = tmp.handler; + if (b->handler) + b->handler -= delta; } +#define swap_ex_entry_fixup swap_ex_entry_fixup #endif -- cgit From db6140e5e35a48405e669353bd54042c1d4c3841 Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Mon, 28 Feb 2022 11:23:49 +0200 Subject: net/sched: act_ct: Fix flow table lookup failure with no originating ifindex After cited commit optimizted hw insertion, flow table entries are populated with ifindex information which was intended to only be used for HW offload. This tuple ifindex is hashed in the flow table key, so it must be filled for lookup to be successful. But tuple ifindex is only relevant for the netfilter flowtables (nft), so it's not filled in act_ct flow table lookup, resulting in lookup failure, and no SW offload and no offload teardown for TCP connection FIN/RST packets. To fix this, add new tc ifindex field to tuple, which will only be used for offloading, not for lookup, as it will not be part of the tuple hash. Fixes: 9795ded7f924 ("net/sched: act_ct: Fill offloading tuple iifidx") Signed-off-by: Paul Blakey Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 6 +++++- net/netfilter/nf_flow_table_offload.c | 6 +++++- net/sched/act_ct.c | 13 +++++++++---- 3 files changed, 19 insertions(+), 6 deletions(-) diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index a3647fadf1cc..bd59e950f4d6 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -96,6 +96,7 @@ enum flow_offload_xmit_type { FLOW_OFFLOAD_XMIT_NEIGH, FLOW_OFFLOAD_XMIT_XFRM, FLOW_OFFLOAD_XMIT_DIRECT, + FLOW_OFFLOAD_XMIT_TC, }; #define NF_FLOW_TABLE_ENCAP_MAX 2 @@ -127,7 +128,7 @@ struct flow_offload_tuple { struct { } __hash; u8 dir:2, - xmit_type:2, + xmit_type:3, encap_num:2, in_vlan_ingress:2; u16 mtu; @@ -142,6 +143,9 @@ struct flow_offload_tuple { u8 h_source[ETH_ALEN]; u8 h_dest[ETH_ALEN]; } out; + struct { + u32 iifidx; + } tc; }; }; diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index b561e0a44a45..fc4265acd9c4 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -110,7 +110,11 @@ static int nf_flow_rule_match(struct nf_flow_match *match, nf_flow_rule_lwt_match(match, tun_info); } - key->meta.ingress_ifindex = tuple->iifidx; + if (tuple->xmit_type == FLOW_OFFLOAD_XMIT_TC) + key->meta.ingress_ifindex = tuple->tc.iifidx; + else + key->meta.ingress_ifindex = tuple->iifidx; + mask->meta.ingress_ifindex = 0xffffffff; if (tuple->encap_num > 0 && !(tuple->in_vlan_ingress & BIT(0)) && diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 33e70d60f0bf..ec19f625863a 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -361,6 +361,13 @@ static void tcf_ct_flow_table_put(struct tcf_ct_params *params) } } +static void tcf_ct_flow_tc_ifidx(struct flow_offload *entry, + struct nf_conn_act_ct_ext *act_ct_ext, u8 dir) +{ + entry->tuplehash[dir].tuple.xmit_type = FLOW_OFFLOAD_XMIT_TC; + entry->tuplehash[dir].tuple.tc.iifidx = act_ct_ext->ifindex[dir]; +} + static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft, struct nf_conn *ct, bool tcp) @@ -385,10 +392,8 @@ static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft, act_ct_ext = nf_conn_act_ct_ext_find(ct); if (act_ct_ext) { - entry->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.iifidx = - act_ct_ext->ifindex[IP_CT_DIR_ORIGINAL]; - entry->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.iifidx = - act_ct_ext->ifindex[IP_CT_DIR_REPLY]; + tcf_ct_flow_tc_ifidx(entry, act_ct_ext, FLOW_OFFLOAD_DIR_ORIGINAL); + tcf_ct_flow_tc_ifidx(entry, act_ct_ext, FLOW_OFFLOAD_DIR_REPLY); } err = flow_offload_add(&ct_ft->nf_ft, entry); -- cgit From a0e897d1b36793fe0ab899f2fe93dff25c82f418 Mon Sep 17 00:00:00 2001 From: Pali Rohár Date: Mon, 17 Jan 2022 19:20:06 +0100 Subject: arm64: dts: armada-3720-turris-mox: Add missing ethernet0 alias MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit U-Boot uses ethernet* aliases for setting MAC addresses. Therefore define also alias for ethernet0. Fixes: 7109d817db2e ("arm64: dts: marvell: add DTS for Turris Mox") Signed-off-by: Pali Rohár Signed-off-by: Gregory CLEMENT Signed-off-by: Arnd Bergmann --- arch/arm64/boot/dts/marvell/armada-3720-turris-mox.dts | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/boot/dts/marvell/armada-3720-turris-mox.dts b/arch/arm64/boot/dts/marvell/armada-3720-turris-mox.dts index 04da07ae4420..1eddf31d8bd8 100644 --- a/arch/arm64/boot/dts/marvell/armada-3720-turris-mox.dts +++ b/arch/arm64/boot/dts/marvell/armada-3720-turris-mox.dts @@ -18,6 +18,7 @@ aliases { spi0 = &spi0; + ethernet0 = ð0; ethernet1 = ð1; mmc0 = &sdhci0; mmc1 = &sdhci1; -- cgit From fc7f750dc9d102c1ed7bbe4591f991e770c99033 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 28 Feb 2022 10:43:31 +0300 Subject: staging: gdm724x: fix use after free in gdm_lte_rx() The netif_rx_ni() function frees the skb so we can't dereference it to save the skb->len. Fixes: 61e121047645 ("staging: gdm7240: adding LTE USB driver") Cc: stable Reported-by: kernel test robot Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/20220228074331.GA13685@kili Signed-off-by: Greg Kroah-Hartman --- drivers/staging/gdm724x/gdm_lte.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/staging/gdm724x/gdm_lte.c b/drivers/staging/gdm724x/gdm_lte.c index 493ed4821515..0d8d8fed283d 100644 --- a/drivers/staging/gdm724x/gdm_lte.c +++ b/drivers/staging/gdm724x/gdm_lte.c @@ -76,14 +76,15 @@ static void tx_complete(void *arg) static int gdm_lte_rx(struct sk_buff *skb, struct nic *nic, int nic_type) { - int ret; + int ret, len; + len = skb->len + ETH_HLEN; ret = netif_rx_ni(skb); if (ret == NET_RX_DROP) { nic->stats.rx_dropped++; } else { nic->stats.rx_packets++; - nic->stats.rx_bytes += skb->len + ETH_HLEN; + nic->stats.rx_bytes += len; } return 0; -- cgit From 275f3f64870245b06188f24bdf917e55a813d294 Mon Sep 17 00:00:00 2001 From: Brian Gix Date: Tue, 1 Mar 2022 14:34:57 -0800 Subject: Bluetooth: Fix not checking MGMT cmd pending queue A number of places in the MGMT handlers we examine the command queue for other commands (in progress but not yet complete) that will interact with the process being performed. However, not all commands go into the queue if one of: 1. There is no negative side effect of consecutive or redundent commands 2. The command is entirely perform "inline". This change examines each "pending command" check, and if it is not needed, deletes the check. Of the remaining pending command checks, we make sure that the command is in the pending queue by using the mgmt_pending_add/mgmt_pending_remove pair rather than the mgmt_pending_new/mgmt_pending_free pair. Link: https://lore.kernel.org/linux-bluetooth/f648f2e11bb3c2974c32e605a85ac3a9fac944f1.camel@redhat.com/T/ Tested-by: Maxim Levitsky Signed-off-by: Brian Gix Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/mgmt.c | 99 +++++++++++++++++++++++++++++++++------------------- 1 file changed, 63 insertions(+), 36 deletions(-) diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 37087cf7dc5a..533cf60673a3 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -1218,7 +1218,13 @@ static int new_settings(struct hci_dev *hdev, struct sock *skip) static void mgmt_set_powered_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; - struct mgmt_mode *cp = cmd->param; + struct mgmt_mode *cp; + + /* Make sure cmd still outstanding. */ + if (cmd != pending_find(MGMT_OP_SET_POWERED, hdev)) + return; + + cp = cmd->param; bt_dev_dbg(hdev, "err %d", err); @@ -1242,7 +1248,7 @@ static void mgmt_set_powered_complete(struct hci_dev *hdev, void *data, int err) mgmt_status(err)); } - mgmt_pending_free(cmd); + mgmt_pending_remove(cmd); } static int set_powered_sync(struct hci_dev *hdev, void *data) @@ -1281,7 +1287,7 @@ static int set_powered(struct sock *sk, struct hci_dev *hdev, void *data, goto failed; } - cmd = mgmt_pending_new(sk, MGMT_OP_SET_POWERED, hdev, data, len); + cmd = mgmt_pending_add(sk, MGMT_OP_SET_POWERED, hdev, data, len); if (!cmd) { err = -ENOMEM; goto failed; @@ -1290,6 +1296,9 @@ static int set_powered(struct sock *sk, struct hci_dev *hdev, void *data, err = hci_cmd_sync_queue(hdev, set_powered_sync, cmd, mgmt_set_powered_complete); + if (err < 0) + mgmt_pending_remove(cmd); + failed: hci_dev_unlock(hdev); return err; @@ -1383,6 +1392,10 @@ static void mgmt_set_discoverable_complete(struct hci_dev *hdev, void *data, bt_dev_dbg(hdev, "err %d", err); + /* Make sure cmd still outstanding. */ + if (cmd != pending_find(MGMT_OP_SET_DISCOVERABLE, hdev)) + return; + hci_dev_lock(hdev); if (err) { @@ -1402,7 +1415,7 @@ static void mgmt_set_discoverable_complete(struct hci_dev *hdev, void *data, new_settings(hdev, cmd->sk); done: - mgmt_pending_free(cmd); + mgmt_pending_remove(cmd); hci_dev_unlock(hdev); } @@ -1511,7 +1524,7 @@ static int set_discoverable(struct sock *sk, struct hci_dev *hdev, void *data, goto failed; } - cmd = mgmt_pending_new(sk, MGMT_OP_SET_DISCOVERABLE, hdev, data, len); + cmd = mgmt_pending_add(sk, MGMT_OP_SET_DISCOVERABLE, hdev, data, len); if (!cmd) { err = -ENOMEM; goto failed; @@ -1538,6 +1551,9 @@ static int set_discoverable(struct sock *sk, struct hci_dev *hdev, void *data, err = hci_cmd_sync_queue(hdev, set_discoverable_sync, cmd, mgmt_set_discoverable_complete); + if (err < 0) + mgmt_pending_remove(cmd); + failed: hci_dev_unlock(hdev); return err; @@ -1550,6 +1566,10 @@ static void mgmt_set_connectable_complete(struct hci_dev *hdev, void *data, bt_dev_dbg(hdev, "err %d", err); + /* Make sure cmd still outstanding. */ + if (cmd != pending_find(MGMT_OP_SET_CONNECTABLE, hdev)) + return; + hci_dev_lock(hdev); if (err) { @@ -1562,7 +1582,9 @@ static void mgmt_set_connectable_complete(struct hci_dev *hdev, void *data, new_settings(hdev, cmd->sk); done: - mgmt_pending_free(cmd); + if (cmd) + mgmt_pending_remove(cmd); + hci_dev_unlock(hdev); } @@ -1634,7 +1656,7 @@ static int set_connectable(struct sock *sk, struct hci_dev *hdev, void *data, goto failed; } - cmd = mgmt_pending_new(sk, MGMT_OP_SET_CONNECTABLE, hdev, data, len); + cmd = mgmt_pending_add(sk, MGMT_OP_SET_CONNECTABLE, hdev, data, len); if (!cmd) { err = -ENOMEM; goto failed; @@ -1654,6 +1676,9 @@ static int set_connectable(struct sock *sk, struct hci_dev *hdev, void *data, err = hci_cmd_sync_queue(hdev, set_connectable_sync, cmd, mgmt_set_connectable_complete); + if (err < 0) + mgmt_pending_remove(cmd); + failed: hci_dev_unlock(hdev); return err; @@ -1774,6 +1799,10 @@ static void set_ssp_complete(struct hci_dev *hdev, void *data, int err) u8 enable = cp->val; bool changed; + /* Make sure cmd still outstanding. */ + if (cmd != pending_find(MGMT_OP_SET_SSP, hdev)) + return; + if (err) { u8 mgmt_err = mgmt_status(err); @@ -3321,6 +3350,9 @@ static void set_name_complete(struct hci_dev *hdev, void *data, int err) bt_dev_dbg(hdev, "err %d", err); + if (cmd != pending_find(MGMT_OP_SET_LOCAL_NAME, hdev)) + return; + if (status) { mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_SET_LOCAL_NAME, status); @@ -3493,6 +3525,9 @@ static void set_default_phy_complete(struct hci_dev *hdev, void *data, int err) struct sk_buff *skb = cmd->skb; u8 status = mgmt_status(err); + if (cmd != pending_find(MGMT_OP_SET_PHY_CONFIGURATION, hdev)) + return; + if (!status) { if (!skb) status = MGMT_STATUS_FAILED; @@ -3759,13 +3794,6 @@ static int set_wideband_speech(struct sock *sk, struct hci_dev *hdev, hci_dev_lock(hdev); - if (pending_find(MGMT_OP_SET_WIDEBAND_SPEECH, hdev)) { - err = mgmt_cmd_status(sk, hdev->id, - MGMT_OP_SET_WIDEBAND_SPEECH, - MGMT_STATUS_BUSY); - goto unlock; - } - if (hdev_is_powered(hdev) && !!cp->val != hci_dev_test_flag(hdev, HCI_WIDEBAND_SPEECH_ENABLED)) { @@ -5036,12 +5064,6 @@ static int read_local_oob_data(struct sock *sk, struct hci_dev *hdev, goto unlock; } - if (pending_find(MGMT_OP_READ_LOCAL_OOB_DATA, hdev)) { - err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_READ_LOCAL_OOB_DATA, - MGMT_STATUS_BUSY); - goto unlock; - } - cmd = mgmt_pending_new(sk, MGMT_OP_READ_LOCAL_OOB_DATA, hdev, NULL, 0); if (!cmd) err = -ENOMEM; @@ -5261,11 +5283,16 @@ static void start_discovery_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; + if (cmd != pending_find(MGMT_OP_START_DISCOVERY, hdev) && + cmd != pending_find(MGMT_OP_START_LIMITED_DISCOVERY, hdev) && + cmd != pending_find(MGMT_OP_START_SERVICE_DISCOVERY, hdev)) + return; + bt_dev_dbg(hdev, "err %d", err); mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, mgmt_status(err), cmd->param, 1); - mgmt_pending_free(cmd); + mgmt_pending_remove(cmd); hci_discovery_set_state(hdev, err ? DISCOVERY_STOPPED: DISCOVERY_FINDING); @@ -5327,7 +5354,7 @@ static int start_discovery_internal(struct sock *sk, struct hci_dev *hdev, else hdev->discovery.limited = false; - cmd = mgmt_pending_new(sk, op, hdev, data, len); + cmd = mgmt_pending_add(sk, op, hdev, data, len); if (!cmd) { err = -ENOMEM; goto failed; @@ -5336,7 +5363,7 @@ static int start_discovery_internal(struct sock *sk, struct hci_dev *hdev, err = hci_cmd_sync_queue(hdev, start_discovery_sync, cmd, start_discovery_complete); if (err < 0) { - mgmt_pending_free(cmd); + mgmt_pending_remove(cmd); goto failed; } @@ -5430,7 +5457,7 @@ static int start_service_discovery(struct sock *sk, struct hci_dev *hdev, goto failed; } - cmd = mgmt_pending_new(sk, MGMT_OP_START_SERVICE_DISCOVERY, + cmd = mgmt_pending_add(sk, MGMT_OP_START_SERVICE_DISCOVERY, hdev, data, len); if (!cmd) { err = -ENOMEM; @@ -5463,7 +5490,7 @@ static int start_service_discovery(struct sock *sk, struct hci_dev *hdev, err = hci_cmd_sync_queue(hdev, start_discovery_sync, cmd, start_discovery_complete); if (err < 0) { - mgmt_pending_free(cmd); + mgmt_pending_remove(cmd); goto failed; } @@ -5495,11 +5522,14 @@ static void stop_discovery_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; + if (cmd != pending_find(MGMT_OP_STOP_DISCOVERY, hdev)) + return; + bt_dev_dbg(hdev, "err %d", err); mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, mgmt_status(err), cmd->param, 1); - mgmt_pending_free(cmd); + mgmt_pending_remove(cmd); if (!err) hci_discovery_set_state(hdev, DISCOVERY_STOPPED); @@ -5535,7 +5565,7 @@ static int stop_discovery(struct sock *sk, struct hci_dev *hdev, void *data, goto unlock; } - cmd = mgmt_pending_new(sk, MGMT_OP_STOP_DISCOVERY, hdev, data, len); + cmd = mgmt_pending_add(sk, MGMT_OP_STOP_DISCOVERY, hdev, data, len); if (!cmd) { err = -ENOMEM; goto unlock; @@ -5544,7 +5574,7 @@ static int stop_discovery(struct sock *sk, struct hci_dev *hdev, void *data, err = hci_cmd_sync_queue(hdev, stop_discovery_sync, cmd, stop_discovery_complete); if (err < 0) { - mgmt_pending_free(cmd); + mgmt_pending_remove(cmd); goto unlock; } @@ -7474,6 +7504,9 @@ static void read_local_oob_ext_data_complete(struct hci_dev *hdev, void *data, u8 status = mgmt_status(err); u16 eir_len; + if (cmd != pending_find(MGMT_OP_READ_LOCAL_OOB_EXT_DATA, hdev)) + return; + if (!status) { if (!skb) status = MGMT_STATUS_FAILED; @@ -7969,11 +8002,7 @@ static bool requested_adv_flags_are_valid(struct hci_dev *hdev, u32 adv_flags) static bool adv_busy(struct hci_dev *hdev) { - return (pending_find(MGMT_OP_ADD_ADVERTISING, hdev) || - pending_find(MGMT_OP_REMOVE_ADVERTISING, hdev) || - pending_find(MGMT_OP_SET_LE, hdev) || - pending_find(MGMT_OP_ADD_EXT_ADV_PARAMS, hdev) || - pending_find(MGMT_OP_ADD_EXT_ADV_DATA, hdev)); + return pending_find(MGMT_OP_SET_LE, hdev); } static void add_adv_complete(struct hci_dev *hdev, struct sock *sk, u8 instance, @@ -8563,9 +8592,7 @@ static int remove_advertising(struct sock *sk, struct hci_dev *hdev, goto unlock; } - if (pending_find(MGMT_OP_ADD_ADVERTISING, hdev) || - pending_find(MGMT_OP_REMOVE_ADVERTISING, hdev) || - pending_find(MGMT_OP_SET_LE, hdev)) { + if (pending_find(MGMT_OP_SET_LE, hdev)) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_REMOVE_ADVERTISING, MGMT_STATUS_BUSY); goto unlock; -- cgit From 0b0e2ff10356e7e2ffd66ecdd6eee69a2f03449b Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 28 Feb 2022 16:17:15 +0200 Subject: net: dsa: restore error path of dsa_tree_change_tag_proto When the DSA_NOTIFIER_TAG_PROTO returns an error, the user space process which initiated the protocol change exits the kernel processing while still holding the rtnl_mutex. So any other process attempting to lock the rtnl_mutex would deadlock after such event. The error handling of DSA_NOTIFIER_TAG_PROTO was inadvertently changed by the blamed commit, introducing this regression. We must still call rtnl_unlock(), and we must still call DSA_NOTIFIER_TAG_PROTO for the old protocol. The latter is due to the limiting design of notifier chains for cross-chip operations, which don't have a built-in error recovery mechanism - we should look into using notifier_call_chain_robust for that. Fixes: dc452a471dba ("net: dsa: introduce tagger-owned storage for private and shared data") Signed-off-by: Vladimir Oltean Link: https://lore.kernel.org/r/20220228141715.146485-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- net/dsa/dsa2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index dcad3100b164..209496996cdf 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -1261,7 +1261,7 @@ int dsa_tree_change_tag_proto(struct dsa_switch_tree *dst, info.tag_ops = tag_ops; err = dsa_tree_notify(dst, DSA_NOTIFIER_TAG_PROTO, &info); if (err) - return err; + goto out_unlock; err = dsa_tree_bind_tag_proto(dst, tag_ops); if (err) -- cgit From 1d1898f65616c4601208963c3376c1d828cbf2c7 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 1 Mar 2022 22:29:04 -0500 Subject: tracing/histogram: Fix sorting on old "cpu" value When trying to add a histogram against an event with the "cpu" field, it was impossible due to "cpu" being a keyword to key off of the running CPU. So to fix this, it was changed to "common_cpu" to match the other generic fields (like "common_pid"). But since some scripts used "cpu" for keying off of the CPU (for events that did not have "cpu" as a field, which is most of them), a backward compatibility trick was added such that if "cpu" was used as a key, and the event did not have "cpu" as a field name, then it would fallback and switch over to "common_cpu". This fix has a couple of subtle bugs. One was that when switching over to "common_cpu", it did not change the field name, it just set a flag. But the code still found a "cpu" field. The "cpu" field is used for filtering and is returned when the event does not have a "cpu" field. This was found by: # cd /sys/kernel/tracing # echo hist:key=cpu,pid:sort=cpu > events/sched/sched_wakeup/trigger # cat events/sched/sched_wakeup/hist Which showed the histogram unsorted: { cpu: 19, pid: 1175 } hitcount: 1 { cpu: 6, pid: 239 } hitcount: 2 { cpu: 23, pid: 1186 } hitcount: 14 { cpu: 12, pid: 249 } hitcount: 2 { cpu: 3, pid: 994 } hitcount: 5 Instead of hard coding the "cpu" checks, take advantage of the fact that trace_event_field_field() returns a special field for "cpu" and "CPU" if the event does not have "cpu" as a field. This special field has the "filter_type" of "FILTER_CPU". Check that to test if the returned field is of the CPU type instead of doing the string compare. Also, fix the sorting bug by testing for the hist_field flag of HIST_FIELD_FL_CPU when setting up the sort routine. Otherwise it will use the special CPU field to know what compare routine to use, and since that special field does not have a size, it returns tracing_map_cmp_none. Cc: stable@vger.kernel.org Fixes: 1e3bac71c505 ("tracing/histogram: Rename "cpu" to "common_cpu"") Reported-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index ada87bfb5bb8..dc7f733b4cb3 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -2289,9 +2289,9 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, /* * For backward compatibility, if field_name * was "cpu", then we treat this the same as - * common_cpu. + * common_cpu. This also works for "CPU". */ - if (strcmp(field_name, "cpu") == 0) { + if (field && field->filter_type == FILTER_CPU) { *flags |= HIST_FIELD_FL_CPU; } else { hist_err(tr, HIST_ERR_FIELD_NOT_FOUND, @@ -4832,7 +4832,7 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data) if (hist_field->flags & HIST_FIELD_FL_STACKTRACE) cmp_fn = tracing_map_cmp_none; - else if (!field) + else if (!field || hist_field->flags & HIST_FIELD_FL_CPU) cmp_fn = tracing_map_cmp_num(hist_field->size, hist_field->is_signed); else if (is_string_field(field)) -- cgit From 81a36d8ce554b82b0a08e2b95d0bd44fcbff339b Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Mon, 28 Feb 2022 23:39:38 -0800 Subject: Input: elan_i2c - move regulator_[en|dis]able() out of elan_[en|dis]able_power() elan_disable_power() is called conditionally on suspend, where as elan_enable_power() is always called on resume. This leads to an imbalance in the regulator's enable count. Move the regulator_[en|dis]able() calls out of elan_[en|dis]able_power() in preparation of fixing this. No functional changes intended. Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20220131135436.29638-1-hdegoede@redhat.com [dtor: consolidate elan_[en|dis]able() into elan_set_power()] Signed-off-by: Dmitry Torokhov --- drivers/input/mouse/elan_i2c_core.c | 62 +++++++++++++------------------------ 1 file changed, 22 insertions(+), 40 deletions(-) diff --git a/drivers/input/mouse/elan_i2c_core.c b/drivers/input/mouse/elan_i2c_core.c index 47af62c12267..185191b86fad 100644 --- a/drivers/input/mouse/elan_i2c_core.c +++ b/drivers/input/mouse/elan_i2c_core.c @@ -186,55 +186,21 @@ static int elan_get_fwinfo(u16 ic_type, u8 iap_version, u16 *validpage_count, return 0; } -static int elan_enable_power(struct elan_tp_data *data) +static int elan_set_power(struct elan_tp_data *data, bool on) { int repeat = ETP_RETRY_COUNT; int error; - error = regulator_enable(data->vcc); - if (error) { - dev_err(&data->client->dev, - "failed to enable regulator: %d\n", error); - return error; - } - do { - error = data->ops->power_control(data->client, true); + error = data->ops->power_control(data->client, on); if (error >= 0) return 0; msleep(30); } while (--repeat > 0); - dev_err(&data->client->dev, "failed to enable power: %d\n", error); - return error; -} - -static int elan_disable_power(struct elan_tp_data *data) -{ - int repeat = ETP_RETRY_COUNT; - int error; - - do { - error = data->ops->power_control(data->client, false); - if (!error) { - error = regulator_disable(data->vcc); - if (error) { - dev_err(&data->client->dev, - "failed to disable regulator: %d\n", - error); - /* Attempt to power the chip back up */ - data->ops->power_control(data->client, true); - break; - } - - return 0; - } - - msleep(30); - } while (--repeat > 0); - - dev_err(&data->client->dev, "failed to disable power: %d\n", error); + dev_err(&data->client->dev, "failed to set power %s: %d\n", + on ? "on" : "off", error); return error; } @@ -1399,9 +1365,19 @@ static int __maybe_unused elan_suspend(struct device *dev) /* Enable wake from IRQ */ data->irq_wake = (enable_irq_wake(client->irq) == 0); } else { - ret = elan_disable_power(data); + ret = elan_set_power(data, false); + if (ret) + goto err; + + ret = regulator_disable(data->vcc); + if (ret) { + dev_err(dev, "error %d disabling regulator\n", ret); + /* Attempt to power the chip back up */ + elan_set_power(data, true); + } } +err: mutex_unlock(&data->sysfs_mutex); return ret; } @@ -1417,7 +1393,13 @@ static int __maybe_unused elan_resume(struct device *dev) data->irq_wake = false; } - error = elan_enable_power(data); + error = regulator_enable(data->vcc); + if (error) { + dev_err(dev, "error %d enabling regulator\n", error); + goto err; + } + + error = elan_set_power(data, true); if (error) { dev_err(dev, "power up when resuming failed: %d\n", error); goto err; -- cgit From 04b7762e37c95d9b965d16bb0e18dbd1fa2e2861 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Mon, 28 Feb 2022 23:39:50 -0800 Subject: Input: elan_i2c - fix regulator enable count imbalance after suspend/resume Before these changes elan_suspend() would only disable the regulator when device_may_wakeup() returns false; whereas elan_resume() would unconditionally enable it, leading to an enable count imbalance when device_may_wakeup() returns true. This triggers the "WARN_ON(regulator->enable_count)" in regulator_put() when the elan_i2c driver gets unbound, this happens e.g. with the hot-plugable dock with Elan I2C touchpad for the Asus TF103C 2-in-1. Fix this by making the regulator_enable() call also be conditional on device_may_wakeup() returning false. Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20220131135436.29638-2-hdegoede@redhat.com Signed-off-by: Dmitry Torokhov --- drivers/input/mouse/elan_i2c_core.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/input/mouse/elan_i2c_core.c b/drivers/input/mouse/elan_i2c_core.c index 185191b86fad..e1758d5ffe42 100644 --- a/drivers/input/mouse/elan_i2c_core.c +++ b/drivers/input/mouse/elan_i2c_core.c @@ -1388,17 +1388,17 @@ static int __maybe_unused elan_resume(struct device *dev) struct elan_tp_data *data = i2c_get_clientdata(client); int error; - if (device_may_wakeup(dev) && data->irq_wake) { + if (!device_may_wakeup(dev)) { + error = regulator_enable(data->vcc); + if (error) { + dev_err(dev, "error %d enabling regulator\n", error); + goto err; + } + } else if (data->irq_wake) { disable_irq_wake(client->irq); data->irq_wake = false; } - error = regulator_enable(data->vcc); - if (error) { - dev_err(dev, "error %d enabling regulator\n", error); - goto err; - } - error = elan_set_power(data, true); if (error) { dev_err(dev, "power up when resuming failed: %d\n", error); -- cgit From 690bb6fb64f5dc7437317153902573ecad67593d Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Mon, 28 Feb 2022 00:01:24 +0100 Subject: batman-adv: Request iflink once in batadv-on-batadv check There is no need to call dev_get_iflink multiple times for the same net_device in batadv_is_on_batman_iface. And since some of the .ndo_get_iflink callbacks are dynamic (for example via RCUs like in vxcan_get_iflink), it could easily happen that the returned values are not stable. The pre-checks before __dev_get_by_index are then of course bogus. Fixes: b7eddd0b3950 ("batman-adv: prevent using any virtual device created on batman-adv as hard-interface") Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/hard-interface.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 8a2b78f9c4b2..35aa1122043b 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -149,22 +149,23 @@ static bool batadv_is_on_batman_iface(const struct net_device *net_dev) struct net *net = dev_net(net_dev); struct net_device *parent_dev; struct net *parent_net; + int iflink; bool ret; /* check if this is a batman-adv mesh interface */ if (batadv_softif_is_valid(net_dev)) return true; + iflink = dev_get_iflink(net_dev); + /* no more parents..stop recursion */ - if (dev_get_iflink(net_dev) == 0 || - dev_get_iflink(net_dev) == net_dev->ifindex) + if (iflink == 0 || iflink == net_dev->ifindex) return false; parent_net = batadv_getlink_net(net_dev, net); /* recurse over the parent device */ - parent_dev = __dev_get_by_index((struct net *)parent_net, - dev_get_iflink(net_dev)); + parent_dev = __dev_get_by_index((struct net *)parent_net, iflink); /* if we got a NULL parent_dev there is something broken.. */ if (!parent_dev) { pr_err("Cannot find parent device\n"); -- cgit From 6116ba09423f7d140f0460be6a1644dceaad00da Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Mon, 28 Feb 2022 00:01:24 +0100 Subject: batman-adv: Request iflink once in batadv_get_real_netdevice There is no need to call dev_get_iflink multiple times for the same net_device in batadv_get_real_netdevice. And since some of the ndo_get_iflink callbacks are dynamic (for example via RCUs like in vxcan_get_iflink), it could easily happen that the returned values are not stable. The pre-checks before __dev_get_by_index are then of course bogus. Fixes: 5ed4a460a1d3 ("batman-adv: additional checks for virtual interfaces on top of WiFi") Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/hard-interface.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 35aa1122043b..e2760cfce190 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -215,14 +215,16 @@ static struct net_device *batadv_get_real_netdevice(struct net_device *netdev) struct net_device *real_netdev = NULL; struct net *real_net; struct net *net; - int ifindex; + int iflink; ASSERT_RTNL(); if (!netdev) return NULL; - if (netdev->ifindex == dev_get_iflink(netdev)) { + iflink = dev_get_iflink(netdev); + + if (netdev->ifindex == iflink) { dev_hold(netdev); return netdev; } @@ -232,9 +234,8 @@ static struct net_device *batadv_get_real_netdevice(struct net_device *netdev) goto out; net = dev_net(hard_iface->soft_iface); - ifindex = dev_get_iflink(netdev); real_net = batadv_getlink_net(netdev, net); - real_netdev = dev_get_by_index(real_net, ifindex); + real_netdev = dev_get_by_index(real_net, iflink); out: batadv_hardif_put(hard_iface); -- cgit From 6c1f41afc1dbe59d9d3c8bb0d80b749c119aa334 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sun, 27 Feb 2022 23:23:49 +0100 Subject: batman-adv: Don't expect inter-netns unique iflink indices The ifindex doesn't have to be unique for multiple network namespaces on the same machine. $ ip netns add test1 $ ip -net test1 link add dummy1 type dummy $ ip netns add test2 $ ip -net test2 link add dummy2 type dummy $ ip -net test1 link show dev dummy1 6: dummy1: mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000 link/ether 96:81:55:1e:dd:85 brd ff:ff:ff:ff:ff:ff $ ip -net test2 link show dev dummy2 6: dummy2: mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000 link/ether 5a:3c:af:35:07:c3 brd ff:ff:ff:ff:ff:ff But the batman-adv code to walk through the various layers of virtual interfaces uses this assumption because dev_get_iflink handles it internally and doesn't return the actual netns of the iflink. And dev_get_iflink only documents the situation where ifindex == iflink for physical devices. But only checking for dev->netdev_ops->ndo_get_iflink is also not an option because ipoib_get_iflink implements it even when it sometimes returns an iflink != ifindex and sometimes iflink == ifindex. The caller must therefore make sure itself to check both netns and iflink + ifindex for equality. Only when they are equal, a "physical" interface was detected which should stop the traversal. On the other hand, vxcan_get_iflink can also return 0 in case there was currently no valid peer. In this case, it is still necessary to stop. Fixes: b7eddd0b3950 ("batman-adv: prevent using any virtual device created on batman-adv as hard-interface") Fixes: 5ed4a460a1d3 ("batman-adv: additional checks for virtual interfaces on top of WiFi") Reported-by: Sabrina Dubroca Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/hard-interface.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index e2760cfce190..35fadb924849 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -157,13 +157,15 @@ static bool batadv_is_on_batman_iface(const struct net_device *net_dev) return true; iflink = dev_get_iflink(net_dev); - - /* no more parents..stop recursion */ - if (iflink == 0 || iflink == net_dev->ifindex) + if (iflink == 0) return false; parent_net = batadv_getlink_net(net_dev, net); + /* iflink to itself, most likely physical device */ + if (net == parent_net && iflink == net_dev->ifindex) + return false; + /* recurse over the parent device */ parent_dev = __dev_get_by_index((struct net *)parent_net, iflink); /* if we got a NULL parent_dev there is something broken.. */ @@ -223,8 +225,7 @@ static struct net_device *batadv_get_real_netdevice(struct net_device *netdev) return NULL; iflink = dev_get_iflink(netdev); - - if (netdev->ifindex == iflink) { + if (iflink == 0) { dev_hold(netdev); return netdev; } @@ -235,6 +236,14 @@ static struct net_device *batadv_get_real_netdevice(struct net_device *netdev) net = dev_net(hard_iface->soft_iface); real_net = batadv_getlink_net(netdev, net); + + /* iflink to itself, most likely physical device */ + if (net == real_net && netdev->ifindex == iflink) { + real_netdev = netdev; + dev_hold(real_netdev); + goto out; + } + real_netdev = dev_get_by_index(real_net, iflink); out: -- cgit From 0aa6b294b312d9710804679abd2c0c8ca52cc2bc Mon Sep 17 00:00:00 2001 From: Zhen Ni Date: Wed, 2 Mar 2022 15:42:41 +0800 Subject: ALSA: intel_hdmi: Fix reference to PCM buffer address PCM buffers might be allocated dynamically when the buffer preallocation failed or a larger buffer is requested, and it's not guaranteed that substream->dma_buffer points to the actually used buffer. The driver needs to refer to substream->runtime->dma_addr instead for the buffer address. Signed-off-by: Zhen Ni Cc: Link: https://lore.kernel.org/r/20220302074241.30469-1-nizhen@uniontech.com Signed-off-by: Takashi Iwai --- sound/x86/intel_hdmi_audio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/x86/intel_hdmi_audio.c b/sound/x86/intel_hdmi_audio.c index 1c94eaff1931..4a3ff6468aa7 100644 --- a/sound/x86/intel_hdmi_audio.c +++ b/sound/x86/intel_hdmi_audio.c @@ -1261,7 +1261,7 @@ static int had_pcm_mmap(struct snd_pcm_substream *substream, { vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); return remap_pfn_range(vma, vma->vm_start, - substream->dma_buffer.addr >> PAGE_SHIFT, + substream->runtime->dma_addr >> PAGE_SHIFT, vma->vm_end - vma->vm_start, vma->vm_page_prot); } -- cgit From 22ba5e99b96f1c0dbdfa4f4e1d9751b4c8348541 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Tue, 22 Feb 2022 11:31:18 +0800 Subject: erofs: fix ztailpacking on > 4GiB filesystems z_idataoff here is an absolute physical offset, so it should use erofs_off_t (64 bits at least). Otherwise, it'll get trimmed and cause the decompresion failure. Link: https://lore.kernel.org/r/20220222033118.20540-1-hsiangkao@linux.alibaba.com Fixes: ab92184ff8f1 ("erofs: add on-disk compressed tail-packing inline support") Reviewed-by: Yue Hu Reviewed-by: Chao Yu Signed-off-by: Gao Xiang --- fs/erofs/internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index b8272fb95fd6..5aa2cf2c2f80 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -325,7 +325,7 @@ struct erofs_inode { unsigned char z_algorithmtype[2]; unsigned char z_logical_clusterbits; unsigned long z_tailextent_headlcn; - unsigned int z_idataoff; + erofs_off_t z_idataoff; unsigned short z_idata_size; }; #endif /* CONFIG_EROFS_FS_ZIP */ -- cgit From 8f4347081be32e67b0873827e0138ab0fdaaf450 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Wed, 2 Mar 2022 11:16:36 +0100 Subject: staging: rtl8723bs: Fix access-point mode deadlock Commit 54659ca026e5 ("staging: rtl8723bs: remove possible deadlock when disconnect (v2)") split the locking of pxmitpriv->lock vs sleep_q/lock into 2 locks in attempt to fix a lockdep reported issue with the locking order of the sta_hash_lock vs pxmitpriv->lock. But in the end this turned out to not fully solve the sta_hash_lock issue so commit a7ac783c338b ("staging: rtl8723bs: remove a second possible deadlock") was added to fix this in another way. The original fix was kept as it was still seen as a good thing to have, but now it turns out that it creates a deadlock in access-point mode: [Feb20 23:47] ====================================================== [ +0.074085] WARNING: possible circular locking dependency detected [ +0.074077] 5.16.0-1-amd64 #1 Tainted: G C E [ +0.064710] ------------------------------------------------------ [ +0.074075] ksoftirqd/3/29 is trying to acquire lock: [ +0.060542] ffffb8b30062ab00 (&pxmitpriv->lock){+.-.}-{2:2}, at: rtw_xmit_classifier+0x8a/0x140 [r8723bs] [ +0.114921] but task is already holding lock: [ +0.069908] ffffb8b3007ab704 (&psta->sleep_q.lock){+.-.}-{2:2}, at: wakeup_sta_to_xmit+0x3b/0x300 [r8723bs] [ +0.116976] which lock already depends on the new lock. [ +0.098037] the existing dependency chain (in reverse order) is: [ +0.089704] -> #1 (&psta->sleep_q.lock){+.-.}-{2:2}: [ +0.077232] _raw_spin_lock_bh+0x34/0x40 [ +0.053261] xmitframe_enqueue_for_sleeping_sta+0xc1/0x2f0 [r8723bs] [ +0.082572] rtw_xmit+0x58b/0x940 [r8723bs] [ +0.056528] _rtw_xmit_entry+0xba/0x350 [r8723bs] [ +0.062755] dev_hard_start_xmit+0xf1/0x320 [ +0.056381] sch_direct_xmit+0x9e/0x360 [ +0.052212] __dev_queue_xmit+0xce4/0x1080 [ +0.055334] ip6_finish_output2+0x18f/0x6e0 [ +0.056378] ndisc_send_skb+0x2c8/0x870 [ +0.052209] ndisc_send_ns+0xd3/0x210 [ +0.050130] addrconf_dad_work+0x3df/0x5a0 [ +0.055338] process_one_work+0x274/0x5a0 [ +0.054296] worker_thread+0x52/0x3b0 [ +0.050124] kthread+0x16c/0x1a0 [ +0.044925] ret_from_fork+0x1f/0x30 [ +0.049092] -> #0 (&pxmitpriv->lock){+.-.}-{2:2}: [ +0.074101] __lock_acquire+0x10f5/0x1d80 [ +0.054298] lock_acquire+0xd7/0x300 [ +0.049088] _raw_spin_lock_bh+0x34/0x40 [ +0.053248] rtw_xmit_classifier+0x8a/0x140 [r8723bs] [ +0.066949] rtw_xmitframe_enqueue+0xa/0x20 [r8723bs] [ +0.066946] rtl8723bs_hal_xmitframe_enqueue+0x14/0x50 [r8723bs] [ +0.078386] wakeup_sta_to_xmit+0xa6/0x300 [r8723bs] [ +0.065903] rtw_recv_entry+0xe36/0x1160 [r8723bs] [ +0.063809] rtl8723bs_recv_tasklet+0x349/0x6c0 [r8723bs] [ +0.071093] tasklet_action_common.constprop.0+0xe5/0x110 [ +0.070966] __do_softirq+0x16f/0x50a [ +0.050134] __irq_exit_rcu+0xeb/0x140 [ +0.051172] irq_exit_rcu+0xa/0x20 [ +0.047006] common_interrupt+0xb8/0xd0 [ +0.052214] asm_common_interrupt+0x1e/0x40 [ +0.056381] finish_task_switch.isra.0+0x100/0x3a0 [ +0.063670] __schedule+0x3ad/0xd20 [ +0.048047] schedule+0x4e/0xc0 [ +0.043880] smpboot_thread_fn+0xc4/0x220 [ +0.054298] kthread+0x16c/0x1a0 [ +0.044922] ret_from_fork+0x1f/0x30 [ +0.049088] other info that might help us debug this: [ +0.095950] Possible unsafe locking scenario: [ +0.070952] CPU0 CPU1 [ +0.054282] ---- ---- [ +0.054285] lock(&psta->sleep_q.lock); [ +0.047004] lock(&pxmitpriv->lock); [ +0.074082] lock(&psta->sleep_q.lock); [ +0.077209] lock(&pxmitpriv->lock); [ +0.043873] *** DEADLOCK *** [ +0.070950] 1 lock held by ksoftirqd/3/29: [ +0.049082] #0: ffffb8b3007ab704 (&psta->sleep_q.lock){+.-.}-{2:2}, at: wakeup_sta_to_xmit+0x3b/0x300 [r8723bs] Analysis shows that in hindsight the splitting of the lock was not a good idea, so revert this to fix the access-point mode deadlock. Note this is a straight-forward revert done with git revert, the commented out "/* spin_lock_bh(&psta_bmc->sleep_q.lock); */" lines were part of the code before the reverted changes. Fixes: 54659ca026e5 ("staging: rtl8723bs: remove possible deadlock when disconnect (v2)") Cc: stable Cc: Fabio Aiuto Signed-off-by: Hans de Goede BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=215542 Link: https://lore.kernel.org/r/20220302101637.26542-1-hdegoede@redhat.com Signed-off-by: Greg Kroah-Hartman --- drivers/staging/rtl8723bs/core/rtw_mlme_ext.c | 7 +++++-- drivers/staging/rtl8723bs/core/rtw_recv.c | 10 +++++++--- drivers/staging/rtl8723bs/core/rtw_sta_mgt.c | 22 ++++++++++------------ drivers/staging/rtl8723bs/core/rtw_xmit.c | 16 +++++++++------- drivers/staging/rtl8723bs/hal/rtl8723bs_xmit.c | 2 ++ 5 files changed, 33 insertions(+), 24 deletions(-) diff --git a/drivers/staging/rtl8723bs/core/rtw_mlme_ext.c b/drivers/staging/rtl8723bs/core/rtw_mlme_ext.c index 0f82f5031c43..49a3f45cb771 100644 --- a/drivers/staging/rtl8723bs/core/rtw_mlme_ext.c +++ b/drivers/staging/rtl8723bs/core/rtw_mlme_ext.c @@ -5907,6 +5907,7 @@ u8 chk_bmc_sleepq_hdl(struct adapter *padapter, unsigned char *pbuf) struct sta_info *psta_bmc; struct list_head *xmitframe_plist, *xmitframe_phead, *tmp; struct xmit_frame *pxmitframe = NULL; + struct xmit_priv *pxmitpriv = &padapter->xmitpriv; struct sta_priv *pstapriv = &padapter->stapriv; /* for BC/MC Frames */ @@ -5917,7 +5918,8 @@ u8 chk_bmc_sleepq_hdl(struct adapter *padapter, unsigned char *pbuf) if ((pstapriv->tim_bitmap&BIT(0)) && (psta_bmc->sleepq_len > 0)) { msleep(10);/* 10ms, ATIM(HIQ) Windows */ - spin_lock_bh(&psta_bmc->sleep_q.lock); + /* spin_lock_bh(&psta_bmc->sleep_q.lock); */ + spin_lock_bh(&pxmitpriv->lock); xmitframe_phead = get_list_head(&psta_bmc->sleep_q); list_for_each_safe(xmitframe_plist, tmp, xmitframe_phead) { @@ -5940,7 +5942,8 @@ u8 chk_bmc_sleepq_hdl(struct adapter *padapter, unsigned char *pbuf) rtw_hal_xmitframe_enqueue(padapter, pxmitframe); } - spin_unlock_bh(&psta_bmc->sleep_q.lock); + /* spin_unlock_bh(&psta_bmc->sleep_q.lock); */ + spin_unlock_bh(&pxmitpriv->lock); /* check hi queue and bmc_sleepq */ rtw_chk_hi_queue_cmd(padapter); diff --git a/drivers/staging/rtl8723bs/core/rtw_recv.c b/drivers/staging/rtl8723bs/core/rtw_recv.c index 41bfca549c64..105fe0e3482a 100644 --- a/drivers/staging/rtl8723bs/core/rtw_recv.c +++ b/drivers/staging/rtl8723bs/core/rtw_recv.c @@ -957,8 +957,10 @@ static signed int validate_recv_ctrl_frame(struct adapter *padapter, union recv_ if ((psta->state&WIFI_SLEEP_STATE) && (pstapriv->sta_dz_bitmap&BIT(psta->aid))) { struct list_head *xmitframe_plist, *xmitframe_phead; struct xmit_frame *pxmitframe = NULL; + struct xmit_priv *pxmitpriv = &padapter->xmitpriv; - spin_lock_bh(&psta->sleep_q.lock); + /* spin_lock_bh(&psta->sleep_q.lock); */ + spin_lock_bh(&pxmitpriv->lock); xmitframe_phead = get_list_head(&psta->sleep_q); xmitframe_plist = get_next(xmitframe_phead); @@ -989,10 +991,12 @@ static signed int validate_recv_ctrl_frame(struct adapter *padapter, union recv_ update_beacon(padapter, WLAN_EID_TIM, NULL, true); } - spin_unlock_bh(&psta->sleep_q.lock); + /* spin_unlock_bh(&psta->sleep_q.lock); */ + spin_unlock_bh(&pxmitpriv->lock); } else { - spin_unlock_bh(&psta->sleep_q.lock); + /* spin_unlock_bh(&psta->sleep_q.lock); */ + spin_unlock_bh(&pxmitpriv->lock); if (pstapriv->tim_bitmap&BIT(psta->aid)) { if (psta->sleepq_len == 0) { diff --git a/drivers/staging/rtl8723bs/core/rtw_sta_mgt.c b/drivers/staging/rtl8723bs/core/rtw_sta_mgt.c index 0c9ea1520fd0..beb11d89db18 100644 --- a/drivers/staging/rtl8723bs/core/rtw_sta_mgt.c +++ b/drivers/staging/rtl8723bs/core/rtw_sta_mgt.c @@ -293,48 +293,46 @@ u32 rtw_free_stainfo(struct adapter *padapter, struct sta_info *psta) /* list_del_init(&psta->wakeup_list); */ - spin_lock_bh(&psta->sleep_q.lock); + spin_lock_bh(&pxmitpriv->lock); + rtw_free_xmitframe_queue(pxmitpriv, &psta->sleep_q); psta->sleepq_len = 0; - spin_unlock_bh(&psta->sleep_q.lock); - - spin_lock_bh(&pxmitpriv->lock); /* vo */ - spin_lock_bh(&pstaxmitpriv->vo_q.sta_pending.lock); + /* spin_lock_bh(&(pxmitpriv->vo_pending.lock)); */ rtw_free_xmitframe_queue(pxmitpriv, &pstaxmitpriv->vo_q.sta_pending); list_del_init(&(pstaxmitpriv->vo_q.tx_pending)); phwxmit = pxmitpriv->hwxmits; phwxmit->accnt -= pstaxmitpriv->vo_q.qcnt; pstaxmitpriv->vo_q.qcnt = 0; - spin_unlock_bh(&pstaxmitpriv->vo_q.sta_pending.lock); + /* spin_unlock_bh(&(pxmitpriv->vo_pending.lock)); */ /* vi */ - spin_lock_bh(&pstaxmitpriv->vi_q.sta_pending.lock); + /* spin_lock_bh(&(pxmitpriv->vi_pending.lock)); */ rtw_free_xmitframe_queue(pxmitpriv, &pstaxmitpriv->vi_q.sta_pending); list_del_init(&(pstaxmitpriv->vi_q.tx_pending)); phwxmit = pxmitpriv->hwxmits+1; phwxmit->accnt -= pstaxmitpriv->vi_q.qcnt; pstaxmitpriv->vi_q.qcnt = 0; - spin_unlock_bh(&pstaxmitpriv->vi_q.sta_pending.lock); + /* spin_unlock_bh(&(pxmitpriv->vi_pending.lock)); */ /* be */ - spin_lock_bh(&pstaxmitpriv->be_q.sta_pending.lock); + /* spin_lock_bh(&(pxmitpriv->be_pending.lock)); */ rtw_free_xmitframe_queue(pxmitpriv, &pstaxmitpriv->be_q.sta_pending); list_del_init(&(pstaxmitpriv->be_q.tx_pending)); phwxmit = pxmitpriv->hwxmits+2; phwxmit->accnt -= pstaxmitpriv->be_q.qcnt; pstaxmitpriv->be_q.qcnt = 0; - spin_unlock_bh(&pstaxmitpriv->be_q.sta_pending.lock); + /* spin_unlock_bh(&(pxmitpriv->be_pending.lock)); */ /* bk */ - spin_lock_bh(&pstaxmitpriv->bk_q.sta_pending.lock); + /* spin_lock_bh(&(pxmitpriv->bk_pending.lock)); */ rtw_free_xmitframe_queue(pxmitpriv, &pstaxmitpriv->bk_q.sta_pending); list_del_init(&(pstaxmitpriv->bk_q.tx_pending)); phwxmit = pxmitpriv->hwxmits+3; phwxmit->accnt -= pstaxmitpriv->bk_q.qcnt; pstaxmitpriv->bk_q.qcnt = 0; - spin_unlock_bh(&pstaxmitpriv->bk_q.sta_pending.lock); + /* spin_unlock_bh(&(pxmitpriv->bk_pending.lock)); */ spin_unlock_bh(&pxmitpriv->lock); diff --git a/drivers/staging/rtl8723bs/core/rtw_xmit.c b/drivers/staging/rtl8723bs/core/rtw_xmit.c index 13b8bd5ffabc..f466bfd248fb 100644 --- a/drivers/staging/rtl8723bs/core/rtw_xmit.c +++ b/drivers/staging/rtl8723bs/core/rtw_xmit.c @@ -1734,12 +1734,15 @@ void rtw_free_xmitframe_queue(struct xmit_priv *pxmitpriv, struct __queue *pfram struct list_head *plist, *phead, *tmp; struct xmit_frame *pxmitframe; + spin_lock_bh(&pframequeue->lock); + phead = get_list_head(pframequeue); list_for_each_safe(plist, tmp, phead) { pxmitframe = list_entry(plist, struct xmit_frame, list); rtw_free_xmitframe(pxmitpriv, pxmitframe); } + spin_unlock_bh(&pframequeue->lock); } s32 rtw_xmitframe_enqueue(struct adapter *padapter, struct xmit_frame *pxmitframe) @@ -1794,7 +1797,6 @@ s32 rtw_xmit_classifier(struct adapter *padapter, struct xmit_frame *pxmitframe) struct sta_info *psta; struct tx_servq *ptxservq; struct pkt_attrib *pattrib = &pxmitframe->attrib; - struct xmit_priv *xmit_priv = &padapter->xmitpriv; struct hw_xmit *phwxmits = padapter->xmitpriv.hwxmits; signed int res = _SUCCESS; @@ -1812,14 +1814,12 @@ s32 rtw_xmit_classifier(struct adapter *padapter, struct xmit_frame *pxmitframe) ptxservq = rtw_get_sta_pending(padapter, psta, pattrib->priority, (u8 *)(&ac_index)); - spin_lock_bh(&xmit_priv->lock); if (list_empty(&ptxservq->tx_pending)) list_add_tail(&ptxservq->tx_pending, get_list_head(phwxmits[ac_index].sta_queue)); list_add_tail(&pxmitframe->list, get_list_head(&ptxservq->sta_pending)); ptxservq->qcnt++; phwxmits[ac_index].accnt++; - spin_unlock_bh(&xmit_priv->lock); exit: @@ -2202,10 +2202,11 @@ void wakeup_sta_to_xmit(struct adapter *padapter, struct sta_info *psta) struct list_head *xmitframe_plist, *xmitframe_phead, *tmp; struct xmit_frame *pxmitframe = NULL; struct sta_priv *pstapriv = &padapter->stapriv; + struct xmit_priv *pxmitpriv = &padapter->xmitpriv; psta_bmc = rtw_get_bcmc_stainfo(padapter); - spin_lock_bh(&psta->sleep_q.lock); + spin_lock_bh(&pxmitpriv->lock); xmitframe_phead = get_list_head(&psta->sleep_q); list_for_each_safe(xmitframe_plist, tmp, xmitframe_phead) { @@ -2306,7 +2307,7 @@ void wakeup_sta_to_xmit(struct adapter *padapter, struct sta_info *psta) _exit: - spin_unlock_bh(&psta->sleep_q.lock); + spin_unlock_bh(&pxmitpriv->lock); if (update_mask) update_beacon(padapter, WLAN_EID_TIM, NULL, true); @@ -2318,8 +2319,9 @@ void xmit_delivery_enabled_frames(struct adapter *padapter, struct sta_info *pst struct list_head *xmitframe_plist, *xmitframe_phead, *tmp; struct xmit_frame *pxmitframe = NULL; struct sta_priv *pstapriv = &padapter->stapriv; + struct xmit_priv *pxmitpriv = &padapter->xmitpriv; - spin_lock_bh(&psta->sleep_q.lock); + spin_lock_bh(&pxmitpriv->lock); xmitframe_phead = get_list_head(&psta->sleep_q); list_for_each_safe(xmitframe_plist, tmp, xmitframe_phead) { @@ -2372,7 +2374,7 @@ void xmit_delivery_enabled_frames(struct adapter *padapter, struct sta_info *pst } } - spin_unlock_bh(&psta->sleep_q.lock); + spin_unlock_bh(&pxmitpriv->lock); } void enqueue_pending_xmitbuf(struct xmit_priv *pxmitpriv, struct xmit_buf *pxmitbuf) diff --git a/drivers/staging/rtl8723bs/hal/rtl8723bs_xmit.c b/drivers/staging/rtl8723bs/hal/rtl8723bs_xmit.c index b5d5e922231c..15810438a472 100644 --- a/drivers/staging/rtl8723bs/hal/rtl8723bs_xmit.c +++ b/drivers/staging/rtl8723bs/hal/rtl8723bs_xmit.c @@ -502,7 +502,9 @@ s32 rtl8723bs_hal_xmit( rtw_issue_addbareq_cmd(padapter, pxmitframe); } + spin_lock_bh(&pxmitpriv->lock); err = rtw_xmitframe_enqueue(padapter, pxmitframe); + spin_unlock_bh(&pxmitpriv->lock); if (err != _SUCCESS) { rtw_free_xmitframe(pxmitpriv, pxmitframe); -- cgit From 342e7c6ea58200e45bcaa9bdd8402a5531c4777e Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Wed, 2 Mar 2022 11:16:37 +0100 Subject: staging: rtl8723bs: Improve the comment explaining the locking rules rtw_mlme.h has a comment which briefly describes the locking rules for the rtl8723bs driver, improve this to also mention the locking order of xmit_priv.lock vs the lock(s) embedded in the various queues. Cc: Fabio Aiuto Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20220302101637.26542-2-hdegoede@redhat.com Signed-off-by: Greg Kroah-Hartman --- drivers/staging/rtl8723bs/include/rtw_mlme.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/staging/rtl8723bs/include/rtw_mlme.h b/drivers/staging/rtl8723bs/include/rtw_mlme.h index c94fa7d8d5a9..1b343b434f4d 100644 --- a/drivers/staging/rtl8723bs/include/rtw_mlme.h +++ b/drivers/staging/rtl8723bs/include/rtw_mlme.h @@ -102,13 +102,17 @@ there are several "locks" in mlme_priv, since mlme_priv is a shared resource between many threads, like ISR/Call-Back functions, the OID handlers, and even timer functions. - Each struct __queue has its own locks, already. -Other items are protected by mlme_priv.lock. +Other items in mlme_priv are protected by mlme_priv.lock, while items in +xmit_priv are protected by xmit_priv.lock. To avoid possible dead lock, any thread trying to modifiying mlme_priv SHALL not lock up more than one locks at a time! +The only exception is that queue functions which take the __queue.lock +may be called with the xmit_priv.lock held. In this case the order +MUST always be first lock xmit_priv.lock and then call any queue functions +which take __queue.lock. */ -- cgit From c992fa1fd52380d0c4ced7b07479e877311ae645 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 18 Feb 2022 10:13:00 +0800 Subject: btrfs: subpage: fix a wrong check on subpage->writers [BUG] When looping btrfs/074 with 64K page size and 4K sectorsize, there is a low chance (1/50~1/100) to crash with the following ASSERT() triggered in btrfs_subpage_start_writer(): ret = atomic_add_return(nbits, &subpage->writers); ASSERT(ret == nbits); <<< This one <<< [CAUSE] With more debugging output on the parameters of btrfs_subpage_start_writer(), it shows a very concerning error: ret=29 nbits=13 start=393216 len=53248 For @nbits it's correct, but @ret which is the returned value from atomic_add_return(), it's not only larger than nbits, but also larger than max sectors per page value (for 64K page size and 4K sector size, it's 16). This indicates that some call sites are not properly decreasing the value. And that's exactly the case, in btrfs_page_unlock_writer(), due to the fact that we can have page locked either by lock_page() or process_one_page(), we have to check if the subpage has any writer. If no writers, it's locked by lock_page() and we only need to unlock it. But unfortunately the check for the writers are completely opposite: if (atomic_read(&subpage->writers)) /* No writers, locked by plain lock_page() */ return unlock_page(page); We directly unlock the page if it has writers, which is the completely opposite what we want. Thankfully the affected call site is only limited to extent_write_locked_range(), so it's mostly affecting compressed write. [FIX] Just fix the wrong check condition to fix the bug. Fixes: e55a0de18572 ("btrfs: rework page locking in __extent_writepage()") CC: stable@vger.kernel.org # 5.16 Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/subpage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 29bd8c7a7706..ef7ae20d2b77 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -736,7 +736,7 @@ void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page, * Since we own the page lock, no one else could touch subpage::writers * and we are safe to do several atomic operations without spinlock. */ - if (atomic_read(&subpage->writers)) + if (atomic_read(&subpage->writers) == 0) /* No writers, locked by plain lock_page() */ return unlock_page(page); -- cgit From d99478874355d3a7b9d86dfb5d7590d5b1754b1f Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 17 Feb 2022 12:12:02 +0000 Subject: btrfs: fix lost prealloc extents beyond eof after full fsync When doing a full fsync, if we have prealloc extents beyond (or at) eof, and the leaves that contain them were not modified in the current transaction, we end up not logging them. This results in losing those extents when we replay the log after a power failure, since the inode is truncated to the current value of the logged i_size. Just like for the fast fsync path, we need to always log all prealloc extents starting at or beyond i_size. The fast fsync case was fixed in commit 471d557afed155 ("Btrfs: fix loss of prealloc extents past i_size after fsync log replay") but it missed the full fsync path. The problem exists since the very early days, when the log tree was added by commit e02119d5a7b439 ("Btrfs: Add a write ahead tree log to optimize synchronous operations"). Example reproducer: $ mkfs.btrfs -f /dev/sdc $ mount /dev/sdc /mnt # Create our test file with many file extent items, so that they span # several leaves of metadata, even if the node/page size is 64K. Use # direct IO and not fsync/O_SYNC because it's both faster and it avoids # clearing the full sync flag from the inode - we want the fsync below # to trigger the slow full sync code path. $ xfs_io -f -d -c "pwrite -b 4K 0 16M" /mnt/foo # Now add two preallocated extents to our file without extending the # file's size. One right at i_size, and another further beyond, leaving # a gap between the two prealloc extents. $ xfs_io -c "falloc -k 16M 1M" /mnt/foo $ xfs_io -c "falloc -k 20M 1M" /mnt/foo # Make sure everything is durably persisted and the transaction is # committed. This makes all created extents to have a generation lower # than the generation of the transaction used by the next write and # fsync. sync # Now overwrite only the first extent, which will result in modifying # only the first leaf of metadata for our inode. Then fsync it. This # fsync will use the slow code path (inode full sync bit is set) because # it's the first fsync since the inode was created/loaded. $ xfs_io -c "pwrite 0 4K" -c "fsync" /mnt/foo # Extent list before power failure. $ xfs_io -c "fiemap -v" /mnt/foo /mnt/foo: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..7]: 2178048..2178055 8 0x0 1: [8..16383]: 26632..43007 16376 0x0 2: [16384..32767]: 2156544..2172927 16384 0x0 3: [32768..34815]: 2172928..2174975 2048 0x800 4: [34816..40959]: hole 6144 5: [40960..43007]: 2174976..2177023 2048 0x801 # Mount fs again, trigger log replay. $ mount /dev/sdc /mnt # Extent list after power failure and log replay. $ xfs_io -c "fiemap -v" /mnt/foo /mnt/foo: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..7]: 2178048..2178055 8 0x0 1: [8..16383]: 26632..43007 16376 0x0 2: [16384..32767]: 2156544..2172927 16384 0x1 # The prealloc extents at file offsets 16M and 20M are missing. So fix this by calling btrfs_log_prealloc_extents() when we are doing a full fsync, so that we always log all prealloc extents beyond eof. A test case for fstests will follow soon. CC: stable@vger.kernel.org # 4.19+ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 43 +++++++++++++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 3ee014c06b82..42caf595c936 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4635,7 +4635,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, /* * Log all prealloc extents beyond the inode's i_size to make sure we do not - * lose them after doing a fast fsync and replaying the log. We scan the + * lose them after doing a full/fast fsync and replaying the log. We scan the * subvolume's root instead of iterating the inode's extent map tree because * otherwise we can log incorrect extent items based on extent map conversion. * That can happen due to the fact that extent maps are merged when they @@ -5414,6 +5414,7 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans, struct btrfs_log_ctx *ctx, bool *need_log_inode_item) { + const u64 i_size = i_size_read(&inode->vfs_inode); struct btrfs_root *root = inode->root; int ins_start_slot = 0; int ins_nr = 0; @@ -5434,13 +5435,21 @@ again: if (min_key->type > max_key->type) break; - if (min_key->type == BTRFS_INODE_ITEM_KEY) + if (min_key->type == BTRFS_INODE_ITEM_KEY) { *need_log_inode_item = false; - - if ((min_key->type == BTRFS_INODE_REF_KEY || - min_key->type == BTRFS_INODE_EXTREF_KEY) && - inode->generation == trans->transid && - !recursive_logging) { + } else if (min_key->type == BTRFS_EXTENT_DATA_KEY && + min_key->offset >= i_size) { + /* + * Extents at and beyond eof are logged with + * btrfs_log_prealloc_extents(). + * Only regular files have BTRFS_EXTENT_DATA_KEY keys, + * and no keys greater than that, so bail out. + */ + break; + } else if ((min_key->type == BTRFS_INODE_REF_KEY || + min_key->type == BTRFS_INODE_EXTREF_KEY) && + inode->generation == trans->transid && + !recursive_logging) { u64 other_ino = 0; u64 other_parent = 0; @@ -5471,10 +5480,8 @@ again: btrfs_release_path(path); goto next_key; } - } - - /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ - if (min_key->type == BTRFS_XATTR_ITEM_KEY) { + } else if (min_key->type == BTRFS_XATTR_ITEM_KEY) { + /* Skip xattrs, logged later with btrfs_log_all_xattrs() */ if (ins_nr == 0) goto next_slot; ret = copy_items(trans, inode, dst_path, path, @@ -5527,9 +5534,21 @@ next_key: break; } } - if (ins_nr) + if (ins_nr) { ret = copy_items(trans, inode, dst_path, path, ins_start_slot, ins_nr, inode_only, logged_isize); + if (ret) + return ret; + } + + if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) { + /* + * Release the path because otherwise we might attempt to double + * lock the same leaf with btrfs_log_prealloc_extents() below. + */ + btrfs_release_path(path); + ret = btrfs_log_prealloc_extents(trans, inode, dst_path); + } return ret; } -- cgit From a50e1fcbc9b85fd4e95b89a75c0884cb032a3e06 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 18 Feb 2022 10:17:39 -0500 Subject: btrfs: do not WARN_ON() if we have PageError set Whenever we do any extent buffer operations we call assert_eb_page_uptodate() to complain loudly if we're operating on an non-uptodate page. Our overnight tests caught this warning earlier this week WARNING: CPU: 1 PID: 553508 at fs/btrfs/extent_io.c:6849 assert_eb_page_uptodate+0x3f/0x50 CPU: 1 PID: 553508 Comm: kworker/u4:13 Tainted: G W 5.17.0-rc3+ #564 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014 Workqueue: btrfs-cache btrfs_work_helper RIP: 0010:assert_eb_page_uptodate+0x3f/0x50 RSP: 0018:ffffa961440a7c68 EFLAGS: 00010246 RAX: 0017ffffc0002112 RBX: ffffe6e74453f9c0 RCX: 0000000000001000 RDX: ffffe6e74467c887 RSI: ffffe6e74453f9c0 RDI: ffff8d4c5efc2fc0 RBP: 0000000000000d56 R08: ffff8d4d4a224000 R09: 0000000000000000 R10: 00015817fa9d1ef0 R11: 000000000000000c R12: 00000000000007b1 R13: ffff8d4c5efc2fc0 R14: 0000000001500000 R15: 0000000001cb1000 FS: 0000000000000000(0000) GS:ffff8d4dbbd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007ff31d3448d8 CR3: 0000000118be8004 CR4: 0000000000370ee0 Call Trace: extent_buffer_test_bit+0x3f/0x70 free_space_test_bit+0xa6/0xc0 load_free_space_tree+0x1f6/0x470 caching_thread+0x454/0x630 ? rcu_read_lock_sched_held+0x12/0x60 ? rcu_read_lock_sched_held+0x12/0x60 ? rcu_read_lock_sched_held+0x12/0x60 ? lock_release+0x1f0/0x2d0 btrfs_work_helper+0xf2/0x3e0 ? lock_release+0x1f0/0x2d0 ? finish_task_switch.isra.0+0xf9/0x3a0 process_one_work+0x26d/0x580 ? process_one_work+0x580/0x580 worker_thread+0x55/0x3b0 ? process_one_work+0x580/0x580 kthread+0xf0/0x120 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x1f/0x30 This was partially fixed by c2e39305299f01 ("btrfs: clear extent buffer uptodate when we fail to write it"), however all that fix did was keep us from finding extent buffers after a failed writeout. It didn't keep us from continuing to use a buffer that we already had found. In this case we're searching the commit root to cache the block group, so we can start committing the transaction and switch the commit root and then start writing. After the switch we can look up an extent buffer that hasn't been written yet and start processing that block group. Then we fail to write that block out and clear Uptodate on the page, and then we start spewing these errors. Normally we're protected by the tree lock to a certain degree here. If we read a block we have that block read locked, and we block the writer from locking the block before we submit it for the write. However this isn't necessarily fool proof because the read could happen before we do the submit_bio and after we locked and unlocked the extent buffer. Also in this particular case we have path->skip_locking set, so that won't save us here. We'll simply get a block that was valid when we read it, but became invalid while we were using it. What we really want is to catch the case where we've "read" a block but it's not marked Uptodate. On read we ClearPageError(), so if we're !Uptodate and !Error we know we didn't do the right thing for reading the page. Fix this by checking !Uptodate && !Error, this way we will not complain if our buffer gets invalidated while we're using it, and we'll maintain the spirit of the check which is to make sure we have a fully in-cache block while we're messing with it. CC: stable@vger.kernel.org # 5.4+ Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d6d48ecf823c..9081223c3230 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -6851,14 +6851,24 @@ static void assert_eb_page_uptodate(const struct extent_buffer *eb, { struct btrfs_fs_info *fs_info = eb->fs_info; + /* + * If we are using the commit root we could potentially clear a page + * Uptodate while we're using the extent buffer that we've previously + * looked up. We don't want to complain in this case, as the page was + * valid before, we just didn't write it out. Instead we want to catch + * the case where we didn't actually read the block properly, which + * would have !PageUptodate && !PageError, as we clear PageError before + * reading. + */ if (fs_info->sectorsize < PAGE_SIZE) { - bool uptodate; + bool uptodate, error; uptodate = btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len); - WARN_ON(!uptodate); + error = btrfs_subpage_test_error(fs_info, page, eb->start, eb->len); + WARN_ON(!uptodate && !error); } else { - WARN_ON(!PageUptodate(page)); + WARN_ON(!PageUptodate(page) && !PageError(page)); } } -- cgit From a6ab66eb8541d61b0a11d70980f07b4c2dfeddc5 Mon Sep 17 00:00:00 2001 From: Su Yue Date: Tue, 22 Feb 2022 16:42:07 +0800 Subject: btrfs: tree-checker: use u64 for item data end to avoid overflow User reported there is an array-index-out-of-bounds access while mounting the crafted image: [350.411942 ] loop0: detected capacity change from 0 to 262144 [350.427058 ] BTRFS: device fsid a62e00e8-e94e-4200-8217-12444de93c2e devid 1 transid 8 /dev/loop0 scanned by systemd-udevd (1044) [350.428564 ] BTRFS info (device loop0): disk space caching is enabled [350.428568 ] BTRFS info (device loop0): has skinny extents [350.429589 ] [350.429619 ] UBSAN: array-index-out-of-bounds in fs/btrfs/struct-funcs.c:161:1 [350.429636 ] index 1048096 is out of range for type 'page *[16]' [350.429650 ] CPU: 0 PID: 9 Comm: kworker/u8:1 Not tainted 5.16.0-rc4 [350.429652 ] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-1ubuntu1.1 04/01/2014 [350.429653 ] Workqueue: btrfs-endio-meta btrfs_work_helper [btrfs] [350.429772 ] Call Trace: [350.429774 ] [350.429776 ] dump_stack_lvl+0x47/0x5c [350.429780 ] ubsan_epilogue+0x5/0x50 [350.429786 ] __ubsan_handle_out_of_bounds+0x66/0x70 [350.429791 ] btrfs_get_16+0xfd/0x120 [btrfs] [350.429832 ] check_leaf+0x754/0x1a40 [btrfs] [350.429874 ] ? filemap_read+0x34a/0x390 [350.429878 ] ? load_balance+0x175/0xfc0 [350.429881 ] validate_extent_buffer+0x244/0x310 [btrfs] [350.429911 ] btrfs_validate_metadata_buffer+0xf8/0x100 [btrfs] [350.429935 ] end_bio_extent_readpage+0x3af/0x850 [btrfs] [350.429969 ] ? newidle_balance+0x259/0x480 [350.429972 ] end_workqueue_fn+0x29/0x40 [btrfs] [350.429995 ] btrfs_work_helper+0x71/0x330 [btrfs] [350.430030 ] ? __schedule+0x2fb/0xa40 [350.430033 ] process_one_work+0x1f6/0x400 [350.430035 ] ? process_one_work+0x400/0x400 [350.430036 ] worker_thread+0x2d/0x3d0 [350.430037 ] ? process_one_work+0x400/0x400 [350.430038 ] kthread+0x165/0x190 [350.430041 ] ? set_kthread_struct+0x40/0x40 [350.430043 ] ret_from_fork+0x1f/0x30 [350.430047 ] [350.430047 ] [350.430077 ] BTRFS warning (device loop0): bad eb member start: ptr 0xffe20f4e start 20975616 member offset 4293005178 size 2 btrfs check reports: corrupt leaf: root=3 block=20975616 physical=20975616 slot=1, unexpected item end, have 4294971193 expect 3897 The first slot item offset is 4293005033 and the size is 1966160. In check_leaf, we use btrfs_item_end() to check item boundary versus extent_buffer data size. However, return type of btrfs_item_end() is u32. (u32)(4293005033 + 1966160) == 3897, overflow happens and the result 3897 equals to leaf data size reasonably. Fix it by use u64 variable to store item data end in check_leaf() to avoid u32 overflow. This commit does solve the invalid memory access showed by the stack trace. However, its metadata profile is DUP and another copy of the leaf is fine. So the image can be mounted successfully. But when umount is called, the ASSERT btrfs_mark_buffer_dirty() will be triggered because the only node in extent tree has 0 item and invalid owner. It's solved by another commit "btrfs: check extent buffer owner against the owner rootid". Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=215299 Reported-by: Wenqing Liu CC: stable@vger.kernel.org # 4.19+ Signed-off-by: Su Yue Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-checker.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 9fd145f1c4bc..aae5697dde32 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1682,6 +1682,7 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) */ for (slot = 0; slot < nritems; slot++) { u32 item_end_expected; + u64 item_data_end; int ret; btrfs_item_key_to_cpu(leaf, &key, slot); @@ -1696,6 +1697,8 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) return -EUCLEAN; } + item_data_end = (u64)btrfs_item_offset(leaf, slot) + + btrfs_item_size(leaf, slot); /* * Make sure the offset and ends are right, remember that the * item data starts at the end of the leaf and grows towards the @@ -1706,11 +1709,10 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) else item_end_expected = btrfs_item_offset(leaf, slot - 1); - if (unlikely(btrfs_item_data_end(leaf, slot) != item_end_expected)) { + if (unlikely(item_data_end != item_end_expected)) { generic_err(leaf, slot, - "unexpected item end, have %u expect %u", - btrfs_item_data_end(leaf, slot), - item_end_expected); + "unexpected item end, have %llu expect %u", + item_data_end, item_end_expected); return -EUCLEAN; } @@ -1719,12 +1721,10 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) * just in case all the items are consistent to each other, but * all point outside of the leaf. */ - if (unlikely(btrfs_item_data_end(leaf, slot) > - BTRFS_LEAF_DATA_SIZE(fs_info))) { + if (unlikely(item_data_end > BTRFS_LEAF_DATA_SIZE(fs_info))) { generic_err(leaf, slot, - "slot end outside of leaf, have %u expect range [0, %u]", - btrfs_item_data_end(leaf, slot), - BTRFS_LEAF_DATA_SIZE(fs_info)); + "slot end outside of leaf, have %llu expect range [0, %u]", + item_data_end, BTRFS_LEAF_DATA_SIZE(fs_info)); return -EUCLEAN; } -- cgit From b4be6aefa73c9a6899ef3ba9c5faaa8a66e333ef Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 18 Feb 2022 14:56:10 -0500 Subject: btrfs: do not start relocation until in progress drops are done We hit a bug with a recovering relocation on mount for one of our file systems in production. I reproduced this locally by injecting errors into snapshot delete with balance running at the same time. This presented as an error while looking up an extent item WARNING: CPU: 5 PID: 1501 at fs/btrfs/extent-tree.c:866 lookup_inline_extent_backref+0x647/0x680 CPU: 5 PID: 1501 Comm: btrfs-balance Not tainted 5.16.0-rc8+ #8 RIP: 0010:lookup_inline_extent_backref+0x647/0x680 RSP: 0018:ffffae0a023ab960 EFLAGS: 00010202 RAX: 0000000000000001 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 000000000000000c RDI: 0000000000000000 RBP: ffff943fd2a39b60 R08: 0000000000000000 R09: 0000000000000001 R10: 0001434088152de0 R11: 0000000000000000 R12: 0000000001d05000 R13: ffff943fd2a39b60 R14: ffff943fdb96f2a0 R15: ffff9442fc923000 FS: 0000000000000000(0000) GS:ffff944e9eb40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f1157b1fca8 CR3: 000000010f092000 CR4: 0000000000350ee0 Call Trace: insert_inline_extent_backref+0x46/0xd0 __btrfs_inc_extent_ref.isra.0+0x5f/0x200 ? btrfs_merge_delayed_refs+0x164/0x190 __btrfs_run_delayed_refs+0x561/0xfa0 ? btrfs_search_slot+0x7b4/0xb30 ? btrfs_update_root+0x1a9/0x2c0 btrfs_run_delayed_refs+0x73/0x1f0 ? btrfs_update_root+0x1a9/0x2c0 btrfs_commit_transaction+0x50/0xa50 ? btrfs_update_reloc_root+0x122/0x220 prepare_to_merge+0x29f/0x320 relocate_block_group+0x2b8/0x550 btrfs_relocate_block_group+0x1a6/0x350 btrfs_relocate_chunk+0x27/0xe0 btrfs_balance+0x777/0xe60 balance_kthread+0x35/0x50 ? btrfs_balance+0xe60/0xe60 kthread+0x16b/0x190 ? set_kthread_struct+0x40/0x40 ret_from_fork+0x22/0x30 Normally snapshot deletion and relocation are excluded from running at the same time by the fs_info->cleaner_mutex. However if we had a pending balance waiting to get the ->cleaner_mutex, and a snapshot deletion was running, and then the box crashed, we would come up in a state where we have a half deleted snapshot. Again, in the normal case the snapshot deletion needs to complete before relocation can start, but in this case relocation could very well start before the snapshot deletion completes, as we simply add the root to the dead roots list and wait for the next time the cleaner runs to clean up the snapshot. Fix this by setting a bit on the fs_info if we have any DEAD_ROOT's that had a pending drop_progress key. If they do then we know we were in the middle of the drop operation and set a flag on the fs_info. Then balance can wait until this flag is cleared to start up again. If there are DEAD_ROOT's that don't have a drop_progress set then we're safe to start balance right away as we'll be properly protected by the cleaner_mutex. CC: stable@vger.kernel.org # 5.10+ Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 10 ++++++++++ fs/btrfs/disk-io.c | 10 ++++++++++ fs/btrfs/extent-tree.c | 10 ++++++++++ fs/btrfs/relocation.c | 13 +++++++++++++ fs/btrfs/root-tree.c | 15 +++++++++++++++ fs/btrfs/transaction.c | 33 ++++++++++++++++++++++++++++++++- fs/btrfs/transaction.h | 1 + 7 files changed, 91 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 947f04789389..ebb2d109e8bb 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -602,6 +602,9 @@ enum { /* Indicate that we want the transaction kthread to commit right now. */ BTRFS_FS_COMMIT_TRANS, + /* Indicate we have half completed snapshot deletions pending. */ + BTRFS_FS_UNFINISHED_DROPS, + #if BITS_PER_LONG == 32 /* Indicate if we have error/warn message printed on 32bit systems */ BTRFS_FS_32BIT_ERROR, @@ -1106,8 +1109,15 @@ enum { BTRFS_ROOT_QGROUP_FLUSHING, /* We started the orphan cleanup for this root. */ BTRFS_ROOT_ORPHAN_CLEANUP, + /* This root has a drop operation that was started previously. */ + BTRFS_ROOT_UNFINISHED_DROP, }; +static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info) +{ + clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags); +} + /* * Record swapped tree blocks of a subvolume tree for delayed subtree trace * code. For detail check comment in fs/btrfs/qgroup.c. diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 87a5addbedf6..48590a380762 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3813,6 +3813,10 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device set_bit(BTRFS_FS_OPEN, &fs_info->flags); + /* Kick the cleaner thread so it'll start deleting snapshots. */ + if (test_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags)) + wake_up_process(fs_info->cleaner_kthread); + clear_oneshot: btrfs_clear_oneshot_options(fs_info); return 0; @@ -4538,6 +4542,12 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) */ kthread_park(fs_info->cleaner_kthread); + /* + * If we had UNFINISHED_DROPS we could still be processing them, so + * clear that bit and wake up relocation so it can stop. + */ + btrfs_wake_unfinished_drop(fs_info); + /* wait for the qgroup rescan worker to stop */ btrfs_qgroup_wait_for_completion(fs_info, false); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d89273c4b6b8..96427b1ecac3 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5622,6 +5622,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) int ret; int level; bool root_dropped = false; + bool unfinished_drop = false; btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid); @@ -5664,6 +5665,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) * already dropped. */ set_bit(BTRFS_ROOT_DELETING, &root->state); + unfinished_drop = test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state); + if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { level = btrfs_header_level(root->node); path->nodes[level] = btrfs_lock_root_node(root); @@ -5838,6 +5841,13 @@ out_free: kfree(wc); btrfs_free_path(path); out: + /* + * We were an unfinished drop root, check to see if there are any + * pending, and if not clear and wake up any waiters. + */ + if (!err && unfinished_drop) + btrfs_maybe_wake_unfinished_drop(fs_info); + /* * So if we need to stop dropping the snapshot for whatever reason we * need to make sure to add it back to the dead root list so that we diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index f5465197996d..9d8054839782 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3960,6 +3960,19 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) int rw = 0; int err = 0; + /* + * This only gets set if we had a half-deleted snapshot on mount. We + * cannot allow relocation to start while we're still trying to clean up + * these pending deletions. + */ + ret = wait_on_bit(&fs_info->flags, BTRFS_FS_UNFINISHED_DROPS, TASK_INTERRUPTIBLE); + if (ret) + return ret; + + /* We may have been woken up by close_ctree, so bail if we're closing. */ + if (btrfs_fs_closing(fs_info)) + return -EINTR; + bg = btrfs_lookup_block_group(fs_info, group_start); if (!bg) return -ENOENT; diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 3d68d2dcd83e..ca7426ef61c8 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -278,6 +278,21 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)); if (btrfs_root_refs(&root->root_item) == 0) { + struct btrfs_key drop_key; + + btrfs_disk_key_to_cpu(&drop_key, &root->root_item.drop_progress); + /* + * If we have a non-zero drop_progress then we know we + * made it partly through deleting this snapshot, and + * thus we need to make sure we block any balance from + * happening until this snapshot is completely dropped. + */ + if (drop_key.objectid != 0 || drop_key.type != 0 || + drop_key.offset != 0) { + set_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags); + set_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state); + } + set_bit(BTRFS_ROOT_DEAD_TREE, &root->state); btrfs_add_dead_root(root); } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index c3cfdfd8de9b..f17bf3764ce8 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1319,6 +1319,32 @@ again: return 0; } +/* + * If we had a pending drop we need to see if there are any others left in our + * dead roots list, and if not clear our bit and wake any waiters. + */ +void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info) +{ + /* + * We put the drop in progress roots at the front of the list, so if the + * first entry doesn't have UNFINISHED_DROP set we can wake everybody + * up. + */ + spin_lock(&fs_info->trans_lock); + if (!list_empty(&fs_info->dead_roots)) { + struct btrfs_root *root = list_first_entry(&fs_info->dead_roots, + struct btrfs_root, + root_list); + if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state)) { + spin_unlock(&fs_info->trans_lock); + return; + } + } + spin_unlock(&fs_info->trans_lock); + + btrfs_wake_unfinished_drop(fs_info); +} + /* * dead roots are old snapshots that need to be deleted. This allocates * a dirty root struct and adds it into the list of dead roots that need to @@ -1331,7 +1357,12 @@ void btrfs_add_dead_root(struct btrfs_root *root) spin_lock(&fs_info->trans_lock); if (list_empty(&root->root_list)) { btrfs_grab_root(root); - list_add_tail(&root->root_list, &fs_info->dead_roots); + + /* We want to process the partially complete drops first. */ + if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state)) + list_add(&root->root_list, &fs_info->dead_roots); + else + list_add_tail(&root->root_list, &fs_info->dead_roots); } spin_unlock(&fs_info->trans_lock); } diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 9402d8d94484..ba8a9826eb37 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -216,6 +216,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid); void btrfs_add_dead_root(struct btrfs_root *root); int btrfs_defrag_root(struct btrfs_root *root); +void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info); int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root); int btrfs_commit_transaction(struct btrfs_trans_handle *trans); void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans); -- cgit From 5fd76bf31ccfecc06e2e6b29f8c809e934085b99 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Thu, 17 Feb 2022 15:14:43 -0800 Subject: btrfs: fix relocation crash due to premature return from btrfs_commit_transaction() We are seeing crashes similar to the following trace: [38.969182] WARNING: CPU: 20 PID: 2105 at fs/btrfs/relocation.c:4070 btrfs_relocate_block_group+0x2dc/0x340 [btrfs] [38.973556] CPU: 20 PID: 2105 Comm: btrfs Not tainted 5.17.0-rc4 #54 [38.974580] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014 [38.976539] RIP: 0010:btrfs_relocate_block_group+0x2dc/0x340 [btrfs] [38.980336] RSP: 0000:ffffb0dd42e03c20 EFLAGS: 00010206 [38.981218] RAX: ffff96cfc4ede800 RBX: ffff96cfc3ce0000 RCX: 000000000002ca14 [38.982560] RDX: 0000000000000000 RSI: 4cfd109a0bcb5d7f RDI: ffff96cfc3ce0360 [38.983619] RBP: ffff96cfc309c000 R08: 0000000000000000 R09: 0000000000000000 [38.984678] R10: ffff96cec0000001 R11: ffffe84c80000000 R12: ffff96cfc4ede800 [38.985735] R13: 0000000000000000 R14: 0000000000000000 R15: ffff96cfc3ce0360 [38.987146] FS: 00007f11c15218c0(0000) GS:ffff96d6dfb00000(0000) knlGS:0000000000000000 [38.988662] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [38.989398] CR2: 00007ffc922c8e60 CR3: 00000001147a6001 CR4: 0000000000370ee0 [38.990279] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [38.991219] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [38.992528] Call Trace: [38.992854] [38.993148] btrfs_relocate_chunk+0x27/0xe0 [btrfs] [38.993941] btrfs_balance+0x78e/0xea0 [btrfs] [38.994801] ? vsnprintf+0x33c/0x520 [38.995368] ? __kmalloc_track_caller+0x351/0x440 [38.996198] btrfs_ioctl_balance+0x2b9/0x3a0 [btrfs] [38.997084] btrfs_ioctl+0x11b0/0x2da0 [btrfs] [38.997867] ? mod_objcg_state+0xee/0x340 [38.998552] ? seq_release+0x24/0x30 [38.999184] ? proc_nr_files+0x30/0x30 [38.999654] ? call_rcu+0xc8/0x2f0 [39.000228] ? __x64_sys_ioctl+0x84/0xc0 [39.000872] ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs] [39.001973] __x64_sys_ioctl+0x84/0xc0 [39.002566] do_syscall_64+0x3a/0x80 [39.003011] entry_SYSCALL_64_after_hwframe+0x44/0xae [39.003735] RIP: 0033:0x7f11c166959b [39.007324] RSP: 002b:00007fff2543e998 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 [39.008521] RAX: ffffffffffffffda RBX: 00007f11c1521698 RCX: 00007f11c166959b [39.009833] RDX: 00007fff2543ea40 RSI: 00000000c4009420 RDI: 0000000000000003 [39.011270] RBP: 0000000000000003 R08: 0000000000000013 R09: 00007f11c16f94e0 [39.012581] R10: 0000000000000000 R11: 0000000000000246 R12: 00007fff25440df3 [39.014046] R13: 0000000000000000 R14: 00007fff2543ea40 R15: 0000000000000001 [39.015040] [39.015418] ---[ end trace 0000000000000000 ]--- [43.131559] ------------[ cut here ]------------ [43.132234] kernel BUG at fs/btrfs/extent-tree.c:2717! [43.133031] invalid opcode: 0000 [#1] PREEMPT SMP PTI [43.133702] CPU: 1 PID: 1839 Comm: btrfs Tainted: G W 5.17.0-rc4 #54 [43.134863] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014 [43.136426] RIP: 0010:unpin_extent_range+0x37a/0x4f0 [btrfs] [43.139913] RSP: 0000:ffffb0dd4216bc70 EFLAGS: 00010246 [43.140629] RAX: 0000000000000000 RBX: ffff96cfc34490f8 RCX: 0000000000000001 [43.141604] RDX: 0000000080000001 RSI: 0000000051d00000 RDI: 00000000ffffffff [43.142645] RBP: 0000000000000000 R08: 0000000000000000 R09: ffff96cfd07dca50 [43.143669] R10: ffff96cfc46e8a00 R11: fffffffffffec000 R12: 0000000041d00000 [43.144657] R13: ffff96cfc3ce0000 R14: ffffb0dd4216bd08 R15: 0000000000000000 [43.145686] FS: 00007f7657dd68c0(0000) GS:ffff96d6df640000(0000) knlGS:0000000000000000 [43.146808] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [43.147584] CR2: 00007f7fe81bf5b0 CR3: 00000001093ee004 CR4: 0000000000370ee0 [43.148589] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [43.149581] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [43.150559] Call Trace: [43.150904] [43.151253] btrfs_finish_extent_commit+0x88/0x290 [btrfs] [43.152127] btrfs_commit_transaction+0x74f/0xaa0 [btrfs] [43.152932] ? btrfs_attach_transaction_barrier+0x1e/0x50 [btrfs] [43.153786] btrfs_ioctl+0x1edc/0x2da0 [btrfs] [43.154475] ? __check_object_size+0x150/0x170 [43.155170] ? preempt_count_add+0x49/0xa0 [43.155753] ? __x64_sys_ioctl+0x84/0xc0 [43.156437] ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs] [43.157456] __x64_sys_ioctl+0x84/0xc0 [43.157980] do_syscall_64+0x3a/0x80 [43.158543] entry_SYSCALL_64_after_hwframe+0x44/0xae [43.159231] RIP: 0033:0x7f7657f1e59b [43.161819] RSP: 002b:00007ffda5cd1658 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 [43.162702] RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007f7657f1e59b [43.163526] RDX: 0000000000000000 RSI: 0000000000009408 RDI: 0000000000000003 [43.164358] RBP: 0000000000000003 R08: 0000000000000000 R09: 0000000000000000 [43.165208] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 [43.166029] R13: 00005621b91c3232 R14: 00005621b91ba580 R15: 00007ffda5cd1800 [43.166863] [43.167125] Modules linked in: btrfs blake2b_generic xor pata_acpi ata_piix libata raid6_pq scsi_mod libcrc32c virtio_net virtio_rng net_failover rng_core failover scsi_common [43.169552] ---[ end trace 0000000000000000 ]--- [43.171226] RIP: 0010:unpin_extent_range+0x37a/0x4f0 [btrfs] [43.174767] RSP: 0000:ffffb0dd4216bc70 EFLAGS: 00010246 [43.175600] RAX: 0000000000000000 RBX: ffff96cfc34490f8 RCX: 0000000000000001 [43.176468] RDX: 0000000080000001 RSI: 0000000051d00000 RDI: 00000000ffffffff [43.177357] RBP: 0000000000000000 R08: 0000000000000000 R09: ffff96cfd07dca50 [43.178271] R10: ffff96cfc46e8a00 R11: fffffffffffec000 R12: 0000000041d00000 [43.179178] R13: ffff96cfc3ce0000 R14: ffffb0dd4216bd08 R15: 0000000000000000 [43.180071] FS: 00007f7657dd68c0(0000) GS:ffff96d6df800000(0000) knlGS:0000000000000000 [43.181073] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [43.181808] CR2: 00007fe09905f010 CR3: 00000001093ee004 CR4: 0000000000370ee0 [43.182706] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [43.183591] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 We first hit the WARN_ON(rc->block_group->pinned > 0) in btrfs_relocate_block_group() and then the BUG_ON(!cache) in unpin_extent_range(). This tells us that we are exiting relocation and removing the block group with bytes still pinned for that block group. This is supposed to be impossible: the last thing relocate_block_group() does is commit the transaction to get rid of pinned extents. Commit d0c2f4fa555e ("btrfs: make concurrent fsyncs wait less when waiting for a transaction commit") introduced an optimization so that commits from fsync don't have to wait for the previous commit to unpin extents. This was only intended to affect fsync, but it inadvertently made it possible for any commit to skip waiting for the previous commit to unpin. This is because if a call to btrfs_commit_transaction() finds that another thread is already committing the transaction, it waits for the other thread to complete the commit and then returns. If that other thread was in fsync, then it completes the commit without completing the previous commit. This makes the following sequence of events possible: Thread 1____________________|Thread 2 (fsync)_____________________|Thread 3 (balance)___________________ btrfs_commit_transaction(N) | | btrfs_run_delayed_refs | | pin extents | | ... | | state = UNBLOCKED |btrfs_sync_file | | btrfs_start_transaction(N + 1) |relocate_block_group | | btrfs_join_transaction(N + 1) | btrfs_commit_transaction(N + 1) | ... | trans->state = COMMIT_START | | | btrfs_commit_transaction(N + 1) | | wait_for_commit(N + 1, COMPLETED) | wait_for_commit(N, SUPER_COMMITTED)| state = SUPER_COMMITTED | ... | btrfs_finish_extent_commit| | unpin_extent_range() | trans->state = COMPLETED | | | return | | ... | |Thread 1 isn't done, so pinned > 0 | |and we WARN | | | |btrfs_remove_block_group unpin_extent_range() | | Thread 3 removed the | | block group, so we BUG| | There are other sequences involving SUPER_COMMITTED transactions that can cause a similar outcome. We could fix this by making relocation explicitly wait for unpinning, but there may be other cases that need it. Josef mentioned ENOSPC flushing and the free space cache inode as other potential victims. Rather than playing whack-a-mole, this fix is conservative and makes all commits not in fsync wait for all previous transactions, which is what the optimization intended. Fixes: d0c2f4fa555e ("btrfs: make concurrent fsyncs wait less when waiting for a transaction commit") CC: stable@vger.kernel.org # 5.15+ Reviewed-by: Filipe Manana Signed-off-by: Omar Sandoval Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index f17bf3764ce8..1f1c25db6f6b 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -854,7 +854,37 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root) static noinline void wait_for_commit(struct btrfs_transaction *commit, const enum btrfs_trans_state min_state) { - wait_event(commit->commit_wait, commit->state >= min_state); + struct btrfs_fs_info *fs_info = commit->fs_info; + u64 transid = commit->transid; + bool put = false; + + while (1) { + wait_event(commit->commit_wait, commit->state >= min_state); + if (put) + btrfs_put_transaction(commit); + + if (min_state < TRANS_STATE_COMPLETED) + break; + + /* + * A transaction isn't really completed until all of the + * previous transactions are completed, but with fsync we can + * end up with SUPER_COMMITTED transactions before a COMPLETED + * transaction. Wait for those. + */ + + spin_lock(&fs_info->trans_lock); + commit = list_first_entry_or_null(&fs_info->trans_list, + struct btrfs_transaction, + list); + if (!commit || commit->transid > transid) { + spin_unlock(&fs_info->trans_lock); + break; + } + refcount_inc(&commit->use_count); + put = true; + spin_unlock(&fs_info->trans_lock); + } } int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid) -- cgit From d4aef1e122d8bbdc15ce3bd0bc813d6b44a7d63a Mon Sep 17 00:00:00 2001 From: Sidong Yang Date: Mon, 28 Feb 2022 01:43:40 +0000 Subject: btrfs: qgroup: fix deadlock between rescan worker and remove qgroup The commit e804861bd4e6 ("btrfs: fix deadlock between quota disable and qgroup rescan worker") by Kawasaki resolves deadlock between quota disable and qgroup rescan worker. But also there is a deadlock case like it. It's about enabling or disabling quota and creating or removing qgroup. It can be reproduced in simple script below. for i in {1..100} do btrfs quota enable /mnt & btrfs qgroup create 1/0 /mnt & btrfs qgroup destroy 1/0 /mnt & btrfs quota disable /mnt & done Here's why the deadlock happens: 1) The quota rescan task is running. 2) Task A calls btrfs_quota_disable(), locks the qgroup_ioctl_lock mutex, and then calls btrfs_qgroup_wait_for_completion(), to wait for the quota rescan task to complete. 3) Task B calls btrfs_remove_qgroup() and it blocks when trying to lock the qgroup_ioctl_lock mutex, because it's being held by task A. At that point task B is holding a transaction handle for the current transaction. 4) The quota rescan task calls btrfs_commit_transaction(). This results in it waiting for all other tasks to release their handles on the transaction, but task B is blocked on the qgroup_ioctl_lock mutex while holding a handle on the transaction, and that mutex is being held by task A, which is waiting for the quota rescan task to complete, resulting in a deadlock between these 3 tasks. To resolve this issue, the thread disabling quota should unlock qgroup_ioctl_lock before waiting rescan completion. Move btrfs_qgroup_wait_for_completion() after unlock of qgroup_ioctl_lock. Fixes: e804861bd4e6 ("btrfs: fix deadlock between quota disable and qgroup rescan worker") CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Filipe Manana Reviewed-by: Shin'ichiro Kawasaki Signed-off-by: Sidong Yang Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index f12dc687350c..30d42ea655ce 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1196,6 +1196,14 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) if (!fs_info->quota_root) goto out; + /* + * Unlock the qgroup_ioctl_lock mutex before waiting for the rescan worker to + * complete. Otherwise we can deadlock because btrfs_remove_qgroup() needs + * to lock that mutex while holding a transaction handle and the rescan + * worker needs to commit a transaction. + */ + mutex_unlock(&fs_info->qgroup_ioctl_lock); + /* * Request qgroup rescan worker to complete and wait for it. This wait * must be done before transaction start for quota disable since it may @@ -1203,7 +1211,6 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) */ clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); btrfs_qgroup_wait_for_completion(fs_info, false); - mutex_unlock(&fs_info->qgroup_ioctl_lock); /* * 1 For the root item -- cgit From 4751dc99627e4d1465c5bfa8cb7ab31ed418eff5 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 28 Feb 2022 16:29:28 +0000 Subject: btrfs: add missing run of delayed items after unlink during log replay During log replay, whenever we need to check if a name (dentry) exists in a directory we do searches on the subvolume tree for inode references or or directory entries (BTRFS_DIR_INDEX_KEY keys, and BTRFS_DIR_ITEM_KEY keys as well, before kernel 5.17). However when during log replay we unlink a name, through btrfs_unlink_inode(), we may not delete inode references and dir index keys from a subvolume tree and instead just add the deletions to the delayed inode's delayed items, which will only be run when we commit the transaction used for log replay. This means that after an unlink operation during log replay, if we attempt to search for the same name during log replay, we will not see that the name was already deleted, since the deletion is recorded only on the delayed items. We run delayed items after every unlink operation during log replay, except at unlink_old_inode_refs() and at add_inode_ref(). This was due to an overlook, as delayed items should be run after evert unlink, for the reasons stated above. So fix those two cases. Fixes: 0d836392cadd5 ("Btrfs: fix mount failure after fsync due to hard link recreation") Fixes: 1f250e929a9c9 ("Btrfs: fix log replay failure after unlink and link combination") CC: stable@vger.kernel.org # 4.19+ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 42caf595c936..6bc8834ac8f7 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1362,6 +1362,15 @@ again: inode, name, namelen); kfree(name); iput(dir); + /* + * Whenever we need to check if a name exists or not, we + * check the subvolume tree. So after an unlink we must + * run delayed items, so that future checks for a name + * during log replay see that the name does not exists + * anymore. + */ + if (!ret) + ret = btrfs_run_delayed_items(trans); if (ret) goto out; goto again; @@ -1614,6 +1623,15 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, */ if (!ret && inode->i_nlink == 0) inc_nlink(inode); + /* + * Whenever we need to check if a name exists or + * not, we check the subvolume tree. So after an + * unlink we must run delayed items, so that future + * checks for a name during log replay see that the + * name does not exists anymore. + */ + if (!ret) + ret = btrfs_run_delayed_items(trans); } if (ret < 0) goto out; -- cgit From c6c937d673aaa1d603f62f134e1ca9c173eeeed3 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Tue, 1 Mar 2022 20:49:41 +0800 Subject: KVM: x86/mmu: Passing up the error state of mmu_alloc_shadow_roots() Just like on the optional mmu_alloc_direct_roots() path, once shadow path reaches "r = -EIO" somewhere, the caller needs to know the actual state in order to enter error handling and avoid something worse. Fixes: 4a38162ee9f1 ("KVM: MMU: load PDPTRs outside mmu_lock") Signed-off-by: Like Xu Reviewed-by: Sean Christopherson Message-Id: <20220301124941.48412-1-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 8e24f73bf60b..5628d0ba637e 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3565,7 +3565,7 @@ set_root_pgd: out_unlock: write_unlock(&vcpu->kvm->mmu_lock); - return 0; + return r; } static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu) -- cgit From 8d25b7beca7ed6ca34f53f0f8abd009e2be15d94 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Sat, 19 Feb 2022 04:28:20 -0500 Subject: KVM: x86: pull kvm->srcu read-side to kvm_arch_vcpu_ioctl_run kvm_arch_vcpu_ioctl_run is already doing srcu_read_lock/unlock in two places, namely vcpu_run and post_kvm_run_save, and a third is actually needed around the call to vcpu->arch.complete_userspace_io to avoid the following splat: WARNING: suspicious RCU usage arch/x86/kvm/pmu.c:190 suspicious rcu_dereference_check() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 1 lock held by CPU 28/KVM/370841: #0: ff11004089f280b8 (&vcpu->mutex){+.+.}-{3:3}, at: kvm_vcpu_ioctl+0x87/0x730 [kvm] Call Trace: dump_stack_lvl+0x59/0x73 reprogram_fixed_counter+0x15d/0x1a0 [kvm] kvm_pmu_trigger_event+0x1a3/0x260 [kvm] ? free_moved_vector+0x1b4/0x1e0 complete_fast_pio_in+0x8a/0xd0 [kvm] This splat is not at all unexpected, since complete_userspace_io callbacks can execute similar code to vmexits. For example, SVM with nrips=false will call into the emulator from svm_skip_emulated_instruction(). While it's tempting to never acquire kvm->srcu for an uninitialized vCPU, practically speaking there's no penalty to acquiring kvm->srcu "early" as the KVM_MP_STATE_UNINITIALIZED path is a one-time thing per vCPU. On the other hand, seemingly innocuous helpers like kvm_apic_accept_events() and sync_regs() can theoretically reach code that might access SRCU-protected data structures, e.g. sync_regs() can trigger forced existing of nested mode via kvm_vcpu_ioctl_x86_set_vcpu_events(). Reported-by: Like Xu Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 82a9dcd8c67f..eb4029660bd9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9180,6 +9180,7 @@ static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu) likely(!pic_in_kernel(vcpu->kvm)); } +/* Called within kvm->srcu read side. */ static void post_kvm_run_save(struct kvm_vcpu *vcpu) { struct kvm_run *kvm_run = vcpu->run; @@ -9188,16 +9189,9 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu) kvm_run->cr8 = kvm_get_cr8(vcpu); kvm_run->apic_base = kvm_get_apic_base(vcpu); - /* - * The call to kvm_ready_for_interrupt_injection() may end up in - * kvm_xen_has_interrupt() which may require the srcu lock to be - * held, to protect against changes in the vcpu_info address. - */ - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); kvm_run->ready_for_interrupt_injection = pic_in_kernel(vcpu->kvm) || kvm_vcpu_ready_for_interrupt_injection(vcpu); - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); if (is_smm(vcpu)) kvm_run->flags |= KVM_RUN_X86_SMM; @@ -9815,6 +9809,7 @@ void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu) EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit); /* + * Called within kvm->srcu read side. * Returns 1 to let vcpu_run() continue the guest execution loop without * exiting to the userspace. Otherwise, the value will be returned to the * userspace. @@ -10193,6 +10188,7 @@ out: return r; } +/* Called within kvm->srcu read side. */ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) { bool hv_timer; @@ -10252,12 +10248,12 @@ static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu) !vcpu->arch.apf.halted); } +/* Called within kvm->srcu read side. */ static int vcpu_run(struct kvm_vcpu *vcpu) { int r; struct kvm *kvm = vcpu->kvm; - vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); vcpu->arch.l1tf_flush_l1d = true; for (;;) { @@ -10285,14 +10281,12 @@ static int vcpu_run(struct kvm_vcpu *vcpu) if (__xfer_to_guest_mode_work_pending()) { srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); r = xfer_to_guest_mode_handle_work(vcpu); + vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); if (r) return r; - vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); } } - srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); - return r; } @@ -10398,6 +10392,7 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) { struct kvm_run *kvm_run = vcpu->run; + struct kvm *kvm = vcpu->kvm; int r; vcpu_load(vcpu); @@ -10405,6 +10400,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) kvm_run->flags = 0; kvm_load_guest_fpu(vcpu); + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { if (kvm_run->immediate_exit) { r = -EINTR; @@ -10415,7 +10411,11 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) * use before KVM has ever run the vCPU. */ WARN_ON_ONCE(kvm_lapic_hv_timer_in_use(vcpu)); + + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); kvm_vcpu_block(vcpu); + vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); + if (kvm_apic_accept_events(vcpu) < 0) { r = 0; goto out; @@ -10475,8 +10475,9 @@ out: if (kvm_run->kvm_valid_regs) store_regs(vcpu); post_kvm_run_save(vcpu); - kvm_sigset_deactivate(vcpu); + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); + kvm_sigset_deactivate(vcpu); vcpu_put(vcpu); return r; } -- cgit From 90f8f4c0e3cebd541deaa45cf0e470bb9810dd4f Mon Sep 17 00:00:00 2001 From: Jonathan Lemon Date: Mon, 28 Feb 2022 12:39:57 -0800 Subject: ptp: ocp: Add ptp_ocp_adjtime_coarse for large adjustments In ("ptp: ocp: Have FPGA fold in ns adjustment for adjtime."), the ns adjustment was written to the FPGA register, so the clock could accurately perform adjustments. However, the adjtime() call passes in a s64, while the clock adjustment registers use a s32. When trying to perform adjustments with a large value (37 sec), things fail. Examine the incoming delta, and if larger than 1 sec, use the original (coarse) adjustment method. If smaller than 1 sec, then allow the FPGA to fold in the changes over a 1 second window. Fixes: 6d59d4fa1789 ("ptp: ocp: Have FPGA fold in ns adjustment for adjtime.") Signed-off-by: Jonathan Lemon Acked-by: Richard Cochran Link: https://lore.kernel.org/r/20220228203957.367371-1-jonathan.lemon@gmail.com Signed-off-by: Jakub Kicinski --- drivers/ptp/ptp_ocp.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/drivers/ptp/ptp_ocp.c b/drivers/ptp/ptp_ocp.c index 0f1b5a7d2a89..17ad5f0d13b2 100644 --- a/drivers/ptp/ptp_ocp.c +++ b/drivers/ptp/ptp_ocp.c @@ -607,7 +607,7 @@ ptp_ocp_settime(struct ptp_clock_info *ptp_info, const struct timespec64 *ts) } static void -__ptp_ocp_adjtime_locked(struct ptp_ocp *bp, u64 adj_val) +__ptp_ocp_adjtime_locked(struct ptp_ocp *bp, u32 adj_val) { u32 select, ctrl; @@ -615,7 +615,7 @@ __ptp_ocp_adjtime_locked(struct ptp_ocp *bp, u64 adj_val) iowrite32(OCP_SELECT_CLK_REG, &bp->reg->select); iowrite32(adj_val, &bp->reg->offset_ns); - iowrite32(adj_val & 0x7f, &bp->reg->offset_window_ns); + iowrite32(NSEC_PER_SEC, &bp->reg->offset_window_ns); ctrl = OCP_CTRL_ADJUST_OFFSET | OCP_CTRL_ENABLE; iowrite32(ctrl, &bp->reg->ctrl); @@ -624,6 +624,22 @@ __ptp_ocp_adjtime_locked(struct ptp_ocp *bp, u64 adj_val) iowrite32(select >> 16, &bp->reg->select); } +static void +ptp_ocp_adjtime_coarse(struct ptp_ocp *bp, u64 delta_ns) +{ + struct timespec64 ts; + unsigned long flags; + int err; + + spin_lock_irqsave(&bp->lock, flags); + err = __ptp_ocp_gettime_locked(bp, &ts, NULL); + if (likely(!err)) { + timespec64_add_ns(&ts, delta_ns); + __ptp_ocp_settime_locked(bp, &ts); + } + spin_unlock_irqrestore(&bp->lock, flags); +} + static int ptp_ocp_adjtime(struct ptp_clock_info *ptp_info, s64 delta_ns) { @@ -631,6 +647,11 @@ ptp_ocp_adjtime(struct ptp_clock_info *ptp_info, s64 delta_ns) unsigned long flags; u32 adj_ns, sign; + if (delta_ns > NSEC_PER_SEC || -delta_ns > NSEC_PER_SEC) { + ptp_ocp_adjtime_coarse(bp, delta_ns); + return 0; + } + sign = delta_ns < 0 ? BIT(31) : 0; adj_ns = sign ? -delta_ns : delta_ns; -- cgit From 875ad06015329314c594d3302ac2bbea37774543 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 27 Feb 2022 12:00:51 -0800 Subject: iwlwifi: fix build error for IWLMEI When CONFIG_IWLWIFI=m and CONFIG_IWLMEI=y, the kernel build system must be told to build the iwlwifi/ subdirectory for both IWLWIFI and IWLMEI so that builds for both =y and =m are done. This resolves an undefined reference build error: ERROR: modpost: "iwl_mei_is_connected" [drivers/net/wireless/intel/iwlwifi/iwlwifi.ko] undefined! Fixes: 977df8bd5844 ("wlwifi: work around reverse dependency on MEI") Signed-off-by: Randy Dunlap Reported-by: kernel test robot Cc: Arnd Bergmann Cc: Luca Coelho Cc: linux-wireless@vger.kernel.org Link: https://lore.kernel.org/r/20220227200051.7176-1-rdunlap@infradead.org Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/intel/Makefile b/drivers/net/wireless/intel/Makefile index 1364b0014488..208e73a16051 100644 --- a/drivers/net/wireless/intel/Makefile +++ b/drivers/net/wireless/intel/Makefile @@ -5,3 +5,4 @@ obj-$(CONFIG_IPW2200) += ipw2x00/ obj-$(CONFIG_IWLEGACY) += iwlegacy/ obj-$(CONFIG_IWLWIFI) += iwlwifi/ +obj-$(CONFIG_IWLMEI) += iwlwifi/ -- cgit From e50b88c4f076242358b66ddb67482b96947438f2 Mon Sep 17 00:00:00 2001 From: Sreeramya Soratkal Date: Tue, 1 Mar 2022 11:33:20 +0530 Subject: nl80211: Update bss channel on channel switch for P2P_CLIENT The wdev channel information is updated post channel switch only for the station mode and not for the other modes. Due to this, the P2P client still points to the old value though it moved to the new channel when the channel change is induced from the P2P GO. Update the bss channel after CSA channel switch completion for P2P client interface as well. Signed-off-by: Sreeramya Soratkal Link: https://lore.kernel.org/r/1646114600-31479-1-git-send-email-quic_ssramya@quicinc.com Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index b1909ce2b739..c01fbcc848e8 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -17828,7 +17828,8 @@ void cfg80211_ch_switch_notify(struct net_device *dev, wdev->chandef = *chandef; wdev->preset_chandef = *chandef; - if (wdev->iftype == NL80211_IFTYPE_STATION && + if ((wdev->iftype == NL80211_IFTYPE_STATION || + wdev->iftype == NL80211_IFTYPE_P2P_CLIENT) && !WARN_ON(!wdev->current_bss)) cfg80211_update_assoc_bss_entry(wdev, chandef->chan); -- cgit From e6e91ec966db5af4f059cfbac1af06560404b317 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Wed, 2 Mar 2022 09:27:15 +0200 Subject: iwlwifi: mvm: return value for request_ownership Propagate the value to the user space so it can understand if the operation failed or not. Fixes: bfcfdb59b669 ("iwlwifi: mvm: add vendor commands needed for iwlmei") Signed-off-by: Emmanuel Grumbach Link: https://lore.kernel.org/r/20220302072715.4885-1-emmanuel.grumbach@intel.com Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mvm/vendor-cmd.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/vendor-cmd.c b/drivers/net/wireless/intel/iwlwifi/mvm/vendor-cmd.c index 78450366312b..080a1587caa5 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/vendor-cmd.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/vendor-cmd.c @@ -71,12 +71,13 @@ static int iwl_mvm_vendor_host_get_ownership(struct wiphy *wiphy, { struct ieee80211_hw *hw = wiphy_to_ieee80211_hw(wiphy); struct iwl_mvm *mvm = IWL_MAC80211_GET_MVM(hw); + int ret; mutex_lock(&mvm->mutex); - iwl_mvm_mei_get_ownership(mvm); + ret = iwl_mvm_mei_get_ownership(mvm); mutex_unlock(&mvm->mutex); - return 0; + return ret; } static const struct wiphy_vendor_command iwl_mvm_vendor_commands[] = { -- cgit From 4424c35ead667ba2e8de7ab8206da66453e6f728 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 23 Feb 2022 17:47:16 +0200 Subject: auxdisplay: lcd2s: Fix lcd2s_redefine_char() feature It seems that the lcd2s_redefine_char() has never been properly tested. The buffer is filled by DEF_CUSTOM_CHAR command followed by the character number (from 0 to 7), but immediately after that these bytes are rewritten by the decoded hex stream. Fix the index to fill the buffer after the command and number. Fixes: 8c9108d014c5 ("auxdisplay: add a driver for lcd2s character display") Cc: Lars Poeschel Signed-off-by: Andy Shevchenko Reviewed-by: Geert Uytterhoeven [fixed typo in commit message] Signed-off-by: Miguel Ojeda --- drivers/auxdisplay/lcd2s.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/auxdisplay/lcd2s.c b/drivers/auxdisplay/lcd2s.c index 38ba08628ccb..ea9d75ad4f16 100644 --- a/drivers/auxdisplay/lcd2s.c +++ b/drivers/auxdisplay/lcd2s.c @@ -238,7 +238,7 @@ static int lcd2s_redefine_char(struct charlcd *lcd, char *esc) if (buf[1] > 7) return 1; - i = 0; + i = 2; shift = 0; value = 0; while (*esc && i < LCD2S_CHARACTER_SIZE + 2) { -- cgit From 898c0a15425a5bcaa8d44bd436eae5afd2483796 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 23 Feb 2022 17:47:17 +0200 Subject: auxdisplay: lcd2s: Fix memory leak in ->remove() Once allocated the struct lcd2s_data is never freed. Fix the memory leak by switching to devm_kzalloc(). Fixes: 8c9108d014c5 ("auxdisplay: add a driver for lcd2s character display") Cc: Lars Poeschel Signed-off-by: Andy Shevchenko Signed-off-by: Miguel Ojeda --- drivers/auxdisplay/lcd2s.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/drivers/auxdisplay/lcd2s.c b/drivers/auxdisplay/lcd2s.c index ea9d75ad4f16..7b9cfbbaf007 100644 --- a/drivers/auxdisplay/lcd2s.c +++ b/drivers/auxdisplay/lcd2s.c @@ -298,6 +298,10 @@ static int lcd2s_i2c_probe(struct i2c_client *i2c, I2C_FUNC_SMBUS_WRITE_BLOCK_DATA)) return -EIO; + lcd2s = devm_kzalloc(&i2c->dev, sizeof(*lcd2s), GFP_KERNEL); + if (!lcd2s) + return -ENOMEM; + /* Test, if the display is responding */ err = lcd2s_i2c_smbus_write_byte(i2c, LCD2S_CMD_DISPLAY_OFF); if (err < 0) @@ -307,12 +311,6 @@ static int lcd2s_i2c_probe(struct i2c_client *i2c, if (!lcd) return -ENOMEM; - lcd2s = kzalloc(sizeof(struct lcd2s_data), GFP_KERNEL); - if (!lcd2s) { - err = -ENOMEM; - goto fail1; - } - lcd->drvdata = lcd2s; lcd2s->i2c = i2c; lcd2s->charlcd = lcd; @@ -321,24 +319,22 @@ static int lcd2s_i2c_probe(struct i2c_client *i2c, err = device_property_read_u32(&i2c->dev, "display-height-chars", &lcd->height); if (err) - goto fail2; + goto fail1; err = device_property_read_u32(&i2c->dev, "display-width-chars", &lcd->width); if (err) - goto fail2; + goto fail1; lcd->ops = &lcd2s_ops; err = charlcd_register(lcd2s->charlcd); if (err) - goto fail2; + goto fail1; i2c_set_clientdata(i2c, lcd2s); return 0; -fail2: - kfree(lcd2s); fail1: kfree(lcd); return err; -- cgit From 9ed331f8a0fb674f4f06edf05a1687bf755af27b Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 23 Feb 2022 17:47:18 +0200 Subject: auxdisplay: lcd2s: Use proper API to free the instance of charlcd object While it might work, the current approach is fragile in a few ways: - whenever members in the structure are shuffled, the pointer will be wrong - the resource freeing may include more than covered by kfree() Fix this by using charlcd_free() call instead of kfree(). Fixes: 8c9108d014c5 ("auxdisplay: add a driver for lcd2s character display") Cc: Lars Poeschel Signed-off-by: Andy Shevchenko Signed-off-by: Miguel Ojeda --- drivers/auxdisplay/lcd2s.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/auxdisplay/lcd2s.c b/drivers/auxdisplay/lcd2s.c index 7b9cfbbaf007..2578b2d45439 100644 --- a/drivers/auxdisplay/lcd2s.c +++ b/drivers/auxdisplay/lcd2s.c @@ -336,7 +336,7 @@ static int lcd2s_i2c_probe(struct i2c_client *i2c, return 0; fail1: - kfree(lcd); + charlcd_free(lcd2s->charlcd); return err; } @@ -345,7 +345,7 @@ static int lcd2s_i2c_remove(struct i2c_client *i2c) struct lcd2s_data *lcd2s = i2c_get_clientdata(i2c); charlcd_unregister(lcd2s->charlcd); - kfree(lcd2s->charlcd); + charlcd_free(lcd2s->charlcd); return 0; } -- cgit From f1ef17011c765495c876fa75435e59eecfdc1ee4 Mon Sep 17 00:00:00 2001 From: Qiang Yu Date: Tue, 1 Mar 2022 14:11:59 +0800 Subject: drm/amdgpu: fix suspend/resume hang regression MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Regression has been reported that suspend/resume may hang with the previous vm ready check commit. So bring back the evicted list check as a temp fix. Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/1922 Fixes: c1a66c3bc425 ("drm/amdgpu: check vm ready by amdgpu_vm->evicting flag") Reviewed-by: Christian König Signed-off-by: Qiang Yu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index d62190b3dd9b..418341a67517 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -777,7 +777,8 @@ bool amdgpu_vm_ready(struct amdgpu_vm *vm) amdgpu_vm_eviction_lock(vm); ret = !vm->evicting; amdgpu_vm_eviction_unlock(vm); - return ret; + + return ret && list_empty(&vm->evicted); } /** -- cgit From 224102de2ff105a2c05695e66a08f4b5b6b2d19c Mon Sep 17 00:00:00 2001 From: lena wang Date: Tue, 1 Mar 2022 19:17:09 +0800 Subject: net: fix up skbs delta_truesize in UDP GRO frag_list The truesize for a UDP GRO packet is added by main skb and skbs in main skb's frag_list: skb_gro_receive_list p->truesize += skb->truesize; The commit 53475c5dd856 ("net: fix use-after-free when UDP GRO with shared fraglist") introduced a truesize increase for frag_list skbs. When uncloning skb, it will call pskb_expand_head and trusesize for frag_list skbs may increase. This can occur when allocators uses __netdev_alloc_skb and not jump into __alloc_skb. This flow does not use ksize(len) to calculate truesize while pskb_expand_head uses. skb_segment_list err = skb_unclone(nskb, GFP_ATOMIC); pskb_expand_head if (!skb->sk || skb->destructor == sock_edemux) skb->truesize += size - osize; If we uses increased truesize adding as delta_truesize, it will be larger than before and even larger than previous total truesize value if skbs in frag_list are abundant. The main skb truesize will become smaller and even a minus value or a huge value for an unsigned int parameter. Then the following memory check will drop this abnormal skb. To avoid this error we should use the original truesize to segment the main skb. Fixes: 53475c5dd856 ("net: fix use-after-free when UDP GRO with shared fraglist") Signed-off-by: lena wang Acked-by: Paolo Abeni Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/1646133431-8948-1-git-send-email-lena.wang@mediatek.com Signed-off-by: Jakub Kicinski --- net/core/skbuff.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b8138c372535..ea51e23e9247 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3876,6 +3876,7 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, list_skb = list_skb->next; err = 0; + delta_truesize += nskb->truesize; if (skb_shared(nskb)) { tmp = skb_clone(nskb, GFP_ATOMIC); if (tmp) { @@ -3900,7 +3901,6 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, tail = nskb; delta_len += nskb->len; - delta_truesize += nskb->truesize; skb_push(nskb, -skb_network_offset(nskb) + offset); -- cgit From 1dba41c9d2e2dc94b543394974f63d55aa195bfe Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 1 Mar 2022 05:34:40 -0600 Subject: net: ipa: add an interconnect dependency In order to function, the IPA driver very clearly requires the interconnect framework to be enabled in the kernel configuration. State that dependency in the Kconfig file. This became a problem when CONFIG_COMPILE_TEST support was added. Non-Qualcomm platforms won't necessarily enable CONFIG_INTERCONNECT. Reported-by: kernel test robot Fixes: 38a4066f593c5 ("net: ipa: support COMPILE_TEST") Signed-off-by: Alex Elder Link: https://lore.kernel.org/r/20220301113440.257916-1-elder@linaro.org Signed-off-by: Jakub Kicinski --- drivers/net/ipa/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ipa/Kconfig b/drivers/net/ipa/Kconfig index e0164a55c1e6..6782c2cbf542 100644 --- a/drivers/net/ipa/Kconfig +++ b/drivers/net/ipa/Kconfig @@ -2,6 +2,7 @@ config QCOM_IPA tristate "Qualcomm IPA support" depends on NET && QCOM_SMEM depends on ARCH_QCOM || COMPILE_TEST + depends on INTERCONNECT depends on QCOM_RPROC_COMMON || (QCOM_RPROC_COMMON=n && COMPILE_TEST) depends on QCOM_AOSS_QMP || QCOM_AOSS_QMP=n select QCOM_MDT_LOADER if ARCH_QCOM -- cgit From 60ce37b03917e593d8e5d8bcc7ec820773daf81d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 2 Mar 2022 08:17:22 -0800 Subject: bpf, sockmap: Do not ignore orig_len parameter Currently, sk_psock_verdict_recv() returns skb->len This is problematic because tcp_read_sock() might have passed orig_len < skb->len, due to the presence of TCP urgent data. This causes an infinite loop from tcp_read_sock() Followup patch will make tcp_read_sock() more robust vs bad actors. Fixes: ef5659280eb1 ("bpf, sockmap: Allow skipping sk_skb parser program") Reported-by: syzbot Signed-off-by: Eric Dumazet Acked-by: John Fastabend Acked-by: Jakub Sitnicki Tested-by: Jakub Sitnicki Acked-by: Daniel Borkmann Link: https://lore.kernel.org/r/20220302161723.3910001-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- net/core/skmsg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 8eb671c827f9..929a2b096b04 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -1153,7 +1153,7 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, struct sk_psock *psock; struct bpf_prog *prog; int ret = __SK_DROP; - int len = skb->len; + int len = orig_len; /* clone here so sk_eat_skb() in tcp_read_sock does not drop our data */ skb = skb_clone(skb, GFP_ATOMIC); -- cgit From e3d5ea2c011ecb16fb94c56a659364e6b30fac94 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 2 Mar 2022 08:17:23 -0800 Subject: tcp: make tcp_read_sock() more robust If recv_actor() returns an incorrect value, tcp_read_sock() might loop forever. Instead, issue a one time warning and make sure to make progress. Signed-off-by: Eric Dumazet Acked-by: John Fastabend Acked-by: Jakub Sitnicki Acked-by: Daniel Borkmann Link: https://lore.kernel.org/r/20220302161723.3910001-2-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 02cb275e5487..28ff2a820f7c 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1684,11 +1684,13 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, if (!copied) copied = used; break; - } else if (used <= len) { - seq += used; - copied += used; - offset += used; } + if (WARN_ON_ONCE(used > len)) + used = len; + seq += used; + copied += used; + offset += used; + /* If recv_actor drops the lock (e.g. TCP splice * receive) the skb pointer might be invalid when * getting here: tcp_collapse might have deleted it -- cgit From bd6f1fd5d33dfe5d1b4f2502d3694a7cc13f166d Mon Sep 17 00:00:00 2001 From: Zheyu Ma Date: Wed, 2 Mar 2022 20:24:23 +0800 Subject: net: arcnet: com20020: Fix null-ptr-deref in com20020pci_probe() During driver initialization, the pointer of card info, i.e. the variable 'ci' is required. However, the definition of 'com20020pci_id_table' reveals that this field is empty for some devices, which will cause null pointer dereference when initializing these devices. The following log reveals it: [ 3.973806] KASAN: null-ptr-deref in range [0x0000000000000028-0x000000000000002f] [ 3.973819] RIP: 0010:com20020pci_probe+0x18d/0x13e0 [com20020_pci] [ 3.975181] Call Trace: [ 3.976208] local_pci_probe+0x13f/0x210 [ 3.977248] pci_device_probe+0x34c/0x6d0 [ 3.977255] ? pci_uevent+0x470/0x470 [ 3.978265] really_probe+0x24c/0x8d0 [ 3.978273] __driver_probe_device+0x1b3/0x280 [ 3.979288] driver_probe_device+0x50/0x370 Fix this by checking whether the 'ci' is a null pointer first. Fixes: 8c14f9c70327 ("ARCNET: add com20020 PCI IDs with metadata") Signed-off-by: Zheyu Ma Signed-off-by: David S. Miller --- drivers/net/arcnet/com20020-pci.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/arcnet/com20020-pci.c b/drivers/net/arcnet/com20020-pci.c index 6382e1937cca..c580acb8b1d3 100644 --- a/drivers/net/arcnet/com20020-pci.c +++ b/drivers/net/arcnet/com20020-pci.c @@ -138,6 +138,9 @@ static int com20020pci_probe(struct pci_dev *pdev, return -ENOMEM; ci = (struct com20020_pci_card_info *)id->driver_data; + if (!ci) + return -EINVAL; + priv->ci = ci; mm = &ci->misc_map; -- cgit From 0537f0a2151375dcf90c1bbfda6a0aaf57164e89 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Wed, 2 Mar 2022 21:25:11 +0800 Subject: net/smc: fix unexpected SMC_CLC_DECL_ERR_REGRMB error generated by client The main reason for this unexpected SMC_CLC_DECL_ERR_REGRMB in client dues to following execution sequence: Server Conn A: Server Conn B: Client Conn B: smc_lgr_unregister_conn smc_lgr_register_conn smc_clc_send_accept -> smc_rtoken_add smcr_buf_unuse -> Client Conn A: smc_rtoken_delete smc_lgr_unregister_conn() makes current link available to assigned to new incoming connection, while smcr_buf_unuse() has not executed yet, which means that smc_rtoken_add may fail because of insufficient rtoken_entry, reversing their execution order will avoid this problem. Fixes: 3e034725c0d8 ("net/smc: common functions for RMBs and send buffers") Signed-off-by: D. Wythe Signed-off-by: David S. Miller --- net/smc/smc_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 29525d03b253..f8c9675f4af3 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1161,8 +1161,8 @@ void smc_conn_free(struct smc_connection *conn) cancel_work_sync(&conn->abort_work); } if (!list_empty(&lgr->list)) { - smc_lgr_unregister_conn(conn); smc_buf_unuse(conn, lgr); /* allow buffer reuse */ + smc_lgr_unregister_conn(conn); } if (!lgr->conns_num) -- cgit From 4940a1fdf31c39f0806ac831cde333134862030b Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Wed, 2 Mar 2022 21:25:12 +0800 Subject: net/smc: fix unexpected SMC_CLC_DECL_ERR_REGRMB error cause by server The problem of SMC_CLC_DECL_ERR_REGRMB on the server is very clear. Based on the fact that whether a new SMC connection can be accepted or not depends on not only the limit of conn nums, but also the available entries of rtoken. Since the rtoken release is trigger by peer, while the conn nums is decrease by local, tons of thing can happen in this time difference. This only thing that needs to be mentioned is that now all connection creations are completely protected by smc_server_lgr_pending lock, it's enough to check only the available entries in rtokens_used_mask. Fixes: cd6851f30386 ("smc: remote memory buffers (RMBs)") Signed-off-by: D. Wythe Signed-off-by: David S. Miller --- net/smc/smc_core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index f8c9675f4af3..be7d704976ff 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1864,7 +1864,8 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) (ini->smcd_version == SMC_V2 || lgr->vlan_id == ini->vlan_id) && (role == SMC_CLNT || ini->is_smcd || - lgr->conns_num < SMC_RMBS_PER_LGR_MAX)) { + (lgr->conns_num < SMC_RMBS_PER_LGR_MAX && + !bitmap_full(lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX)))) { /* link group found */ ini->first_contact_local = 0; conn->lgr = lgr; -- cgit From 815d5121927093017947fd76e627da03f0f70be7 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Wed, 2 Mar 2022 14:44:39 +0100 Subject: Bluetooth: hci_core: Fix unbalanced unlock in set_device_flags() There is only one "goto done;" in set_device_flags() and this happens *before* hci_dev_lock() is called, move the done label to after the hci_dev_unlock() to fix the following unlock balance: [ 31.493567] ===================================== [ 31.493571] WARNING: bad unlock balance detected! [ 31.493576] 5.17.0-rc2+ #13 Tainted: G C E [ 31.493581] ------------------------------------- [ 31.493584] bluetoothd/685 is trying to release lock (&hdev->lock) at: [ 31.493594] [] set_device_flags+0x65/0x1f0 [bluetooth] [ 31.493684] but there are no more locks to release! Note this bug has been around for a couple of years, but before commit fe92ee6425a2 ("Bluetooth: hci_core: Rework hci_conn_params flags") supported_flags was hardcoded to "((1U << HCI_CONN_FLAG_MAX) - 1)" so the check for unsupported flags which does the "goto done;" never triggered. Fixes: fe92ee6425a2 ("Bluetooth: hci_core: Rework hci_conn_params flags") Cc: Luiz Augusto von Dentz Signed-off-by: Hans de Goede Signed-off-by: Marcel Holtmann --- net/bluetooth/mgmt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 533cf60673a3..230a7a8196c0 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -4541,9 +4541,9 @@ static int set_device_flags(struct sock *sk, struct hci_dev *hdev, void *data, } } -done: hci_dev_unlock(hdev); +done: if (status == MGMT_STATUS_SUCCESS) device_flags_changed(sk, hdev, &cp->addr.bdaddr, cp->addr.type, supported_flags, current_flags); -- cgit From 008ee9eb8a11bcabf12c91771dd4f470b082bd44 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 2 Mar 2022 13:02:45 -0800 Subject: Bluetooth: hci_sync: Fix not processing all entries on cmd_sync_work hci_cmd_sync_queue can be called multiple times, each adding a hci_cmd_sync_work_entry, before hci_cmd_sync_work is run so this makes sure they are all dequeued properly otherwise it creates a backlog of entries that are never run. Link: https://lore.kernel.org/all/CAJCQCtSeUtHCgsHXLGrSTWKmyjaQDbDNpP4rb0i+RE+L2FTXSA@mail.gmail.com/T/ Fixes: 6a98e3836fa20 ("Bluetooth: Add helper for serialized HCI command execution") Tested-by: Chris Clayton Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_sync.c | 49 +++++++++++++++++++++++------------------------- 1 file changed, 23 insertions(+), 26 deletions(-) diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 9ba2a1a7d481..ab9aa700b6b3 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -276,40 +276,37 @@ EXPORT_SYMBOL(__hci_cmd_sync_status); static void hci_cmd_sync_work(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, cmd_sync_work); - struct hci_cmd_sync_work_entry *entry; - hci_cmd_sync_work_func_t func; - hci_cmd_sync_work_destroy_t destroy; - void *data; bt_dev_dbg(hdev, ""); - mutex_lock(&hdev->cmd_sync_work_lock); - entry = list_first_entry(&hdev->cmd_sync_work_list, - struct hci_cmd_sync_work_entry, list); - if (entry) { - list_del(&entry->list); - func = entry->func; - data = entry->data; - destroy = entry->destroy; - kfree(entry); - } else { - func = NULL; - data = NULL; - destroy = NULL; - } - mutex_unlock(&hdev->cmd_sync_work_lock); + /* Dequeue all entries and run them */ + while (1) { + struct hci_cmd_sync_work_entry *entry; - if (func) { - int err; + mutex_lock(&hdev->cmd_sync_work_lock); + entry = list_first_entry_or_null(&hdev->cmd_sync_work_list, + struct hci_cmd_sync_work_entry, + list); + if (entry) + list_del(&entry->list); + mutex_unlock(&hdev->cmd_sync_work_lock); + + if (!entry) + break; - hci_req_sync_lock(hdev); + bt_dev_dbg(hdev, "entry %p", entry); - err = func(hdev, data); + if (entry->func) { + int err; - if (destroy) - destroy(hdev, data, err); + hci_req_sync_lock(hdev); + err = entry->func(hdev, entry->data); + if (entry->destroy) + entry->destroy(hdev, entry->data, err); + hci_req_sync_unlock(hdev); + } - hci_req_sync_unlock(hdev); + kfree(entry); } } -- cgit From f1fb205efb0ccca55626fd4ef38570dd16b44719 Mon Sep 17 00:00:00 2001 From: Niels Dossche Date: Tue, 1 Mar 2022 23:28:22 +0100 Subject: sfc: extend the locking on mcdi->seqno seqno could be read as a stale value outside of the lock. The lock is already acquired to protect the modification of seqno against a possible race condition. Place the reading of this value also inside this locking to protect it against a possible race condition. Signed-off-by: Niels Dossche Acked-by: Martin Habets Signed-off-by: David S. Miller --- drivers/net/ethernet/sfc/mcdi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/sfc/mcdi.c b/drivers/net/ethernet/sfc/mcdi.c index be6bfd6b7ec7..50baf62b2cbc 100644 --- a/drivers/net/ethernet/sfc/mcdi.c +++ b/drivers/net/ethernet/sfc/mcdi.c @@ -163,9 +163,9 @@ static void efx_mcdi_send_request(struct efx_nic *efx, unsigned cmd, /* Serialise with efx_mcdi_ev_cpl() and efx_mcdi_ev_death() */ spin_lock_bh(&mcdi->iface_lock); ++mcdi->seqno; + seqno = mcdi->seqno & SEQ_MASK; spin_unlock_bh(&mcdi->iface_lock); - seqno = mcdi->seqno & SEQ_MASK; xflags = 0; if (mcdi->mode == MCDI_MODE_EVENTS) xflags |= MCDI_HEADER_XFLAGS_EVREQ; -- cgit From 8ccffe9ac3239e549beaa0a9d5e1a1eac94e866c Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Wed, 2 Mar 2022 21:21:15 +0100 Subject: bnx2: Fix an error message Fix an error message and report the correct failing function. Signed-off-by: Christophe JAILLET Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnx2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnx2.c b/drivers/net/ethernet/broadcom/bnx2.c index e20aafeb4ca9..b97ed9b5f685 100644 --- a/drivers/net/ethernet/broadcom/bnx2.c +++ b/drivers/net/ethernet/broadcom/bnx2.c @@ -8216,7 +8216,7 @@ bnx2_init_board(struct pci_dev *pdev, struct net_device *dev) rc = dma_set_coherent_mask(&pdev->dev, persist_dma_mask); if (rc) { dev_err(&pdev->dev, - "pci_set_consistent_dma_mask failed, aborting\n"); + "dma_set_coherent_mask failed, aborting\n"); goto err_out_unmap; } } else if ((rc = dma_set_mask(&pdev->dev, DMA_BIT_MASK(32))) != 0) { -- cgit From 10b6bb62ae1a49ee818fc479cf57b8900176773e Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 2 Mar 2022 21:39:39 +0200 Subject: net: dcb: disable softirqs in dcbnl_flush_dev() Ido Schimmel points out that since commit 52cff74eef5d ("dcbnl : Disable software interrupts before taking dcb_lock"), the DCB API can be called by drivers from softirq context. One such in-tree example is the chelsio cxgb4 driver: dcb_rpl -> cxgb4_dcb_handle_fw_update -> dcb_ieee_setapp If the firmware for this driver happened to send an event which resulted in a call to dcb_ieee_setapp() at the exact same time as another DCB-enabled interface was unregistering on the same CPU, the softirq would deadlock, because the interrupted process was already holding the dcb_lock in dcbnl_flush_dev(). Fix this unlikely event by using spin_lock_bh() in dcbnl_flush_dev() as in the rest of the dcbnl code. Fixes: 91b0383fef06 ("net: dcb: flush lingering app table entries for unregistered devices") Reported-by: Ido Schimmel Signed-off-by: Vladimir Oltean Link: https://lore.kernel.org/r/20220302193939.1368823-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- net/dcb/dcbnl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index 36c91273daac..dc4fb699b56c 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -2077,7 +2077,7 @@ static void dcbnl_flush_dev(struct net_device *dev) { struct dcb_app_type *itr, *tmp; - spin_lock(&dcb_lock); + spin_lock_bh(&dcb_lock); list_for_each_entry_safe(itr, tmp, &dcb_app_list, list) { if (itr->ifindex == dev->ifindex) { @@ -2086,7 +2086,7 @@ static void dcbnl_flush_dev(struct net_device *dev) } } - spin_unlock(&dcb_lock); + spin_unlock_bh(&dcb_lock); } static int dcbnl_netdevice_event(struct notifier_block *nb, -- cgit From dc9752075341e7beb653e37c6f4a3723074dc8bc Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Wed, 2 Mar 2022 18:14:46 +0200 Subject: selftests: mlxsw: tc_police_scale: Make test more robust The test adds tc filters and checks how many of them were offloaded by grepping for 'in_hw'. iproute2 commit f4cd4f127047 ("tc: add skip_hw and skip_sw to control action offload") added offload indication to tc actions, producing the following output: $ tc filter show dev swp2 ingress ... filter protocol ipv6 pref 1000 flower chain 0 handle 0x7c0 eth_type ipv6 dst_ip 2001:db8:1::7bf skip_sw in_hw in_hw_count 1 action order 1: police 0x7c0 rate 10Mbit burst 100Kb mtu 2Kb action drop overhead 0b ref 1 bind 1 not_in_hw used_hw_stats immediate The current grep expression matches on both 'in_hw' and 'not_in_hw', resulting in incorrect results. Fix that by using JSON output instead. Fixes: 5061e773264b ("selftests: mlxsw: Add scale test for tc-police") Signed-off-by: Amit Cohen Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/mlxsw/tc_police_scale.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/drivers/net/mlxsw/tc_police_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/tc_police_scale.sh index 3e3e06ea5703..86e787895f78 100644 --- a/tools/testing/selftests/drivers/net/mlxsw/tc_police_scale.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/tc_police_scale.sh @@ -60,7 +60,8 @@ __tc_police_test() tc_police_rules_create $count $should_fail - offload_count=$(tc filter show dev $swp1 ingress | grep in_hw | wc -l) + offload_count=$(tc -j filter show dev $swp1 ingress | + jq "[.[] | select(.options.in_hw == true)] | length") ((offload_count == count)) check_err_fail $should_fail $? "tc police offload count" } -- cgit From 196f9bc050cbc5085b4cbb61cce2efe380bc66d0 Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Wed, 2 Mar 2022 18:14:47 +0200 Subject: selftests: mlxsw: resource_scale: Fix return value The test runs several test cases and is supposed to return an error in case at least one of them failed. Currently, the check of the return value of each test case is in the wrong place, which can result in the wrong return value. For example: # TESTS='tc_police' ./resource_scale.sh TEST: 'tc_police' [default] 968 [FAIL] tc police offload count failed Error: mlxsw_spectrum: Failed to allocate policer index. We have an error talking to the kernel Command failed /tmp/tmp.i7Oc5HwmXY:969 TEST: 'tc_police' [default] overflow 969 [ OK ] ... TEST: 'tc_police' [ipv4_max] overflow 969 [ OK ] $ echo $? 0 Fix this by moving the check to be done after each test case. Fixes: 059b18e21c63 ("selftests: mlxsw: Return correct error code in resource scale test") Signed-off-by: Amit Cohen Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh index bcb110e830ce..dea33dc93790 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh @@ -50,8 +50,8 @@ for current_test in ${TESTS:-$ALL_TESTS}; do else log_test "'$current_test' [$profile] overflow $target" fi + RET_FIN=$(( RET_FIN || RET )) done - RET_FIN=$(( RET_FIN || RET )) done done current_test="" -- cgit From 6c7273a266759d9d36f7c862149f248bcdeddc0f Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 2 Mar 2022 09:59:27 -0800 Subject: ixgbe: xsk: change !netif_carrier_ok() handling in ixgbe_xmit_zc() Commit c685c69fba71 ("ixgbe: don't do any AF_XDP zero-copy transmit if netif is not OK") addressed the ring transient state when MEM_TYPE_XSK_BUFF_POOL was being configured which in turn caused the interface to through down/up. Maurice reported that when carrier is not ok and xsk_pool is present on ring pair, ksoftirqd will consume 100% CPU cycles due to the constant NAPI rescheduling as ixgbe_poll() states that there is still some work to be done. To fix this, do not set work_done to false for a !netif_carrier_ok(). Fixes: c685c69fba71 ("ixgbe: don't do any AF_XDP zero-copy transmit if netif is not OK") Reported-by: Maurice Baijens Tested-by: Maurice Baijens Signed-off-by: Maciej Fijalkowski Tested-by: Sandeep Penigalapati Signed-off-by: Tony Nguyen Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c index b3fd8e5cd85b..6a5e9cf6b5da 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c @@ -390,12 +390,14 @@ static bool ixgbe_xmit_zc(struct ixgbe_ring *xdp_ring, unsigned int budget) u32 cmd_type; while (budget-- > 0) { - if (unlikely(!ixgbe_desc_unused(xdp_ring)) || - !netif_carrier_ok(xdp_ring->netdev)) { + if (unlikely(!ixgbe_desc_unused(xdp_ring))) { work_done = false; break; } + if (!netif_carrier_ok(xdp_ring->netdev)) + break; + if (!xsk_tx_peek_desc(pool, &desc)) break; -- cgit From e1bec7fa1cee311a6d3fb9161037c7675904134d Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 3 Mar 2022 17:42:49 +0200 Subject: net: dsa: make dsa_tree_change_tag_proto actually unwind the tag proto change The blamed commit said one thing but did another. It explains that we should restore the "return err" to the original "goto out_unwind_tagger", but instead it replaced it with "goto out_unlock". When DSA_NOTIFIER_TAG_PROTO fails after the first switch of a multi-switch tree, the switches would end up not using the same tagging protocol. Fixes: 0b0e2ff10356 ("net: dsa: restore error path of dsa_tree_change_tag_proto") Signed-off-by: Vladimir Oltean Link: https://lore.kernel.org/r/20220303154249.1854436-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- net/dsa/dsa2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 209496996cdf..b4e67758e104 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -1261,7 +1261,7 @@ int dsa_tree_change_tag_proto(struct dsa_switch_tree *dst, info.tag_ops = tag_ops; err = dsa_tree_notify(dst, DSA_NOTIFIER_TAG_PROTO, &info); if (err) - goto out_unlock; + goto out_unwind_tagger; err = dsa_tree_bind_tag_proto(dst, tag_ops); if (err) -- cgit From 2d3916f3189172d5c69d33065c3c21119fe539fc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 3 Mar 2022 09:37:28 -0800 Subject: ipv6: fix skb drops in igmp6_event_query() and igmp6_event_report() While investigating on why a synchronize_net() has been added recently in ipv6_mc_down(), I found that igmp6_event_query() and igmp6_event_report() might drop skbs in some cases. Discussion about removing synchronize_net() from ipv6_mc_down() will happen in a different thread. Fixes: f185de28d9ae ("mld: add new workqueues for process mld events") Signed-off-by: Eric Dumazet Cc: Taehee Yoo Cc: Cong Wang Cc: David Ahern Link: https://lore.kernel.org/r/20220303173728.937869-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/net/ndisc.h | 4 ++-- net/ipv6/mcast.c | 32 ++++++++++++-------------------- 2 files changed, 14 insertions(+), 22 deletions(-) diff --git a/include/net/ndisc.h b/include/net/ndisc.h index 53cb8de0e589..47ffb360ddfa 100644 --- a/include/net/ndisc.h +++ b/include/net/ndisc.h @@ -475,9 +475,9 @@ int igmp6_late_init(void); void igmp6_cleanup(void); void igmp6_late_cleanup(void); -int igmp6_event_query(struct sk_buff *skb); +void igmp6_event_query(struct sk_buff *skb); -int igmp6_event_report(struct sk_buff *skb); +void igmp6_event_report(struct sk_buff *skb); #ifdef CONFIG_SYSCTL diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index a8861db52c18..909f937befd7 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1371,27 +1371,23 @@ static void mld_process_v2(struct inet6_dev *idev, struct mld2_query *mld, } /* called with rcu_read_lock() */ -int igmp6_event_query(struct sk_buff *skb) +void igmp6_event_query(struct sk_buff *skb) { struct inet6_dev *idev = __in6_dev_get(skb->dev); - if (!idev) - return -EINVAL; - - if (idev->dead) { - kfree_skb(skb); - return -ENODEV; - } + if (!idev || idev->dead) + goto out; spin_lock_bh(&idev->mc_query_lock); if (skb_queue_len(&idev->mc_query_queue) < MLD_MAX_SKBS) { __skb_queue_tail(&idev->mc_query_queue, skb); if (!mod_delayed_work(mld_wq, &idev->mc_query_work, 0)) in6_dev_hold(idev); + skb = NULL; } spin_unlock_bh(&idev->mc_query_lock); - - return 0; +out: + kfree_skb(skb); } static void __mld_query_work(struct sk_buff *skb) @@ -1542,27 +1538,23 @@ static void mld_query_work(struct work_struct *work) } /* called with rcu_read_lock() */ -int igmp6_event_report(struct sk_buff *skb) +void igmp6_event_report(struct sk_buff *skb) { struct inet6_dev *idev = __in6_dev_get(skb->dev); - if (!idev) - return -EINVAL; - - if (idev->dead) { - kfree_skb(skb); - return -ENODEV; - } + if (!idev || idev->dead) + goto out; spin_lock_bh(&idev->mc_report_lock); if (skb_queue_len(&idev->mc_report_queue) < MLD_MAX_SKBS) { __skb_queue_tail(&idev->mc_report_queue, skb); if (!mod_delayed_work(mld_wq, &idev->mc_report_work, 0)) in6_dev_hold(idev); + skb = NULL; } spin_unlock_bh(&idev->mc_report_lock); - - return 0; +out: + kfree_skb(skb); } static void __mld_report_work(struct sk_buff *skb) -- cgit From b08968f196d498b19e9d0841d76a03862258f2d8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 3 Mar 2022 13:05:18 +0000 Subject: cachefiles: Fix incorrect length to fallocate() When cachefiles_shorten_object() calls fallocate() to shape the cache file to match the DIO size, it passes the total file size it wants to achieve, not the amount of zeros that should be inserted. Since this is meant to preallocate that amount of storage for the file, it can cause the cache to fill up the disk and hit ENOSPC. Fix this by passing the length actually required to go from the current EOF to the desired EOF. Fixes: 7623ed6772de ("cachefiles: Implement cookie resize for truncate") Reported-by: Jeffle Xu Signed-off-by: David Howells Tested-by: Jeff Layton Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/164630854858.3665356.17419701804248490708.stgit@warthog.procyon.org.uk # v1 Signed-off-by: Linus Torvalds --- fs/cachefiles/interface.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index 51c968cd00a6..ae93cee9d25d 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -254,7 +254,7 @@ static bool cachefiles_shorten_object(struct cachefiles_object *object, ret = cachefiles_inject_write_error(); if (ret == 0) ret = vfs_fallocate(file, FALLOC_FL_ZERO_RANGE, - new_size, dio_size); + new_size, dio_size - new_size); if (ret < 0) { trace_cachefiles_io_error(object, file_inode(file), ret, cachefiles_trace_fallocate_error); -- cgit From 38f80f42147ff658aff218edb0a88c37e58bf44f Mon Sep 17 00:00:00 2001 From: Ammar Faizi Date: Sat, 26 Feb 2022 14:40:56 +0700 Subject: MAINTAINERS: Remove dead patchwork link The patchwork link is dead. It says: 404: File not found The page URL requested (/project/LKML/list/) does not exist. Remove it. Signed-off-by: Ammar Faizi Signed-off-by: Linus Torvalds --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index d2b39f444abf..05fd080b82f3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -21468,7 +21468,6 @@ THE REST M: Linus Torvalds L: linux-kernel@vger.kernel.org S: Buried alive in reporters -Q: http://patchwork.kernel.org/project/LKML/list/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git F: * F: */ -- cgit From 8b274f2238950c55570ff14fcc278a7fcbecc663 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Fri, 25 Feb 2022 13:39:48 +0100 Subject: riscv: Fix is_linear_mapping with recent move of KASAN region The KASAN region was recently moved between the linear mapping and the kernel mapping, is_linear_mapping used to check the validity of an address by using the start of the kernel mapping, which is now wrong. Fix this by using the maximum size of the physical memory. Fixes: f7ae02333d13 ("riscv: Move KASAN mapping next to the kernel mapping") Signed-off-by: Alexandre Ghiti Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/page.h | 2 +- arch/riscv/include/asm/pgtable.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h index 160e3a1e8f8b..004372f8da54 100644 --- a/arch/riscv/include/asm/page.h +++ b/arch/riscv/include/asm/page.h @@ -119,7 +119,7 @@ extern phys_addr_t phys_ram_base; ((x) >= kernel_map.virt_addr && (x) < (kernel_map.virt_addr + kernel_map.size)) #define is_linear_mapping(x) \ - ((x) >= PAGE_OFFSET && (!IS_ENABLED(CONFIG_64BIT) || (x) < kernel_map.virt_addr)) + ((x) >= PAGE_OFFSET && (!IS_ENABLED(CONFIG_64BIT) || (x) < PAGE_OFFSET + KERN_VIRT_SIZE)) #define linear_mapping_pa_to_va(x) ((void *)((unsigned long)(x) + kernel_map.va_pa_offset)) #define kernel_mapping_pa_to_va(y) ({ \ diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 7e949f25c933..e3549e50de95 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -13,6 +13,7 @@ #ifndef CONFIG_MMU #define KERNEL_LINK_ADDR PAGE_OFFSET +#define KERN_VIRT_SIZE (UL(-1)) #else #define ADDRESS_SPACE_END (UL(-1)) -- cgit From a3d328037846d013bb4c7f3777241e190e4c75e1 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Fri, 25 Feb 2022 13:39:49 +0100 Subject: riscv: Fix config KASAN && SPARSEMEM && !SPARSE_VMEMMAP In order to get the pfn of a struct page* when sparsemem is enabled without vmemmap, the mem_section structures need to be initialized which happens in sparse_init. But kasan_early_init calls pfn_to_page way before sparse_init is called, which then tries to dereference a null mem_section pointer. Fix this by removing the usage of this function in kasan_early_init. Fixes: 8ad8b72721d0 ("riscv: Add KASAN support") Signed-off-by: Alexandre Ghiti Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/kasan_init.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c index f61f7ca6fe0f..85e849318389 100644 --- a/arch/riscv/mm/kasan_init.c +++ b/arch/riscv/mm/kasan_init.c @@ -202,8 +202,7 @@ asmlinkage void __init kasan_early_init(void) for (i = 0; i < PTRS_PER_PTE; ++i) set_pte(kasan_early_shadow_pte + i, - mk_pte(virt_to_page(kasan_early_shadow_page), - PAGE_KERNEL)); + pfn_pte(virt_to_pfn(kasan_early_shadow_page), PAGE_KERNEL)); for (i = 0; i < PTRS_PER_PMD; ++i) set_pmd(kasan_early_shadow_pmd + i, -- cgit From 5f763b3b59602735993149330ffa7e348bc85bc0 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Fri, 25 Feb 2022 13:39:50 +0100 Subject: riscv: Fix DEBUG_VIRTUAL false warnings KERN_VIRT_SIZE used to encompass the kernel mapping before it was redefined when moving the kasan mapping next to the kernel mapping to only match the maximum amount of physical memory. Then, kernel mapping addresses that go through __virt_to_phys are now declared as wrong which is not true, one can use __virt_to_phys on such addresses. Fix this by redefining the condition that matches wrong addresses. Fixes: f7ae02333d13 ("riscv: Move KASAN mapping next to the kernel mapping") Signed-off-by: Alexandre Ghiti Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/physaddr.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/riscv/mm/physaddr.c b/arch/riscv/mm/physaddr.c index e7fd0c253c7b..19cf25a74ee2 100644 --- a/arch/riscv/mm/physaddr.c +++ b/arch/riscv/mm/physaddr.c @@ -8,12 +8,10 @@ phys_addr_t __virt_to_phys(unsigned long x) { - phys_addr_t y = x - PAGE_OFFSET; - /* * Boundary checking aginst the kernel linear mapping space. */ - WARN(y >= KERN_VIRT_SIZE, + WARN(!is_linear_mapping(x) && !is_kernel_mapping(x), "virt_to_phys used for non-linear address: %pK (%pS)\n", (void *)x, (void *)x); -- cgit From c648c4bb7d02ceb53ee40172fdc4433b37cee9c6 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Fri, 25 Feb 2022 13:39:51 +0100 Subject: riscv: Fix config KASAN && DEBUG_VIRTUAL __virt_to_phys function is called very early in the boot process (ie kasan_early_init) so it should not be instrumented by KASAN otherwise it bugs. Fix this by declaring phys_addr.c as non-kasan instrumentable. Signed-off-by: Alexandre Ghiti Fixes: 8ad8b72721d0 (riscv: Add KASAN support) Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/riscv/mm/Makefile b/arch/riscv/mm/Makefile index 7ebaef10ea1b..ac7a25298a04 100644 --- a/arch/riscv/mm/Makefile +++ b/arch/riscv/mm/Makefile @@ -24,6 +24,9 @@ obj-$(CONFIG_KASAN) += kasan_init.o ifdef CONFIG_KASAN KASAN_SANITIZE_kasan_init.o := n KASAN_SANITIZE_init.o := n +ifdef CONFIG_DEBUG_VIRTUAL +KASAN_SANITIZE_physaddr.o := n +endif endif obj-$(CONFIG_DEBUG_VIRTUAL) += physaddr.o -- cgit From 625e24a550e6a600e639b43cf7c15879b2a70840 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Fri, 25 Feb 2022 13:39:52 +0100 Subject: riscv: Move high_memory initialization to setup_bootmem high_memory used to be initialized in mem_init, way after setup_bootmem. But a call to dma_contiguous_reserve in this function gives rise to the below warning because high_memory is equal to 0 and is used at the very beginning at cma_declare_contiguous_nid. It went unnoticed since the move of the kasan region redefined KERN_VIRT_SIZE so that it does not encompass -1 anymore. Fix this by initializing high_memory in setup_bootmem. ------------[ cut here ]------------ virt_to_phys used for non-linear address: ffffffffffffffff (0xffffffffffffffff) WARNING: CPU: 0 PID: 0 at arch/riscv/mm/physaddr.c:14 __virt_to_phys+0xac/0x1b8 Modules linked in: CPU: 0 PID: 0 Comm: swapper Not tainted 5.17.0-rc1-00007-ga68b89289e26 #27 Hardware name: riscv-virtio,qemu (DT) epc : __virt_to_phys+0xac/0x1b8 ra : __virt_to_phys+0xac/0x1b8 epc : ffffffff80014922 ra : ffffffff80014922 sp : ffffffff84a03c30 gp : ffffffff85866c80 tp : ffffffff84a3f180 t0 : ffffffff86bce657 t1 : fffffffef09406e8 t2 : 0000000000000000 s0 : ffffffff84a03c70 s1 : ffffffffffffffff a0 : 000000000000004f a1 : 00000000000f0000 a2 : 0000000000000002 a3 : ffffffff8011f408 a4 : 0000000000000000 a5 : 0000000000000000 a6 : 0000000000f00000 a7 : ffffffff84a03747 s2 : ffffffd800000000 s3 : ffffffff86ef4000 s4 : ffffffff8467f828 s5 : fffffff800000000 s6 : 8000000000006800 s7 : 0000000000000000 s8 : 0000000480000000 s9 : 0000000080038ea0 s10: 0000000000000000 s11: ffffffffffffffff t3 : ffffffff84a035c0 t4 : fffffffef09406e8 t5 : fffffffef09406e9 t6 : ffffffff84a03758 status: 0000000000000100 badaddr: 0000000000000000 cause: 0000000000000003 [] cma_declare_contiguous_nid+0xf2/0x64a [] dma_contiguous_reserve_area+0x46/0xb4 [] dma_contiguous_reserve+0x174/0x18e [] paging_init+0x12c/0x35e [] setup_arch+0x120/0x74e [] start_kernel+0xce/0x68c irq event stamp: 0 hardirqs last enabled at (0): [<0000000000000000>] 0x0 hardirqs last disabled at (0): [<0000000000000000>] 0x0 softirqs last enabled at (0): [<0000000000000000>] 0x0 softirqs last disabled at (0): [<0000000000000000>] 0x0 ---[ end trace 0000000000000000 ]--- Fixes: f7ae02333d13 ("riscv: Move KASAN mapping next to the kernel mapping") Signed-off-by: Alexandre Ghiti Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index c27294128e18..0d588032d6e6 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -125,7 +125,6 @@ void __init mem_init(void) else swiotlb_force = SWIOTLB_NO_FORCE; #endif - high_memory = (void *)(__va(PFN_PHYS(max_low_pfn))); memblock_free_all(); print_vm_layout(); @@ -195,6 +194,7 @@ static void __init setup_bootmem(void) min_low_pfn = PFN_UP(phys_ram_base); max_low_pfn = max_pfn = PFN_DOWN(phys_ram_end); + high_memory = (void *)(__va(PFN_PHYS(max_low_pfn))); dma32_phys_limit = min(4UL * SZ_1G, (unsigned long)PFN_PHYS(max_low_pfn)); set_max_mapnr(max_low_pfn - ARCH_PFN_OFFSET); -- cgit From e4fcfe6eca6f32357f1b4408ff15b10527518eee Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Fri, 25 Feb 2022 13:39:53 +0100 Subject: riscv: Fix kasan pud population In sv48, the kasan inner regions are not aligned on PGDIR_SIZE and then when we populate the kasan linear mapping region, we clear the kasan vmalloc region which is in the same PGD. Fix this by copying the content of the kasan early pud after allocating a new PGD for the first time. Fixes: e8a62cc26ddf ("riscv: Implement sv48 support") Signed-off-by: Alexandre Ghiti Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/kasan_init.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c index 85e849318389..cd1a145257b7 100644 --- a/arch/riscv/mm/kasan_init.c +++ b/arch/riscv/mm/kasan_init.c @@ -113,8 +113,11 @@ static void __init kasan_populate_pud(pgd_t *pgd, base_pud = pt_ops.get_pud_virt(pfn_to_phys(_pgd_pfn(*pgd))); } else { base_pud = (pud_t *)pgd_page_vaddr(*pgd); - if (base_pud == lm_alias(kasan_early_shadow_pud)) + if (base_pud == lm_alias(kasan_early_shadow_pud)) { base_pud = memblock_alloc(PTRS_PER_PUD * sizeof(pud_t), PAGE_SIZE); + memcpy(base_pud, (void *)kasan_early_shadow_pud, + sizeof(pud_t) * PTRS_PER_PUD); + } } pudp = base_pud + pud_index(vaddr); -- cgit From bfa26ba343c727e055223be04e08f2ebdd43c293 Mon Sep 17 00:00:00 2001 From: William Mahon Date: Thu, 3 Mar 2022 18:23:42 -0800 Subject: HID: add mapping for KEY_DICTATE Numerous keyboards are adding dictate keys which allows for text messages to be dictated by a microphone. This patch adds a new key definition KEY_DICTATE and maps 0x0c/0x0d8 usage code to this new keycode. Additionally hid-debug is adjusted to recognize this new usage code as well. Signed-off-by: William Mahon Acked-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20220303021501.1.I5dbf50eb1a7a6734ee727bda4a8573358c6d3ec0@changeid Signed-off-by: Dmitry Torokhov --- drivers/hid/hid-debug.c | 1 + drivers/hid/hid-input.c | 1 + include/uapi/linux/input-event-codes.h | 1 + 3 files changed, 3 insertions(+) diff --git a/drivers/hid/hid-debug.c b/drivers/hid/hid-debug.c index 26c31d759914..8aa68416b1d7 100644 --- a/drivers/hid/hid-debug.c +++ b/drivers/hid/hid-debug.c @@ -969,6 +969,7 @@ static const char *keys[KEY_MAX + 1] = { [KEY_ASSISTANT] = "Assistant", [KEY_KBD_LAYOUT_NEXT] = "KbdLayoutNext", [KEY_EMOJI_PICKER] = "EmojiPicker", + [KEY_DICTATE] = "Dictate", [KEY_BRIGHTNESS_MIN] = "BrightnessMin", [KEY_BRIGHTNESS_MAX] = "BrightnessMax", [KEY_BRIGHTNESS_AUTO] = "BrightnessAuto", diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c index 112901d2d8d2..ce2b75a67cb8 100644 --- a/drivers/hid/hid-input.c +++ b/drivers/hid/hid-input.c @@ -992,6 +992,7 @@ static void hidinput_configure_usage(struct hid_input *hidinput, struct hid_fiel case 0x0cd: map_key_clear(KEY_PLAYPAUSE); break; case 0x0cf: map_key_clear(KEY_VOICECOMMAND); break; + case 0x0d8: map_key_clear(KEY_DICTATE); break; case 0x0d9: map_key_clear(KEY_EMOJI_PICKER); break; case 0x0e0: map_abs_clear(ABS_VOLUME); break; diff --git a/include/uapi/linux/input-event-codes.h b/include/uapi/linux/input-event-codes.h index 225ec87d4f22..4db5d41848e4 100644 --- a/include/uapi/linux/input-event-codes.h +++ b/include/uapi/linux/input-event-codes.h @@ -612,6 +612,7 @@ #define KEY_ASSISTANT 0x247 /* AL Context-aware desktop assistant */ #define KEY_KBD_LAYOUT_NEXT 0x248 /* AC Next Keyboard Layout Select */ #define KEY_EMOJI_PICKER 0x249 /* Show/hide emoji picker (HUTRR101) */ +#define KEY_DICTATE 0x24a /* Start or Stop Voice Dictation Session (HUTRR99) */ #define KEY_BRIGHTNESS_MIN 0x250 /* Set Brightness to Minimum */ #define KEY_BRIGHTNESS_MAX 0x251 /* Set Brightness to Maximum */ -- cgit From 327b89f0acc4c20a06ed59e4d9af7f6d804dc2e2 Mon Sep 17 00:00:00 2001 From: William Mahon Date: Thu, 3 Mar 2022 18:26:22 -0800 Subject: HID: add mapping for KEY_ALL_APPLICATIONS This patch adds a new key definition for KEY_ALL_APPLICATIONS and aliases KEY_DASHBOARD to it. It also maps the 0x0c/0x2a2 usage code to KEY_ALL_APPLICATIONS. Signed-off-by: William Mahon Acked-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20220303035618.1.I3a7746ad05d270161a18334ae06e3b6db1a1d339@changeid Signed-off-by: Dmitry Torokhov --- drivers/hid/hid-debug.c | 4 +++- drivers/hid/hid-input.c | 2 ++ include/uapi/linux/input-event-codes.h | 3 ++- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/hid/hid-debug.c b/drivers/hid/hid-debug.c index 8aa68416b1d7..81e7e404a5fc 100644 --- a/drivers/hid/hid-debug.c +++ b/drivers/hid/hid-debug.c @@ -860,7 +860,9 @@ static const char *keys[KEY_MAX + 1] = { [KEY_F22] = "F22", [KEY_F23] = "F23", [KEY_F24] = "F24", [KEY_PLAYCD] = "PlayCD", [KEY_PAUSECD] = "PauseCD", [KEY_PROG3] = "Prog3", - [KEY_PROG4] = "Prog4", [KEY_SUSPEND] = "Suspend", + [KEY_PROG4] = "Prog4", + [KEY_ALL_APPLICATIONS] = "AllApplications", + [KEY_SUSPEND] = "Suspend", [KEY_CLOSE] = "Close", [KEY_PLAY] = "Play", [KEY_FASTFORWARD] = "FastForward", [KEY_BASSBOOST] = "BassBoost", [KEY_PRINT] = "Print", [KEY_HP] = "HP", diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c index ce2b75a67cb8..56ec27398a00 100644 --- a/drivers/hid/hid-input.c +++ b/drivers/hid/hid-input.c @@ -1084,6 +1084,8 @@ static void hidinput_configure_usage(struct hid_input *hidinput, struct hid_fiel case 0x29d: map_key_clear(KEY_KBD_LAYOUT_NEXT); break; + case 0x2a2: map_key_clear(KEY_ALL_APPLICATIONS); break; + case 0x2c7: map_key_clear(KEY_KBDINPUTASSIST_PREV); break; case 0x2c8: map_key_clear(KEY_KBDINPUTASSIST_NEXT); break; case 0x2c9: map_key_clear(KEY_KBDINPUTASSIST_PREVGROUP); break; diff --git a/include/uapi/linux/input-event-codes.h b/include/uapi/linux/input-event-codes.h index 4db5d41848e4..7989d9483ea7 100644 --- a/include/uapi/linux/input-event-codes.h +++ b/include/uapi/linux/input-event-codes.h @@ -278,7 +278,8 @@ #define KEY_PAUSECD 201 #define KEY_PROG3 202 #define KEY_PROG4 203 -#define KEY_DASHBOARD 204 /* AL Dashboard */ +#define KEY_ALL_APPLICATIONS 204 /* AC Desktop Show All Applications */ +#define KEY_DASHBOARD KEY_ALL_APPLICATIONS #define KEY_SUSPEND 205 #define KEY_CLOSE 206 /* AC Close */ #define KEY_PLAY 207 -- cgit From 74583f1b92cb3bbba1a3741cea237545c56f506c Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Tue, 1 Mar 2022 00:44:18 +0000 Subject: riscv: dts: k210: fix broken IRQs on hart1 Commit 67d96729a9e7 ("riscv: Update Canaan Kendryte K210 device tree") incorrectly removed two entries from the PLIC interrupt-controller node's interrupts-extended property. The PLIC driver cannot know the mapping between hart contexts and hart ids, so this information has to be provided by device tree, as specified by the PLIC device tree binding. The PLIC driver uses the interrupts-extended property, and initializes the hart context registers in the exact same order as provided by the interrupts-extended property. In other words, if we don't specify the S-mode interrupts, the PLIC driver will simply initialize the hart0 S-mode hart context with the hart1 M-mode configuration. It is therefore essential to specify the S-mode IRQs even though the system itself will only ever be running in M-mode. Re-add the S-mode interrupts, so that we get working IRQs on hart1 again. Cc: Fixes: 67d96729a9e7 ("riscv: Update Canaan Kendryte K210 device tree") Signed-off-by: Niklas Cassel Signed-off-by: Palmer Dabbelt --- arch/riscv/boot/dts/canaan/k210.dtsi | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/riscv/boot/dts/canaan/k210.dtsi b/arch/riscv/boot/dts/canaan/k210.dtsi index 56f57118c633..44d338514761 100644 --- a/arch/riscv/boot/dts/canaan/k210.dtsi +++ b/arch/riscv/boot/dts/canaan/k210.dtsi @@ -113,7 +113,8 @@ compatible = "canaan,k210-plic", "sifive,plic-1.0.0"; reg = <0xC000000 0x4000000>; interrupt-controller; - interrupts-extended = <&cpu0_intc 11>, <&cpu1_intc 11>; + interrupts-extended = <&cpu0_intc 11>, <&cpu0_intc 9>, + <&cpu1_intc 11>, <&cpu1_intc 9>; riscv,ndev = <65>; }; -- cgit From 0bf476fc3624e3a72af4ba7340d430a91c18cd67 Mon Sep 17 00:00:00 2001 From: Robert Hancock Date: Thu, 3 Mar 2022 12:10:27 -0600 Subject: net: macb: Fix lost RX packet wakeup race in NAPI receive There is an oddity in the way the RSR register flags propagate to the ISR register (and the actual interrupt output) on this hardware: it appears that RSR register bits only result in ISR being asserted if the interrupt was actually enabled at the time, so enabling interrupts with RSR bits already set doesn't trigger an interrupt to be raised. There was already a partial fix for this race in the macb_poll function where it checked for RSR bits being set and re-triggered NAPI receive. However, there was a still a race window between checking RSR and actually enabling interrupts, where a lost wakeup could happen. It's necessary to check again after enabling interrupts to see if RSR was set just prior to the interrupt being enabled, and re-trigger receive in that case. This issue was noticed in a point-to-point UDP request-response protocol which periodically saw timeouts or abnormally high response times due to received packets not being processed in a timely fashion. In many applications, more packets arriving, including TCP retransmissions, would cause the original packet to be processed, thus masking the issue. Fixes: 02f7a34f34e3 ("net: macb: Re-enable RX interrupt only when RX is done") Cc: stable@vger.kernel.org Co-developed-by: Scott McNutt Signed-off-by: Scott McNutt Signed-off-by: Robert Hancock Tested-by: Claudiu Beznea Signed-off-by: David S. Miller --- drivers/net/ethernet/cadence/macb_main.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index 98498a76ae16..d13f06cf0308 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -1573,7 +1573,14 @@ static int macb_poll(struct napi_struct *napi, int budget) if (work_done < budget) { napi_complete_done(napi, work_done); - /* Packets received while interrupts were disabled */ + /* RSR bits only seem to propagate to raise interrupts when + * interrupts are enabled at the time, so if bits are already + * set due to packets received while interrupts were disabled, + * they will not cause another interrupt to be generated when + * interrupts are re-enabled. + * Check for this case here. This has been seen to happen + * around 30% of the time under heavy network load. + */ status = macb_readl(bp, RSR); if (status) { if (bp->caps & MACB_CAPS_ISR_CLEAR_ON_WRITE) @@ -1581,6 +1588,22 @@ static int macb_poll(struct napi_struct *napi, int budget) napi_reschedule(napi); } else { queue_writel(queue, IER, bp->rx_intr_mask); + + /* In rare cases, packets could have been received in + * the window between the check above and re-enabling + * interrupts. Therefore, a double-check is required + * to avoid losing a wakeup. This can potentially race + * with the interrupt handler doing the same actions + * if an interrupt is raised just after enabling them, + * but this should be harmless. + */ + status = macb_readl(bp, RSR); + if (unlikely(status)) { + queue_writel(queue, IDR, bp->rx_intr_mask); + if (bp->caps & MACB_CAPS_ISR_CLEAR_ON_WRITE) + queue_writel(queue, ISR, MACB_BIT(RCOMP)); + napi_schedule(napi); + } } } -- cgit From be4977b847f5d5cedb64d50eaaf2218c3a55a3a3 Mon Sep 17 00:00:00 2001 From: Tung Nguyen Date: Fri, 4 Mar 2022 03:25:18 +0000 Subject: tipc: fix kernel panic when enabling bearer When enabling a bearer on a node, a kernel panic is observed: [ 4.498085] RIP: 0010:tipc_mon_prep+0x4e/0x130 [tipc] ... [ 4.520030] Call Trace: [ 4.520689] [ 4.521236] tipc_link_build_proto_msg+0x375/0x750 [tipc] [ 4.522654] tipc_link_build_state_msg+0x48/0xc0 [tipc] [ 4.524034] __tipc_node_link_up+0xd7/0x290 [tipc] [ 4.525292] tipc_rcv+0x5da/0x730 [tipc] [ 4.526346] ? __netif_receive_skb_core+0xb7/0xfc0 [ 4.527601] tipc_l2_rcv_msg+0x5e/0x90 [tipc] [ 4.528737] __netif_receive_skb_list_core+0x20b/0x260 [ 4.530068] netif_receive_skb_list_internal+0x1bf/0x2e0 [ 4.531450] ? dev_gro_receive+0x4c2/0x680 [ 4.532512] napi_complete_done+0x6f/0x180 [ 4.533570] virtnet_poll+0x29c/0x42e [virtio_net] ... The node in question is receiving activate messages in another thread after changing bearer status to allow message sending/ receiving in current thread: thread 1 | thread 2 -------- | -------- | tipc_enable_bearer() | test_and_set_bit_lock() | tipc_bearer_xmit_skb() | | tipc_l2_rcv_msg() | tipc_rcv() | __tipc_node_link_up() | tipc_link_build_state_msg() | tipc_link_build_proto_msg() | tipc_mon_prep() | { | ... | // null-pointer dereference | u16 gen = mon->dom_gen; | ... | } // Not being executed yet | tipc_mon_create() | { | ... | // allocate | mon = kzalloc(); | ... | } | Monitoring pointer in thread 2 is dereferenced before monitoring data is allocated in thread 1. This causes kernel panic. This commit fixes it by allocating the monitoring data before enabling the bearer to receive messages. Fixes: 35c55c9877f8 ("tipc: add neighbor monitoring framework") Reported-by: Shuang Li Acked-by: Jon Maloy Signed-off-by: Tung Nguyen Signed-off-by: David S. Miller --- net/tipc/bearer.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 473a790f5894..a2f9c9640716 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -352,16 +352,18 @@ static int tipc_enable_bearer(struct net *net, const char *name, goto rejected; } - test_and_set_bit_lock(0, &b->up); - rcu_assign_pointer(tn->bearer_list[bearer_id], b); - if (skb) - tipc_bearer_xmit_skb(net, bearer_id, skb, &b->bcast_addr); - + /* Create monitoring data before accepting activate messages */ if (tipc_mon_create(net, bearer_id)) { bearer_disable(net, b); + kfree_skb(skb); return -ENOMEM; } + test_and_set_bit_lock(0, &b->up); + rcu_assign_pointer(tn->bearer_list[bearer_id], b); + if (skb) + tipc_bearer_xmit_skb(net, bearer_id, skb, &b->bcast_addr); + pr_info("Enabled bearer <%s>, priority %u\n", name, prio); return res; -- cgit From 838d6d3461db0fdbf33fc5f8a69c27b50b4a46da Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 14 Jan 2022 14:56:15 -0500 Subject: virtio: unexport virtio_finalize_features virtio_finalize_features is only used internally within virtio. No reason to export it. Signed-off-by: Michael S. Tsirkin Reviewed-by: Cornelia Huck Acked-by: Jason Wang --- drivers/virtio/virtio.c | 3 +-- include/linux/virtio.h | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c index 00ac9db792a4..d891b0a354b0 100644 --- a/drivers/virtio/virtio.c +++ b/drivers/virtio/virtio.c @@ -166,7 +166,7 @@ void virtio_add_status(struct virtio_device *dev, unsigned int status) } EXPORT_SYMBOL_GPL(virtio_add_status); -int virtio_finalize_features(struct virtio_device *dev) +static int virtio_finalize_features(struct virtio_device *dev) { int ret = dev->config->finalize_features(dev); unsigned status; @@ -202,7 +202,6 @@ int virtio_finalize_features(struct virtio_device *dev) } return 0; } -EXPORT_SYMBOL_GPL(virtio_finalize_features); void virtio_reset_device(struct virtio_device *dev) { diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 72292a62cd90..5464f398912a 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -133,7 +133,6 @@ bool is_virtio_device(struct device *dev); void virtio_break_device(struct virtio_device *dev); void virtio_config_changed(struct virtio_device *dev); -int virtio_finalize_features(struct virtio_device *dev); #ifdef CONFIG_PM_SLEEP int virtio_device_freeze(struct virtio_device *dev); int virtio_device_restore(struct virtio_device *dev); -- cgit From 4fa59ede95195f267101a1b8916992cf3f245cdb Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 14 Jan 2022 14:58:41 -0500 Subject: virtio: acknowledge all features before access The feature negotiation was designed in a way that makes it possible for devices to know which config fields will be accessed by drivers. This is broken since commit 404123c2db79 ("virtio: allow drivers to validate features") with fallout in at least block and net. We have a partial work-around in commit 2f9a174f918e ("virtio: write back F_VERSION_1 before validate") which at least lets devices find out which format should config space have, but this is a partial fix: guests should not access config space without acknowledging features since otherwise we'll never be able to change the config space format. To fix, split finalize_features from virtio_finalize_features and call finalize_features with all feature bits before validation, and then - if validation changed any bits - once again after. Since virtio_finalize_features no longer writes out features rename it to virtio_features_ok - since that is what it does: checks that features are ok with the device. As a side effect, this also reduces the amount of hypervisor accesses - we now only acknowledge features once unless we are clearing any features when validating (which is uncommon). IRC I think that this was more or less always the intent in the spec but unfortunately the way the spec is worded does not say this explicitly, I plan to address this at the spec level, too. Acked-by: Jason Wang Cc: stable@vger.kernel.org Fixes: 404123c2db79 ("virtio: allow drivers to validate features") Fixes: 2f9a174f918e ("virtio: write back F_VERSION_1 before validate") Cc: "Halil Pasic" Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio.c | 39 ++++++++++++++++++++++----------------- include/linux/virtio_config.h | 3 ++- 2 files changed, 24 insertions(+), 18 deletions(-) diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c index d891b0a354b0..d6396be0ea83 100644 --- a/drivers/virtio/virtio.c +++ b/drivers/virtio/virtio.c @@ -166,14 +166,13 @@ void virtio_add_status(struct virtio_device *dev, unsigned int status) } EXPORT_SYMBOL_GPL(virtio_add_status); -static int virtio_finalize_features(struct virtio_device *dev) +/* Do some validation, then set FEATURES_OK */ +static int virtio_features_ok(struct virtio_device *dev) { - int ret = dev->config->finalize_features(dev); unsigned status; + int ret; might_sleep(); - if (ret) - return ret; ret = arch_has_restricted_virtio_memory_access(); if (ret) { @@ -244,17 +243,6 @@ static int virtio_dev_probe(struct device *_d) driver_features_legacy = driver_features; } - /* - * Some devices detect legacy solely via F_VERSION_1. Write - * F_VERSION_1 to force LE config space accesses before FEATURES_OK for - * these when needed. - */ - if (drv->validate && !virtio_legacy_is_little_endian() - && device_features & BIT_ULL(VIRTIO_F_VERSION_1)) { - dev->features = BIT_ULL(VIRTIO_F_VERSION_1); - dev->config->finalize_features(dev); - } - if (device_features & (1ULL << VIRTIO_F_VERSION_1)) dev->features = driver_features & device_features; else @@ -265,13 +253,26 @@ static int virtio_dev_probe(struct device *_d) if (device_features & (1ULL << i)) __virtio_set_bit(dev, i); + err = dev->config->finalize_features(dev); + if (err) + goto err; + if (drv->validate) { + u64 features = dev->features; + err = drv->validate(dev); if (err) goto err; + + /* Did validation change any features? Then write them again. */ + if (features != dev->features) { + err = dev->config->finalize_features(dev); + if (err) + goto err; + } } - err = virtio_finalize_features(dev); + err = virtio_features_ok(dev); if (err) goto err; @@ -495,7 +496,11 @@ int virtio_device_restore(struct virtio_device *dev) /* We have a driver! */ virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER); - ret = virtio_finalize_features(dev); + ret = dev->config->finalize_features(dev); + if (ret) + goto err; + + ret = virtio_features_ok(dev); if (ret) goto err; diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index 4d107ad31149..dafdc7f48c01 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -64,8 +64,9 @@ struct virtio_shm_region { * Returns the first 64 feature bits (all we currently need). * @finalize_features: confirm what device features we'll be using. * vdev: the virtio_device - * This gives the final feature bits for the device: it can change + * This sends the driver feature bits to the device: it can change * the dev->feature bits if it wants. + * Note: despite the name this can be called any number of times. * Returns 0 on success or error status * @bus_name: return the bus name associated with the device (optional) * vdev: the virtio_device -- cgit From c46eccdaadabb7822080a04e633f81b2ad37f358 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 14 Jan 2022 15:54:01 -0500 Subject: virtio: document virtio_reset_device Looks like most callers get driver/device removal wrong. Document what's expected of callers. Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c index d6396be0ea83..22f15f444f75 100644 --- a/drivers/virtio/virtio.c +++ b/drivers/virtio/virtio.c @@ -202,6 +202,22 @@ static int virtio_features_ok(struct virtio_device *dev) return 0; } +/** + * virtio_reset_device - quiesce device for removal + * @dev: the device to reset + * + * Prevents device from sending interrupts and accessing memory. + * + * Generally used for cleanup during driver / device removal. + * + * Once this has been invoked, caller must ensure that + * virtqueue_notify / virtqueue_kick are not in progress. + * + * Note: this guarantees that vq callbacks are not in progress, however caller + * is responsible for preventing access from other contexts, such as a system + * call/workqueue/bh. Invoking virtio_break_device then flushing any such + * contexts is one way to handle that. + * */ void virtio_reset_device(struct virtio_device *dev) { dev->config->reset(dev); -- cgit From 0e7174b9d5877130fec41fb4a16e0c2ee4958d44 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Tue, 5 Oct 2021 03:04:10 -0400 Subject: virtio_console: break out of buf poll on remove A common pattern for device reset is currently: vdev->config->reset(vdev); .. cleanup .. reset prevents new interrupts from arriving and waits for interrupt handlers to finish. However if - as is common - the handler queues a work request which is flushed during the cleanup stage, we have code adding buffers / trying to get buffers while device is reset. Not good. This was reproduced by running modprobe virtio_console modprobe -r virtio_console in a loop. Fix this up by calling virtio_break_device + flush before reset. Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1786239 Signed-off-by: Michael S. Tsirkin --- drivers/char/virtio_console.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c index 2359889a35a0..e3c430539a17 100644 --- a/drivers/char/virtio_console.c +++ b/drivers/char/virtio_console.c @@ -1957,6 +1957,13 @@ static void virtcons_remove(struct virtio_device *vdev) list_del(&portdev->list); spin_unlock_irq(&pdrvdata_lock); + /* Device is going away, exit any polling for buffers */ + virtio_break_device(vdev); + if (use_multiport(portdev)) + flush_work(&portdev->control_work); + else + flush_work(&portdev->config_work); + /* Disable interrupts for vqs */ virtio_reset_device(vdev); /* Finish up work that's lined up */ -- cgit From ca93e44bfb5fd7996b76f0f544999171f647f93b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 2 Mar 2022 11:48:39 +0000 Subject: btrfs: fallback to blocking mode when doing async dio over multiple extents Some users recently reported that MariaDB was getting a read corruption when using io_uring on top of btrfs. This started to happen in 5.16, after commit 51bd9563b6783d ("btrfs: fix deadlock due to page faults during direct IO reads and writes"). That changed btrfs to use the new iomap flag IOMAP_DIO_PARTIAL and to disable page faults before calling iomap_dio_rw(). This was necessary to fix deadlocks when the iovector corresponds to a memory mapped file region. That type of scenario is exercised by test case generic/647 from fstests. For this MariaDB scenario, we attempt to read 16K from file offset X using IOCB_NOWAIT and io_uring. In that range we have 4 extents, each with a size of 4K, and what happens is the following: 1) btrfs_direct_read() disables page faults and calls iomap_dio_rw(); 2) iomap creates a struct iomap_dio object, its reference count is initialized to 1 and its ->size field is initialized to 0; 3) iomap calls btrfs_dio_iomap_begin() with file offset X, which finds the first 4K extent, and setups an iomap for this extent consisting of a single page; 4) At iomap_dio_bio_iter(), we are able to access the first page of the buffer (struct iov_iter) with bio_iov_iter_get_pages() without triggering a page fault; 5) iomap submits a bio for this 4K extent (iomap_dio_submit_bio() -> btrfs_submit_direct()) and increments the refcount on the struct iomap_dio object to 2; The ->size field of the struct iomap_dio object is incremented to 4K; 6) iomap calls btrfs_iomap_begin() again, this time with a file offset of X + 4K. There we setup an iomap for the next extent that also has a size of 4K; 7) Then at iomap_dio_bio_iter() we call bio_iov_iter_get_pages(), which tries to access the next page (2nd page) of the buffer. This triggers a page fault and returns -EFAULT; 8) At __iomap_dio_rw() we see the -EFAULT, but we reset the error to 0 because we passed the flag IOMAP_DIO_PARTIAL to iomap and the struct iomap_dio object has a ->size value of 4K (we submitted a bio for an extent already). The 'wait_for_completion' variable is not set to true, because our iocb has IOCB_NOWAIT set; 9) At the bottom of __iomap_dio_rw(), we decrement the reference count of the struct iomap_dio object from 2 to 1. Because we were not the only ones holding a reference on it and 'wait_for_completion' is set to false, -EIOCBQUEUED is returned to btrfs_direct_read(), which just returns it up the callchain, up to io_uring; 10) The bio submitted for the first extent (step 5) completes and its bio endio function, iomap_dio_bio_end_io(), decrements the last reference on the struct iomap_dio object, resulting in calling iomap_dio_complete_work() -> iomap_dio_complete(). 11) At iomap_dio_complete() we adjust the iocb->ki_pos from X to X + 4K and return 4K (the amount of io done) to iomap_dio_complete_work(); 12) iomap_dio_complete_work() calls the iocb completion callback, iocb->ki_complete() with a second argument value of 4K (total io done) and the iocb with the adjust ki_pos of X + 4K. This results in completing the read request for io_uring, leaving it with a result of 4K bytes read, and only the first page of the buffer filled in, while the remaining 3 pages, corresponding to the other 3 extents, were not filled; 13) For the application, the result is unexpected because if we ask to read N bytes, it expects to get N bytes read as long as those N bytes don't cross the EOF (i_size). MariaDB reports this as an error, as it's not expecting a short read, since it knows it's asking for read operations fully within the i_size boundary. This is typical in many applications, but it may also be questionable if they should react to such short reads by issuing more read calls to get the remaining data. Nevertheless, the short read happened due to a change in btrfs regarding how it deals with page faults while in the middle of a read operation, and there's no reason why btrfs can't have the previous behaviour of returning the whole data that was requested by the application. The problem can also be triggered with the following simple program: /* Get O_DIRECT */ #ifndef _GNU_SOURCE #define _GNU_SOURCE #endif #include #include #include #include #include #include #include int main(int argc, char *argv[]) { char *foo_path; struct io_uring ring; struct io_uring_sqe *sqe; struct io_uring_cqe *cqe; struct iovec iovec; int fd; long pagesize; void *write_buf; void *read_buf; ssize_t ret; int i; if (argc != 2) { fprintf(stderr, "Use: %s \n", argv[0]); return 1; } foo_path = malloc(strlen(argv[1]) + 5); if (!foo_path) { fprintf(stderr, "Failed to allocate memory for file path\n"); return 1; } strcpy(foo_path, argv[1]); strcat(foo_path, "/foo"); /* * Create file foo with 2 extents, each with a size matching * the page size. Then allocate a buffer to read both extents * with io_uring, using O_DIRECT and IOCB_NOWAIT. Before doing * the read with io_uring, access the first page of the buffer * to fault it in, so that during the read we only trigger a * page fault when accessing the second page of the buffer. */ fd = open(foo_path, O_CREAT | O_TRUNC | O_WRONLY | O_DIRECT, 0666); if (fd == -1) { fprintf(stderr, "Failed to create file 'foo': %s (errno %d)", strerror(errno), errno); return 1; } pagesize = sysconf(_SC_PAGE_SIZE); ret = posix_memalign(&write_buf, pagesize, 2 * pagesize); if (ret) { fprintf(stderr, "Failed to allocate write buffer\n"); return 1; } memset(write_buf, 0xab, pagesize); memset(write_buf + pagesize, 0xcd, pagesize); /* Create 2 extents, each with a size matching page size. */ for (i = 0; i < 2; i++) { ret = pwrite(fd, write_buf + i * pagesize, pagesize, i * pagesize); if (ret != pagesize) { fprintf(stderr, "Failed to write to file, ret = %ld errno %d (%s)\n", ret, errno, strerror(errno)); return 1; } ret = fsync(fd); if (ret != 0) { fprintf(stderr, "Failed to fsync file\n"); return 1; } } close(fd); fd = open(foo_path, O_RDONLY | O_DIRECT); if (fd == -1) { fprintf(stderr, "Failed to open file 'foo': %s (errno %d)", strerror(errno), errno); return 1; } ret = posix_memalign(&read_buf, pagesize, 2 * pagesize); if (ret) { fprintf(stderr, "Failed to allocate read buffer\n"); return 1; } /* * Fault in only the first page of the read buffer. * We want to trigger a page fault for the 2nd page of the * read buffer during the read operation with io_uring * (O_DIRECT and IOCB_NOWAIT). */ memset(read_buf, 0, 1); ret = io_uring_queue_init(1, &ring, 0); if (ret != 0) { fprintf(stderr, "Failed to create io_uring queue\n"); return 1; } sqe = io_uring_get_sqe(&ring); if (!sqe) { fprintf(stderr, "Failed to get io_uring sqe\n"); return 1; } iovec.iov_base = read_buf; iovec.iov_len = 2 * pagesize; io_uring_prep_readv(sqe, fd, &iovec, 1, 0); ret = io_uring_submit_and_wait(&ring, 1); if (ret != 1) { fprintf(stderr, "Failed at io_uring_submit_and_wait()\n"); return 1; } ret = io_uring_wait_cqe(&ring, &cqe); if (ret < 0) { fprintf(stderr, "Failed at io_uring_wait_cqe()\n"); return 1; } printf("io_uring read result for file foo:\n\n"); printf(" cqe->res == %d (expected %d)\n", cqe->res, 2 * pagesize); printf(" memcmp(read_buf, write_buf) == %d (expected 0)\n", memcmp(read_buf, write_buf, 2 * pagesize)); io_uring_cqe_seen(&ring, cqe); io_uring_queue_exit(&ring); return 0; } When running it on an unpatched kernel: $ gcc io_uring_test.c -luring $ mkfs.btrfs -f /dev/sda $ mount /dev/sda /mnt/sda $ ./a.out /mnt/sda io_uring read result for file foo: cqe->res == 4096 (expected 8192) memcmp(read_buf, write_buf) == -205 (expected 0) After this patch, the read always returns 8192 bytes, with the buffer filled with the correct data. Although that reproducer always triggers the bug in my test vms, it's possible that it will not be so reliable on other environments, as that can happen if the bio for the first extent completes and decrements the reference on the struct iomap_dio object before we do the atomic_dec_and_test() on the reference at __iomap_dio_rw(). Fix this in btrfs by having btrfs_dio_iomap_begin() return -EAGAIN whenever we try to satisfy a non blocking IO request (IOMAP_NOWAIT flag set) over a range that spans multiple extents (or a mix of extents and holes). This avoids returning success to the caller when we only did partial IO, which is not optimal for writes and for reads it's actually incorrect, as the caller doesn't expect to get less bytes read than it has requested (unless EOF is crossed), as previously mentioned. This is also the type of behaviour that xfs follows (xfs_direct_write_iomap_begin()), even though it doesn't use IOMAP_DIO_PARTIAL. A test case for fstests will follow soon. Link: https://lore.kernel.org/linux-btrfs/CABVffEM0eEWho+206m470rtM0d9J8ue85TtR-A_oVTuGLWFicA@mail.gmail.com/ Link: https://lore.kernel.org/linux-btrfs/CAHF2GV6U32gmqSjLe=XKgfcZAmLCiH26cJ2OnHGp5x=VAH4OHQ@mail.gmail.com/ CC: stable@vger.kernel.org # 5.16+ Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/inode.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 76e530f76e3c..5bbea5ec31fc 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7600,6 +7600,34 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, } len = min(len, em->len - (start - em->start)); + + /* + * If we have a NOWAIT request and the range contains multiple extents + * (or a mix of extents and holes), then we return -EAGAIN to make the + * caller fallback to a context where it can do a blocking (without + * NOWAIT) request. This way we avoid doing partial IO and returning + * success to the caller, which is not optimal for writes and for reads + * it can result in unexpected behaviour for an application. + * + * When doing a read, because we use IOMAP_DIO_PARTIAL when calling + * iomap_dio_rw(), we can end up returning less data then what the caller + * asked for, resulting in an unexpected, and incorrect, short read. + * That is, the caller asked to read N bytes and we return less than that, + * which is wrong unless we are crossing EOF. This happens if we get a + * page fault error when trying to fault in pages for the buffer that is + * associated to the struct iov_iter passed to iomap_dio_rw(), and we + * have previously submitted bios for other extents in the range, in + * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of + * those bios have completed by the time we get the page fault error, + * which we return back to our caller - we should only return EIOCBQUEUED + * after we have submitted bios for all the extents in the range. + */ + if ((flags & IOMAP_NOWAIT) && len < length) { + free_extent_map(em); + ret = -EAGAIN; + goto unlock_err; + } + if (write) { ret = btrfs_get_blocks_direct_write(&em, inode, dio_data, start, len); -- cgit From e0077cc13b831f8fad5557442f73bf7728683713 Mon Sep 17 00:00:00 2001 From: Si-Wei Liu Date: Fri, 14 Jan 2022 19:27:59 -0500 Subject: vdpa: factor out vdpa_set_features_unlocked for vdpa internal use No functional change introduced. vdpa bus driver such as virtio_vdpa or vhost_vdpa is not supposed to take care of the locking for core by its own. The locked API vdpa_set_features should suffice the bus driver's need. Signed-off-by: Si-Wei Liu Reviewed-by: Eli Cohen Link: https://lore.kernel.org/r/1642206481-30721-2-git-send-email-si-wei.liu@oracle.com Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/vdpa/vdpa.c | 2 +- drivers/vhost/vdpa.c | 2 +- drivers/virtio/virtio_vdpa.c | 2 +- include/linux/vdpa.h | 18 ++++++++++++------ 4 files changed, 15 insertions(+), 9 deletions(-) diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c index 9846c9de4bfa..1ea525433a5c 100644 --- a/drivers/vdpa/vdpa.c +++ b/drivers/vdpa/vdpa.c @@ -393,7 +393,7 @@ static void vdpa_get_config_unlocked(struct vdpa_device *vdev, * If it does happen we assume a legacy guest. */ if (!vdev->features_valid) - vdpa_set_features(vdev, 0, true); + vdpa_set_features_unlocked(vdev, 0); ops->get_config(vdev, offset, buf, len); } diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 851539807bc9..ec5249e8c32d 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -286,7 +286,7 @@ static long vhost_vdpa_set_features(struct vhost_vdpa *v, u64 __user *featurep) if (copy_from_user(&features, featurep, sizeof(features))) return -EFAULT; - if (vdpa_set_features(vdpa, features, false)) + if (vdpa_set_features(vdpa, features)) return -EINVAL; return 0; diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c index 7767a7f0119b..76504559bc25 100644 --- a/drivers/virtio/virtio_vdpa.c +++ b/drivers/virtio/virtio_vdpa.c @@ -317,7 +317,7 @@ static int virtio_vdpa_finalize_features(struct virtio_device *vdev) /* Give virtio_ring a chance to accept features. */ vring_transport_features(vdev); - return vdpa_set_features(vdpa, vdev->features, false); + return vdpa_set_features(vdpa, vdev->features); } static const char *virtio_vdpa_bus_name(struct virtio_device *vdev) diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index 2de442ececae..721089bb4c84 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -401,18 +401,24 @@ static inline int vdpa_reset(struct vdpa_device *vdev) return ret; } -static inline int vdpa_set_features(struct vdpa_device *vdev, u64 features, bool locked) +static inline int vdpa_set_features_unlocked(struct vdpa_device *vdev, u64 features) { const struct vdpa_config_ops *ops = vdev->config; int ret; - if (!locked) - mutex_lock(&vdev->cf_mutex); - vdev->features_valid = true; ret = ops->set_driver_features(vdev, features); - if (!locked) - mutex_unlock(&vdev->cf_mutex); + + return ret; +} + +static inline int vdpa_set_features(struct vdpa_device *vdev, u64 features) +{ + int ret; + + mutex_lock(&vdev->cf_mutex); + ret = vdpa_set_features_unlocked(vdev, features); + mutex_unlock(&vdev->cf_mutex); return ret; } -- cgit From 30c22f3816ffef8aa21a000e93c4ee1402a6ea65 Mon Sep 17 00:00:00 2001 From: Si-Wei Liu Date: Fri, 14 Jan 2022 19:28:00 -0500 Subject: vdpa/mlx5: should verify CTRL_VQ feature exists for MQ Per VIRTIO v1.1 specification, section 5.1.3.1 Feature bit requirements: "VIRTIO_NET_F_MQ Requires VIRTIO_NET_F_CTRL_VQ". There's assumption in the mlx5_vdpa multiqueue code that MQ must come together with CTRL_VQ. However, there's nowhere in the upper layer to guarantee this assumption would hold. Were there an untrusted driver sending down MQ without CTRL_VQ, it would compromise various spots for e.g. is_index_valid() and is_ctrl_vq_idx(). Although this doesn't end up with immediate panic or security loophole as of today's code, the chance for this to be taken advantage of due to future code change is not zero. Harden the crispy assumption by failing the set_driver_features() call when seeing (MQ && !CTRL_VQ). For that end, verify_min_features() is renamed to verify_driver_features() to reflect the fact that it now does more than just validate the minimum features. verify_driver_features() is now used to accommodate various checks against the driver features for set_driver_features(). Signed-off-by: Si-Wei Liu Link: https://lore.kernel.org/r/1642206481-30721-3-git-send-email-si-wei.liu@oracle.com Signed-off-by: Michael S. Tsirkin Reviewed-by: Eli Cohen Acked-by: Jason Wang --- drivers/vdpa/mlx5/net/mlx5_vnet.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index f648f1c54a0f..1ffbd20fcf21 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -1897,11 +1897,25 @@ static u64 mlx5_vdpa_get_device_features(struct vdpa_device *vdev) return ndev->mvdev.mlx_features; } -static int verify_min_features(struct mlx5_vdpa_dev *mvdev, u64 features) +static int verify_driver_features(struct mlx5_vdpa_dev *mvdev, u64 features) { + /* Minimum features to expect */ if (!(features & BIT_ULL(VIRTIO_F_ACCESS_PLATFORM))) return -EOPNOTSUPP; + /* Double check features combination sent down by the driver. + * Fail invalid features due to absence of the depended feature. + * + * Per VIRTIO v1.1 specification, section 5.1.3.1 Feature bit + * requirements: "VIRTIO_NET_F_MQ Requires VIRTIO_NET_F_CTRL_VQ". + * By failing the invalid features sent down by untrusted drivers, + * we're assured the assumption made upon is_index_valid() and + * is_ctrl_vq_idx() will not be compromised. + */ + if ((features & (BIT_ULL(VIRTIO_NET_F_MQ) | BIT_ULL(VIRTIO_NET_F_CTRL_VQ))) == + BIT_ULL(VIRTIO_NET_F_MQ)) + return -EINVAL; + return 0; } @@ -1977,7 +1991,7 @@ static int mlx5_vdpa_set_driver_features(struct vdpa_device *vdev, u64 features) print_features(mvdev, features, true); - err = verify_min_features(mvdev, features); + err = verify_driver_features(mvdev, features); if (err) return err; -- cgit From ed0f849fc3a63ed2ddf5e72cdb1de3bdbbb0f8eb Mon Sep 17 00:00:00 2001 From: Si-Wei Liu Date: Fri, 14 Jan 2022 19:28:01 -0500 Subject: vdpa/mlx5: add validation for VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET command When control vq receives a VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET command request from the driver, presently there is no validation against the number of queue pairs to configure, or even if multiqueue had been negotiated or not is unverified. This may lead to kernel panic due to uninitialized resource for the queues were there any bogus request sent down by untrusted driver. Tie up the loose ends there. Fixes: 52893733f2c5 ("vdpa/mlx5: Add multiqueue support") Signed-off-by: Si-Wei Liu Link: https://lore.kernel.org/r/1642206481-30721-4-git-send-email-si-wei.liu@oracle.com Signed-off-by: Michael S. Tsirkin Reviewed-by: Eli Cohen Acked-by: Jason Wang --- drivers/vdpa/mlx5/net/mlx5_vnet.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 1ffbd20fcf21..d0f91078600e 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -1563,11 +1563,27 @@ static virtio_net_ctrl_ack handle_ctrl_mq(struct mlx5_vdpa_dev *mvdev, u8 cmd) switch (cmd) { case VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET: + /* This mq feature check aligns with pre-existing userspace + * implementation. + * + * Without it, an untrusted driver could fake a multiqueue config + * request down to a non-mq device that may cause kernel to + * panic due to uninitialized resources for extra vqs. Even with + * a well behaving guest driver, it is not expected to allow + * changing the number of vqs on a non-mq device. + */ + if (!MLX5_FEATURE(mvdev, VIRTIO_NET_F_MQ)) + break; + read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, (void *)&mq, sizeof(mq)); if (read != sizeof(mq)) break; newqps = mlx5vdpa16_to_cpu(mvdev, mq.virtqueue_pairs); + if (newqps < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN || + newqps > mlx5_vdpa_max_qps(mvdev->max_vqs)) + break; + if (ndev->cur_num_vqs == 2 * newqps) { status = VIRTIO_NET_OK; break; -- cgit From b9d102dafec6af1c07b610faf0a6d4e8aee14ae0 Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Fri, 21 Jan 2022 16:39:39 +0800 Subject: vduse: Fix returning wrong type in vduse_domain_alloc_iova() This fixes the following smatch warnings: drivers/vdpa/vdpa_user/iova_domain.c:305 vduse_domain_alloc_iova() warn: should 'iova_pfn << shift' be a 64 bit type? Fixes: 8c773d53fb7b ("vduse: Implement an MMU-based software IOTLB") Reported-by: kernel test robot Reported-by: Dan Carpenter Signed-off-by: Xie Yongji Link: https://lore.kernel.org/r/20220121083940.102-1-xieyongji@bytedance.com Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/vdpa/vdpa_user/iova_domain.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vdpa/vdpa_user/iova_domain.c b/drivers/vdpa/vdpa_user/iova_domain.c index 2b1143f11d8f..0a4d93edc4c0 100644 --- a/drivers/vdpa/vdpa_user/iova_domain.c +++ b/drivers/vdpa/vdpa_user/iova_domain.c @@ -294,7 +294,7 @@ vduse_domain_alloc_iova(struct iova_domain *iovad, iova_pfn = alloc_iova_fast(iovad, iova_len, limit >> shift, true); - return iova_pfn << shift; + return (dma_addr_t)iova_pfn << shift; } static void vduse_domain_free_iova(struct iova_domain *iovad, -- cgit From 0708a0afe291bdfe1386d74d5ec1f0c27e8b9168 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 Mar 2022 15:26:32 +0100 Subject: mm: Consider __GFP_NOWARN flag for oversized kvmalloc() calls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit syzkaller was recently triggering an oversized kvmalloc() warning via xdp_umem_create(). The triggered warning was added back in 7661809d493b ("mm: don't allow oversized kvmalloc() calls"). The rationale for the warning for huge kvmalloc sizes was as a reaction to a security bug where the size was more than UINT_MAX but not everything was prepared to handle unsigned long sizes. Anyway, the AF_XDP related call trace from this syzkaller report was: kvmalloc include/linux/mm.h:806 [inline] kvmalloc_array include/linux/mm.h:824 [inline] kvcalloc include/linux/mm.h:829 [inline] xdp_umem_pin_pages net/xdp/xdp_umem.c:102 [inline] xdp_umem_reg net/xdp/xdp_umem.c:219 [inline] xdp_umem_create+0x6a5/0xf00 net/xdp/xdp_umem.c:252 xsk_setsockopt+0x604/0x790 net/xdp/xsk.c:1068 __sys_setsockopt+0x1fd/0x4e0 net/socket.c:2176 __do_sys_setsockopt net/socket.c:2187 [inline] __se_sys_setsockopt net/socket.c:2184 [inline] __x64_sys_setsockopt+0xb5/0x150 net/socket.c:2184 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae Björn mentioned that requests for >2GB allocation can still be valid: The structure that is being allocated is the page-pinning accounting. AF_XDP has an internal limit of U32_MAX pages, which is *a lot*, but still fewer than what memcg allows (PAGE_COUNTER_MAX is a LONG_MAX/ PAGE_SIZE on 64 bit systems). [...] I could just change from U32_MAX to INT_MAX, but as I stated earlier that has a hacky feeling to it. [...] From my perspective, the code isn't broken, with the memcg limits in consideration. [...] Linus says: [...] Pretty much every time this has come up, the kernel warning has shown that yes, the code was broken and there really wasn't a reason for doing allocations that big. Of course, some people would be perfectly fine with the allocation failing, they just don't want the warning. I didn't want __GFP_NOWARN to shut it up originally because I wanted people to see all those cases, but these days I think we can just say "yeah, people can shut it up explicitly by saying 'go ahead and fail this allocation, don't warn about it'". So enough time has passed that by now I'd certainly be ok with [it]. Thus allow call-sites to silence such userspace triggered splats if the allocation requests have __GFP_NOWARN. For xdp_umem_pin_pages()'s call to kvcalloc() this is already the case, so nothing else needed there. Fixes: 7661809d493b ("mm: don't allow oversized kvmalloc() calls") Reported-by: syzbot+11421fbbff99b989670e@syzkaller.appspotmail.com Suggested-by: Linus Torvalds Signed-off-by: Daniel Borkmann Tested-by: syzbot+11421fbbff99b989670e@syzkaller.appspotmail.com Cc: Björn Töpel Cc: Magnus Karlsson Cc: Willy Tarreau Cc: Andrew Morton Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Jakub Kicinski Cc: David S. Miller Link: https://lore.kernel.org/bpf/CAJ+HfNhyfsT5cS_U9EC213ducHs9k9zNxX9+abqC0kTrPbQ0gg@mail.gmail.com Link: https://lore.kernel.org/bpf/20211201202905.b9892171e3f5b9a60f9da251@linux-foundation.org Reviewed-by: Leon Romanovsky Ackd-by: Michal Hocko Signed-off-by: Linus Torvalds --- mm/util.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/util.c b/mm/util.c index 7e43369064c8..d3102081add0 100644 --- a/mm/util.c +++ b/mm/util.c @@ -587,8 +587,10 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node) return ret; /* Don't even allow crazy sizes */ - if (WARN_ON_ONCE(size > INT_MAX)) + if (unlikely(size > INT_MAX)) { + WARN_ON_ONCE(!(flags & __GFP_NOWARN)); return NULL; + } return __vmalloc_node(size, 1, flags, node, __builtin_return_address(0)); -- cgit From 1d02b444b8d1345ea4708db3bab4db89a7784b55 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 2 Mar 2022 19:17:44 -0800 Subject: tracing: Fix return value of __setup handlers __setup() handlers should generally return 1 to indicate that the boot options have been handled. Using invalid option values causes the entire kernel boot option string to be reported as Unknown and added to init's environment strings, polluting it. Unknown kernel command line parameters "BOOT_IMAGE=/boot/bzImage-517rc6 kprobe_event=p,syscall_any,$arg1 trace_options=quiet trace_clock=jiffies", will be passed to user space. Run /sbin/init as init process with arguments: /sbin/init with environment: HOME=/ TERM=linux BOOT_IMAGE=/boot/bzImage-517rc6 kprobe_event=p,syscall_any,$arg1 trace_options=quiet trace_clock=jiffies Return 1 from the __setup() handlers so that init's environment is not polluted with kernel boot options. Link: lore.kernel.org/r/64644a2f-4a20-bab3-1e15-3b2cdd0defe3@omprussia.ru Link: https://lkml.kernel.org/r/20220303031744.32356-1-rdunlap@infradead.org Cc: stable@vger.kernel.org Fixes: 7bcfaf54f591 ("tracing: Add trace_options kernel command line parameter") Fixes: e1e232ca6b8f ("tracing: Add trace_clock= kernel parameter") Fixes: 970988e19eb0 ("tracing/kprobe: Add kprobe_event= boot parameter") Signed-off-by: Randy Dunlap Reported-by: Igor Zhbanov Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 4 ++-- kernel/trace/trace_kprobe.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3050892d1812..eb44418574f9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -235,7 +235,7 @@ static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata; static int __init set_trace_boot_options(char *str) { strlcpy(trace_boot_options_buf, str, MAX_TRACER_SIZE); - return 0; + return 1; } __setup("trace_options=", set_trace_boot_options); @@ -246,7 +246,7 @@ static int __init set_trace_boot_clock(char *str) { strlcpy(trace_boot_clock_buf, str, MAX_TRACER_SIZE); trace_boot_clock = trace_boot_clock_buf; - return 0; + return 1; } __setup("trace_clock=", set_trace_boot_clock); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 508f14af4f2c..b62fd785b599 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -32,7 +32,7 @@ static int __init set_kprobe_boot_events(char *str) strlcpy(kprobe_boot_events_buf, str, COMMAND_LINE_SIZE); disable_tracing_selftest("running kprobe events"); - return 0; + return 1; } __setup("kprobe_event=", set_kprobe_boot_events); -- cgit From a502a8f04097e038c3daa16c5202a9538116d563 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 3 Mar 2022 08:54:15 +0100 Subject: net: phy: meson-gxl: fix interrupt handling in forced mode This PHY doesn't support a link-up interrupt source. If aneg is enabled we use the "aneg complete" interrupt for this purpose, but if aneg is disabled link-up isn't signaled currently. According to a vendor driver there's an additional "energy detect" interrupt source that can be used to signal link-up if aneg is disabled. We can safely ignore this interrupt source if aneg is enabled. This patch was tested on a TX3 Mini TV box with S905W (even though boot message says it's a S905D). This issue has been existing longer, but due to changes in phylib and the driver the patch applies only from the commit marked as fixed. Fixes: 84c8f773d2dc ("net: phy: meson-gxl: remove the use of .ack_callback()") Signed-off-by: Heiner Kallweit Link: https://lore.kernel.org/r/04cac530-ea1b-850e-6cfa-144a55c4d75d@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/meson-gxl.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/drivers/net/phy/meson-gxl.c b/drivers/net/phy/meson-gxl.c index 7e7904fee1d9..c49062ad72c6 100644 --- a/drivers/net/phy/meson-gxl.c +++ b/drivers/net/phy/meson-gxl.c @@ -30,8 +30,12 @@ #define INTSRC_LINK_DOWN BIT(4) #define INTSRC_REMOTE_FAULT BIT(5) #define INTSRC_ANEG_COMPLETE BIT(6) +#define INTSRC_ENERGY_DETECT BIT(7) #define INTSRC_MASK 30 +#define INT_SOURCES (INTSRC_LINK_DOWN | INTSRC_ANEG_COMPLETE | \ + INTSRC_ENERGY_DETECT) + #define BANK_ANALOG_DSP 0 #define BANK_WOL 1 #define BANK_BIST 3 @@ -200,7 +204,6 @@ static int meson_gxl_ack_interrupt(struct phy_device *phydev) static int meson_gxl_config_intr(struct phy_device *phydev) { - u16 val; int ret; if (phydev->interrupts == PHY_INTERRUPT_ENABLED) { @@ -209,16 +212,9 @@ static int meson_gxl_config_intr(struct phy_device *phydev) if (ret) return ret; - val = INTSRC_ANEG_PR - | INTSRC_PARALLEL_FAULT - | INTSRC_ANEG_LP_ACK - | INTSRC_LINK_DOWN - | INTSRC_REMOTE_FAULT - | INTSRC_ANEG_COMPLETE; - ret = phy_write(phydev, INTSRC_MASK, val); + ret = phy_write(phydev, INTSRC_MASK, INT_SOURCES); } else { - val = 0; - ret = phy_write(phydev, INTSRC_MASK, val); + ret = phy_write(phydev, INTSRC_MASK, 0); /* Ack any pending IRQ */ ret = meson_gxl_ack_interrupt(phydev); @@ -237,9 +233,16 @@ static irqreturn_t meson_gxl_handle_interrupt(struct phy_device *phydev) return IRQ_NONE; } + irq_status &= INT_SOURCES; + if (irq_status == 0) return IRQ_NONE; + /* Aneg-complete interrupt is used for link-up detection */ + if (phydev->autoneg == AUTONEG_ENABLE && + irq_status == INTSRC_ENERGY_DETECT) + return IRQ_HANDLED; + phy_trigger_machine(phydev); return IRQ_HANDLED; -- cgit From eafd987d4a82c7bb5aa12f0e3b4f8f3dea93e678 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 25 Feb 2022 14:31:49 -0800 Subject: x86/speculation: Warn about Spectre v2 LFENCE mitigation With: f8a66d608a3e ("x86,bugs: Unconditionally allow spectre_v2=retpoline,amd") it became possible to enable the LFENCE "retpoline" on Intel. However, Intel doesn't recommend it, as it has some weaknesses compared to retpoline. Now AMD doesn't recommend it either. It can still be left available as a cmdline option. It's faster than retpoline but is weaker in certain scenarios -- particularly SMT, but even non-SMT may be vulnerable in some cases. So just unconditionally warn if the user requests it on the cmdline. [ bp: Massage commit message. ] Signed-off-by: Josh Poimboeuf Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/bugs.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index a36bfe2c2480..cfd116423908 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -651,6 +651,7 @@ static inline const char *spectre_v2_module_string(void) static inline const char *spectre_v2_module_string(void) { return ""; } #endif +#define SPECTRE_V2_LFENCE_MSG "WARNING: LFENCE mitigation is not recommended for this CPU, data leaks possible!\n" #define SPECTRE_V2_EIBRS_EBPF_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS on, data leaks possible via Spectre v2 BHB attacks!\n" #ifdef CONFIG_BPF_SYSCALL @@ -972,6 +973,7 @@ static void __init spectre_v2_select_mitigation(void) break; case SPECTRE_V2_CMD_RETPOLINE_LFENCE: + pr_err(SPECTRE_V2_LFENCE_MSG); mode = SPECTRE_V2_LFENCE; break; @@ -1787,6 +1789,9 @@ static char *ibpb_state(void) static ssize_t spectre_v2_show_state(char *buf) { + if (spectre_v2_enabled == SPECTRE_V2_LFENCE) + return sprintf(buf, "Vulnerable: LFENCE\n"); + if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) return sprintf(buf, "Vulnerable: Unprivileged eBPF enabled\n"); -- cgit From 0de05d056afdb00eca8c7bbb0c79a3438daf700c Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 25 Feb 2022 14:32:28 -0800 Subject: x86/speculation: Warn about eIBRS + LFENCE + Unprivileged eBPF + SMT The commit 44a3918c8245 ("x86/speculation: Include unprivileged eBPF status in Spectre v2 mitigation reporting") added a warning for the "eIBRS + unprivileged eBPF" combination, which has been shown to be vulnerable against Spectre v2 BHB-based attacks. However, there's no warning about the "eIBRS + LFENCE retpoline + unprivileged eBPF" combo. The LFENCE adds more protection by shortening the speculation window after a mispredicted branch. That makes an attack significantly more difficult, even with unprivileged eBPF. So at least for now the logic doesn't warn about that combination. But if you then add SMT into the mix, the SMT attack angle weakens the effectiveness of the LFENCE considerably. So extend the "eIBRS + unprivileged eBPF" warning to also include the "eIBRS + LFENCE + unprivileged eBPF + SMT" case. [ bp: Massage commit message. ] Suggested-by: Alyssa Milburn Signed-off-by: Josh Poimboeuf Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/bugs.c | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index cfd116423908..6296e1ebed1d 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -653,12 +653,27 @@ static inline const char *spectre_v2_module_string(void) { return ""; } #define SPECTRE_V2_LFENCE_MSG "WARNING: LFENCE mitigation is not recommended for this CPU, data leaks possible!\n" #define SPECTRE_V2_EIBRS_EBPF_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS on, data leaks possible via Spectre v2 BHB attacks!\n" +#define SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS+LFENCE mitigation and SMT, data leaks possible via Spectre v2 BHB attacks!\n" #ifdef CONFIG_BPF_SYSCALL void unpriv_ebpf_notify(int new_state) { - if (spectre_v2_enabled == SPECTRE_V2_EIBRS && !new_state) + if (new_state) + return; + + /* Unprivileged eBPF is enabled */ + + switch (spectre_v2_enabled) { + case SPECTRE_V2_EIBRS: pr_err(SPECTRE_V2_EIBRS_EBPF_MSG); + break; + case SPECTRE_V2_EIBRS_LFENCE: + if (sched_smt_active()) + pr_err(SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG); + break; + default: + break; + } } #endif @@ -1118,6 +1133,10 @@ void cpu_bugs_smt_update(void) { mutex_lock(&spec_ctrl_mutex); + if (sched_smt_active() && unprivileged_ebpf_enabled() && + spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE) + pr_warn_once(SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG); + switch (spectre_v2_user_stibp) { case SPECTRE_V2_USER_NONE: break; @@ -1793,7 +1812,11 @@ static ssize_t spectre_v2_show_state(char *buf) return sprintf(buf, "Vulnerable: LFENCE\n"); if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) - return sprintf(buf, "Vulnerable: Unprivileged eBPF enabled\n"); + return sprintf(buf, "Vulnerable: eIBRS with unprivileged eBPF\n"); + + if (sched_smt_active() && unprivileged_ebpf_enabled() && + spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE) + return sprintf(buf, "Vulnerable: eIBRS+LFENCE with unprivileged eBPF and SMT\n"); return sprintf(buf, "%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], -- cgit From 58dbe9b373df2828d873b1c0e5afc77485b2f376 Mon Sep 17 00:00:00 2001 From: Murilo Opsfelder Araujo Date: Tue, 1 Mar 2022 17:47:43 -0300 Subject: powerpc/64s: Fix build failure when CONFIG_PPC_64S_HASH_MMU is not set MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The following build failure occurs when CONFIG_PPC_64S_HASH_MMU is not set: arch/powerpc/kernel/setup_64.c: In function ‘setup_per_cpu_areas’: arch/powerpc/kernel/setup_64.c:811:21: error: ‘mmu_linear_psize’ undeclared (first use in this function); did you mean ‘mmu_virtual_psize’? 811 | if (mmu_linear_psize == MMU_PAGE_4K) | ^~~~~~~~~~~~~~~~ | mmu_virtual_psize arch/powerpc/kernel/setup_64.c:811:21: note: each undeclared identifier is reported only once for each function it appears in Move the declaration of mmu_linear_psize outside of CONFIG_PPC_64S_HASH_MMU ifdef. After the above is fixed, it fails later with the following error: ld: arch/powerpc/kexec/file_load_64.o: in function `.arch_kexec_kernel_image_probe': file_load_64.c:(.text+0x1c1c): undefined reference to `.add_htab_mem_range' Fix that, too, by conditioning add_htab_mem_range() symbol to CONFIG_PPC_64S_HASH_MMU. Fixes: 387e220a2e5e ("powerpc/64s: Move hash MMU support code under CONFIG_PPC_64S_HASH_MMU") Reported-by: Erhard F. Signed-off-by: Murilo Opsfelder Araujo Signed-off-by: Michael Ellerman BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=215567 Link: https://lore.kernel.org/r/20220301204743.45133-1-muriloo@linux.ibm.com --- arch/powerpc/include/asm/book3s/64/mmu.h | 2 +- arch/powerpc/include/asm/kexec_ranges.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h index ba5b1becf518..006cbec70ffe 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -202,7 +202,6 @@ static inline struct subpage_prot_table *mm_ctx_subpage_prot(mm_context_t *ctx) /* * The current system page and segment sizes */ -extern int mmu_linear_psize; extern int mmu_virtual_psize; extern int mmu_vmalloc_psize; extern int mmu_io_psize; @@ -213,6 +212,7 @@ extern int mmu_io_psize; #define mmu_virtual_psize MMU_PAGE_4K #endif #endif +extern int mmu_linear_psize; extern int mmu_vmemmap_psize; /* MMU initialization */ diff --git a/arch/powerpc/include/asm/kexec_ranges.h b/arch/powerpc/include/asm/kexec_ranges.h index 7a90000f8d15..f83866a19e87 100644 --- a/arch/powerpc/include/asm/kexec_ranges.h +++ b/arch/powerpc/include/asm/kexec_ranges.h @@ -9,7 +9,7 @@ struct crash_mem *realloc_mem_ranges(struct crash_mem **mem_ranges); int add_mem_range(struct crash_mem **mem_ranges, u64 base, u64 size); int add_tce_mem_ranges(struct crash_mem **mem_ranges); int add_initrd_mem_range(struct crash_mem **mem_ranges); -#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_64S_HASH_MMU int add_htab_mem_range(struct crash_mem **mem_ranges); #else static inline int add_htab_mem_range(struct crash_mem **mem_ranges) -- cgit From 9dd78194a3722fa6712192cdd4f7032d45112a9a Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Fri, 11 Feb 2022 16:45:54 +0000 Subject: ARM: report Spectre v2 status through sysfs As per other architectures, add support for reporting the Spectre vulnerability status via sysfs CPU. Acked-by: Catalin Marinas Signed-off-by: Russell King (Oracle) --- arch/arm/include/asm/spectre.h | 28 ++++++++ arch/arm/kernel/Makefile | 2 + arch/arm/kernel/spectre.c | 54 ++++++++++++++++ arch/arm/mm/Kconfig | 1 + arch/arm/mm/proc-v7-bugs.c | 141 +++++++++++++++++++++++++++++------------ 5 files changed, 187 insertions(+), 39 deletions(-) create mode 100644 arch/arm/include/asm/spectre.h create mode 100644 arch/arm/kernel/spectre.c diff --git a/arch/arm/include/asm/spectre.h b/arch/arm/include/asm/spectre.h new file mode 100644 index 000000000000..8a9019e08dba --- /dev/null +++ b/arch/arm/include/asm/spectre.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __ASM_SPECTRE_H +#define __ASM_SPECTRE_H + +enum { + SPECTRE_UNAFFECTED, + SPECTRE_MITIGATED, + SPECTRE_VULNERABLE, +}; + +enum { + __SPECTRE_V2_METHOD_BPIALL, + __SPECTRE_V2_METHOD_ICIALLU, + __SPECTRE_V2_METHOD_SMC, + __SPECTRE_V2_METHOD_HVC, +}; + +enum { + SPECTRE_V2_METHOD_BPIALL = BIT(__SPECTRE_V2_METHOD_BPIALL), + SPECTRE_V2_METHOD_ICIALLU = BIT(__SPECTRE_V2_METHOD_ICIALLU), + SPECTRE_V2_METHOD_SMC = BIT(__SPECTRE_V2_METHOD_SMC), + SPECTRE_V2_METHOD_HVC = BIT(__SPECTRE_V2_METHOD_HVC), +}; + +void spectre_v2_update_state(unsigned int state, unsigned int methods); + +#endif diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile index ae295a3bcfef..6ef3b535b7bf 100644 --- a/arch/arm/kernel/Makefile +++ b/arch/arm/kernel/Makefile @@ -106,4 +106,6 @@ endif obj-$(CONFIG_HAVE_ARM_SMCCC) += smccc-call.o +obj-$(CONFIG_GENERIC_CPU_VULNERABILITIES) += spectre.o + extra-y := $(head-y) vmlinux.lds diff --git a/arch/arm/kernel/spectre.c b/arch/arm/kernel/spectre.c new file mode 100644 index 000000000000..6f6dd1cfd099 --- /dev/null +++ b/arch/arm/kernel/spectre.c @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include + +#include + +ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr, + char *buf) +{ + return sprintf(buf, "Mitigation: __user pointer sanitization\n"); +} + +static unsigned int spectre_v2_state; +static unsigned int spectre_v2_methods; + +void spectre_v2_update_state(unsigned int state, unsigned int method) +{ + if (state > spectre_v2_state) + spectre_v2_state = state; + spectre_v2_methods |= method; +} + +ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, + char *buf) +{ + const char *method; + + if (spectre_v2_state == SPECTRE_UNAFFECTED) + return sprintf(buf, "%s\n", "Not affected"); + + if (spectre_v2_state != SPECTRE_MITIGATED) + return sprintf(buf, "%s\n", "Vulnerable"); + + switch (spectre_v2_methods) { + case SPECTRE_V2_METHOD_BPIALL: + method = "Branch predictor hardening"; + break; + + case SPECTRE_V2_METHOD_ICIALLU: + method = "I-cache invalidation"; + break; + + case SPECTRE_V2_METHOD_SMC: + case SPECTRE_V2_METHOD_HVC: + method = "Firmware call"; + break; + + default: + method = "Multiple mitigations"; + break; + } + + return sprintf(buf, "Mitigation: %s\n", method); +} diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig index 58afba346729..850329ea9bf8 100644 --- a/arch/arm/mm/Kconfig +++ b/arch/arm/mm/Kconfig @@ -830,6 +830,7 @@ config CPU_BPREDICT_DISABLE config CPU_SPECTRE bool + select GENERIC_CPU_VULNERABILITIES config HARDEN_BRANCH_PREDICTOR bool "Harden the branch predictor against aliasing attacks" if EXPERT diff --git a/arch/arm/mm/proc-v7-bugs.c b/arch/arm/mm/proc-v7-bugs.c index 114c05ab4dd9..e438e59bb63e 100644 --- a/arch/arm/mm/proc-v7-bugs.c +++ b/arch/arm/mm/proc-v7-bugs.c @@ -6,8 +6,35 @@ #include #include #include +#include #include +#ifdef CONFIG_ARM_PSCI +static int __maybe_unused spectre_v2_get_cpu_fw_mitigation_state(void) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, + ARM_SMCCC_ARCH_WORKAROUND_1, &res); + + switch ((int)res.a0) { + case SMCCC_RET_SUCCESS: + return SPECTRE_MITIGATED; + + case SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED: + return SPECTRE_UNAFFECTED; + + default: + return SPECTRE_VULNERABLE; + } +} +#else +static int __maybe_unused spectre_v2_get_cpu_fw_mitigation_state(void) +{ + return SPECTRE_VULNERABLE; +} +#endif + #ifdef CONFIG_HARDEN_BRANCH_PREDICTOR DEFINE_PER_CPU(harden_branch_predictor_fn_t, harden_branch_predictor_fn); @@ -36,13 +63,60 @@ static void __maybe_unused call_hvc_arch_workaround_1(void) arm_smccc_1_1_hvc(ARM_SMCCC_ARCH_WORKAROUND_1, NULL); } -static void cpu_v7_spectre_init(void) +static unsigned int spectre_v2_install_workaround(unsigned int method) { const char *spectre_v2_method = NULL; int cpu = smp_processor_id(); if (per_cpu(harden_branch_predictor_fn, cpu)) - return; + return SPECTRE_MITIGATED; + + switch (method) { + case SPECTRE_V2_METHOD_BPIALL: + per_cpu(harden_branch_predictor_fn, cpu) = + harden_branch_predictor_bpiall; + spectre_v2_method = "BPIALL"; + break; + + case SPECTRE_V2_METHOD_ICIALLU: + per_cpu(harden_branch_predictor_fn, cpu) = + harden_branch_predictor_iciallu; + spectre_v2_method = "ICIALLU"; + break; + + case SPECTRE_V2_METHOD_HVC: + per_cpu(harden_branch_predictor_fn, cpu) = + call_hvc_arch_workaround_1; + cpu_do_switch_mm = cpu_v7_hvc_switch_mm; + spectre_v2_method = "hypervisor"; + break; + + case SPECTRE_V2_METHOD_SMC: + per_cpu(harden_branch_predictor_fn, cpu) = + call_smc_arch_workaround_1; + cpu_do_switch_mm = cpu_v7_smc_switch_mm; + spectre_v2_method = "firmware"; + break; + } + + if (spectre_v2_method) + pr_info("CPU%u: Spectre v2: using %s workaround\n", + smp_processor_id(), spectre_v2_method); + + return SPECTRE_MITIGATED; +} +#else +static unsigned int spectre_v2_install_workaround(unsigned int method) +{ + pr_info("CPU%u: Spectre V2: workarounds disabled by configuration\n"); + + return SPECTRE_VULNERABLE; +} +#endif + +static void cpu_v7_spectre_v2_init(void) +{ + unsigned int state, method = 0; switch (read_cpuid_part()) { case ARM_CPU_PART_CORTEX_A8: @@ -51,68 +125,57 @@ static void cpu_v7_spectre_init(void) case ARM_CPU_PART_CORTEX_A17: case ARM_CPU_PART_CORTEX_A73: case ARM_CPU_PART_CORTEX_A75: - per_cpu(harden_branch_predictor_fn, cpu) = - harden_branch_predictor_bpiall; - spectre_v2_method = "BPIALL"; + state = SPECTRE_MITIGATED; + method = SPECTRE_V2_METHOD_BPIALL; break; case ARM_CPU_PART_CORTEX_A15: case ARM_CPU_PART_BRAHMA_B15: - per_cpu(harden_branch_predictor_fn, cpu) = - harden_branch_predictor_iciallu; - spectre_v2_method = "ICIALLU"; + state = SPECTRE_MITIGATED; + method = SPECTRE_V2_METHOD_ICIALLU; break; -#ifdef CONFIG_ARM_PSCI case ARM_CPU_PART_BRAHMA_B53: /* Requires no workaround */ + state = SPECTRE_UNAFFECTED; break; + default: /* Other ARM CPUs require no workaround */ - if (read_cpuid_implementor() == ARM_CPU_IMP_ARM) + if (read_cpuid_implementor() == ARM_CPU_IMP_ARM) { + state = SPECTRE_UNAFFECTED; break; + } + fallthrough; - /* Cortex A57/A72 require firmware workaround */ - case ARM_CPU_PART_CORTEX_A57: - case ARM_CPU_PART_CORTEX_A72: { - struct arm_smccc_res res; - arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, - ARM_SMCCC_ARCH_WORKAROUND_1, &res); - if ((int)res.a0 != 0) - return; + /* Cortex A57/A72 require firmware workaround */ + case ARM_CPU_PART_CORTEX_A57: + case ARM_CPU_PART_CORTEX_A72: + state = spectre_v2_get_cpu_fw_mitigation_state(); + if (state != SPECTRE_MITIGATED) + break; switch (arm_smccc_1_1_get_conduit()) { case SMCCC_CONDUIT_HVC: - per_cpu(harden_branch_predictor_fn, cpu) = - call_hvc_arch_workaround_1; - cpu_do_switch_mm = cpu_v7_hvc_switch_mm; - spectre_v2_method = "hypervisor"; + method = SPECTRE_V2_METHOD_HVC; break; case SMCCC_CONDUIT_SMC: - per_cpu(harden_branch_predictor_fn, cpu) = - call_smc_arch_workaround_1; - cpu_do_switch_mm = cpu_v7_smc_switch_mm; - spectre_v2_method = "firmware"; + method = SPECTRE_V2_METHOD_SMC; break; default: + state = SPECTRE_VULNERABLE; break; } } -#endif - } - if (spectre_v2_method) - pr_info("CPU%u: Spectre v2: using %s workaround\n", - smp_processor_id(), spectre_v2_method); -} -#else -static void cpu_v7_spectre_init(void) -{ + if (state == SPECTRE_MITIGATED) + state = spectre_v2_install_workaround(method); + + spectre_v2_update_state(state, method); } -#endif static __maybe_unused bool cpu_v7_check_auxcr_set(bool *warned, u32 mask, const char *msg) @@ -142,16 +205,16 @@ static bool check_spectre_auxcr(bool *warned, u32 bit) void cpu_v7_ca8_ibe(void) { if (check_spectre_auxcr(this_cpu_ptr(&spectre_warned), BIT(6))) - cpu_v7_spectre_init(); + cpu_v7_spectre_v2_init(); } void cpu_v7_ca15_ibe(void) { if (check_spectre_auxcr(this_cpu_ptr(&spectre_warned), BIT(0))) - cpu_v7_spectre_init(); + cpu_v7_spectre_v2_init(); } void cpu_v7_bugs_init(void) { - cpu_v7_spectre_init(); + cpu_v7_spectre_v2_init(); } -- cgit From 04e91b7324760a377a725e218b5ee783826d30f5 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Fri, 11 Feb 2022 19:46:15 +0000 Subject: ARM: early traps initialisation Provide a couple of helpers to copy the vectors and stubs, and also to flush the copied vectors and stubs. Acked-by: Catalin Marinas Signed-off-by: Russell King (Oracle) --- arch/arm/kernel/traps.c | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index 195dff58bafc..abc82dfeab51 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -787,10 +787,22 @@ static inline void __init kuser_init(void *vectors) } #endif +#ifndef CONFIG_CPU_V7M +static void copy_from_lma(void *vma, void *lma_start, void *lma_end) +{ + memcpy(vma, lma_start, lma_end - lma_start); +} + +static void flush_vectors(void *vma, size_t offset, size_t size) +{ + unsigned long start = (unsigned long)vma + offset; + unsigned long end = start + size; + + flush_icache_range(start, end); +} + void __init early_trap_init(void *vectors_base) { -#ifndef CONFIG_CPU_V7M - unsigned long vectors = (unsigned long)vectors_base; extern char __stubs_start[], __stubs_end[]; extern char __vectors_start[], __vectors_end[]; unsigned i; @@ -811,17 +823,20 @@ void __init early_trap_init(void *vectors_base) * into the vector page, mapped at 0xffff0000, and ensure these * are visible to the instruction stream. */ - memcpy((void *)vectors, __vectors_start, __vectors_end - __vectors_start); - memcpy((void *)vectors + 0x1000, __stubs_start, __stubs_end - __stubs_start); + copy_from_lma(vectors_base, __vectors_start, __vectors_end); + copy_from_lma(vectors_base + 0x1000, __stubs_start, __stubs_end); kuser_init(vectors_base); - flush_icache_range(vectors, vectors + PAGE_SIZE * 2); + flush_vectors(vectors_base, 0, PAGE_SIZE * 2); +} #else /* ifndef CONFIG_CPU_V7M */ +void __init early_trap_init(void *vectors_base) +{ /* * on V7-M there is no need to copy the vector table to a dedicated * memory area. The address is configurable and so a table in the kernel * image can be used. */ -#endif } +#endif -- cgit From 8d9d651ff2270a632e9dc497b142db31e8911315 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Fri, 11 Feb 2022 19:49:50 +0000 Subject: ARM: use LOADADDR() to get load address of sections Use the linker's LOADADDR() macro to get the load address of the sections, and provide a macro to set the start and end symbols. Acked-by: Catalin Marinas Signed-off-by: Russell King (Oracle) --- arch/arm/include/asm/vmlinux.lds.h | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/arch/arm/include/asm/vmlinux.lds.h b/arch/arm/include/asm/vmlinux.lds.h index 4a91428c324d..e02710d17cf9 100644 --- a/arch/arm/include/asm/vmlinux.lds.h +++ b/arch/arm/include/asm/vmlinux.lds.h @@ -26,6 +26,11 @@ #define ARM_MMU_DISCARD(x) x #endif +/* Set start/end symbol names to the LMA for the section */ +#define ARM_LMA(sym, section) \ + sym##_start = LOADADDR(section); \ + sym##_end = LOADADDR(section) + SIZEOF(section) + #define PROC_INFO \ . = ALIGN(4); \ __proc_info_begin = .; \ @@ -110,19 +115,19 @@ * only thing that matters is their relative offsets */ #define ARM_VECTORS \ - __vectors_start = .; \ + __vectors_lma = .; \ .vectors 0xffff0000 : AT(__vectors_start) { \ *(.vectors) \ } \ - . = __vectors_start + SIZEOF(.vectors); \ - __vectors_end = .; \ + ARM_LMA(__vectors, .vectors); \ + . = __vectors_lma + SIZEOF(.vectors); \ \ - __stubs_start = .; \ - .stubs ADDR(.vectors) + 0x1000 : AT(__stubs_start) { \ + __stubs_lma = .; \ + .stubs ADDR(.vectors) + 0x1000 : AT(__stubs_lma) { \ *(.stubs) \ } \ - . = __stubs_start + SIZEOF(.stubs); \ - __stubs_end = .; \ + ARM_LMA(__stubs, .stubs); \ + . = __stubs_lma + SIZEOF(.stubs); \ \ PROVIDE(vector_fiq_offset = vector_fiq - ADDR(.vectors)); -- cgit From b9baf5c8c5c356757f4f9d8180b5e9d234065bc3 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 10 Feb 2022 16:05:45 +0000 Subject: ARM: Spectre-BHB workaround Workaround the Spectre BHB issues for Cortex-A15, Cortex-A57, Cortex-A72, Cortex-A73 and Cortex-A75. We also include Brahma B15 as well to be safe, which is affected by Spectre V2 in the same ways as Cortex-A15. Reviewed-by: Catalin Marinas Signed-off-by: Russell King (Oracle) --- arch/arm/include/asm/assembler.h | 10 +++++ arch/arm/include/asm/spectre.h | 4 ++ arch/arm/include/asm/vmlinux.lds.h | 18 +++++++-- arch/arm/kernel/entry-armv.S | 79 +++++++++++++++++++++++++++++++++++--- arch/arm/kernel/entry-common.S | 24 ++++++++++++ arch/arm/kernel/spectre.c | 4 ++ arch/arm/kernel/traps.c | 38 ++++++++++++++++++ arch/arm/mm/Kconfig | 10 +++++ arch/arm/mm/proc-v7-bugs.c | 76 ++++++++++++++++++++++++++++++++++++ 9 files changed, 254 insertions(+), 9 deletions(-) diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h index 7d23d4bb2168..49ea603fc8e3 100644 --- a/arch/arm/include/asm/assembler.h +++ b/arch/arm/include/asm/assembler.h @@ -107,6 +107,16 @@ .endm #endif +#if __LINUX_ARM_ARCH__ < 7 + .macro dsb, args + mcr p15, 0, r0, c7, c10, 4 + .endm + + .macro isb, args + mcr p15, 0, r0, c7, r5, 4 + .endm +#endif + .macro asm_trace_hardirqs_off, save=1 #if defined(CONFIG_TRACE_IRQFLAGS) .if \save diff --git a/arch/arm/include/asm/spectre.h b/arch/arm/include/asm/spectre.h index 8a9019e08dba..d1fa5607d3aa 100644 --- a/arch/arm/include/asm/spectre.h +++ b/arch/arm/include/asm/spectre.h @@ -14,6 +14,7 @@ enum { __SPECTRE_V2_METHOD_ICIALLU, __SPECTRE_V2_METHOD_SMC, __SPECTRE_V2_METHOD_HVC, + __SPECTRE_V2_METHOD_LOOP8, }; enum { @@ -21,8 +22,11 @@ enum { SPECTRE_V2_METHOD_ICIALLU = BIT(__SPECTRE_V2_METHOD_ICIALLU), SPECTRE_V2_METHOD_SMC = BIT(__SPECTRE_V2_METHOD_SMC), SPECTRE_V2_METHOD_HVC = BIT(__SPECTRE_V2_METHOD_HVC), + SPECTRE_V2_METHOD_LOOP8 = BIT(__SPECTRE_V2_METHOD_LOOP8), }; void spectre_v2_update_state(unsigned int state, unsigned int methods); +int spectre_bhb_update_vectors(unsigned int method); + #endif diff --git a/arch/arm/include/asm/vmlinux.lds.h b/arch/arm/include/asm/vmlinux.lds.h index e02710d17cf9..0ef21bfae9f6 100644 --- a/arch/arm/include/asm/vmlinux.lds.h +++ b/arch/arm/include/asm/vmlinux.lds.h @@ -116,11 +116,23 @@ */ #define ARM_VECTORS \ __vectors_lma = .; \ - .vectors 0xffff0000 : AT(__vectors_start) { \ - *(.vectors) \ + OVERLAY 0xffff0000 : NOCROSSREFS AT(__vectors_lma) { \ + .vectors { \ + *(.vectors) \ + } \ + .vectors.bhb.loop8 { \ + *(.vectors.bhb.loop8) \ + } \ + .vectors.bhb.bpiall { \ + *(.vectors.bhb.bpiall) \ + } \ } \ ARM_LMA(__vectors, .vectors); \ - . = __vectors_lma + SIZEOF(.vectors); \ + ARM_LMA(__vectors_bhb_loop8, .vectors.bhb.loop8); \ + ARM_LMA(__vectors_bhb_bpiall, .vectors.bhb.bpiall); \ + . = __vectors_lma + SIZEOF(.vectors) + \ + SIZEOF(.vectors.bhb.loop8) + \ + SIZEOF(.vectors.bhb.bpiall); \ \ __stubs_lma = .; \ .stubs ADDR(.vectors) + 0x1000 : AT(__stubs_lma) { \ diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S index 5cd057859fe9..676703cbfe4b 100644 --- a/arch/arm/kernel/entry-armv.S +++ b/arch/arm/kernel/entry-armv.S @@ -1002,12 +1002,11 @@ vector_\name: sub lr, lr, #\correction .endif - @ - @ Save r0, lr_ (parent PC) and spsr_ - @ (parent CPSR) - @ + @ Save r0, lr_ (parent PC) stmia sp, {r0, lr} @ save r0, lr - mrs lr, spsr + + @ Save spsr_ (parent CPSR) +2: mrs lr, spsr str lr, [sp, #8] @ save spsr @ @@ -1028,6 +1027,44 @@ vector_\name: movs pc, lr @ branch to handler in SVC mode ENDPROC(vector_\name) +#ifdef CONFIG_HARDEN_BRANCH_HISTORY + .subsection 1 + .align 5 +vector_bhb_loop8_\name: + .if \correction + sub lr, lr, #\correction + .endif + + @ Save r0, lr_ (parent PC) + stmia sp, {r0, lr} + + @ bhb workaround + mov r0, #8 +1: b . + 4 + subs r0, r0, #1 + bne 1b + dsb + isb + b 2b +ENDPROC(vector_bhb_loop8_\name) + +vector_bhb_bpiall_\name: + .if \correction + sub lr, lr, #\correction + .endif + + @ Save r0, lr_ (parent PC) + stmia sp, {r0, lr} + + @ bhb workaround + mcr p15, 0, r0, c7, c5, 6 @ BPIALL + @ isb not needed due to "movs pc, lr" in the vector stub + @ which gives a "context synchronisation". + b 2b +ENDPROC(vector_bhb_bpiall_\name) + .previous +#endif + .align 2 @ handler addresses follow this label 1: @@ -1036,6 +1073,10 @@ ENDPROC(vector_\name) .section .stubs, "ax", %progbits @ This must be the first word .word vector_swi +#ifdef CONFIG_HARDEN_BRANCH_HISTORY + .word vector_bhb_loop8_swi + .word vector_bhb_bpiall_swi +#endif vector_rst: ARM( swi SYS_ERROR0 ) @@ -1150,8 +1191,10 @@ vector_addrexcptn: * FIQ "NMI" handler *----------------------------------------------------------------------------- * Handle a FIQ using the SVC stack allowing FIQ act like NMI on x86 - * systems. + * systems. This must be the last vector stub, so lets place it in its own + * subsection. */ + .subsection 2 vector_stub fiq, FIQ_MODE, 4 .long __fiq_usr @ 0 (USR_26 / USR_32) @@ -1184,6 +1227,30 @@ vector_addrexcptn: W(b) vector_irq W(b) vector_fiq +#ifdef CONFIG_HARDEN_BRANCH_HISTORY + .section .vectors.bhb.loop8, "ax", %progbits +.L__vectors_bhb_loop8_start: + W(b) vector_rst + W(b) vector_bhb_loop8_und + W(ldr) pc, .L__vectors_bhb_loop8_start + 0x1004 + W(b) vector_bhb_loop8_pabt + W(b) vector_bhb_loop8_dabt + W(b) vector_addrexcptn + W(b) vector_bhb_loop8_irq + W(b) vector_bhb_loop8_fiq + + .section .vectors.bhb.bpiall, "ax", %progbits +.L__vectors_bhb_bpiall_start: + W(b) vector_rst + W(b) vector_bhb_bpiall_und + W(ldr) pc, .L__vectors_bhb_bpiall_start + 0x1008 + W(b) vector_bhb_bpiall_pabt + W(b) vector_bhb_bpiall_dabt + W(b) vector_addrexcptn + W(b) vector_bhb_bpiall_irq + W(b) vector_bhb_bpiall_fiq +#endif + .data .align 2 diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S index ac86c34682bb..dbc1913ee30b 100644 --- a/arch/arm/kernel/entry-common.S +++ b/arch/arm/kernel/entry-common.S @@ -153,6 +153,29 @@ ENDPROC(ret_from_fork) *----------------------------------------------------------------------------- */ + .align 5 +#ifdef CONFIG_HARDEN_BRANCH_HISTORY +ENTRY(vector_bhb_loop8_swi) + sub sp, sp, #PT_REGS_SIZE + stmia sp, {r0 - r12} + mov r8, #8 +1: b 2f +2: subs r8, r8, #1 + bne 1b + dsb + isb + b 3f +ENDPROC(vector_bhb_loop8_swi) + + .align 5 +ENTRY(vector_bhb_bpiall_swi) + sub sp, sp, #PT_REGS_SIZE + stmia sp, {r0 - r12} + mcr p15, 0, r8, c7, c5, 6 @ BPIALL + isb + b 3f +ENDPROC(vector_bhb_bpiall_swi) +#endif .align 5 ENTRY(vector_swi) #ifdef CONFIG_CPU_V7M @@ -160,6 +183,7 @@ ENTRY(vector_swi) #else sub sp, sp, #PT_REGS_SIZE stmia sp, {r0 - r12} @ Calling r0 - r12 +3: ARM( add r8, sp, #S_PC ) ARM( stmdb r8, {sp, lr}^ ) @ Calling sp, lr THUMB( mov r8, sp ) diff --git a/arch/arm/kernel/spectre.c b/arch/arm/kernel/spectre.c index 6f6dd1cfd099..ade967f18d06 100644 --- a/arch/arm/kernel/spectre.c +++ b/arch/arm/kernel/spectre.c @@ -45,6 +45,10 @@ ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, method = "Firmware call"; break; + case SPECTRE_V2_METHOD_LOOP8: + method = "History overwrite"; + break; + default: method = "Multiple mitigations"; break; diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index abc82dfeab51..90c887aa67a4 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -801,6 +802,43 @@ static void flush_vectors(void *vma, size_t offset, size_t size) flush_icache_range(start, end); } +#ifdef CONFIG_HARDEN_BRANCH_HISTORY +int spectre_bhb_update_vectors(unsigned int method) +{ + extern char __vectors_bhb_bpiall_start[], __vectors_bhb_bpiall_end[]; + extern char __vectors_bhb_loop8_start[], __vectors_bhb_loop8_end[]; + void *vec_start, *vec_end; + + if (system_state >= SYSTEM_FREEING_INITMEM) { + pr_err("CPU%u: Spectre BHB workaround too late - system vulnerable\n", + smp_processor_id()); + return SPECTRE_VULNERABLE; + } + + switch (method) { + case SPECTRE_V2_METHOD_LOOP8: + vec_start = __vectors_bhb_loop8_start; + vec_end = __vectors_bhb_loop8_end; + break; + + case SPECTRE_V2_METHOD_BPIALL: + vec_start = __vectors_bhb_bpiall_start; + vec_end = __vectors_bhb_bpiall_end; + break; + + default: + pr_err("CPU%u: unknown Spectre BHB state %d\n", + smp_processor_id(), method); + return SPECTRE_VULNERABLE; + } + + copy_from_lma(vectors_page, vec_start, vec_end); + flush_vectors(vectors_page, 0, vec_end - vec_start); + + return SPECTRE_MITIGATED; +} +#endif + void __init early_trap_init(void *vectors_base) { extern char __stubs_start[], __stubs_end[]; diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig index 850329ea9bf8..9724c16e9076 100644 --- a/arch/arm/mm/Kconfig +++ b/arch/arm/mm/Kconfig @@ -851,6 +851,16 @@ config HARDEN_BRANCH_PREDICTOR If unsure, say Y. +config HARDEN_BRANCH_HISTORY + bool "Harden Spectre style attacks against branch history" if EXPERT + depends on CPU_SPECTRE + default y + help + Speculation attacks against some high-performance processors can + make use of branch history to influence future speculation. When + taking an exception, a sequence of branches overwrites the branch + history, or branch history is invalidated. + config TLS_REG_EMUL bool select NEED_KUSER_HELPERS diff --git a/arch/arm/mm/proc-v7-bugs.c b/arch/arm/mm/proc-v7-bugs.c index e438e59bb63e..c226feab2457 100644 --- a/arch/arm/mm/proc-v7-bugs.c +++ b/arch/arm/mm/proc-v7-bugs.c @@ -177,6 +177,81 @@ static void cpu_v7_spectre_v2_init(void) spectre_v2_update_state(state, method); } +#ifdef CONFIG_HARDEN_BRANCH_HISTORY +static int spectre_bhb_method; + +static const char *spectre_bhb_method_name(int method) +{ + switch (method) { + case SPECTRE_V2_METHOD_LOOP8: + return "loop"; + + case SPECTRE_V2_METHOD_BPIALL: + return "BPIALL"; + + default: + return "unknown"; + } +} + +static int spectre_bhb_install_workaround(int method) +{ + if (spectre_bhb_method != method) { + if (spectre_bhb_method) { + pr_err("CPU%u: Spectre BHB: method disagreement, system vulnerable\n", + smp_processor_id()); + + return SPECTRE_VULNERABLE; + } + + if (spectre_bhb_update_vectors(method) == SPECTRE_VULNERABLE) + return SPECTRE_VULNERABLE; + + spectre_bhb_method = method; + } + + pr_info("CPU%u: Spectre BHB: using %s workaround\n", + smp_processor_id(), spectre_bhb_method_name(method)); + + return SPECTRE_MITIGATED; +} +#else +static int spectre_bhb_install_workaround(int method) +{ + return SPECTRE_VULNERABLE; +} +#endif + +static void cpu_v7_spectre_bhb_init(void) +{ + unsigned int state, method = 0; + + switch (read_cpuid_part()) { + case ARM_CPU_PART_CORTEX_A15: + case ARM_CPU_PART_BRAHMA_B15: + case ARM_CPU_PART_CORTEX_A57: + case ARM_CPU_PART_CORTEX_A72: + state = SPECTRE_MITIGATED; + method = SPECTRE_V2_METHOD_LOOP8; + break; + + case ARM_CPU_PART_CORTEX_A73: + case ARM_CPU_PART_CORTEX_A75: + state = SPECTRE_MITIGATED; + method = SPECTRE_V2_METHOD_BPIALL; + break; + + default: + state = SPECTRE_UNAFFECTED; + break; + } + + if (state == SPECTRE_MITIGATED) + state = spectre_bhb_install_workaround(method); + + spectre_v2_update_state(state, method); +} + static __maybe_unused bool cpu_v7_check_auxcr_set(bool *warned, u32 mask, const char *msg) { @@ -217,4 +292,5 @@ void cpu_v7_ca15_ibe(void) void cpu_v7_bugs_init(void) { cpu_v7_spectre_v2_init(); + cpu_v7_spectre_bhb_init(); } -- cgit From c6a502c2299941c8326d029cfc8a3bc8a4607ad5 Mon Sep 17 00:00:00 2001 From: Alexey Khoroshilov Date: Fri, 4 Mar 2022 21:25:36 +0300 Subject: mISDN: Fix memory leak in dsp_pipeline_build() dsp_pipeline_build() allocates dup pointer by kstrdup(cfg), but then it updates dup variable by strsep(&dup, "|"). As a result when it calls kfree(dup), the dup variable contains NULL. Found by Linux Driver Verification project (linuxtesting.org) with SVACE. Signed-off-by: Alexey Khoroshilov Fixes: 960366cf8dbb ("Add mISDN DSP") Signed-off-by: David S. Miller --- drivers/isdn/mISDN/dsp_pipeline.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/isdn/mISDN/dsp_pipeline.c b/drivers/isdn/mISDN/dsp_pipeline.c index e11ca6bbc7f4..c3b2c99b5cd5 100644 --- a/drivers/isdn/mISDN/dsp_pipeline.c +++ b/drivers/isdn/mISDN/dsp_pipeline.c @@ -192,7 +192,7 @@ void dsp_pipeline_destroy(struct dsp_pipeline *pipeline) int dsp_pipeline_build(struct dsp_pipeline *pipeline, const char *cfg) { int found = 0; - char *dup, *tok, *name, *args; + char *dup, *next, *tok, *name, *args; struct dsp_element_entry *entry, *n; struct dsp_pipeline_entry *pipeline_entry; struct mISDN_dsp_element *elem; @@ -203,10 +203,10 @@ int dsp_pipeline_build(struct dsp_pipeline *pipeline, const char *cfg) if (!list_empty(&pipeline->list)) _dsp_pipeline_destroy(pipeline); - dup = kstrdup(cfg, GFP_ATOMIC); + dup = next = kstrdup(cfg, GFP_ATOMIC); if (!dup) return 0; - while ((tok = strsep(&dup, "|"))) { + while ((tok = strsep(&next, "|"))) { if (!strlen(tok)) continue; name = strsep(&tok, "("); -- cgit From ff712a627f7296a42ea5d7356704525e1e909e05 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Fri, 4 Mar 2022 20:28:48 -0800 Subject: selftests/vm: cleanup hugetlb file after mremap test The hugepage-mremap test will create a file in a hugetlb filesystem. In a default 'run_vmtests' run, the file will contain all the hugetlb pages. After the test, the file remains and there are no free hugetlb pages for subsequent tests. This causes those hugetlb tests to fail. Change hugepage-mremap to take the name of the hugetlb file as an argument. Unlink the file within the test, and just to be sure remove the file in the run_vmtests script. Link: https://lkml.kernel.org/r/20220201033459.156944-1-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Shuah Khan Acked-by: Yosry Ahmed Reviewed-by: Muchun Song Reviewed-by: Mina Almasry Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/hugepage-mremap.c | 26 +++++++++++++++++++------- tools/testing/selftests/vm/run_vmtests.sh | 3 ++- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/vm/hugepage-mremap.c b/tools/testing/selftests/vm/hugepage-mremap.c index 2a7c33631a29..1d689084a54b 100644 --- a/tools/testing/selftests/vm/hugepage-mremap.c +++ b/tools/testing/selftests/vm/hugepage-mremap.c @@ -3,9 +3,10 @@ * hugepage-mremap: * * Example of remapping huge page memory in a user application using the - * mremap system call. Code assumes a hugetlbfs filesystem is mounted - * at './huge'. The amount of memory used by this test is decided by a command - * line argument in MBs. If missing, the default amount is 10MB. + * mremap system call. The path to a file in a hugetlbfs filesystem must + * be passed as the last argument to this test. The amount of memory used + * by this test in MBs can optionally be passed as an argument. If no memory + * amount is passed, the default amount is 10MB. * * To make sure the test triggers pmd sharing and goes through the 'unshare' * path in the mremap code use 1GB (1024) or more. @@ -25,7 +26,6 @@ #define DEFAULT_LENGTH_MB 10UL #define MB_TO_BYTES(x) (x * 1024 * 1024) -#define FILE_NAME "huge/hugepagefile" #define PROTECTION (PROT_READ | PROT_WRITE | PROT_EXEC) #define FLAGS (MAP_SHARED | MAP_ANONYMOUS) @@ -107,17 +107,26 @@ static void register_region_with_uffd(char *addr, size_t len) int main(int argc, char *argv[]) { + size_t length; + + if (argc != 2 && argc != 3) { + printf("Usage: %s [length_in_MB] \n", argv[0]); + exit(1); + } + /* Read memory length as the first arg if valid, otherwise fallback to - * the default length. Any additional args are ignored. + * the default length. */ - size_t length = argc > 1 ? (size_t)atoi(argv[1]) : 0UL; + if (argc == 3) + length = argc > 2 ? (size_t)atoi(argv[1]) : 0UL; length = length > 0 ? length : DEFAULT_LENGTH_MB; length = MB_TO_BYTES(length); int ret = 0; - int fd = open(FILE_NAME, O_CREAT | O_RDWR, 0755); + /* last arg is the hugetlb file name */ + int fd = open(argv[argc-1], O_CREAT | O_RDWR, 0755); if (fd < 0) { perror("Open failed"); @@ -169,5 +178,8 @@ int main(int argc, char *argv[]) munmap(addr, length); + close(fd); + unlink(argv[argc-1]); + return ret; } diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh index 75d401741394..71d2dc198fc1 100755 --- a/tools/testing/selftests/vm/run_vmtests.sh +++ b/tools/testing/selftests/vm/run_vmtests.sh @@ -111,13 +111,14 @@ fi echo "-----------------------" echo "running hugepage-mremap" echo "-----------------------" -./hugepage-mremap 256 +./hugepage-mremap $mnt/huge_mremap if [ $? -ne 0 ]; then echo "[FAIL]" exitcode=1 else echo "[PASS]" fi +rm -f $mnt/huge_mremap echo "NOTE: The above hugetlb tests provide minimal coverage. Use" echo " https://github.com/libhugetlbfs/libhugetlbfs.git for" -- cgit From 5c26f6ac9416b63d093e29c30e79b3297e425472 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 4 Mar 2022 20:28:51 -0800 Subject: mm: refactor vm_area_struct::anon_vma_name usage code Avoid mixing strings and their anon_vma_name referenced pointers by using struct anon_vma_name whenever possible. This simplifies the code and allows easier sharing of anon_vma_name structures when they represent the same name. [surenb@google.com: fix comment] Link: https://lkml.kernel.org/r/20220223153613.835563-1-surenb@google.com Link: https://lkml.kernel.org/r/20220224231834.1481408-1-surenb@google.com Signed-off-by: Suren Baghdasaryan Suggested-by: Matthew Wilcox Suggested-by: Michal Hocko Acked-by: Michal Hocko Cc: Colin Cross Cc: Sumit Semwal Cc: Dave Hansen Cc: Kees Cook Cc: "Kirill A. Shutemov" Cc: Vlastimil Babka Cc: Johannes Weiner Cc: "Eric W. Biederman" Cc: Christian Brauner Cc: Alexey Gladkov Cc: Sasha Levin Cc: Chris Hyser Cc: Davidlohr Bueso Cc: Peter Collingbourne Cc: Xiaofeng Cao Cc: David Hildenbrand Cc: Cyrill Gorcunov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 6 ++-- fs/userfaultfd.c | 6 ++-- include/linux/mm.h | 7 ++-- include/linux/mm_inline.h | 87 ++++++++++++++++++++++++++++++++--------------- include/linux/mm_types.h | 5 ++- kernel/fork.c | 4 +-- kernel/sys.c | 19 +++++++---- mm/madvise.c | 87 ++++++++++++++++------------------------------- mm/mempolicy.c | 2 +- mm/mlock.c | 2 +- mm/mmap.c | 12 +++---- mm/mprotect.c | 2 +- 12 files changed, 125 insertions(+), 114 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 6e97ed775074..2c48b1eaaa9c 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -309,7 +309,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) name = arch_vma_name(vma); if (!name) { - const char *anon_name; + struct anon_vma_name *anon_name; if (!mm) { name = "[vdso]"; @@ -327,10 +327,10 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) goto done; } - anon_name = vma_anon_name(vma); + anon_name = anon_vma_name(vma); if (anon_name) { seq_pad(m, ' '); - seq_printf(m, "[anon:%s]", anon_name); + seq_printf(m, "[anon:%s]", anon_name->name); } } diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index e26b10132d47..8e03b3d3f5fa 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -878,7 +878,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), - NULL_VM_UFFD_CTX, vma_anon_name(vma)); + NULL_VM_UFFD_CTX, anon_vma_name(vma)); if (prev) vma = prev; else @@ -1438,7 +1438,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), ((struct vm_userfaultfd_ctx){ ctx }), - vma_anon_name(vma)); + anon_vma_name(vma)); if (prev) { vma = prev; goto next; @@ -1615,7 +1615,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, prev = vma_merge(mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), - NULL_VM_UFFD_CTX, vma_anon_name(vma)); + NULL_VM_UFFD_CTX, anon_vma_name(vma)); if (prev) { vma = prev; goto next; diff --git a/include/linux/mm.h b/include/linux/mm.h index 213cc569b192..5744a3fc4716 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2626,7 +2626,7 @@ static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start, extern struct vm_area_struct *vma_merge(struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, unsigned long end, unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t, - struct mempolicy *, struct vm_userfaultfd_ctx, const char *); + struct mempolicy *, struct vm_userfaultfd_ctx, struct anon_vma_name *); extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); extern int __split_vma(struct mm_struct *, struct vm_area_struct *, unsigned long addr, int new_below); @@ -3372,11 +3372,12 @@ static inline int seal_check_future_write(int seals, struct vm_area_struct *vma) #ifdef CONFIG_ANON_VMA_NAME int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, - unsigned long len_in, const char *name); + unsigned long len_in, + struct anon_vma_name *anon_name); #else static inline int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, - unsigned long len_in, const char *name) { + unsigned long len_in, struct anon_vma_name *anon_name) { return 0; } #endif diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index b725839dfe71..dd3accaa4e6d 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -140,50 +140,81 @@ static __always_inline void del_page_from_lru_list(struct page *page, #ifdef CONFIG_ANON_VMA_NAME /* - * mmap_lock should be read-locked when calling vma_anon_name() and while using - * the returned pointer. + * mmap_lock should be read-locked when calling anon_vma_name(). Caller should + * either keep holding the lock while using the returned pointer or it should + * raise anon_vma_name refcount before releasing the lock. */ -extern const char *vma_anon_name(struct vm_area_struct *vma); +extern struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma); +extern struct anon_vma_name *anon_vma_name_alloc(const char *name); +extern void anon_vma_name_free(struct kref *kref); -/* - * mmap_lock should be read-locked for orig_vma->vm_mm. - * mmap_lock should be write-locked for new_vma->vm_mm or new_vma should be - * isolated. - */ -extern void dup_vma_anon_name(struct vm_area_struct *orig_vma, - struct vm_area_struct *new_vma); +/* mmap_lock should be read-locked */ +static inline void anon_vma_name_get(struct anon_vma_name *anon_name) +{ + if (anon_name) + kref_get(&anon_name->kref); +} -/* - * mmap_lock should be write-locked or vma should have been isolated under - * write-locked mmap_lock protection. - */ -extern void free_vma_anon_name(struct vm_area_struct *vma); +static inline void anon_vma_name_put(struct anon_vma_name *anon_name) +{ + if (anon_name) + kref_put(&anon_name->kref, anon_vma_name_free); +} -/* mmap_lock should be read-locked */ -static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, - const char *name) +static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma, + struct vm_area_struct *new_vma) +{ + struct anon_vma_name *anon_name = anon_vma_name(orig_vma); + + if (anon_name) { + anon_vma_name_get(anon_name); + new_vma->anon_name = anon_name; + } +} + +static inline void free_anon_vma_name(struct vm_area_struct *vma) { - const char *vma_name = vma_anon_name(vma); + /* + * Not using anon_vma_name because it generates a warning if mmap_lock + * is not held, which might be the case here. + */ + if (!vma->vm_file) + anon_vma_name_put(vma->anon_name); +} - /* either both NULL, or pointers to same string */ - if (vma_name == name) +static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1, + struct anon_vma_name *anon_name2) +{ + if (anon_name1 == anon_name2) return true; - return name && vma_name && !strcmp(name, vma_name); + return anon_name1 && anon_name2 && + !strcmp(anon_name1->name, anon_name2->name); } + #else /* CONFIG_ANON_VMA_NAME */ -static inline const char *vma_anon_name(struct vm_area_struct *vma) +static inline struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) { return NULL; } -static inline void dup_vma_anon_name(struct vm_area_struct *orig_vma, - struct vm_area_struct *new_vma) {} -static inline void free_vma_anon_name(struct vm_area_struct *vma) {} -static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, - const char *name) + +static inline struct anon_vma_name *anon_vma_name_alloc(const char *name) +{ + return NULL; +} + +static inline void anon_vma_name_get(struct anon_vma_name *anon_name) {} +static inline void anon_vma_name_put(struct anon_vma_name *anon_name) {} +static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma, + struct vm_area_struct *new_vma) {} +static inline void free_anon_vma_name(struct vm_area_struct *vma) {} + +static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1, + struct anon_vma_name *anon_name2) { return true; } + #endif /* CONFIG_ANON_VMA_NAME */ static inline void init_tlb_flush_pending(struct mm_struct *mm) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5140e5feb486..0f549870da6a 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -416,7 +416,10 @@ struct vm_area_struct { struct rb_node rb; unsigned long rb_subtree_last; } shared; - /* Serialized by mmap_sem. */ + /* + * Serialized by mmap_sem. Never use directly because it is + * valid only when vm_file is NULL. Use anon_vma_name instead. + */ struct anon_vma_name *anon_name; }; diff --git a/kernel/fork.c b/kernel/fork.c index a024bf6254df..f1e89007f228 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -366,14 +366,14 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) *new = data_race(*orig); INIT_LIST_HEAD(&new->anon_vma_chain); new->vm_next = new->vm_prev = NULL; - dup_vma_anon_name(orig, new); + dup_anon_vma_name(orig, new); } return new; } void vm_area_free(struct vm_area_struct *vma) { - free_vma_anon_name(vma); + free_anon_vma_name(vma); kmem_cache_free(vm_area_cachep, vma); } diff --git a/kernel/sys.c b/kernel/sys.c index 97dc9e5d6bf9..5b0e172c4d47 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -2286,15 +2287,16 @@ static int prctl_set_vma(unsigned long opt, unsigned long addr, { struct mm_struct *mm = current->mm; const char __user *uname; - char *name, *pch; + struct anon_vma_name *anon_name = NULL; int error; switch (opt) { case PR_SET_VMA_ANON_NAME: uname = (const char __user *)arg; if (uname) { - name = strndup_user(uname, ANON_VMA_NAME_MAX_LEN); + char *name, *pch; + name = strndup_user(uname, ANON_VMA_NAME_MAX_LEN); if (IS_ERR(name)) return PTR_ERR(name); @@ -2304,15 +2306,18 @@ static int prctl_set_vma(unsigned long opt, unsigned long addr, return -EINVAL; } } - } else { - /* Reset the name */ - name = NULL; + /* anon_vma has its own copy */ + anon_name = anon_vma_name_alloc(name); + kfree(name); + if (!anon_name) + return -ENOMEM; + } mmap_write_lock(mm); - error = madvise_set_anon_name(mm, addr, size, name); + error = madvise_set_anon_name(mm, addr, size, anon_name); mmap_write_unlock(mm); - kfree(name); + anon_vma_name_put(anon_name); break; default: error = -EINVAL; diff --git a/mm/madvise.c b/mm/madvise.c index 5604064df464..081b1cded21e 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -65,7 +65,7 @@ static int madvise_need_mmap_write(int behavior) } #ifdef CONFIG_ANON_VMA_NAME -static struct anon_vma_name *anon_vma_name_alloc(const char *name) +struct anon_vma_name *anon_vma_name_alloc(const char *name) { struct anon_vma_name *anon_name; size_t count; @@ -81,78 +81,49 @@ static struct anon_vma_name *anon_vma_name_alloc(const char *name) return anon_name; } -static void vma_anon_name_free(struct kref *kref) +void anon_vma_name_free(struct kref *kref) { struct anon_vma_name *anon_name = container_of(kref, struct anon_vma_name, kref); kfree(anon_name); } -static inline bool has_vma_anon_name(struct vm_area_struct *vma) +struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) { - return !vma->vm_file && vma->anon_name; -} - -const char *vma_anon_name(struct vm_area_struct *vma) -{ - if (!has_vma_anon_name(vma)) - return NULL; - mmap_assert_locked(vma->vm_mm); - return vma->anon_name->name; -} - -void dup_vma_anon_name(struct vm_area_struct *orig_vma, - struct vm_area_struct *new_vma) -{ - if (!has_vma_anon_name(orig_vma)) - return; - - kref_get(&orig_vma->anon_name->kref); - new_vma->anon_name = orig_vma->anon_name; -} - -void free_vma_anon_name(struct vm_area_struct *vma) -{ - struct anon_vma_name *anon_name; - - if (!has_vma_anon_name(vma)) - return; + if (vma->vm_file) + return NULL; - anon_name = vma->anon_name; - vma->anon_name = NULL; - kref_put(&anon_name->kref, vma_anon_name_free); + return vma->anon_name; } /* mmap_lock should be write-locked */ -static int replace_vma_anon_name(struct vm_area_struct *vma, const char *name) +static int replace_anon_vma_name(struct vm_area_struct *vma, + struct anon_vma_name *anon_name) { - const char *anon_name; + struct anon_vma_name *orig_name = anon_vma_name(vma); - if (!name) { - free_vma_anon_name(vma); + if (!anon_name) { + vma->anon_name = NULL; + anon_vma_name_put(orig_name); return 0; } - anon_name = vma_anon_name(vma); - if (anon_name) { - /* Same name, nothing to do here */ - if (!strcmp(name, anon_name)) - return 0; + if (anon_vma_name_eq(orig_name, anon_name)) + return 0; - free_vma_anon_name(vma); - } - vma->anon_name = anon_vma_name_alloc(name); - if (!vma->anon_name) - return -ENOMEM; + anon_vma_name_get(anon_name); + vma->anon_name = anon_name; + anon_vma_name_put(orig_name); return 0; } #else /* CONFIG_ANON_VMA_NAME */ -static int replace_vma_anon_name(struct vm_area_struct *vma, const char *name) +static int replace_anon_vma_name(struct vm_area_struct *vma, + struct anon_vma_name *anon_name) { - if (name) + if (anon_name) return -EINVAL; return 0; @@ -165,13 +136,13 @@ static int replace_vma_anon_name(struct vm_area_struct *vma, const char *name) static int madvise_update_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, unsigned long new_flags, - const char *name) + struct anon_vma_name *anon_name) { struct mm_struct *mm = vma->vm_mm; int error; pgoff_t pgoff; - if (new_flags == vma->vm_flags && is_same_vma_anon_name(vma, name)) { + if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) { *prev = vma; return 0; } @@ -179,7 +150,7 @@ static int madvise_update_vma(struct vm_area_struct *vma, pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx, name); + vma->vm_userfaultfd_ctx, anon_name); if (*prev) { vma = *prev; goto success; @@ -209,7 +180,7 @@ success: */ vma->vm_flags = new_flags; if (!vma->vm_file) { - error = replace_vma_anon_name(vma, name); + error = replace_anon_vma_name(vma, anon_name); if (error) return error; } @@ -1041,7 +1012,7 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, } error = madvise_update_vma(vma, prev, start, end, new_flags, - vma_anon_name(vma)); + anon_vma_name(vma)); out: /* @@ -1225,7 +1196,7 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, static int madvise_vma_anon_name(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, - unsigned long name) + unsigned long anon_name) { int error; @@ -1234,7 +1205,7 @@ static int madvise_vma_anon_name(struct vm_area_struct *vma, return -EBADF; error = madvise_update_vma(vma, prev, start, end, vma->vm_flags, - (const char *)name); + (struct anon_vma_name *)anon_name); /* * madvise() returns EAGAIN if kernel resources, such as @@ -1246,7 +1217,7 @@ static int madvise_vma_anon_name(struct vm_area_struct *vma, } int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, - unsigned long len_in, const char *name) + unsigned long len_in, struct anon_vma_name *anon_name) { unsigned long end; unsigned long len; @@ -1266,7 +1237,7 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, if (end == start) return 0; - return madvise_walk_vmas(mm, start, end, (unsigned long)name, + return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name, madvise_vma_anon_name); } #endif /* CONFIG_ANON_VMA_NAME */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 028e8dd82b44..69284d3b5e53 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -814,7 +814,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, new_pol, vma->vm_userfaultfd_ctx, - vma_anon_name(vma)); + anon_vma_name(vma)); if (prev) { vma = prev; next = vma->vm_next; diff --git a/mm/mlock.c b/mm/mlock.c index 8f584eddd305..25934e7db3e1 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -512,7 +512,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx, vma_anon_name(vma)); + vma->vm_userfaultfd_ctx, anon_vma_name(vma)); if (*prev) { vma = *prev; goto success; diff --git a/mm/mmap.c b/mm/mmap.c index d445c1b9d606..f61a15474dd6 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1031,7 +1031,7 @@ again: static inline int is_mergeable_vma(struct vm_area_struct *vma, struct file *file, unsigned long vm_flags, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, - const char *anon_name) + struct anon_vma_name *anon_name) { /* * VM_SOFTDIRTY should not prevent from VMA merging, if we @@ -1049,7 +1049,7 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma, return 0; if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx)) return 0; - if (!is_same_vma_anon_name(vma, anon_name)) + if (!anon_vma_name_eq(anon_vma_name(vma), anon_name)) return 0; return 1; } @@ -1084,7 +1084,7 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, - const char *anon_name) + struct anon_vma_name *anon_name) { if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) && is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { @@ -1106,7 +1106,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, - const char *anon_name) + struct anon_vma_name *anon_name) { if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) && is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { @@ -1167,7 +1167,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, struct anon_vma *anon_vma, struct file *file, pgoff_t pgoff, struct mempolicy *policy, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, - const char *anon_name) + struct anon_vma_name *anon_name) { pgoff_t pglen = (end - addr) >> PAGE_SHIFT; struct vm_area_struct *area, *next; @@ -3256,7 +3256,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, return NULL; /* should never get here */ new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx, vma_anon_name(vma)); + vma->vm_userfaultfd_ctx, anon_vma_name(vma)); if (new_vma) { /* * Source vma may have been merged into new_vma diff --git a/mm/mprotect.c b/mm/mprotect.c index 5ca3fbcb1495..2887644fd150 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -464,7 +464,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *pprev = vma_merge(mm, *pprev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx, vma_anon_name(vma)); + vma->vm_userfaultfd_ctx, anon_vma_name(vma)); if (*pprev) { vma = *pprev; VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY); -- cgit From 96403e11283def1d1c465c8279514c9a504d8630 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 4 Mar 2022 20:28:55 -0800 Subject: mm: prevent vm_area_struct::anon_name refcount saturation A deep process chain with many vmas could grow really high. With default sysctl_max_map_count (64k) and default pid_max (32k) the max number of vmas in the system is 2147450880 and the refcounter has headroom of 1073774592 before it reaches REFCOUNT_SATURATED (3221225472). Therefore it's unlikely that an anonymous name refcounter will overflow with these defaults. Currently the max for pid_max is PID_MAX_LIMIT (4194304) and for sysctl_max_map_count it's INT_MAX (2147483647). In this configuration anon_vma_name refcount overflow becomes theoretically possible (that still require heavy sharing of that anon_vma_name between processes). kref refcounting interface used in anon_vma_name structure will detect a counter overflow when it reaches REFCOUNT_SATURATED value but will only generate a warning and freeze the ref counter. This would lead to the refcounted object never being freed. A determined attacker could leak memory like that but it would be rather expensive and inefficient way to do so. To ensure anon_vma_name refcount does not overflow, stop anon_vma_name sharing when the refcount reaches REFCOUNT_MAX (2147483647), which still leaves INT_MAX/2 (1073741823) values before the counter reaches REFCOUNT_SATURATED. This should provide enough headroom for raising the refcounts temporarily. Link: https://lkml.kernel.org/r/20220223153613.835563-2-surenb@google.com Signed-off-by: Suren Baghdasaryan Suggested-by: Michal Hocko Acked-by: Michal Hocko Cc: Alexey Gladkov Cc: Chris Hyser Cc: Christian Brauner Cc: Colin Cross Cc: Cyrill Gorcunov Cc: Dave Hansen Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: "Eric W. Biederman" Cc: Johannes Weiner Cc: Kees Cook Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Peter Collingbourne Cc: Sasha Levin Cc: Sumit Semwal Cc: Vlastimil Babka Cc: Xiaofeng Cao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_inline.h | 18 ++++++++++++++---- mm/madvise.c | 3 +-- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index dd3accaa4e6d..cf90b1fa2c60 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -161,15 +161,25 @@ static inline void anon_vma_name_put(struct anon_vma_name *anon_name) kref_put(&anon_name->kref, anon_vma_name_free); } +static inline +struct anon_vma_name *anon_vma_name_reuse(struct anon_vma_name *anon_name) +{ + /* Prevent anon_name refcount saturation early on */ + if (kref_read(&anon_name->kref) < REFCOUNT_MAX) { + anon_vma_name_get(anon_name); + return anon_name; + + } + return anon_vma_name_alloc(anon_name->name); +} + static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma, struct vm_area_struct *new_vma) { struct anon_vma_name *anon_name = anon_vma_name(orig_vma); - if (anon_name) { - anon_vma_name_get(anon_name); - new_vma->anon_name = anon_name; - } + if (anon_name) + new_vma->anon_name = anon_vma_name_reuse(anon_name); } static inline void free_anon_vma_name(struct vm_area_struct *vma) diff --git a/mm/madvise.c b/mm/madvise.c index 081b1cded21e..1f2693dccf7b 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -113,8 +113,7 @@ static int replace_anon_vma_name(struct vm_area_struct *vma, if (anon_vma_name_eq(orig_name, anon_name)) return 0; - anon_vma_name_get(anon_name); - vma->anon_name = anon_name; + vma->anon_name = anon_vma_name_reuse(anon_name); anon_vma_name_put(orig_name); return 0; -- cgit From 942341dcc5748d9c1fc7009a359fc1916bfe0ef0 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 4 Mar 2022 20:28:58 -0800 Subject: mm: fix use-after-free when anon vma name is used after vma is freed When adjacent vmas are being merged it can result in the vma that was originally passed to madvise_update_vma being destroyed. In the current implementation, the name parameter passed to madvise_update_vma points directly to vma->anon_name and it is used after the call to vma_merge. In the cases when vma_merge merges the original vma and destroys it, this might result in UAF. For that the original vma would have to hold the anon_vma_name with the last reference. The following vma would need to contain a different anon_vma_name object with the same string. Such scenario is shown below: madvise_vma_behavior(vma) madvise_update_vma(vma, ..., anon_name == vma->anon_name) vma_merge(vma) __vma_adjust(vma) <-- merges vma with adjacent one vm_area_free(vma) <-- frees the original vma replace_vma_anon_name(anon_name) <-- UAF of vma->anon_name Fix this by raising the name refcount and stabilizing it. Link: https://lkml.kernel.org/r/20220224231834.1481408-3-surenb@google.com Link: https://lkml.kernel.org/r/20220223153613.835563-3-surenb@google.com Fixes: 9a10064f5625 ("mm: add a field to store names for private anonymous memory") Signed-off-by: Suren Baghdasaryan Reported-by: syzbot+aa7b3d4b35f9dc46a366@syzkaller.appspotmail.com Acked-by: Michal Hocko Cc: Alexey Gladkov Cc: Chris Hyser Cc: Christian Brauner Cc: Colin Cross Cc: Cyrill Gorcunov Cc: Dave Hansen Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: "Eric W. Biederman" Cc: Johannes Weiner Cc: Kees Cook Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Michal Hocko Cc: Peter Collingbourne Cc: Sasha Levin Cc: Sumit Semwal Cc: Vlastimil Babka Cc: Xiaofeng Cao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/madvise.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/mm/madvise.c b/mm/madvise.c index 1f2693dccf7b..38d0f515d548 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -131,6 +131,8 @@ static int replace_anon_vma_name(struct vm_area_struct *vma, /* * Update the vm_flags on region of a vma, splitting it or merging it as * necessary. Must be called with mmap_sem held for writing; + * Caller should ensure anon_name stability by raising its refcount even when + * anon_name belongs to a valid vma because this function might free that vma. */ static int madvise_update_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, @@ -945,6 +947,7 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, unsigned long behavior) { int error; + struct anon_vma_name *anon_name; unsigned long new_flags = vma->vm_flags; switch (behavior) { @@ -1010,8 +1013,11 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, break; } + anon_name = anon_vma_name(vma); + anon_vma_name_get(anon_name); error = madvise_update_vma(vma, prev, start, end, new_flags, - anon_vma_name(vma)); + anon_name); + anon_vma_name_put(anon_name); out: /* -- cgit From f2b277c4d1c63a85127e8aa2588e9cc3bd21cb99 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 4 Mar 2022 20:29:01 -0800 Subject: memfd: fix F_SEAL_WRITE after shmem huge page allocated Wangyong reports: after enabling tmpfs filesystem to support transparent hugepage with the following command: echo always > /sys/kernel/mm/transparent_hugepage/shmem_enabled the docker program tries to add F_SEAL_WRITE through the following command, but it fails unexpectedly with errno EBUSY: fcntl(5, F_ADD_SEALS, F_SEAL_WRITE) = -1. That is because memfd_tag_pins() and memfd_wait_for_pins() were never updated for shmem huge pages: checking page_mapcount() against page_count() is hopeless on THP subpages - they need to check total_mapcount() against page_count() on THP heads only. Make memfd_tag_pins() (compared > 1) as strict as memfd_wait_for_pins() (compared != 1): either can be justified, but given the non-atomic total_mapcount() calculation, it is better now to be strict. Bear in mind that total_mapcount() itself scans all of the THP subpages, when choosing to take an XA_CHECK_SCHED latency break. Also fix the unlikely xa_is_value() case in memfd_wait_for_pins(): if a page has been swapped out since memfd_tag_pins(), then its refcount must have fallen, and so it can safely be untagged. Link: https://lkml.kernel.org/r/a4f79248-df75-2c8c-3df-ba3317ccb5da@google.com Signed-off-by: Hugh Dickins Reported-by: Zeal Robot Reported-by: wangyong Cc: Mike Kravetz Cc: Matthew Wilcox (Oracle) Cc: CGEL ZTE Cc: Kirill A. Shutemov Cc: Song Liu Cc: Yang Yang Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memfd.c | 40 ++++++++++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/mm/memfd.c b/mm/memfd.c index 9f80f162791a..08f5f8304746 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -31,20 +31,28 @@ static void memfd_tag_pins(struct xa_state *xas) { struct page *page; - unsigned int tagged = 0; + int latency = 0; + int cache_count; lru_add_drain(); xas_lock_irq(xas); xas_for_each(xas, page, ULONG_MAX) { - if (xa_is_value(page)) - continue; - page = find_subpage(page, xas->xa_index); - if (page_count(page) - page_mapcount(page) > 1) + cache_count = 1; + if (!xa_is_value(page) && + PageTransHuge(page) && !PageHuge(page)) + cache_count = HPAGE_PMD_NR; + + if (!xa_is_value(page) && + page_count(page) - total_mapcount(page) != cache_count) xas_set_mark(xas, MEMFD_TAG_PINNED); + if (cache_count != 1) + xas_set(xas, page->index + cache_count); - if (++tagged % XA_CHECK_SCHED) + latency += cache_count; + if (latency < XA_CHECK_SCHED) continue; + latency = 0; xas_pause(xas); xas_unlock_irq(xas); @@ -73,7 +81,8 @@ static int memfd_wait_for_pins(struct address_space *mapping) error = 0; for (scan = 0; scan <= LAST_SCAN; scan++) { - unsigned int tagged = 0; + int latency = 0; + int cache_count; if (!xas_marked(&xas, MEMFD_TAG_PINNED)) break; @@ -87,10 +96,14 @@ static int memfd_wait_for_pins(struct address_space *mapping) xas_lock_irq(&xas); xas_for_each_marked(&xas, page, ULONG_MAX, MEMFD_TAG_PINNED) { bool clear = true; - if (xa_is_value(page)) - continue; - page = find_subpage(page, xas.xa_index); - if (page_count(page) - page_mapcount(page) != 1) { + + cache_count = 1; + if (!xa_is_value(page) && + PageTransHuge(page) && !PageHuge(page)) + cache_count = HPAGE_PMD_NR; + + if (!xa_is_value(page) && cache_count != + page_count(page) - total_mapcount(page)) { /* * On the last scan, we clean up all those tags * we inserted; but make a note that we still @@ -103,8 +116,11 @@ static int memfd_wait_for_pins(struct address_space *mapping) } if (clear) xas_clear_mark(&xas, MEMFD_TAG_PINNED); - if (++tagged % XA_CHECK_SCHED) + + latency += cache_count; + if (latency < XA_CHECK_SCHED) continue; + latency = 0; xas_pause(&xas); xas_unlock_irq(&xas); -- cgit From b773827e361952b3f53ac6fa4c4e39ccd632102e Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 4 Mar 2022 20:29:04 -0800 Subject: kselftest/vm: fix tests build with old libc The error message when I build vm tests on debian10 (GLIBC 2.28): userfaultfd.c: In function `userfaultfd_pagemap_test': userfaultfd.c:1393:37: error: `MADV_PAGEOUT' undeclared (first use in this function); did you mean `MADV_RANDOM'? if (madvise(area_dst, test_pgsize, MADV_PAGEOUT)) ^~~~~~~~~~~~ MADV_RANDOM This patch includes these newer definitions from UAPI linux/mman.h, is useful to fix tests build on systems without these definitions in glibc sys/mman.h. Link: https://lkml.kernel.org/r/20220227055330.43087-2-zhouchengming@bytedance.com Signed-off-by: Chengming Zhou Reviewed-by: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/userfaultfd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 2f49c9af1b58..3fc1d2ee2948 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include #include -- cgit From dd21bfa425c098b95ca86845f8e7d1ec1ddf6e4a Mon Sep 17 00:00:00 2001 From: Yun Zhou Date: Fri, 4 Mar 2022 20:29:07 -0800 Subject: proc: fix documentation and description of pagemap Since bit 57 was exported for uffd-wp write-protected (commit fb8e37f35a2f: "mm/pagemap: export uffd-wp protection information"), fixing it can reduce some unnecessary confusion. Link: https://lkml.kernel.org/r/20220301044538.3042713-1-yun.zhou@windriver.com Fixes: fb8e37f35a2fe1 ("mm/pagemap: export uffd-wp protection information") Signed-off-by: Yun Zhou Reviewed-by: Peter Xu Cc: Jonathan Corbet Cc: Tiberiu A Georgescu Cc: Florian Schmidt Cc: Ivan Teterevkov Cc: SeongJae Park Cc: Yang Shi Cc: David Hildenbrand Cc: Axel Rasmussen Cc: Miaohe Lin Cc: Andrea Arcangeli Cc: Colin Cross Cc: Alistair Popple Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/mm/pagemap.rst | 2 +- fs/proc/task_mmu.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/mm/pagemap.rst b/Documentation/admin-guide/mm/pagemap.rst index bfc28704856c..6e2e416af783 100644 --- a/Documentation/admin-guide/mm/pagemap.rst +++ b/Documentation/admin-guide/mm/pagemap.rst @@ -23,7 +23,7 @@ There are four components to pagemap: * Bit 56 page exclusively mapped (since 4.2) * Bit 57 pte is uffd-wp write-protected (since 5.13) (see :ref:`Documentation/admin-guide/mm/userfaultfd.rst `) - * Bits 57-60 zero + * Bits 58-60 zero * Bit 61 page is file-page or shared-anon (since 3.5) * Bit 62 page swapped * Bit 63 page present diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 2c48b1eaaa9c..f46060eb91b5 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1597,7 +1597,8 @@ static const struct mm_walk_ops pagemap_ops = { * Bits 5-54 swap offset if swapped * Bit 55 pte is soft-dirty (see Documentation/admin-guide/mm/soft-dirty.rst) * Bit 56 page exclusively mapped - * Bits 57-60 zero + * Bit 57 pte is uffd-wp write-protected + * Bits 58-60 zero * Bit 61 page is file-page or shared-anon * Bit 62 page swapped * Bit 63 page present -- cgit From d1eff16d727ff257b706d32114d3881f67cc9c75 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Fri, 4 Mar 2022 20:29:10 -0800 Subject: configs/debug: set CONFIG_DEBUG_INFO=y properly CONFIG_DEBUG_INFO can't be set by user directly, so set CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y instead. Otherwise, we end up with no debuginfo in vmlinux which is a big no-no for kernel debugging. Link: https://lkml.kernel.org/r/20220301202920.18488-1-quic_qiancai@quicinc.com Signed-off-by: Qian Cai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/configs/debug.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config index e9ffb0cc1eec..07df6d93c4df 100644 --- a/kernel/configs/debug.config +++ b/kernel/configs/debug.config @@ -16,7 +16,7 @@ CONFIG_SYMBOLIC_ERRNAME=y # # Compile-time checks and compiler options # -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_DEBUG_SECTION_MISMATCH=y CONFIG_FRAME_WARN=2048 CONFIG_SECTION_MISMATCH_WARN_ONLY=y -- cgit From a3d9001b4e287fc043e5539d03d71a32ab114bcb Mon Sep 17 00:00:00 2001 From: Kai Lueke Date: Thu, 3 Mar 2022 15:55:10 +0100 Subject: Revert "xfrm: state and policy should fail if XFRMA_IF_ID 0" This reverts commit 68ac0f3810e76a853b5f7b90601a05c3048b8b54 because ID 0 was meant to be used for configuring the policy/state without matching for a specific interface (e.g., Cilium is affected, see https://github.com/cilium/cilium/pull/18789 and https://github.com/cilium/cilium/pull/19019). Signed-off-by: Kai Lueke Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index a4fb596e87af..72b2f173aac8 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -630,13 +630,8 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, xfrm_smark_init(attrs, &x->props.smark); - if (attrs[XFRMA_IF_ID]) { + if (attrs[XFRMA_IF_ID]) x->if_id = nla_get_u32(attrs[XFRMA_IF_ID]); - if (!x->if_id) { - err = -EINVAL; - goto error; - } - } err = __xfrm_init_state(x, false, attrs[XFRMA_OFFLOAD_DEV]); if (err) @@ -1432,13 +1427,8 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh, mark = xfrm_mark_get(attrs, &m); - if (attrs[XFRMA_IF_ID]) { + if (attrs[XFRMA_IF_ID]) if_id = nla_get_u32(attrs[XFRMA_IF_ID]); - if (!if_id) { - err = -EINVAL; - goto out_noput; - } - } if (p->info.seq) { x = xfrm_find_acq_byseq(net, mark, p->info.seq); @@ -1751,13 +1741,8 @@ static struct xfrm_policy *xfrm_policy_construct(struct net *net, struct xfrm_us xfrm_mark_get(attrs, &xp->mark); - if (attrs[XFRMA_IF_ID]) { + if (attrs[XFRMA_IF_ID]) xp->if_id = nla_get_u32(attrs[XFRMA_IF_ID]); - if (!xp->if_id) { - err = -EINVAL; - goto error; - } - } return xp; error: -- cgit From afb3cc1a397d77771f342691b7e6b032a234d7f2 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 3 Mar 2022 16:08:40 +0200 Subject: net: dsa: unlock the rtnl_mutex when dsa_master_setup() fails After the blamed commit, dsa_tree_setup_master() may exit without calling rtnl_unlock(), fix that. Fixes: c146f9bc195a ("net: dsa: hold rtnl_mutex when calling dsa_master_{setup,teardown}") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index b4e67758e104..074e4a69a728 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -1058,7 +1058,7 @@ static int dsa_tree_setup_switches(struct dsa_switch_tree *dst) static int dsa_tree_setup_master(struct dsa_switch_tree *dst) { struct dsa_port *dp; - int err; + int err = 0; rtnl_lock(); @@ -1066,13 +1066,13 @@ static int dsa_tree_setup_master(struct dsa_switch_tree *dst) if (dsa_port_is_cpu(dp)) { err = dsa_master_setup(dp->master, dp); if (err) - return err; + break; } } rtnl_unlock(); - return 0; + return err; } static void dsa_tree_teardown_master(struct dsa_switch_tree *dst) -- cgit From e2ae38cf3d91837a493cb2093c87700ff3cbe667 Mon Sep 17 00:00:00 2001 From: Anirudh Rayabharam Date: Sat, 5 Mar 2022 15:25:25 +0530 Subject: vhost: fix hung thread due to erroneous iotlb entries In vhost_iotlb_add_range_ctx(), range size can overflow to 0 when start is 0 and last is ULONG_MAX. One instance where it can happen is when userspace sends an IOTLB message with iova=size=uaddr=0 (vhost_process_iotlb_msg). So, an entry with size = 0, start = 0, last = ULONG_MAX ends up in the iotlb. Next time a packet is sent, iotlb_access_ok() loops indefinitely due to that erroneous entry. Call Trace: iotlb_access_ok+0x21b/0x3e0 drivers/vhost/vhost.c:1340 vq_meta_prefetch+0xbc/0x280 drivers/vhost/vhost.c:1366 vhost_transport_do_send_pkt+0xe0/0xfd0 drivers/vhost/vsock.c:104 vhost_worker+0x23d/0x3d0 drivers/vhost/vhost.c:372 kthread+0x2e9/0x3a0 kernel/kthread.c:377 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295 Reported by syzbot at: https://syzkaller.appspot.com/bug?extid=0abd373e2e50d704db87 To fix this, do two things: 1. Return -EINVAL in vhost_chr_write_iter() when userspace asks to map a range with size 0. 2. Fix vhost_iotlb_add_range_ctx() to handle the range [0, ULONG_MAX] by splitting it into two entries. Fixes: 0bbe30668d89e ("vhost: factor out IOTLB") Reported-by: syzbot+0abd373e2e50d704db87@syzkaller.appspotmail.com Tested-by: syzbot+0abd373e2e50d704db87@syzkaller.appspotmail.com Signed-off-by: Anirudh Rayabharam Link: https://lore.kernel.org/r/20220305095525.5145-1-mail@anirudhrb.com Signed-off-by: Michael S. Tsirkin --- drivers/vhost/iotlb.c | 11 +++++++++++ drivers/vhost/vhost.c | 5 +++++ 2 files changed, 16 insertions(+) diff --git a/drivers/vhost/iotlb.c b/drivers/vhost/iotlb.c index 670d56c879e5..40b098320b2a 100644 --- a/drivers/vhost/iotlb.c +++ b/drivers/vhost/iotlb.c @@ -57,6 +57,17 @@ int vhost_iotlb_add_range_ctx(struct vhost_iotlb *iotlb, if (last < start) return -EFAULT; + /* If the range being mapped is [0, ULONG_MAX], split it into two entries + * otherwise its size would overflow u64. + */ + if (start == 0 && last == ULONG_MAX) { + u64 mid = last / 2; + + vhost_iotlb_add_range_ctx(iotlb, start, mid, addr, perm, opaque); + addr += mid + 1; + start = mid + 1; + } + if (iotlb->limit && iotlb->nmaps == iotlb->limit && iotlb->flags & VHOST_IOTLB_FLAG_RETIRE) { diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 59edb5a1ffe2..55475fd59fb7 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1170,6 +1170,11 @@ ssize_t vhost_chr_write_iter(struct vhost_dev *dev, goto done; } + if (msg.size == 0) { + ret = -EINVAL; + goto done; + } + if (dev->msg_handler) ret = dev->msg_handler(dev, &msg); else -- cgit From dacc73ed0b88f1a787ec20385f42ca9dd9eddcd0 Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Fri, 4 Mar 2022 18:00:57 +0800 Subject: virtio-blk: Don't use MAX_DISCARD_SEGMENTS if max_discard_seg is zero Currently the value of max_discard_segment will be set to MAX_DISCARD_SEGMENTS (256) with no basis in hardware if device set 0 to max_discard_seg in configuration space. It's incorrect since the device might not be able to handle such large descriptors. To fix it, let's follow max_segments restrictions in this case. Fixes: 1f23816b8eb8 ("virtio_blk: add discard and write zeroes support") Signed-off-by: Xie Yongji Link: https://lore.kernel.org/r/20220304100058.116-1-xieyongji@bytedance.com Signed-off-by: Michael S. Tsirkin --- drivers/block/virtio_blk.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index c443cd64fc9b..7fc2c8b97077 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -925,9 +925,15 @@ static int virtblk_probe(struct virtio_device *vdev) virtio_cread(vdev, struct virtio_blk_config, max_discard_seg, &v); + + /* + * max_discard_seg == 0 is out of spec but we always + * handled it. + */ + if (!v) + v = sg_elems - 2; blk_queue_max_discard_segments(q, - min_not_zero(v, - MAX_DISCARD_SEGMENTS)); + min(v, MAX_DISCARD_SEGMENTS)); blk_queue_flag_set(QUEUE_FLAG_DISCARD, q); } -- cgit From e030759a1ddcbf61d42b6e996bfeb675e0032d8b Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Fri, 4 Mar 2022 18:00:58 +0800 Subject: virtio-blk: Remove BUG_ON() in virtio_queue_rq() Currently we have a BUG_ON() to make sure the number of sg list does not exceed queue_max_segments() in virtio_queue_rq(). However, the block layer uses queue_max_discard_segments() instead of queue_max_segments() to limit the sg list for discard requests. So the BUG_ON() might be triggered if virtio-blk device reports a larger value for max discard segment than queue_max_segments(). To fix it, let's simply remove the BUG_ON() which has become unnecessary after commit 02746e26c39e("virtio-blk: avoid preallocating big SGL for data"). And the unused vblk->sg_elems can also be removed together. Fixes: 1f23816b8eb8 ("virtio_blk: add discard and write zeroes support") Suggested-by: Christoph Hellwig Signed-off-by: Xie Yongji Reviewed-by: Max Gurtovoy Link: https://lore.kernel.org/r/20220304100058.116-2-xieyongji@bytedance.com Signed-off-by: Michael S. Tsirkin --- drivers/block/virtio_blk.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 7fc2c8b97077..8c415be86732 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -76,9 +76,6 @@ struct virtio_blk { */ refcount_t refs; - /* What host tells us, plus 2 for header & tailer. */ - unsigned int sg_elems; - /* Ida index - used to track minor number allocations. */ int index; @@ -322,8 +319,6 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx, blk_status_t status; int err; - BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems); - status = virtblk_setup_cmd(vblk->vdev, req, vbr); if (unlikely(status)) return status; @@ -783,8 +778,6 @@ static int virtblk_probe(struct virtio_device *vdev) /* Prevent integer overflows and honor max vq size */ sg_elems = min_t(u32, sg_elems, VIRTIO_BLK_MAX_SG_ELEMS - 2); - /* We need extra sg elements at head and tail. */ - sg_elems += 2; vdev->priv = vblk = kmalloc(sizeof(*vblk), GFP_KERNEL); if (!vblk) { err = -ENOMEM; @@ -796,7 +789,6 @@ static int virtblk_probe(struct virtio_device *vdev) mutex_init(&vblk->vdev_mutex); vblk->vdev = vdev; - vblk->sg_elems = sg_elems; INIT_WORK(&vblk->config_work, virtblk_config_changed_work); @@ -853,7 +845,7 @@ static int virtblk_probe(struct virtio_device *vdev) set_disk_ro(vblk->disk, 1); /* We can handle whatever the host told us to handle. */ - blk_queue_max_segments(q, vblk->sg_elems-2); + blk_queue_max_segments(q, sg_elems); /* No real sector limit. */ blk_queue_max_hw_sectors(q, -1U); @@ -931,7 +923,7 @@ static int virtblk_probe(struct virtio_device *vdev) * handled it. */ if (!v) - v = sg_elems - 2; + v = sg_elems; blk_queue_max_discard_segments(q, min(v, MAX_DISCARD_SEGMENTS)); -- cgit From eb057b44dbe35ae14527830236a92f51de8f9184 Mon Sep 17 00:00:00 2001 From: Zhang Min Date: Tue, 1 Mar 2022 17:10:59 +0800 Subject: vdpa: fix use-after-free on vp_vdpa_remove When vp_vdpa driver is unbind, vp_vdpa is freed in vdpa_unregister_device and then vp_vdpa->mdev.pci_dev is dereferenced in vp_modern_remove, triggering use-after-free. Call Trace of unbinding driver free vp_vdpa : do_syscall_64 vfs_write kernfs_fop_write_iter device_release_driver_internal pci_device_remove vp_vdpa_remove vdpa_unregister_device kobject_release device_release kfree Call Trace of dereference vp_vdpa->mdev.pci_dev: vp_modern_remove pci_release_selected_regions pci_release_region pci_resource_len pci_resource_end (dev)->resource[(bar)].end Signed-off-by: Zhang Min Signed-off-by: Yi Wang Link: https://lore.kernel.org/r/20220301091059.46869-1-wang.yi59@zte.com.cn Signed-off-by: Michael S. Tsirkin Fixes: 64b9f64f80a6 ("vdpa: introduce virtio pci driver") Reviewed-by: Stefano Garzarella --- drivers/vdpa/virtio_pci/vp_vdpa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vdpa/virtio_pci/vp_vdpa.c b/drivers/vdpa/virtio_pci/vp_vdpa.c index a57e381e830b..cce101e6a940 100644 --- a/drivers/vdpa/virtio_pci/vp_vdpa.c +++ b/drivers/vdpa/virtio_pci/vp_vdpa.c @@ -533,8 +533,8 @@ static void vp_vdpa_remove(struct pci_dev *pdev) { struct vp_vdpa *vp_vdpa = pci_get_drvdata(pdev); - vdpa_unregister_device(&vp_vdpa->vdpa); vp_modern_remove(&vp_vdpa->mdev); + vdpa_unregister_device(&vp_vdpa->vdpa); } static struct pci_driver vp_vdpa_driver = { -- cgit From e7c552ec897894ec421867059e48474eb7f1ff6d Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 25 Feb 2022 06:46:34 -0500 Subject: virtio: drop default for virtio-mem There's no special reason why virtio-mem needs a default that's different from what kconfig provides, any more than e.g. virtio blk. Signed-off-by: Michael S. Tsirkin Acked-by: David Hildenbrand --- drivers/virtio/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig index 34f80b7a8a64..492fc26f0b65 100644 --- a/drivers/virtio/Kconfig +++ b/drivers/virtio/Kconfig @@ -105,7 +105,6 @@ config VIRTIO_BALLOON config VIRTIO_MEM tristate "Virtio mem driver" - default m depends on X86_64 depends on VIRTIO depends on MEMORY_HOTPLUG -- cgit From 4c8093637bc9f8cc2e41eed343c12f85d6ff9e25 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Thu, 13 Jan 2022 15:11:34 +0100 Subject: vhost: remove avail_event arg from vhost_update_avail_event() In vhost_update_avail_event() we never used the `avail_event` argument, since its introduction in commit 2723feaa8ec6 ("vhost: set log when updating used flags or avail event"). Let's remove it to clean up the code. Signed-off-by: Stefano Garzarella Link: https://lore.kernel.org/r/20220113141134.186773-1-sgarzare@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/vhost/vhost.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 55475fd59fb7..082380c03a3e 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1986,7 +1986,7 @@ static int vhost_update_used_flags(struct vhost_virtqueue *vq) return 0; } -static int vhost_update_avail_event(struct vhost_virtqueue *vq, u16 avail_event) +static int vhost_update_avail_event(struct vhost_virtqueue *vq) { if (vhost_put_avail_event(vq)) return -EFAULT; @@ -2532,7 +2532,7 @@ bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq) return false; } } else { - r = vhost_update_avail_event(vq, vq->avail_idx); + r = vhost_update_avail_event(vq); if (r) { vq_err(vq, "Failed to update avail event index at %p: %d\n", vhost_avail_event(vq), r); -- cgit From 32f1b53fe8f03d962423ba81f8e92af5839814da Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Tue, 18 Jan 2022 16:06:31 +0100 Subject: tools/virtio: fix virtio_test execution virtio_test hangs on __vring_new_virtqueue() because `vqs_list_lock` is not initialized. Let's initialize it in vdev_info_init(). Signed-off-by: Stefano Garzarella Link: https://lore.kernel.org/r/20220118150631.167015-1-sgarzare@redhat.com Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- tools/virtio/virtio_test.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/virtio/virtio_test.c b/tools/virtio/virtio_test.c index cb3f29c09aff..23f142af544a 100644 --- a/tools/virtio/virtio_test.c +++ b/tools/virtio/virtio_test.c @@ -130,6 +130,7 @@ static void vdev_info_init(struct vdev_info* dev, unsigned long long features) memset(dev, 0, sizeof *dev); dev->vdev.features = features; INIT_LIST_HEAD(&dev->vdev.vqs); + spin_lock_init(&dev->vdev.vqs_list_lock); dev->buf_size = 1024; dev->buf = malloc(dev->buf_size); assert(dev->buf); -- cgit From 3dd7d135e75cb37c8501ba02977332a2a487dd39 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 4 Mar 2022 12:10:38 -0500 Subject: tools/virtio: handle fallout from folio work just add a stub Signed-off-by: Michael S. Tsirkin --- tools/virtio/linux/mm_types.h | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 tools/virtio/linux/mm_types.h diff --git a/tools/virtio/linux/mm_types.h b/tools/virtio/linux/mm_types.h new file mode 100644 index 000000000000..356ba4d496f6 --- /dev/null +++ b/tools/virtio/linux/mm_types.h @@ -0,0 +1,3 @@ +struct folio { + struct page page; +}; -- cgit From ffb217a13a2eaf6d5bd974fc83036a53ca69f1e2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 6 Mar 2022 14:28:31 -0800 Subject: Linux 5.17-rc7 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index daeb5c88b50b..87f672473523 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 5 PATCHLEVEL = 17 SUBLEVEL = 0 -EXTRAVERSION = -rc6 +EXTRAVERSION = -rc7 NAME = Superb Owl # *DOCUMENTATION* -- cgit From 48015b632f770c401f3816f144499a39f2884677 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 11 Feb 2022 17:32:37 +1100 Subject: powerpc: Fix STACKTRACE=n build MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Our skiroot_defconfig doesn't enable FTRACE, and so doesn't get STACKTRACE enabled either. That leads to a build failure since commit 1614b2b11fab ("arch: Make ARCH_STACKWALK independent of STACKTRACE") made stacktrace.c build even when STACKTRACE=n. arch/powerpc/kernel/stacktrace.c: In function ‘handle_backtrace_ipi’: arch/powerpc/kernel/stacktrace.c:171:2: error: implicit declaration of function ‘nmi_cpu_backtrace’ 171 | nmi_cpu_backtrace(regs); | ^~~~~~~~~~~~~~~~~ arch/powerpc/kernel/stacktrace.c: In function ‘arch_trigger_cpumask_backtrace’: arch/powerpc/kernel/stacktrace.c:226:2: error: implicit declaration of function ‘nmi_trigger_cpumask_backtrace’ 226 | nmi_trigger_cpumask_backtrace(mask, exclude_self, raise_backtrace_ipi); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~ This happens because our headers haven't defined arch_trigger_cpumask_backtrace, which causes lib/nmi_backtrace.c not to build nmi_cpu_backtrace(). The code in question doesn't actually depend on STACKTRACE=y, that was just added because arch_trigger_cpumask_backtrace() lived in stacktrace.c for convenience. So drop the dependency on CONFIG_STACKTRACE, that causes lib/nmi_backtrace.c to build nmi_cpu_backtrace() etc. and fixes the build. Fixes: 1614b2b11fab ("arch: Make ARCH_STACKWALK independent of STACKTRACE") [mpe: Cherry pick of 5a72345e6a78 from next into fixes] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220212111349.2806972-1-mpe@ellerman.id.au --- arch/powerpc/include/asm/nmi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/nmi.h b/arch/powerpc/include/asm/nmi.h index 160abcb8e9fa..ea0e487f87b1 100644 --- a/arch/powerpc/include/asm/nmi.h +++ b/arch/powerpc/include/asm/nmi.h @@ -9,7 +9,7 @@ long soft_nmi_interrupt(struct pt_regs *regs); static inline void arch_touch_nmi_watchdog(void) {} #endif -#if defined(CONFIG_NMI_IPI) && defined(CONFIG_STACKTRACE) +#ifdef CONFIG_NMI_IPI extern void arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self); #define arch_trigger_cpumask_backtrace arch_trigger_cpumask_backtrace -- cgit From 3777ea7bac3113005b7180e6b9dadf16d19a5827 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 7 Mar 2022 09:48:54 +0100 Subject: xen/xenbus: don't let xenbus_grant_ring() remove grants in error case Letting xenbus_grant_ring() tear down grants in the error case is problematic, as the other side could already have used these grants. Calling gnttab_end_foreign_access_ref() without checking success is resulting in an unclear situation for any caller of xenbus_grant_ring() as in the error case the memory pages of the ring page might be partially mapped. Freeing them would risk unwanted foreign access to them, while not freeing them would leak memory. In order to remove the need to undo any gnttab_grant_foreign_access() calls, use gnttab_alloc_grant_references() to make sure no further error can occur in the loop granting access to the ring pages. It should be noted that this way of handling removes leaking of grant entries in the error case, too. This is CVE-2022-23040 / part of XSA-396. Reported-by: Demi Marie Obenour Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich --- drivers/xen/xenbus/xenbus_client.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c index e8bed1cb76ba..df6890681231 100644 --- a/drivers/xen/xenbus/xenbus_client.c +++ b/drivers/xen/xenbus/xenbus_client.c @@ -379,7 +379,14 @@ int xenbus_grant_ring(struct xenbus_device *dev, void *vaddr, unsigned int nr_pages, grant_ref_t *grefs) { int err; - int i, j; + unsigned int i; + grant_ref_t gref_head; + + err = gnttab_alloc_grant_references(nr_pages, &gref_head); + if (err) { + xenbus_dev_fatal(dev, err, "granting access to ring page"); + return err; + } for (i = 0; i < nr_pages; i++) { unsigned long gfn; @@ -389,23 +396,14 @@ int xenbus_grant_ring(struct xenbus_device *dev, void *vaddr, else gfn = virt_to_gfn(vaddr); - err = gnttab_grant_foreign_access(dev->otherend_id, gfn, 0); - if (err < 0) { - xenbus_dev_fatal(dev, err, - "granting access to ring page"); - goto fail; - } - grefs[i] = err; + grefs[i] = gnttab_claim_grant_reference(&gref_head); + gnttab_grant_foreign_access_ref(grefs[i], dev->otherend_id, + gfn, 0); vaddr = vaddr + XEN_PAGE_SIZE; } return 0; - -fail: - for (j = 0; j < i; j++) - gnttab_end_foreign_access_ref(grefs[j], 0); - return err; } EXPORT_SYMBOL_GPL(xenbus_grant_ring); -- cgit From 6b1775f26a2da2b05a6dc8ec2b5d14e9a4701a1a Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 7 Mar 2022 09:48:54 +0100 Subject: xen/grant-table: add gnttab_try_end_foreign_access() Add a new grant table function gnttab_try_end_foreign_access(), which will remove and free a grant if it is not in use. Its main use case is to either free a grant if it is no longer in use, or to take some other action if it is still in use. This other action can be an error exit, or (e.g. in the case of blkfront persistent grant feature) some special handling. This is CVE-2022-23036, CVE-2022-23038 / part of XSA-396. Reported-by: Demi Marie Obenour Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich --- V2: - new patch V4: - add comments to header (Jan Beulich) --- drivers/xen/grant-table.c | 14 ++++++++++++-- include/xen/grant_table.h | 12 ++++++++++++ 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 3729bea0c989..1b82e7a3722a 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -435,11 +435,21 @@ static void gnttab_add_deferred(grant_ref_t ref, bool readonly, what, ref, page ? page_to_pfn(page) : -1); } +int gnttab_try_end_foreign_access(grant_ref_t ref) +{ + int ret = _gnttab_end_foreign_access_ref(ref, 0); + + if (ret) + put_free_entry(ref); + + return ret; +} +EXPORT_SYMBOL_GPL(gnttab_try_end_foreign_access); + void gnttab_end_foreign_access(grant_ref_t ref, int readonly, unsigned long page) { - if (gnttab_end_foreign_access_ref(ref, readonly)) { - put_free_entry(ref); + if (gnttab_try_end_foreign_access(ref)) { if (page != 0) put_page(virt_to_page(page)); } else diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h index cb854df031ce..358d2817741b 100644 --- a/include/xen/grant_table.h +++ b/include/xen/grant_table.h @@ -104,10 +104,22 @@ int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly); * access has been ended, free the given page too. Access will be ended * immediately iff the grant entry is not in use, otherwise it will happen * some time later. page may be 0, in which case no freeing will occur. + * Note that the granted page might still be accessed (read or write) by the + * other side after gnttab_end_foreign_access() returns, so even if page was + * specified as 0 it is not allowed to just reuse the page for other + * purposes immediately. */ void gnttab_end_foreign_access(grant_ref_t ref, int readonly, unsigned long page); +/* + * End access through the given grant reference, iff the grant entry is + * no longer in use. In case of success ending foreign access, the + * grant reference is deallocated. + * Return 1 if the grant entry was freed, 0 if it is still in use. + */ +int gnttab_try_end_foreign_access(grant_ref_t ref); + int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn); unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref); -- cgit From abf1fd5919d6238ee3bc5eb4a9b6c3947caa6638 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 7 Mar 2022 09:48:54 +0100 Subject: xen/blkfront: don't use gnttab_query_foreign_access() for mapped status MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It isn't enough to check whether a grant is still being in use by calling gnttab_query_foreign_access(), as a mapping could be realized by the other side just after having called that function. In case the call was done in preparation of revoking a grant it is better to do so via gnttab_end_foreign_access_ref() and check the success of that operation instead. For the ring allocation use alloc_pages_exact() in order to avoid high order pages in case of a multi-page ring. If a grant wasn't unmapped by the backend without persistent grants being used, set the device state to "error". This is CVE-2022-23036 / part of XSA-396. Reported-by: Demi Marie Obenour Signed-off-by: Juergen Gross Reviewed-by: Roger Pau Monné --- V2: - use gnttab_try_end_foreign_access() V4: - use alloc_pages_exact() and free_pages_exact() - set state to error if backend didn't unmap (Roger Pau Monné) --- drivers/block/xen-blkfront.c | 63 ++++++++++++++++++++++++++------------------ 1 file changed, 37 insertions(+), 26 deletions(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index ca71a0585333..03b5fb341e58 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -1288,7 +1288,8 @@ free_shadow: rinfo->ring_ref[i] = GRANT_INVALID_REF; } } - free_pages((unsigned long)rinfo->ring.sring, get_order(info->nr_ring_pages * XEN_PAGE_SIZE)); + free_pages_exact(rinfo->ring.sring, + info->nr_ring_pages * XEN_PAGE_SIZE); rinfo->ring.sring = NULL; if (rinfo->irq) @@ -1372,9 +1373,15 @@ static int blkif_get_final_status(enum blk_req_status s1, return BLKIF_RSP_OKAY; } -static bool blkif_completion(unsigned long *id, - struct blkfront_ring_info *rinfo, - struct blkif_response *bret) +/* + * Return values: + * 1 response processed. + * 0 missing further responses. + * -1 error while processing. + */ +static int blkif_completion(unsigned long *id, + struct blkfront_ring_info *rinfo, + struct blkif_response *bret) { int i = 0; struct scatterlist *sg; @@ -1397,7 +1404,7 @@ static bool blkif_completion(unsigned long *id, /* Wait the second response if not yet here. */ if (s2->status < REQ_DONE) - return false; + return 0; bret->status = blkif_get_final_status(s->status, s2->status); @@ -1448,42 +1455,43 @@ static bool blkif_completion(unsigned long *id, } /* Add the persistent grant into the list of free grants */ for (i = 0; i < num_grant; i++) { - if (gnttab_query_foreign_access(s->grants_used[i]->gref)) { + if (!gnttab_try_end_foreign_access(s->grants_used[i]->gref)) { /* * If the grant is still mapped by the backend (the * backend has chosen to make this grant persistent) * we add it at the head of the list, so it will be * reused first. */ - if (!info->feature_persistent) - pr_alert_ratelimited("backed has not unmapped grant: %u\n", - s->grants_used[i]->gref); + if (!info->feature_persistent) { + pr_alert("backed has not unmapped grant: %u\n", + s->grants_used[i]->gref); + return -1; + } list_add(&s->grants_used[i]->node, &rinfo->grants); rinfo->persistent_gnts_c++; } else { /* - * If the grant is not mapped by the backend we end the - * foreign access and add it to the tail of the list, - * so it will not be picked again unless we run out of - * persistent grants. + * If the grant is not mapped by the backend we add it + * to the tail of the list, so it will not be picked + * again unless we run out of persistent grants. */ - gnttab_end_foreign_access(s->grants_used[i]->gref, 0, 0UL); s->grants_used[i]->gref = GRANT_INVALID_REF; list_add_tail(&s->grants_used[i]->node, &rinfo->grants); } } if (s->req.operation == BLKIF_OP_INDIRECT) { for (i = 0; i < INDIRECT_GREFS(num_grant); i++) { - if (gnttab_query_foreign_access(s->indirect_grants[i]->gref)) { - if (!info->feature_persistent) - pr_alert_ratelimited("backed has not unmapped grant: %u\n", - s->indirect_grants[i]->gref); + if (!gnttab_try_end_foreign_access(s->indirect_grants[i]->gref)) { + if (!info->feature_persistent) { + pr_alert("backed has not unmapped grant: %u\n", + s->indirect_grants[i]->gref); + return -1; + } list_add(&s->indirect_grants[i]->node, &rinfo->grants); rinfo->persistent_gnts_c++; } else { struct page *indirect_page; - gnttab_end_foreign_access(s->indirect_grants[i]->gref, 0, 0UL); /* * Add the used indirect page back to the list of * available pages for indirect grefs. @@ -1498,7 +1506,7 @@ static bool blkif_completion(unsigned long *id, } } - return true; + return 1; } static irqreturn_t blkif_interrupt(int irq, void *dev_id) @@ -1564,12 +1572,17 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) } if (bret.operation != BLKIF_OP_DISCARD) { + int ret; + /* * We may need to wait for an extra response if the * I/O request is split in 2 */ - if (!blkif_completion(&id, rinfo, &bret)) + ret = blkif_completion(&id, rinfo, &bret); + if (!ret) continue; + if (unlikely(ret < 0)) + goto err; } if (add_id_to_freelist(rinfo, id)) { @@ -1676,8 +1689,7 @@ static int setup_blkring(struct xenbus_device *dev, for (i = 0; i < info->nr_ring_pages; i++) rinfo->ring_ref[i] = GRANT_INVALID_REF; - sring = (struct blkif_sring *)__get_free_pages(GFP_NOIO | __GFP_HIGH, - get_order(ring_size)); + sring = alloc_pages_exact(ring_size, GFP_NOIO); if (!sring) { xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring"); return -ENOMEM; @@ -1687,7 +1699,7 @@ static int setup_blkring(struct xenbus_device *dev, err = xenbus_grant_ring(dev, rinfo->ring.sring, info->nr_ring_pages, gref); if (err < 0) { - free_pages((unsigned long)sring, get_order(ring_size)); + free_pages_exact(sring, ring_size); rinfo->ring.sring = NULL; goto fail; } @@ -2532,11 +2544,10 @@ static void purge_persistent_grants(struct blkfront_info *info) list_for_each_entry_safe(gnt_list_entry, tmp, &rinfo->grants, node) { if (gnt_list_entry->gref == GRANT_INVALID_REF || - gnttab_query_foreign_access(gnt_list_entry->gref)) + !gnttab_try_end_foreign_access(gnt_list_entry->gref)) continue; list_del(&gnt_list_entry->node); - gnttab_end_foreign_access(gnt_list_entry->gref, 0, 0UL); rinfo->persistent_gnts_c--; gnt_list_entry->gref = GRANT_INVALID_REF; list_add_tail(&gnt_list_entry->node, &rinfo->grants); -- cgit From 31185df7e2b1d2fa1de4900247a12d7b9c7087eb Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 7 Mar 2022 09:48:54 +0100 Subject: xen/netfront: don't use gnttab_query_foreign_access() for mapped status It isn't enough to check whether a grant is still being in use by calling gnttab_query_foreign_access(), as a mapping could be realized by the other side just after having called that function. In case the call was done in preparation of revoking a grant it is better to do so via gnttab_end_foreign_access_ref() and check the success of that operation instead. This is CVE-2022-23037 / part of XSA-396. Reported-by: Demi Marie Obenour Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich --- V2: - use gnttab_try_end_foreign_access() V3: - don't use gnttab_try_end_foreign_access() --- drivers/net/xen-netfront.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index 7748f07e2cf1..13926d03418e 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -424,14 +424,12 @@ static bool xennet_tx_buf_gc(struct netfront_queue *queue) queue->tx_link[id] = TX_LINK_NONE; skb = queue->tx_skbs[id]; queue->tx_skbs[id] = NULL; - if (unlikely(gnttab_query_foreign_access( - queue->grant_tx_ref[id]) != 0)) { + if (unlikely(!gnttab_end_foreign_access_ref( + queue->grant_tx_ref[id], GNTMAP_readonly))) { dev_alert(dev, "Grant still in use by backend domain\n"); goto err; } - gnttab_end_foreign_access_ref( - queue->grant_tx_ref[id], GNTMAP_readonly); gnttab_release_grant_reference( &queue->gref_tx_head, queue->grant_tx_ref[id]); queue->grant_tx_ref[id] = GRANT_INVALID_REF; -- cgit From 33172ab50a53578a95691310f49567c9266968b0 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 7 Mar 2022 09:48:54 +0100 Subject: xen/scsifront: don't use gnttab_query_foreign_access() for mapped status It isn't enough to check whether a grant is still being in use by calling gnttab_query_foreign_access(), as a mapping could be realized by the other side just after having called that function. In case the call was done in preparation of revoking a grant it is better to do so via gnttab_try_end_foreign_access() and check the success of that operation instead. This is CVE-2022-23038 / part of XSA-396. Reported-by: Demi Marie Obenour Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich --- V2: - use gnttab_try_end_foreign_access() --- drivers/scsi/xen-scsifront.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/scsi/xen-scsifront.c b/drivers/scsi/xen-scsifront.c index 12c10a5e3d93..7f421600cb66 100644 --- a/drivers/scsi/xen-scsifront.c +++ b/drivers/scsi/xen-scsifront.c @@ -233,12 +233,11 @@ static void scsifront_gnttab_done(struct vscsifrnt_info *info, return; for (i = 0; i < shadow->nr_grants; i++) { - if (unlikely(gnttab_query_foreign_access(shadow->gref[i]))) { + if (unlikely(!gnttab_try_end_foreign_access(shadow->gref[i]))) { shost_printk(KERN_ALERT, info->host, KBUILD_MODNAME "grant still in use by backend\n"); BUG(); } - gnttab_end_foreign_access(shadow->gref[i], 0, 0UL); } kfree(shadow->sg); -- cgit From d3b6372c5881cb54925212abb62c521df8ba4809 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 7 Mar 2022 09:48:54 +0100 Subject: xen/gntalloc: don't use gnttab_query_foreign_access() Using gnttab_query_foreign_access() is unsafe, as it is racy by design. The use case in the gntalloc driver is not needed at all. While at it replace the call of gnttab_end_foreign_access_ref() with a call of gnttab_end_foreign_access(), which is what is really wanted there. In case the grant wasn't used due to an allocation failure, just free the grant via gnttab_free_grant_reference(). This is CVE-2022-23039 / part of XSA-396. Reported-by: Demi Marie Obenour Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich --- V3: - fix __del_gref() (Jan Beulich) --- drivers/xen/gntalloc.c | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/drivers/xen/gntalloc.c b/drivers/xen/gntalloc.c index 3fa40c723e8e..edb0acd0b832 100644 --- a/drivers/xen/gntalloc.c +++ b/drivers/xen/gntalloc.c @@ -169,20 +169,14 @@ undo: __del_gref(gref); } - /* It's possible for the target domain to map the just-allocated grant - * references by blindly guessing their IDs; if this is done, then - * __del_gref will leave them in the queue_gref list. They need to be - * added to the global list so that we can free them when they are no - * longer referenced. - */ - if (unlikely(!list_empty(&queue_gref))) - list_splice_tail(&queue_gref, &gref_list); mutex_unlock(&gref_mutex); return rc; } static void __del_gref(struct gntalloc_gref *gref) { + unsigned long addr; + if (gref->notify.flags & UNMAP_NOTIFY_CLEAR_BYTE) { uint8_t *tmp = kmap(gref->page); tmp[gref->notify.pgoff] = 0; @@ -196,21 +190,16 @@ static void __del_gref(struct gntalloc_gref *gref) gref->notify.flags = 0; if (gref->gref_id) { - if (gnttab_query_foreign_access(gref->gref_id)) - return; - - if (!gnttab_end_foreign_access_ref(gref->gref_id, 0)) - return; - - gnttab_free_grant_reference(gref->gref_id); + if (gref->page) { + addr = (unsigned long)page_to_virt(gref->page); + gnttab_end_foreign_access(gref->gref_id, 0, addr); + } else + gnttab_free_grant_reference(gref->gref_id); } gref_size--; list_del(&gref->next_gref); - if (gref->page) - __free_page(gref->page); - kfree(gref); } -- cgit From 1dbd11ca75fe664d3e54607547771d021f531f59 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 7 Mar 2022 09:48:54 +0100 Subject: xen: remove gnttab_query_foreign_access() Remove gnttab_query_foreign_access(), as it is unused and unsafe to use. All previous use cases assumed a grant would not be in use after gnttab_query_foreign_access() returned 0. This information is useless in best case, as it only refers to a situation in the past, which could have changed already. Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich --- drivers/xen/grant-table.c | 25 ------------------------- include/xen/grant_table.h | 2 -- 2 files changed, 27 deletions(-) diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 1b82e7a3722a..e6548910e79f 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -133,13 +133,6 @@ struct gnttab_ops { * return the frame. */ unsigned long (*end_foreign_transfer_ref)(grant_ref_t ref); - /* - * Query the status of a grant entry. Ref parameter is reference of - * queried grant entry, return value is the status of queried entry. - * Detailed status(writing/reading) can be gotten from the return value - * by bit operations. - */ - int (*query_foreign_access)(grant_ref_t ref); }; struct unmap_refs_callback_data { @@ -284,22 +277,6 @@ int gnttab_grant_foreign_access(domid_t domid, unsigned long frame, } EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access); -static int gnttab_query_foreign_access_v1(grant_ref_t ref) -{ - return gnttab_shared.v1[ref].flags & (GTF_reading|GTF_writing); -} - -static int gnttab_query_foreign_access_v2(grant_ref_t ref) -{ - return grstatus[ref] & (GTF_reading|GTF_writing); -} - -int gnttab_query_foreign_access(grant_ref_t ref) -{ - return gnttab_interface->query_foreign_access(ref); -} -EXPORT_SYMBOL_GPL(gnttab_query_foreign_access); - static int gnttab_end_foreign_access_ref_v1(grant_ref_t ref, int readonly) { u16 flags, nflags; @@ -1427,7 +1404,6 @@ static const struct gnttab_ops gnttab_v1_ops = { .update_entry = gnttab_update_entry_v1, .end_foreign_access_ref = gnttab_end_foreign_access_ref_v1, .end_foreign_transfer_ref = gnttab_end_foreign_transfer_ref_v1, - .query_foreign_access = gnttab_query_foreign_access_v1, }; static const struct gnttab_ops gnttab_v2_ops = { @@ -1439,7 +1415,6 @@ static const struct gnttab_ops gnttab_v2_ops = { .update_entry = gnttab_update_entry_v2, .end_foreign_access_ref = gnttab_end_foreign_access_ref_v2, .end_foreign_transfer_ref = gnttab_end_foreign_transfer_ref_v2, - .query_foreign_access = gnttab_query_foreign_access_v2, }; static bool gnttab_need_v2(void) diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h index 358d2817741b..ab9e692a0ef4 100644 --- a/include/xen/grant_table.h +++ b/include/xen/grant_table.h @@ -125,8 +125,6 @@ int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn); unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref); unsigned long gnttab_end_foreign_transfer(grant_ref_t ref); -int gnttab_query_foreign_access(grant_ref_t ref); - /* * operations on reserved batches of grant references */ -- cgit From cd7bcfab4e73dcb3de92c2014c19f17af3864bfe Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 7 Mar 2022 09:48:55 +0100 Subject: xen/usb: don't use gnttab_end_foreign_access() in xenhcd_gnttab_done() The usage of gnttab_end_foreign_access() in xenhcd_gnttab_done() is not safe against a malicious backend, as the backend could keep the I/O page mapped and modify it even after the granted memory page is being used for completely other purposes in the local system. So replace that use case with gnttab_try_end_foreign_access() and disable the PV host adapter in case the backend didn't stop using the granted page. In xenhcd_urb_request_done() immediately return in case of setting the device state to "error" instead of looking into further backend responses. Reported-by: Demi Marie Obenour Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich --- V2: - use gnttab_try_end_foreign_access() --- drivers/usb/host/xen-hcd.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/drivers/usb/host/xen-hcd.c b/drivers/usb/host/xen-hcd.c index be09fd9bac58..19b8c7ed74cb 100644 --- a/drivers/usb/host/xen-hcd.c +++ b/drivers/usb/host/xen-hcd.c @@ -716,8 +716,9 @@ static int xenhcd_map_urb_for_request(struct xenhcd_info *info, struct urb *urb, return 0; } -static void xenhcd_gnttab_done(struct usb_shadow *shadow) +static void xenhcd_gnttab_done(struct xenhcd_info *info, unsigned int id) { + struct usb_shadow *shadow = info->shadow + id; int nr_segs = 0; int i; @@ -726,8 +727,10 @@ static void xenhcd_gnttab_done(struct usb_shadow *shadow) if (xenusb_pipeisoc(shadow->req.pipe)) nr_segs += shadow->req.u.isoc.nr_frame_desc_segs; - for (i = 0; i < nr_segs; i++) - gnttab_end_foreign_access(shadow->req.seg[i].gref, 0, 0UL); + for (i = 0; i < nr_segs; i++) { + if (!gnttab_try_end_foreign_access(shadow->req.seg[i].gref)) + xenhcd_set_error(info, "backend didn't release grant"); + } shadow->req.nr_buffer_segs = 0; shadow->req.u.isoc.nr_frame_desc_segs = 0; @@ -841,7 +844,9 @@ static void xenhcd_cancel_all_enqueued_urbs(struct xenhcd_info *info) list_for_each_entry_safe(urbp, tmp, &info->in_progress_list, list) { req_id = urbp->req_id; if (!urbp->unlinked) { - xenhcd_gnttab_done(&info->shadow[req_id]); + xenhcd_gnttab_done(info, req_id); + if (info->error) + return; if (urbp->urb->status == -EINPROGRESS) /* not dequeued */ xenhcd_giveback_urb(info, urbp->urb, @@ -942,8 +947,7 @@ static int xenhcd_urb_request_done(struct xenhcd_info *info) rp = info->urb_ring.sring->rsp_prod; if (RING_RESPONSE_PROD_OVERFLOW(&info->urb_ring, rp)) { xenhcd_set_error(info, "Illegal index on urb-ring"); - spin_unlock_irqrestore(&info->lock, flags); - return 0; + goto err; } rmb(); /* ensure we see queued responses up to "rp" */ @@ -952,11 +956,13 @@ static int xenhcd_urb_request_done(struct xenhcd_info *info) id = res.id; if (id >= XENUSB_URB_RING_SIZE) { xenhcd_set_error(info, "Illegal data on urb-ring"); - continue; + goto err; } if (likely(xenusb_pipesubmit(info->shadow[id].req.pipe))) { - xenhcd_gnttab_done(&info->shadow[id]); + xenhcd_gnttab_done(info, id); + if (info->error) + goto err; urb = info->shadow[id].urb; if (likely(urb)) { urb->actual_length = res.actual_length; @@ -978,6 +984,10 @@ static int xenhcd_urb_request_done(struct xenhcd_info *info) spin_unlock_irqrestore(&info->lock, flags); return more_to_do; + + err: + spin_unlock_irqrestore(&info->lock, flags); + return 0; } static int xenhcd_conn_notify(struct xenhcd_info *info) -- cgit From 5cadd4bb1d7fc9ab201ac14620d1a478357e4ebd Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 7 Mar 2022 09:48:55 +0100 Subject: xen/9p: use alloc/free_pages_exact() Instead of __get_free_pages() and free_pages() use alloc_pages_exact() and free_pages_exact(). This is in preparation of a change of gnttab_end_foreign_access() which will prohibit use of high-order pages. By using the local variable "order" instead of ring->intf->ring_order in the error path of xen_9pfs_front_alloc_dataring() another bug is fixed, as the error path can be entered before ring->intf->ring_order is being set. By using alloc_pages_exact() the size in bytes is specified for the allocation, which fixes another bug for the case of order < (PAGE_SHIFT - XEN_PAGE_SHIFT). This is part of CVE-2022-23041 / XSA-396. Reported-by: Simon Gaiser Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich --- V4: - new patch --- net/9p/trans_xen.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c index eb9fb55280ef..01f8067994d6 100644 --- a/net/9p/trans_xen.c +++ b/net/9p/trans_xen.c @@ -281,9 +281,9 @@ static void xen_9pfs_front_free(struct xen_9pfs_front_priv *priv) ref = priv->rings[i].intf->ref[j]; gnttab_end_foreign_access(ref, 0, 0); } - free_pages((unsigned long)priv->rings[i].data.in, - priv->rings[i].intf->ring_order - - (PAGE_SHIFT - XEN_PAGE_SHIFT)); + free_pages_exact(priv->rings[i].data.in, + 1UL << (priv->rings[i].intf->ring_order + + XEN_PAGE_SHIFT)); } gnttab_end_foreign_access(priv->rings[i].ref, 0, 0); free_page((unsigned long)priv->rings[i].intf); @@ -322,8 +322,8 @@ static int xen_9pfs_front_alloc_dataring(struct xenbus_device *dev, if (ret < 0) goto out; ring->ref = ret; - bytes = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, - order - (PAGE_SHIFT - XEN_PAGE_SHIFT)); + bytes = alloc_pages_exact(1UL << (order + XEN_PAGE_SHIFT), + GFP_KERNEL | __GFP_ZERO); if (!bytes) { ret = -ENOMEM; goto out; @@ -354,9 +354,7 @@ out: if (bytes) { for (i--; i >= 0; i--) gnttab_end_foreign_access(ring->intf->ref[i], 0, 0); - free_pages((unsigned long)bytes, - ring->intf->ring_order - - (PAGE_SHIFT - XEN_PAGE_SHIFT)); + free_pages_exact(bytes, 1UL << (order + XEN_PAGE_SHIFT)); } gnttab_end_foreign_access(ring->ref, 0, 0); free_page((unsigned long)ring->intf); -- cgit From b0576cc9c6b843d99c6982888d59a56209341888 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 7 Mar 2022 09:48:55 +0100 Subject: xen/pvcalls: use alloc/free_pages_exact() Instead of __get_free_pages() and free_pages() use alloc_pages_exact() and free_pages_exact(). This is in preparation of a change of gnttab_end_foreign_access() which will prohibit use of high-order pages. This is part of CVE-2022-23041 / XSA-396. Reported-by: Simon Gaiser Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich --- V4: - new patch --- drivers/xen/pvcalls-front.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c index 3c9ae156b597..0ca351f30a6d 100644 --- a/drivers/xen/pvcalls-front.c +++ b/drivers/xen/pvcalls-front.c @@ -337,8 +337,8 @@ static void free_active_ring(struct sock_mapping *map) if (!map->active.ring) return; - free_pages((unsigned long)map->active.data.in, - map->active.ring->ring_order); + free_pages_exact(map->active.data.in, + PAGE_SIZE << map->active.ring->ring_order); free_page((unsigned long)map->active.ring); } @@ -352,8 +352,8 @@ static int alloc_active_ring(struct sock_mapping *map) goto out; map->active.ring->ring_order = PVCALLS_RING_ORDER; - bytes = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, - PVCALLS_RING_ORDER); + bytes = alloc_pages_exact(PAGE_SIZE << PVCALLS_RING_ORDER, + GFP_KERNEL | __GFP_ZERO); if (!bytes) goto out; -- cgit From 42baefac638f06314298087394b982ead9ec444b Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 7 Mar 2022 09:48:55 +0100 Subject: xen/gnttab: fix gnttab_end_foreign_access() without page specified gnttab_end_foreign_access() is used to free a grant reference and optionally to free the associated page. In case the grant is still in use by the other side processing is being deferred. This leads to a problem in case no page to be freed is specified by the caller: the caller doesn't know that the page is still mapped by the other side and thus should not be used for other purposes. The correct way to handle this situation is to take an additional reference to the granted page in case handling is being deferred and to drop that reference when the grant reference could be freed finally. This requires that there are no users of gnttab_end_foreign_access() left directly repurposing the granted page after the call, as this might result in clobbered data or information leaks via the not yet freed grant reference. This is part of CVE-2022-23041 / XSA-396. Reported-by: Simon Gaiser Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich --- V4: - expand comment in header V5: - get page ref in case of kmalloc() failure, too --- drivers/xen/grant-table.c | 36 +++++++++++++++++++++++++++++------- include/xen/grant_table.h | 7 ++++++- 2 files changed, 35 insertions(+), 8 deletions(-) diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index e6548910e79f..5c83d41766c8 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -133,6 +133,10 @@ struct gnttab_ops { * return the frame. */ unsigned long (*end_foreign_transfer_ref)(grant_ref_t ref); + /* + * Read the frame number related to a given grant reference. + */ + unsigned long (*read_frame)(grant_ref_t ref); }; struct unmap_refs_callback_data { @@ -330,6 +334,16 @@ int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly) } EXPORT_SYMBOL_GPL(gnttab_end_foreign_access_ref); +static unsigned long gnttab_read_frame_v1(grant_ref_t ref) +{ + return gnttab_shared.v1[ref].frame; +} + +static unsigned long gnttab_read_frame_v2(grant_ref_t ref) +{ + return gnttab_shared.v2[ref].full_page.frame; +} + struct deferred_entry { struct list_head list; grant_ref_t ref; @@ -359,12 +373,9 @@ static void gnttab_handle_deferred(struct timer_list *unused) spin_unlock_irqrestore(&gnttab_list_lock, flags); if (_gnttab_end_foreign_access_ref(entry->ref, entry->ro)) { put_free_entry(entry->ref); - if (entry->page) { - pr_debug("freeing g.e. %#x (pfn %#lx)\n", - entry->ref, page_to_pfn(entry->page)); - put_page(entry->page); - } else - pr_info("freeing g.e. %#x\n", entry->ref); + pr_debug("freeing g.e. %#x (pfn %#lx)\n", + entry->ref, page_to_pfn(entry->page)); + put_page(entry->page); kfree(entry); entry = NULL; } else { @@ -389,9 +400,18 @@ static void gnttab_handle_deferred(struct timer_list *unused) static void gnttab_add_deferred(grant_ref_t ref, bool readonly, struct page *page) { - struct deferred_entry *entry = kmalloc(sizeof(*entry), GFP_ATOMIC); + struct deferred_entry *entry; + gfp_t gfp = (in_atomic() || irqs_disabled()) ? GFP_ATOMIC : GFP_KERNEL; const char *what = KERN_WARNING "leaking"; + entry = kmalloc(sizeof(*entry), gfp); + if (!page) { + unsigned long gfn = gnttab_interface->read_frame(ref); + + page = pfn_to_page(gfn_to_pfn(gfn)); + get_page(page); + } + if (entry) { unsigned long flags; @@ -1404,6 +1424,7 @@ static const struct gnttab_ops gnttab_v1_ops = { .update_entry = gnttab_update_entry_v1, .end_foreign_access_ref = gnttab_end_foreign_access_ref_v1, .end_foreign_transfer_ref = gnttab_end_foreign_transfer_ref_v1, + .read_frame = gnttab_read_frame_v1, }; static const struct gnttab_ops gnttab_v2_ops = { @@ -1415,6 +1436,7 @@ static const struct gnttab_ops gnttab_v2_ops = { .update_entry = gnttab_update_entry_v2, .end_foreign_access_ref = gnttab_end_foreign_access_ref_v2, .end_foreign_transfer_ref = gnttab_end_foreign_transfer_ref_v2, + .read_frame = gnttab_read_frame_v2, }; static bool gnttab_need_v2(void) diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h index ab9e692a0ef4..c9fea9389ebe 100644 --- a/include/xen/grant_table.h +++ b/include/xen/grant_table.h @@ -107,7 +107,12 @@ int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly); * Note that the granted page might still be accessed (read or write) by the * other side after gnttab_end_foreign_access() returns, so even if page was * specified as 0 it is not allowed to just reuse the page for other - * purposes immediately. + * purposes immediately. gnttab_end_foreign_access() will take an additional + * reference to the granted page in this case, which is dropped only after + * the grant is no longer in use. + * This requires that multi page allocations for areas subject to + * gnttab_end_foreign_access() are done via alloc_pages_exact() (and freeing + * via free_pages_exact()) in order to avoid high order pages. */ void gnttab_end_foreign_access(grant_ref_t ref, int readonly, unsigned long page); -- cgit From 66e3531b33ee51dad17c463b4d9c9f52e341503d Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 7 Mar 2022 09:48:55 +0100 Subject: xen/netfront: react properly to failing gnttab_end_foreign_access_ref() When calling gnttab_end_foreign_access_ref() the returned value must be tested and the reaction to that value should be appropriate. In case of failure in xennet_get_responses() the reaction should not be to crash the system, but to disable the network device. The calls in setup_netfront() can be replaced by calls of gnttab_end_foreign_access(). While at it avoid double free of ring pages and grant references via xennet_disconnect_backend() in this case. This is CVE-2022-23042 / part of XSA-396. Reported-by: Demi Marie Obenour Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich --- V2: - avoid double free V3: - remove pointless initializer (Jan Beulich) --- drivers/net/xen-netfront.c | 48 ++++++++++++++++++++++++++++++---------------- 1 file changed, 31 insertions(+), 17 deletions(-) diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index 13926d03418e..daa4e6106aac 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -988,7 +988,6 @@ static int xennet_get_responses(struct netfront_queue *queue, struct device *dev = &queue->info->netdev->dev; struct bpf_prog *xdp_prog; struct xdp_buff xdp; - unsigned long ret; int slots = 1; int err = 0; u32 verdict; @@ -1030,8 +1029,13 @@ static int xennet_get_responses(struct netfront_queue *queue, goto next; } - ret = gnttab_end_foreign_access_ref(ref, 0); - BUG_ON(!ret); + if (!gnttab_end_foreign_access_ref(ref, 0)) { + dev_alert(dev, + "Grant still in use by backend domain\n"); + queue->info->broken = true; + dev_alert(dev, "Disabled for further use\n"); + return -EINVAL; + } gnttab_release_grant_reference(&queue->gref_rx_head, ref); @@ -1252,6 +1256,10 @@ static int xennet_poll(struct napi_struct *napi, int budget) &need_xdp_flush); if (unlikely(err)) { + if (queue->info->broken) { + spin_unlock(&queue->rx_lock); + return 0; + } err: while ((skb = __skb_dequeue(&tmpq))) __skb_queue_tail(&errq, skb); @@ -1916,7 +1924,7 @@ static int setup_netfront(struct xenbus_device *dev, struct netfront_queue *queue, unsigned int feature_split_evtchn) { struct xen_netif_tx_sring *txs; - struct xen_netif_rx_sring *rxs; + struct xen_netif_rx_sring *rxs = NULL; grant_ref_t gref; int err; @@ -1936,21 +1944,21 @@ static int setup_netfront(struct xenbus_device *dev, err = xenbus_grant_ring(dev, txs, 1, &gref); if (err < 0) - goto grant_tx_ring_fail; + goto fail; queue->tx_ring_ref = gref; rxs = (struct xen_netif_rx_sring *)get_zeroed_page(GFP_NOIO | __GFP_HIGH); if (!rxs) { err = -ENOMEM; xenbus_dev_fatal(dev, err, "allocating rx ring page"); - goto alloc_rx_ring_fail; + goto fail; } SHARED_RING_INIT(rxs); FRONT_RING_INIT(&queue->rx, rxs, XEN_PAGE_SIZE); err = xenbus_grant_ring(dev, rxs, 1, &gref); if (err < 0) - goto grant_rx_ring_fail; + goto fail; queue->rx_ring_ref = gref; if (feature_split_evtchn) @@ -1963,22 +1971,28 @@ static int setup_netfront(struct xenbus_device *dev, err = setup_netfront_single(queue); if (err) - goto alloc_evtchn_fail; + goto fail; return 0; /* If we fail to setup netfront, it is safe to just revoke access to * granted pages because backend is not accessing it at this point. */ -alloc_evtchn_fail: - gnttab_end_foreign_access_ref(queue->rx_ring_ref, 0); -grant_rx_ring_fail: - free_page((unsigned long)rxs); -alloc_rx_ring_fail: - gnttab_end_foreign_access_ref(queue->tx_ring_ref, 0); -grant_tx_ring_fail: - free_page((unsigned long)txs); -fail: + fail: + if (queue->rx_ring_ref != GRANT_INVALID_REF) { + gnttab_end_foreign_access(queue->rx_ring_ref, 0, + (unsigned long)rxs); + queue->rx_ring_ref = GRANT_INVALID_REF; + } else { + free_page((unsigned long)rxs); + } + if (queue->tx_ring_ref != GRANT_INVALID_REF) { + gnttab_end_foreign_access(queue->tx_ring_ref, 0, + (unsigned long)txs); + queue->tx_ring_ref = GRANT_INVALID_REF; + } else { + free_page((unsigned long)txs); + } return err; } -- cgit From 1760fdb6fe9f796fbdb9b4106b3e0bbacc16b55c Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Fri, 4 Mar 2022 11:56:56 +0100 Subject: mmc: core: Restore (almost) the busy polling for MMC_SEND_OP_COND Commit 76bfc7ccc2fa ("mmc: core: adjust polling interval for CMD1"), significantly decreased the polling period from ~10-12ms into just a couple of us. The purpose was to decrease the total time spent in the busy polling loop, but unfortunate it has lead to problems, that causes eMMC cards to never gets out busy and thus fails to be initialized. To fix the problem, but also to try to keep some of the new improved behaviour, let's start by using a polling period of 1-2ms, which then increases for each loop, according to common polling loop in __mmc_poll_for_busy(). Reported-by: Jean Rene Dawin Reported-by: H. Nikolaus Schaller Cc: Huijin Park Fixes: 76bfc7ccc2fa ("mmc: core: adjust polling interval for CMD1") Signed-off-by: Ulf Hansson Tested-by: Jean Rene Dawin Tested-by: H. Nikolaus Schaller Link: https://lore.kernel.org/r/20220304105656.149281-1-ulf.hansson@linaro.org --- drivers/mmc/core/block.c | 2 +- drivers/mmc/core/mmc.c | 2 +- drivers/mmc/core/mmc_ops.c | 13 +++++++++---- drivers/mmc/core/mmc_ops.h | 3 ++- drivers/mmc/core/sd.c | 2 +- 5 files changed, 14 insertions(+), 8 deletions(-) diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index 8d718aa56d33..4e67c1403cc9 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -1908,7 +1908,7 @@ static int mmc_blk_card_busy(struct mmc_card *card, struct request *req) cb_data.card = card; cb_data.status = 0; - err = __mmc_poll_for_busy(card->host, MMC_BLK_TIMEOUT_MS, + err = __mmc_poll_for_busy(card->host, 0, MMC_BLK_TIMEOUT_MS, &mmc_blk_busy_cb, &cb_data); /* diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c index bbbbcaf70a59..8421519c2a98 100644 --- a/drivers/mmc/core/mmc.c +++ b/drivers/mmc/core/mmc.c @@ -1962,7 +1962,7 @@ static int mmc_sleep(struct mmc_host *host) goto out_release; } - err = __mmc_poll_for_busy(host, timeout_ms, &mmc_sleep_busy_cb, host); + err = __mmc_poll_for_busy(host, 0, timeout_ms, &mmc_sleep_busy_cb, host); out_release: mmc_retune_release(host); diff --git a/drivers/mmc/core/mmc_ops.c b/drivers/mmc/core/mmc_ops.c index d63d1c735335..180d7e9d3400 100644 --- a/drivers/mmc/core/mmc_ops.c +++ b/drivers/mmc/core/mmc_ops.c @@ -21,6 +21,8 @@ #define MMC_BKOPS_TIMEOUT_MS (120 * 1000) /* 120s */ #define MMC_SANITIZE_TIMEOUT_MS (240 * 1000) /* 240s */ +#define MMC_OP_COND_PERIOD_US (1 * 1000) /* 1ms */ +#define MMC_OP_COND_TIMEOUT_MS 1000 /* 1s */ static const u8 tuning_blk_pattern_4bit[] = { 0xff, 0x0f, 0xff, 0x00, 0xff, 0xcc, 0xc3, 0xcc, @@ -232,7 +234,9 @@ int mmc_send_op_cond(struct mmc_host *host, u32 ocr, u32 *rocr) cmd.arg = mmc_host_is_spi(host) ? 0 : ocr; cmd.flags = MMC_RSP_SPI_R1 | MMC_RSP_R3 | MMC_CMD_BCR; - err = __mmc_poll_for_busy(host, 1000, &__mmc_send_op_cond_cb, &cb_data); + err = __mmc_poll_for_busy(host, MMC_OP_COND_PERIOD_US, + MMC_OP_COND_TIMEOUT_MS, + &__mmc_send_op_cond_cb, &cb_data); if (err) return err; @@ -495,13 +499,14 @@ static int mmc_busy_cb(void *cb_data, bool *busy) return 0; } -int __mmc_poll_for_busy(struct mmc_host *host, unsigned int timeout_ms, +int __mmc_poll_for_busy(struct mmc_host *host, unsigned int period_us, + unsigned int timeout_ms, int (*busy_cb)(void *cb_data, bool *busy), void *cb_data) { int err; unsigned long timeout; - unsigned int udelay = 32, udelay_max = 32768; + unsigned int udelay = period_us ? period_us : 32, udelay_max = 32768; bool expired = false; bool busy = false; @@ -546,7 +551,7 @@ int mmc_poll_for_busy(struct mmc_card *card, unsigned int timeout_ms, cb_data.retry_crc_err = retry_crc_err; cb_data.busy_cmd = busy_cmd; - return __mmc_poll_for_busy(host, timeout_ms, &mmc_busy_cb, &cb_data); + return __mmc_poll_for_busy(host, 0, timeout_ms, &mmc_busy_cb, &cb_data); } EXPORT_SYMBOL_GPL(mmc_poll_for_busy); diff --git a/drivers/mmc/core/mmc_ops.h b/drivers/mmc/core/mmc_ops.h index 9c813b851d0b..09ffbc00908b 100644 --- a/drivers/mmc/core/mmc_ops.h +++ b/drivers/mmc/core/mmc_ops.h @@ -41,7 +41,8 @@ int mmc_can_ext_csd(struct mmc_card *card); int mmc_switch_status(struct mmc_card *card, bool crc_err_fatal); bool mmc_prepare_busy_cmd(struct mmc_host *host, struct mmc_command *cmd, unsigned int timeout_ms); -int __mmc_poll_for_busy(struct mmc_host *host, unsigned int timeout_ms, +int __mmc_poll_for_busy(struct mmc_host *host, unsigned int period_us, + unsigned int timeout_ms, int (*busy_cb)(void *cb_data, bool *busy), void *cb_data); int mmc_poll_for_busy(struct mmc_card *card, unsigned int timeout_ms, diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c index bd87012c220c..bfbfed30dc4d 100644 --- a/drivers/mmc/core/sd.c +++ b/drivers/mmc/core/sd.c @@ -1672,7 +1672,7 @@ static int sd_poweroff_notify(struct mmc_card *card) cb_data.card = card; cb_data.reg_buf = reg_buf; - err = __mmc_poll_for_busy(card->host, SD_POWEROFF_NOTIFY_TIMEOUT_MS, + err = __mmc_poll_for_busy(card->host, 0, SD_POWEROFF_NOTIFY_TIMEOUT_MS, &sd_busy_poweroff_notify_cb, &cb_data); out: -- cgit From d0aeb0d4a3f7d2a0df7e9545892bbeede8f2ac7e Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Sat, 5 Mar 2022 00:58:16 -0800 Subject: isdn: hfcpci: check the return value of dma_set_mask() in setup_hw() The function dma_set_mask() in setup_hw() can fail, so its return value should be checked. Fixes: 1700fe1a10dc ("Add mISDN HFC PCI driver") Reported-by: TOTE Robot Signed-off-by: Jia-Ju Bai Signed-off-by: David S. Miller --- drivers/isdn/hardware/mISDN/hfcpci.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/isdn/hardware/mISDN/hfcpci.c b/drivers/isdn/hardware/mISDN/hfcpci.c index bd087cca1c1d..af17459c1a5c 100644 --- a/drivers/isdn/hardware/mISDN/hfcpci.c +++ b/drivers/isdn/hardware/mISDN/hfcpci.c @@ -2005,7 +2005,11 @@ setup_hw(struct hfc_pci *hc) } /* Allocate memory for FIFOS */ /* the memory needs to be on a 32k boundary within the first 4G */ - dma_set_mask(&hc->pdev->dev, 0xFFFF8000); + if (dma_set_mask(&hc->pdev->dev, 0xFFFF8000)) { + printk(KERN_WARNING + "HFC-PCI: No usable DMA configuration!\n"); + return -EIO; + } buffer = dma_alloc_coherent(&hc->pdev->dev, 0x8000, &hc->hw.dmahandle, GFP_KERNEL); /* We silently assume the address is okay if nonzero */ -- cgit From e0058f0fa80f6e09c4d363779c241c45a3c56b94 Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Sat, 5 Mar 2022 01:14:11 -0800 Subject: net: qlogic: check the return value of dma_alloc_coherent() in qed_vf_hw_prepare() The function dma_alloc_coherent() in qed_vf_hw_prepare() can fail, so its return value should be checked. Fixes: 1408cc1fa48c ("qed: Introduce VFs") Reported-by: TOTE Robot Signed-off-by: Jia-Ju Bai Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_vf.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.c b/drivers/net/ethernet/qlogic/qed/qed_vf.c index 597cd9cd57b5..7b0e390c0b07 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_vf.c +++ b/drivers/net/ethernet/qlogic/qed/qed_vf.c @@ -513,6 +513,9 @@ int qed_vf_hw_prepare(struct qed_hwfn *p_hwfn) p_iov->bulletin.size, &p_iov->bulletin.phys, GFP_KERNEL); + if (!p_iov->bulletin.p_virt) + goto free_pf2vf_reply; + DP_VERBOSE(p_hwfn, QED_MSG_IOV, "VF's bulletin Board [%p virt 0x%llx phys 0x%08x bytes]\n", p_iov->bulletin.p_virt, @@ -552,6 +555,10 @@ int qed_vf_hw_prepare(struct qed_hwfn *p_hwfn) return rc; +free_pf2vf_reply: + dma_free_coherent(&p_hwfn->cdev->pdev->dev, + sizeof(union pfvf_tlvs), + p_iov->pf2vf_reply, p_iov->pf2vf_reply_phys); free_vf2pf_request: dma_free_coherent(&p_hwfn->cdev->pdev->dev, sizeof(union vfpf_tlvs), -- cgit From dd830aed23c6e07cd8e2a163742bf3d63c9add08 Mon Sep 17 00:00:00 2001 From: Aleksander Jan Bajkowski Date: Sat, 5 Mar 2022 12:20:39 +0100 Subject: net: lantiq_xrx200: fix use after free bug The skb->len field is read after the packet is sent to the network stack. In the meantime, skb can be freed. This patch fixes this bug. Fixes: c3e6b2c35b34 ("net: lantiq_xrx200: add ingress SG DMA support") Reported-by: Eric Dumazet Signed-off-by: Aleksander Jan Bajkowski Acked-by: Hauke Mehrtens Signed-off-by: David S. Miller --- drivers/net/ethernet/lantiq_xrx200.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/lantiq_xrx200.c b/drivers/net/ethernet/lantiq_xrx200.c index 41d11137cde0..5712c3e94be8 100644 --- a/drivers/net/ethernet/lantiq_xrx200.c +++ b/drivers/net/ethernet/lantiq_xrx200.c @@ -260,9 +260,9 @@ static int xrx200_hw_receive(struct xrx200_chan *ch) if (ctl & LTQ_DMA_EOP) { ch->skb_head->protocol = eth_type_trans(ch->skb_head, net_dev); - netif_receive_skb(ch->skb_head); net_dev->stats.rx_packets++; net_dev->stats.rx_bytes += ch->skb_head->len; + netif_receive_skb(ch->skb_head); ch->skb_head = NULL; ch->skb_tail = NULL; ret = XRX200_DMA_PACKET_COMPLETE; -- cgit From bb77bd31c281f70ec77c9c4f584950a779e05cf8 Mon Sep 17 00:00:00 2001 From: Zheyu Ma Date: Sat, 5 Mar 2022 14:55:04 +0000 Subject: ethernet: sun: Free the coherent when failing in probing When the driver fails to register net device, it should free the DMA region first, and then do other cleanup. Signed-off-by: Zheyu Ma Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/ethernet/sun/sunhme.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/sun/sunhme.c b/drivers/net/ethernet/sun/sunhme.c index ad9029ae6848..77e5dffb558f 100644 --- a/drivers/net/ethernet/sun/sunhme.c +++ b/drivers/net/ethernet/sun/sunhme.c @@ -3146,7 +3146,7 @@ static int happy_meal_pci_probe(struct pci_dev *pdev, if (err) { printk(KERN_ERR "happymeal(PCI): Cannot register net device, " "aborting.\n"); - goto err_out_iounmap; + goto err_out_free_coherent; } pci_set_drvdata(pdev, hp); @@ -3179,6 +3179,10 @@ static int happy_meal_pci_probe(struct pci_dev *pdev, return 0; +err_out_free_coherent: + dma_free_coherent(hp->dma_dev, PAGE_SIZE, + hp->happy_block, hp->hblock_dvma); + err_out_iounmap: iounmap(hp->gregs); -- cgit From ebe48d368e97d007bfeb76fcb065d6cfc4c96645 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 7 Mar 2022 13:11:39 +0100 Subject: esp: Fix possible buffer overflow in ESP transformation The maximum message size that can be send is bigger than the maximum site that skb_page_frag_refill can allocate. So it is possible to write beyond the allocated buffer. Fix this by doing a fallback to COW in that case. v2: Avoid get get_order() costs as suggested by Linus Torvalds. Fixes: cac2661c53f3 ("esp4: Avoid skb_cow_data whenever possible") Fixes: 03e2a30f6a27 ("esp6: Avoid skb_cow_data whenever possible") Reported-by: valis Signed-off-by: Steffen Klassert --- include/net/esp.h | 2 ++ net/ipv4/esp4.c | 5 +++++ net/ipv6/esp6.c | 5 +++++ 3 files changed, 12 insertions(+) diff --git a/include/net/esp.h b/include/net/esp.h index 9c5637d41d95..90cd02ff77ef 100644 --- a/include/net/esp.h +++ b/include/net/esp.h @@ -4,6 +4,8 @@ #include +#define ESP_SKB_FRAG_MAXSIZE (PAGE_SIZE << SKB_FRAG_PAGE_ORDER) + struct ip_esp_hdr; static inline struct ip_esp_hdr *ip_esp_hdr(const struct sk_buff *skb) diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index e1b1d080e908..70e6c87fbe3d 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -446,6 +446,7 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * struct page *page; struct sk_buff *trailer; int tailen = esp->tailen; + unsigned int allocsz; /* this is non-NULL only with TCP/UDP Encapsulation */ if (x->encap) { @@ -455,6 +456,10 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * return err; } + allocsz = ALIGN(skb->data_len + tailen, L1_CACHE_BYTES); + if (allocsz > ESP_SKB_FRAG_MAXSIZE) + goto cow; + if (!skb_cloned(skb)) { if (tailen <= skb_tailroom(skb)) { nfrags = 1; diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 7591160edce1..b0ffbcd5432d 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -482,6 +482,7 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info struct page *page; struct sk_buff *trailer; int tailen = esp->tailen; + unsigned int allocsz; if (x->encap) { int err = esp6_output_encap(x, skb, esp); @@ -490,6 +491,10 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info return err; } + allocsz = ALIGN(skb->data_len + tailen, L1_CACHE_BYTES); + if (allocsz > ESP_SKB_FRAG_MAXSIZE) + goto cow; + if (!skb_cloned(skb)) { if (tailen <= skb_tailroom(skb)) { nfrags = 1; -- cgit From 053c8fdf2c930efdff5496960842bbb5c34ad43a Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 7 Mar 2022 13:11:40 +0100 Subject: esp: Fix BEET mode inter address family tunneling on GSO The xfrm{4,6}_beet_gso_segment() functions did not correctly set the SKB_GSO_IPXIP4 and SKB_GSO_IPXIP6 gso types for the address family tunneling case. Fix this by setting these gso types. Fixes: 384a46ea7bdc7 ("esp4: add gso_segment for esp4 beet mode") Fixes: 7f9e40eb18a99 ("esp6: add gso_segment for esp6 beet mode") Signed-off-by: Steffen Klassert --- net/ipv4/esp4_offload.c | 3 +++ net/ipv6/esp6_offload.c | 3 +++ 2 files changed, 6 insertions(+) diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index d87f02a6e934..146d4d54830c 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -160,6 +160,9 @@ static struct sk_buff *xfrm4_beet_gso_segment(struct xfrm_state *x, skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4; } + if (proto == IPPROTO_IPV6) + skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP4; + __skb_pull(skb, skb_transport_offset(skb)); ops = rcu_dereference(inet_offloads[proto]); if (likely(ops && ops->callbacks.gso_segment)) diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index ba5e81cd569c..e61172d50817 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -199,6 +199,9 @@ static struct sk_buff *xfrm6_beet_gso_segment(struct xfrm_state *x, ipv6_skip_exthdr(skb, 0, &proto, &frag); } + if (proto == IPPROTO_IPIP) + skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP6; + __skb_pull(skb, skb_transport_offset(skb)); ops = rcu_dereference(inet6_offloads[proto]); if (likely(ops && ops->callbacks.gso_segment)) -- cgit From 23c7f8d7989e1646aac82f75761b7648c355cb8a Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 7 Mar 2022 13:11:41 +0100 Subject: net: Fix esp GSO on inter address family tunnels. The esp tunnel GSO handlers use skb_mac_gso_segment to push the inner packet to the segmentation handlers. However, skb_mac_gso_segment takes the Ethernet Protocol ID from 'skb->protocol' which is wrong for inter address family tunnels. We fix this by introducing a new skb_eth_gso_segment function. This function can be used if it is necessary to pass the Ethernet Protocol ID directly to the segmentation handler. First users of this function will be the esp4 and esp6 tunnel segmentation handlers. Fixes: c35fe4106b92 ("xfrm: Add mode handlers for IPsec on layer 2") Signed-off-by: Steffen Klassert --- include/linux/netdevice.h | 2 ++ net/core/gro.c | 25 +++++++++++++++++++++++++ net/ipv4/esp4_offload.c | 3 +-- net/ipv6/esp6_offload.c | 3 +-- 4 files changed, 29 insertions(+), 4 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 8b5a314db167..f53ea7038441 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4602,6 +4602,8 @@ int skb_csum_hwoffload_help(struct sk_buff *skb, struct sk_buff *__skb_gso_segment(struct sk_buff *skb, netdev_features_t features, bool tx_path); +struct sk_buff *skb_eth_gso_segment(struct sk_buff *skb, + netdev_features_t features, __be16 type); struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, netdev_features_t features); diff --git a/net/core/gro.c b/net/core/gro.c index a11b286d1495..b7d2b0dc59a2 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -92,6 +92,31 @@ void dev_remove_offload(struct packet_offload *po) } EXPORT_SYMBOL(dev_remove_offload); +/** + * skb_eth_gso_segment - segmentation handler for ethernet protocols. + * @skb: buffer to segment + * @features: features for the output path (see dev->features) + * @type: Ethernet Protocol ID + */ +struct sk_buff *skb_eth_gso_segment(struct sk_buff *skb, + netdev_features_t features, __be16 type) +{ + struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); + struct packet_offload *ptype; + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, &offload_base, list) { + if (ptype->type == type && ptype->callbacks.gso_segment) { + segs = ptype->callbacks.gso_segment(skb, features); + break; + } + } + rcu_read_unlock(); + + return segs; +} +EXPORT_SYMBOL(skb_eth_gso_segment); + /** * skb_mac_gso_segment - mac layer segmentation handler. * @skb: buffer to segment diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 146d4d54830c..935026f4c807 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -110,8 +110,7 @@ static struct sk_buff *xfrm4_tunnel_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { - __skb_push(skb, skb->mac_len); - return skb_mac_gso_segment(skb, features); + return skb_eth_gso_segment(skb, features, htons(ETH_P_IP)); } static struct sk_buff *xfrm4_transport_gso_segment(struct xfrm_state *x, diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index e61172d50817..3a293838a91d 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -145,8 +145,7 @@ static struct sk_buff *xfrm6_tunnel_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { - __skb_push(skb, skb->mac_len); - return skb_mac_gso_segment(skb, features); + return skb_eth_gso_segment(skb, features, htons(ETH_P_IPV6)); } static struct sk_buff *xfrm6_transport_gso_segment(struct xfrm_state *x, -- cgit From d9dc0c84ad2d4cc911ba252c973d1bf18d5eb9cf Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Sat, 5 Mar 2022 07:06:42 -0800 Subject: qed: return status of qed_iov_get_link Clang static analysis reports this issue qed_sriov.c:4727:19: warning: Assigned value is garbage or undefined ivi->max_tx_rate = tx_rate ? tx_rate : link.speed; ^ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ link is only sometimes set by the call to qed_iov_get_link() qed_iov_get_link fails without setting link or returning status. So change the decl to return status. Fixes: 73390ac9d82b ("qed*: support ndo_get_vf_config") Signed-off-by: Tom Rix Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_sriov.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.c b/drivers/net/ethernet/qlogic/qed/qed_sriov.c index 8ac38828ba45..48cf4355bc47 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_sriov.c +++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.c @@ -3806,11 +3806,11 @@ bool qed_iov_mark_vf_flr(struct qed_hwfn *p_hwfn, u32 *p_disabled_vfs) return found; } -static void qed_iov_get_link(struct qed_hwfn *p_hwfn, - u16 vfid, - struct qed_mcp_link_params *p_params, - struct qed_mcp_link_state *p_link, - struct qed_mcp_link_capabilities *p_caps) +static int qed_iov_get_link(struct qed_hwfn *p_hwfn, + u16 vfid, + struct qed_mcp_link_params *p_params, + struct qed_mcp_link_state *p_link, + struct qed_mcp_link_capabilities *p_caps) { struct qed_vf_info *p_vf = qed_iov_get_vf_info(p_hwfn, vfid, @@ -3818,7 +3818,7 @@ static void qed_iov_get_link(struct qed_hwfn *p_hwfn, struct qed_bulletin_content *p_bulletin; if (!p_vf) - return; + return -EINVAL; p_bulletin = p_vf->bulletin.p_virt; @@ -3828,6 +3828,7 @@ static void qed_iov_get_link(struct qed_hwfn *p_hwfn, __qed_vf_get_link_state(p_hwfn, p_link, p_bulletin); if (p_caps) __qed_vf_get_link_caps(p_hwfn, p_caps, p_bulletin); + return 0; } static int @@ -4686,6 +4687,7 @@ static int qed_get_vf_config(struct qed_dev *cdev, struct qed_public_vf_info *vf_info; struct qed_mcp_link_state link; u32 tx_rate; + int ret; /* Sanitize request */ if (IS_VF(cdev)) @@ -4699,7 +4701,9 @@ static int qed_get_vf_config(struct qed_dev *cdev, vf_info = qed_iov_get_public_vf_info(hwfn, vf_id, true); - qed_iov_get_link(hwfn, vf_id, NULL, &link, NULL); + ret = qed_iov_get_link(hwfn, vf_id, NULL, &link, NULL); + if (ret) + return ret; /* Fill information about VF */ ivi->vf = vf_id; -- cgit From c70c453abcbf3ecbaadd4c3236a5119b8da365cf Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Sat, 5 Mar 2022 17:47:20 -0300 Subject: smsc95xx: Ignore -ENODEV errors when device is unplugged According to Documentation/driver-api/usb/URB.rst when a device is unplugged usb_submit_urb() returns -ENODEV. This error code propagates all the way up to usbnet_read_cmd() and usbnet_write_cmd() calls inside the smsc95xx.c driver during Ethernet cable unplug, unbind or reboot. This causes the following errors to be shown on reboot, for example: ci_hdrc ci_hdrc.1: remove, state 1 usb usb2: USB disconnect, device number 1 usb 2-1: USB disconnect, device number 2 usb 2-1.1: USB disconnect, device number 3 smsc95xx 2-1.1:1.0 eth1: unregister 'smsc95xx' usb-ci_hdrc.1-1.1, smsc95xx USB 2.0 Ethernet smsc95xx 2-1.1:1.0 eth1: Failed to read reg index 0x00000114: -19 smsc95xx 2-1.1:1.0 eth1: Error reading MII_ACCESS smsc95xx 2-1.1:1.0 eth1: __smsc95xx_mdio_read: MII is busy smsc95xx 2-1.1:1.0 eth1: Failed to read reg index 0x00000114: -19 smsc95xx 2-1.1:1.0 eth1: Error reading MII_ACCESS smsc95xx 2-1.1:1.0 eth1: __smsc95xx_mdio_read: MII is busy smsc95xx 2-1.1:1.0 eth1: hardware isn't capable of remote wakeup usb 2-1.4: USB disconnect, device number 4 ci_hdrc ci_hdrc.1: USB bus 2 deregistered ci_hdrc ci_hdrc.0: remove, state 4 usb usb1: USB disconnect, device number 1 ci_hdrc ci_hdrc.0: USB bus 1 deregistered imx2-wdt 30280000.watchdog: Device shutdown: Expect reboot! reboot: Restarting system Ignore the -ENODEV errors inside __smsc95xx_mdio_read() and __smsc95xx_phy_wait_not_busy() and do not print error messages when -ENODEV is returned. Fixes: a049a30fc27c ("net: usb: Correct PHY handling of smsc95xx") Signed-off-by: Fabio Estevam Signed-off-by: David S. Miller --- drivers/net/usb/smsc95xx.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/drivers/net/usb/smsc95xx.c b/drivers/net/usb/smsc95xx.c index bc1e3dd67c04..a0f29482294d 100644 --- a/drivers/net/usb/smsc95xx.c +++ b/drivers/net/usb/smsc95xx.c @@ -84,9 +84,10 @@ static int __must_check __smsc95xx_read_reg(struct usbnet *dev, u32 index, ret = fn(dev, USB_VENDOR_REQUEST_READ_REGISTER, USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE, 0, index, &buf, 4); - if (unlikely(ret < 0)) { - netdev_warn(dev->net, "Failed to read reg index 0x%08x: %d\n", - index, ret); + if (ret < 0) { + if (ret != -ENODEV) + netdev_warn(dev->net, "Failed to read reg index 0x%08x: %d\n", + index, ret); return ret; } @@ -116,7 +117,7 @@ static int __must_check __smsc95xx_write_reg(struct usbnet *dev, u32 index, ret = fn(dev, USB_VENDOR_REQUEST_WRITE_REGISTER, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE, 0, index, &buf, 4); - if (unlikely(ret < 0)) + if (ret < 0 && ret != -ENODEV) netdev_warn(dev->net, "Failed to write reg index 0x%08x: %d\n", index, ret); @@ -159,6 +160,9 @@ static int __must_check __smsc95xx_phy_wait_not_busy(struct usbnet *dev, do { ret = __smsc95xx_read_reg(dev, MII_ADDR, &val, in_pm); if (ret < 0) { + /* Ignore -ENODEV error during disconnect() */ + if (ret == -ENODEV) + return 0; netdev_warn(dev->net, "Error reading MII_ACCESS\n"); return ret; } @@ -194,7 +198,8 @@ static int __smsc95xx_mdio_read(struct usbnet *dev, int phy_id, int idx, addr = mii_address_cmd(phy_id, idx, MII_READ_ | MII_BUSY_); ret = __smsc95xx_write_reg(dev, MII_ADDR, addr, in_pm); if (ret < 0) { - netdev_warn(dev->net, "Error writing MII_ADDR\n"); + if (ret != -ENODEV) + netdev_warn(dev->net, "Error writing MII_ADDR\n"); goto done; } @@ -206,7 +211,8 @@ static int __smsc95xx_mdio_read(struct usbnet *dev, int phy_id, int idx, ret = __smsc95xx_read_reg(dev, MII_DATA, &val, in_pm); if (ret < 0) { - netdev_warn(dev->net, "Error reading MII_DATA\n"); + if (ret != -ENODEV) + netdev_warn(dev->net, "Error reading MII_DATA\n"); goto done; } @@ -214,6 +220,10 @@ static int __smsc95xx_mdio_read(struct usbnet *dev, int phy_id, int idx, done: mutex_unlock(&dev->phy_mutex); + + /* Ignore -ENODEV error during disconnect() */ + if (ret == -ENODEV) + return 0; return ret; } @@ -235,7 +245,8 @@ static void __smsc95xx_mdio_write(struct usbnet *dev, int phy_id, val = regval; ret = __smsc95xx_write_reg(dev, MII_DATA, val, in_pm); if (ret < 0) { - netdev_warn(dev->net, "Error writing MII_DATA\n"); + if (ret != -ENODEV) + netdev_warn(dev->net, "Error writing MII_DATA\n"); goto done; } @@ -243,7 +254,8 @@ static void __smsc95xx_mdio_write(struct usbnet *dev, int phy_id, addr = mii_address_cmd(phy_id, idx, MII_WRITE_ | MII_BUSY_); ret = __smsc95xx_write_reg(dev, MII_ADDR, addr, in_pm); if (ret < 0) { - netdev_warn(dev->net, "Error writing MII_ADDR\n"); + if (ret != -ENODEV) + netdev_warn(dev->net, "Error writing MII_ADDR\n"); goto done; } -- cgit From 5f84e73f9a8f14b95115b0eb2080da6d9fa7a82e Mon Sep 17 00:00:00 2001 From: Akhil R Date: Mon, 28 Feb 2022 21:04:05 +0530 Subject: gpio: tegra186: Add IRQ per bank for Tegra241 Add the number of interrupts per bank for Tegra241 (Grace) to fix the probe failure. Fixes: d1056b771ddb ("gpio: tegra186: Add support for Tegra241") Signed-off-by: Akhil R Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-tegra186.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpio/gpio-tegra186.c b/drivers/gpio/gpio-tegra186.c index 8d298beffd86..031fe105b58e 100644 --- a/drivers/gpio/gpio-tegra186.c +++ b/drivers/gpio/gpio-tegra186.c @@ -1075,6 +1075,7 @@ static const struct tegra_gpio_soc tegra241_main_soc = { .ports = tegra241_main_ports, .name = "tegra241-gpio", .instance = 0, + .num_irqs_per_bank = 8, }; #define TEGRA241_AON_GPIO_PORT(_name, _bank, _port, _pins) \ @@ -1095,6 +1096,7 @@ static const struct tegra_gpio_soc tegra241_aon_soc = { .ports = tegra241_aon_ports, .name = "tegra241-gpio-aon", .instance = 1, + .num_irqs_per_bank = 8, }; static const struct of_device_id tegra186_gpio_of_match[] = { -- cgit From fc328a7d1fcce263db0b046917a66f3aa6e68719 Mon Sep 17 00:00:00 2001 From: Marcelo Roberto Jimenez Date: Mon, 7 Mar 2022 10:57:24 +0100 Subject: gpio: Revert regression in sysfs-gpio (gpiolib.c) Some GPIO lines have stopped working after the patch commit 2ab73c6d8323f ("gpio: Support GPIO controllers without pin-ranges") And this has supposedly been fixed in the following patches commit 89ad556b7f96a ("gpio: Avoid using pin ranges with !PINCTRL") commit 6dbbf84603961 ("gpiolib: Don't free if pin ranges are not defined") But an erratic behavior where some GPIO lines work while others do not work has been introduced. This patch reverts those changes so that the sysfs-gpio interface works properly again. Signed-off-by: Marcelo Roberto Jimenez Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index a3d14277f17c..005642068c0b 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -1701,11 +1701,6 @@ static inline void gpiochip_irqchip_free_valid_mask(struct gpio_chip *gc) */ int gpiochip_generic_request(struct gpio_chip *gc, unsigned int offset) { -#ifdef CONFIG_PINCTRL - if (list_empty(&gc->gpiodev->pin_ranges)) - return 0; -#endif - return pinctrl_gpio_request(gc->gpiodev->base + offset); } EXPORT_SYMBOL_GPL(gpiochip_generic_request); @@ -1717,11 +1712,6 @@ EXPORT_SYMBOL_GPL(gpiochip_generic_request); */ void gpiochip_generic_free(struct gpio_chip *gc, unsigned int offset) { -#ifdef CONFIG_PINCTRL - if (list_empty(&gc->gpiodev->pin_ranges)) - return; -#endif - pinctrl_gpio_free(gc->gpiodev->base + offset); } EXPORT_SYMBOL_GPL(gpiochip_generic_free); -- cgit From 660c619b9d7ccd28648ee3766cdbe94ec7b27402 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 7 Mar 2022 13:56:23 +0200 Subject: gpiolib: acpi: Convert ACPI value of debounce to microseconds It appears that GPIO ACPI library uses ACPI debounce values directly. However, the GPIO library APIs expect the debounce timeout to be in microseconds. Convert ACPI value of debounce to microseconds. While at it, document this detail where it is appropriate. BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=215664 Reported-by: Kai-Heng Feng Fixes: 8dcb7a15a585 ("gpiolib: acpi: Take into account debounce settings") Signed-off-by: Andy Shevchenko Tested-by: Kai-Heng Feng Reviewed-by: Mika Westerberg Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib-acpi.c | 6 ++++-- drivers/gpio/gpiolib.c | 10 ++++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/gpio/gpiolib-acpi.c b/drivers/gpio/gpiolib-acpi.c index c0f6a25c3279..a5495ad31c9c 100644 --- a/drivers/gpio/gpiolib-acpi.c +++ b/drivers/gpio/gpiolib-acpi.c @@ -307,7 +307,8 @@ static struct gpio_desc *acpi_request_own_gpiod(struct gpio_chip *chip, if (IS_ERR(desc)) return desc; - ret = gpio_set_debounce_timeout(desc, agpio->debounce_timeout); + /* ACPI uses hundredths of milliseconds units */ + ret = gpio_set_debounce_timeout(desc, agpio->debounce_timeout * 10); if (ret) dev_warn(chip->parent, "Failed to set debounce-timeout for pin 0x%04X, err %d\n", @@ -1035,7 +1036,8 @@ int acpi_dev_gpio_irq_get_by(struct acpi_device *adev, const char *name, int ind if (ret < 0) return ret; - ret = gpio_set_debounce_timeout(desc, info.debounce); + /* ACPI uses hundredths of milliseconds units */ + ret = gpio_set_debounce_timeout(desc, info.debounce * 10); if (ret) return ret; diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 005642068c0b..defb7c464b87 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -2217,6 +2217,16 @@ static int gpio_set_bias(struct gpio_desc *desc) return gpio_set_config_with_argument_optional(desc, bias, arg); } +/** + * gpio_set_debounce_timeout() - Set debounce timeout + * @desc: GPIO descriptor to set the debounce timeout + * @debounce: Debounce timeout in microseconds + * + * The function calls the certain GPIO driver to set debounce timeout + * in the hardware. + * + * Returns 0 on success, or negative error code otherwise. + */ int gpio_set_debounce_timeout(struct gpio_desc *desc, unsigned int debounce) { return gpio_set_config_with_argument_optional(desc, -- cgit From 804f468853179b9b58af05c153c411931aa5b310 Mon Sep 17 00:00:00 2001 From: Jouni Högander Date: Fri, 25 Feb 2022 09:02:28 +0200 Subject: drm/i915/psr: Set "SF Partial Frame Enable" also on full update MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently we are observing occasional screen flickering when PSR2 selective fetch is enabled. More specifically glitch seems to happen on full frame update when cursor moves to coords x = -1 or y = -1. According to Bspec SF Single full frame should not be set if SF Partial Frame Enable is not set. This happened to be true for ADLP as PSR2_MAN_TRK_CTL_ENABLE is always set and for ADL_P it's actually "SF Partial Frame Enable" (Bit 31). Setting "SF Partial Frame Enable" bit also on full update seems to fix screen flickering. Also make code more clear by setting PSR2_MAN_TRK_CTL_ENABLE only if not on ADL_P. Bit 31 has different meaning in ADL_P. Bspec: 49274 v2: Fix Mihai Harpau email address v3: Modify commit message and remove unnecessary comment Tested-by: Lyude Paul Fixes: 7f6002e58025 ("drm/i915/display: Enable PSR2 selective fetch by default") Reported-by: Lyude Paul Cc: Mihai Harpau Cc: José Roberto de Souza Cc: Ville Syrjälä Bugzilla: https://gitlab.freedesktop.org/drm/intel/-/issues/5077 Signed-off-by: Jouni Högander Reviewed-by: José Roberto de Souza Signed-off-by: José Roberto de Souza Link: https://patchwork.freedesktop.org/patch/msgid/20220225070228.855138-1-jouni.hogander@intel.com (cherry picked from commit 8d5516d18b323cf7274d1cf5fe76f4a691f879c6) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/display/intel_psr.c | 16 ++++++++++++++-- drivers/gpu/drm/i915/i915_reg.h | 1 + 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_psr.c b/drivers/gpu/drm/i915/display/intel_psr.c index a1a663f362e7..00279e8c2775 100644 --- a/drivers/gpu/drm/i915/display/intel_psr.c +++ b/drivers/gpu/drm/i915/display/intel_psr.c @@ -1406,6 +1406,13 @@ static inline u32 man_trk_ctl_single_full_frame_bit_get(struct drm_i915_private PSR2_MAN_TRK_CTL_SF_SINGLE_FULL_FRAME; } +static inline u32 man_trk_ctl_partial_frame_bit_get(struct drm_i915_private *dev_priv) +{ + return IS_ALDERLAKE_P(dev_priv) ? + ADLP_PSR2_MAN_TRK_CTL_SF_PARTIAL_FRAME_UPDATE : + PSR2_MAN_TRK_CTL_SF_PARTIAL_FRAME_UPDATE; +} + static void psr_force_hw_tracking_exit(struct intel_dp *intel_dp) { struct drm_i915_private *dev_priv = dp_to_i915(intel_dp); @@ -1510,7 +1517,13 @@ static void psr2_man_trk_ctl_calc(struct intel_crtc_state *crtc_state, { struct intel_crtc *crtc = to_intel_crtc(crtc_state->uapi.crtc); struct drm_i915_private *dev_priv = to_i915(crtc->base.dev); - u32 val = PSR2_MAN_TRK_CTL_ENABLE; + u32 val = 0; + + if (!IS_ALDERLAKE_P(dev_priv)) + val = PSR2_MAN_TRK_CTL_ENABLE; + + /* SF partial frame enable has to be set even on full update */ + val |= man_trk_ctl_partial_frame_bit_get(dev_priv); if (full_update) { /* @@ -1530,7 +1543,6 @@ static void psr2_man_trk_ctl_calc(struct intel_crtc_state *crtc_state, } else { drm_WARN_ON(crtc_state->uapi.crtc->dev, clip->y1 % 4 || clip->y2 % 4); - val |= PSR2_MAN_TRK_CTL_SF_PARTIAL_FRAME_UPDATE; val |= PSR2_MAN_TRK_CTL_SU_REGION_START_ADDR(clip->y1 / 4 + 1); val |= PSR2_MAN_TRK_CTL_SU_REGION_END_ADDR(clip->y2 / 4 + 1); } diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index c2bb33febb68..902e4c802a12 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -4829,6 +4829,7 @@ enum { #define ADLP_PSR2_MAN_TRK_CTL_SU_REGION_START_ADDR(val) REG_FIELD_PREP(ADLP_PSR2_MAN_TRK_CTL_SU_REGION_START_ADDR_MASK, val) #define ADLP_PSR2_MAN_TRK_CTL_SU_REGION_END_ADDR_MASK REG_GENMASK(12, 0) #define ADLP_PSR2_MAN_TRK_CTL_SU_REGION_END_ADDR(val) REG_FIELD_PREP(ADLP_PSR2_MAN_TRK_CTL_SU_REGION_END_ADDR_MASK, val) +#define ADLP_PSR2_MAN_TRK_CTL_SF_PARTIAL_FRAME_UPDATE REG_BIT(31) #define ADLP_PSR2_MAN_TRK_CTL_SF_SINGLE_FULL_FRAME REG_BIT(14) #define ADLP_PSR2_MAN_TRK_CTL_SF_CONTINUOS_FULL_FRAME REG_BIT(13) -- cgit From 0c4bcfdecb1ac0967619ee7ff44871d93c08c909 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 7 Mar 2022 16:30:44 +0100 Subject: fuse: fix pipe buffer lifetime for direct_io In FOPEN_DIRECT_IO mode, fuse_file_write_iter() calls fuse_direct_write_iter(), which normally calls fuse_direct_io(), which then imports the write buffer with fuse_get_user_pages(), which uses iov_iter_get_pages() to grab references to userspace pages instead of actually copying memory. On the filesystem device side, these pages can then either be read to userspace (via fuse_dev_read()), or splice()d over into a pipe using fuse_dev_splice_read() as pipe buffers with &nosteal_pipe_buf_ops. This is wrong because after fuse_dev_do_read() unlocks the FUSE request, the userspace filesystem can mark the request as completed, causing write() to return. At that point, the userspace filesystem should no longer have access to the pipe buffer. Fix by copying pages coming from the user address space to new pipe buffers. Reported-by: Jann Horn Fixes: c3021629a0d8 ("fuse: support splice() reading from fuse device") Cc: Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 12 +++++++++++- fs/fuse/file.c | 1 + fs/fuse/fuse_i.h | 1 + 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index cd54a529460d..592730fd6e42 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -941,7 +941,17 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep, while (count) { if (cs->write && cs->pipebufs && page) { - return fuse_ref_page(cs, page, offset, count); + /* + * Can't control lifetime of pipe buffers, so always + * copy user pages. + */ + if (cs->req->args->user_pages) { + err = fuse_copy_fill(cs); + if (err) + return err; + } else { + return fuse_ref_page(cs, page, offset, count); + } } else if (!cs->len) { if (cs->move_pages && page && offset == 0 && count == PAGE_SIZE) { diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 829094451774..0fc150c1c50b 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1413,6 +1413,7 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, (PAGE_SIZE - ret) & (PAGE_SIZE - 1); } + ap->args.user_pages = true; if (write) ap->args.in_pages = true; else diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index e8e59fbdefeb..eac4984cc753 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -256,6 +256,7 @@ struct fuse_args { bool nocreds:1; bool in_pages:1; bool out_pages:1; + bool user_pages:1; bool out_argvar:1; bool page_zeroing:1; bool page_replace:1; -- cgit From 42da5a4ba17070e9d99abf375a5bd70e85d2a6b8 Mon Sep 17 00:00:00 2001 From: Roger Quadros Date: Sat, 19 Feb 2022 21:36:00 +0200 Subject: mtd: rawnand: omap2: Actually prevent invalid configuration and build error The root of the problem is that we are selecting symbols that have dependencies. This can cause random configurations that can fail. The cleanest solution is to avoid using select. This driver uses interfaces from the OMAP_GPMC driver so we have to depend on it instead. Fixes: 4cd335dae3cf ("mtd: rawnand: omap2: Prevent invalid configuration and build error") Signed-off-by: Roger Quadros Signed-off-by: Miquel Raynal Tested-by: Randy Dunlap Link: https://lore.kernel.org/linux-mtd/20220219193600.24892-1-rogerq@kernel.org --- drivers/mtd/nand/raw/Kconfig | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/mtd/nand/raw/Kconfig b/drivers/mtd/nand/raw/Kconfig index d986ab4e4c35..820e5dc3bc9b 100644 --- a/drivers/mtd/nand/raw/Kconfig +++ b/drivers/mtd/nand/raw/Kconfig @@ -42,8 +42,7 @@ config MTD_NAND_OMAP2 tristate "OMAP2, OMAP3, OMAP4 and Keystone NAND controller" depends on ARCH_OMAP2PLUS || ARCH_KEYSTONE || ARCH_K3 || COMPILE_TEST depends on HAS_IOMEM - select MEMORY - select OMAP_GPMC + depends on OMAP_GPMC help Support for NAND flash on Texas Instruments OMAP2, OMAP3, OMAP4 and Keystone platforms. -- cgit From 58c9a5060cb7cd529d49c93954cdafe81c1d642a Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 3 Mar 2022 16:53:56 +0000 Subject: arm64: proton-pack: Include unprivileged eBPF status in Spectre v2 mitigation reporting The mitigations for Spectre-BHB are only applied when an exception is taken from user-space. The mitigation status is reported via the spectre_v2 sysfs vulnerabilities file. When unprivileged eBPF is enabled the mitigation in the exception vectors can be avoided by an eBPF program. When unprivileged eBPF is enabled, print a warning and report vulnerable via the sysfs vulnerabilities file. Acked-by: Catalin Marinas Signed-off-by: James Morse --- arch/arm64/kernel/proton-pack.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index d3fbff00993d..6d45c63c6454 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -18,6 +18,7 @@ */ #include +#include #include #include #include @@ -111,6 +112,15 @@ static const char *get_bhb_affected_string(enum mitigation_state bhb_state) } } +static bool _unprivileged_ebpf_enabled(void) +{ +#ifdef CONFIG_BPF_SYSCALL + return !sysctl_unprivileged_bpf_disabled; +#else + return false; +#endif +} + ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, char *buf) { @@ -130,6 +140,9 @@ ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, v2_str = "CSV2"; fallthrough; case SPECTRE_MITIGATED: + if (bhb_state == SPECTRE_MITIGATED && _unprivileged_ebpf_enabled()) + return sprintf(buf, "Vulnerable: Unprivileged eBPF enabled\n"); + return sprintf(buf, "Mitigation: %s%s\n", v2_str, bhb_str); case SPECTRE_VULNERABLE: fallthrough; @@ -1125,3 +1138,16 @@ void __init spectre_bhb_patch_clearbhb(struct alt_instr *alt, *updptr++ = cpu_to_le32(aarch64_insn_gen_nop()); *updptr++ = cpu_to_le32(aarch64_insn_gen_nop()); } + +#ifdef CONFIG_BPF_SYSCALL +#define EBPF_WARN "Unprivileged eBPF is enabled, data leaks possible via Spectre v2 BHB attacks!\n" +void unpriv_ebpf_notify(int new_state) +{ + if (spectre_v2_state == SPECTRE_VULNERABLE || + spectre_bhb_state != SPECTRE_MITIGATED) + return; + + if (!new_state) + pr_err("WARNING: %s", EBPF_WARN); +} +#endif -- cgit From 7401b49c50c2b032223de408e28e37cbd63f4c97 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Mon, 7 Mar 2022 18:59:09 +0100 Subject: ARM: tegra: Move Nyan FHD panels to AUX bus Similarly to what was earlier done for other Nyan variants, move the eDP panel on the FHD models to the AUX bus as well. Suggested-by: Dmitry Osipenko Fixes: ef6fb9875ce0 ("ARM: tegra: Add device-tree for 1080p version of Nyan Big") Signed-off-by: Thierry Reding --- arch/arm/boot/dts/tegra124-nyan-big-fhd.dts | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/arm/boot/dts/tegra124-nyan-big-fhd.dts b/arch/arm/boot/dts/tegra124-nyan-big-fhd.dts index d35fb79d2f51..4db43324dafa 100644 --- a/arch/arm/boot/dts/tegra124-nyan-big-fhd.dts +++ b/arch/arm/boot/dts/tegra124-nyan-big-fhd.dts @@ -5,7 +5,13 @@ / { /* Version of Nyan Big with 1080p panel */ - panel { - compatible = "auo,b133htn01"; + host1x@50000000 { + dpaux@545c0000 { + aux-bus { + panel: panel { + compatible = "auo,b133htn01"; + }; + }; + }; }; }; -- cgit From aa6f8dcbab473f3a3c7454b74caa46d36cdc5d13 Mon Sep 17 00:00:00 2001 From: Halil Pasic Date: Sat, 5 Mar 2022 18:07:14 +0100 Subject: swiotlb: rework "fix info leak with DMA_FROM_DEVICE" Unfortunately, we ended up merging an old version of the patch "fix info leak with DMA_FROM_DEVICE" instead of merging the latest one. Christoph (the swiotlb maintainer), he asked me to create an incremental fix (after I have pointed this out the mix up, and asked him for guidance). So here we go. The main differences between what we got and what was agreed are: * swiotlb_sync_single_for_device is also required to do an extra bounce * We decided not to introduce DMA_ATTR_OVERWRITE until we have exploiters * The implantation of DMA_ATTR_OVERWRITE is flawed: DMA_ATTR_OVERWRITE must take precedence over DMA_ATTR_SKIP_CPU_SYNC Thus this patch removes DMA_ATTR_OVERWRITE, and makes swiotlb_sync_single_for_device() bounce unconditionally (that is, also when dir == DMA_TO_DEVICE) in order do avoid synchronising back stale data from the swiotlb buffer. Let me note, that if the size used with dma_sync_* API is less than the size used with dma_[un]map_*, under certain circumstances we may still end up with swiotlb not being transparent. In that sense, this is no perfect fix either. To get this bullet proof, we would have to bounce the entire mapping/bounce buffer. For that we would have to figure out the starting address, and the size of the mapping in swiotlb_sync_single_for_device(). While this does seem possible, there seems to be no firm consensus on how things are supposed to work. Signed-off-by: Halil Pasic Fixes: ddbd89deb7d3 ("swiotlb: fix info leak with DMA_FROM_DEVICE") Cc: stable@vger.kernel.org Reviewed-by: Christoph Hellwig Signed-off-by: Linus Torvalds --- Documentation/core-api/dma-attributes.rst | 8 -------- include/linux/dma-mapping.h | 8 -------- kernel/dma/swiotlb.c | 23 +++++++++++++++-------- 3 files changed, 15 insertions(+), 24 deletions(-) diff --git a/Documentation/core-api/dma-attributes.rst b/Documentation/core-api/dma-attributes.rst index 17706dc91ec9..1887d92e8e92 100644 --- a/Documentation/core-api/dma-attributes.rst +++ b/Documentation/core-api/dma-attributes.rst @@ -130,11 +130,3 @@ accesses to DMA buffers in both privileged "supervisor" and unprivileged subsystem that the buffer is fully accessible at the elevated privilege level (and ideally inaccessible or at least read-only at the lesser-privileged levels). - -DMA_ATTR_OVERWRITE ------------------- - -This is a hint to the DMA-mapping subsystem that the device is expected to -overwrite the entire mapped size, thus the caller does not require any of the -previous buffer contents to be preserved. This allows bounce-buffering -implementations to optimise DMA_FROM_DEVICE transfers. diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 6150d11a607e..dca2b1355bb1 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -61,14 +61,6 @@ */ #define DMA_ATTR_PRIVILEGED (1UL << 9) -/* - * This is a hint to the DMA-mapping subsystem that the device is expected - * to overwrite the entire mapped size, thus the caller does not require any - * of the previous buffer contents to be preserved. This allows - * bounce-buffering implementations to optimise DMA_FROM_DEVICE transfers. - */ -#define DMA_ATTR_OVERWRITE (1UL << 10) - /* * A dma_addr_t can hold any valid DMA or bus address for the platform. It can * be given to a device to use as a DMA source or target. It is specific to a diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index bfc56cb21705..6db1c475ec82 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -627,10 +627,14 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, for (i = 0; i < nr_slots(alloc_size + offset); i++) mem->slots[index + i].orig_addr = slot_addr(orig_addr, i); tlb_addr = slot_addr(mem->start, index) + offset; - if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && - (!(attrs & DMA_ATTR_OVERWRITE) || dir == DMA_TO_DEVICE || - dir == DMA_BIDIRECTIONAL)) - swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE); + /* + * When dir == DMA_FROM_DEVICE we could omit the copy from the orig + * to the tlb buffer, if we knew for sure the device will + * overwirte the entire current content. But we don't. Thus + * unconditional bounce may prevent leaking swiotlb content (i.e. + * kernel memory) to user-space. + */ + swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE); return tlb_addr; } @@ -697,10 +701,13 @@ void swiotlb_tbl_unmap_single(struct device *dev, phys_addr_t tlb_addr, void swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr, size_t size, enum dma_data_direction dir) { - if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL) - swiotlb_bounce(dev, tlb_addr, size, DMA_TO_DEVICE); - else - BUG_ON(dir != DMA_FROM_DEVICE); + /* + * Unconditional bounce is necessary to avoid corruption on + * sync_*_for_cpu or dma_ummap_* when the device didn't overwrite + * the whole lengt of the bounce buffer. + */ + swiotlb_bounce(dev, tlb_addr, size, DMA_TO_DEVICE); + BUG_ON(!valid_dma_direction(dir)); } void swiotlb_sync_single_for_cpu(struct device *dev, phys_addr_t tlb_addr, -- cgit From 1860d30466366774055d993f9b31094ede8af415 Mon Sep 17 00:00:00 2001 From: Kuldeep Singh Date: Mon, 7 Mar 2022 23:50:59 +0530 Subject: MAINTAINERS: Update git tree for Broadcom iProc SoCs Current git tree for Broadcom iProc SoCs is pretty outdated as it has not updated for a long time. Fix the reference. Signed-off-by: Kuldeep Singh --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index ea3e6c914384..5d627156efd9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3900,7 +3900,7 @@ M: Scott Branden M: bcm-kernel-feedback-list@broadcom.com L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained -T: git git://github.com/broadcom/cygnus-linux.git +T: git git://github.com/broadcom/stblinux.git F: arch/arm64/boot/dts/broadcom/northstar2/* F: arch/arm64/boot/dts/broadcom/stingray/* F: drivers/clk/bcm/clk-ns* -- cgit From 5125091d757a251a128ec38d2397c9d160394eac Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 7 Mar 2022 18:28:05 +0100 Subject: MAINTAINERS: update Krzysztof Kozlowski's email Use Krzysztof Kozlowski's @kernel.org account in maintainer entries. Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20220307172805.156760-1-krzysztof.kozlowski@canonical.com' Signed-off-by: Arnd Bergmann --- .mailmap | 1 + MAINTAINERS | 32 ++++++++++++++++---------------- drivers/soc/samsung/exynos-chipid.c | 2 +- 3 files changed, 18 insertions(+), 17 deletions(-) diff --git a/.mailmap b/.mailmap index 10ee1103c823..b96b98a2d9de 100644 --- a/.mailmap +++ b/.mailmap @@ -216,6 +216,7 @@ Koushik Krishna Manikandan Krzysztof Kozlowski Krzysztof Kozlowski +Krzysztof Kozlowski Kuninori Morimoto Kuogee Hsieh Leonardo Bras diff --git a/MAINTAINERS b/MAINTAINERS index 1ba1e4af2cbc..a84bd0855d67 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2572,7 +2572,7 @@ F: sound/soc/rockchip/ N: rockchip ARM/SAMSUNG S3C, S5P AND EXYNOS ARM ARCHITECTURES -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski R: Alim Akhtar L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) L: linux-samsung-soc@vger.kernel.org @@ -11675,7 +11675,7 @@ F: drivers/iio/proximity/mb1232.c MAXIM MAX17040 FAMILY FUEL GAUGE DRIVERS R: Iskren Chernev -R: Krzysztof Kozlowski +R: Krzysztof Kozlowski R: Marek Szyprowski R: Matheus Castello L: linux-pm@vger.kernel.org @@ -11685,7 +11685,7 @@ F: drivers/power/supply/max17040_battery.c MAXIM MAX17042 FAMILY FUEL GAUGE DRIVERS R: Hans de Goede -R: Krzysztof Kozlowski +R: Krzysztof Kozlowski R: Marek Szyprowski R: Sebastian Krzyszkowiak R: Purism Kernel Team @@ -11730,7 +11730,7 @@ F: Documentation/devicetree/bindings/power/supply/maxim,max77976.yaml F: drivers/power/supply/max77976_charger.c MAXIM MUIC CHARGER DRIVERS FOR EXYNOS BASED BOARDS -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski M: Bartlomiej Zolnierkiewicz L: linux-pm@vger.kernel.org S: Supported @@ -11739,7 +11739,7 @@ F: drivers/power/supply/max77693_charger.c MAXIM PMIC AND MUIC DRIVERS FOR EXYNOS BASED BOARDS M: Chanwoo Choi -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski M: Bartlomiej Zolnierkiewicz L: linux-kernel@vger.kernel.org S: Supported @@ -12428,7 +12428,7 @@ F: include/linux/memblock.h F: mm/memblock.c MEMORY CONTROLLER DRIVERS -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski L: linux-kernel@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux-mem-ctrl.git @@ -13565,7 +13565,7 @@ F: include/uapi/linux/nexthop.h F: net/ipv4/nexthop.c NFC SUBSYSTEM -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski L: linux-nfc@lists.01.org (subscribers-only) L: netdev@vger.kernel.org S: Maintained @@ -13879,7 +13879,7 @@ F: Documentation/devicetree/bindings/regulator/nxp,pf8x00-regulator.yaml F: drivers/regulator/pf8x00-regulator.c NXP PTN5150A CC LOGIC AND EXTCON DRIVER -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski L: linux-kernel@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/extcon/extcon-ptn5150.yaml @@ -15305,7 +15305,7 @@ F: drivers/pinctrl/renesas/ PIN CONTROLLER - SAMSUNG M: Tomasz Figa -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski M: Sylwester Nawrocki R: Alim Akhtar L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) @@ -16947,7 +16947,7 @@ W: http://www.ibm.com/developerworks/linux/linux390/ F: drivers/s390/scsi/zfcp_* S3C ADC BATTERY DRIVER -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski L: linux-samsung-soc@vger.kernel.org S: Odd Fixes F: drivers/power/supply/s3c_adc_battery.c @@ -16992,7 +16992,7 @@ F: Documentation/admin-guide/LSM/SafeSetID.rst F: security/safesetid/ SAMSUNG AUDIO (ASoC) DRIVERS -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski M: Sylwester Nawrocki L: alsa-devel@alsa-project.org (moderated for non-subscribers) S: Supported @@ -17000,7 +17000,7 @@ F: Documentation/devicetree/bindings/sound/samsung* F: sound/soc/samsung/ SAMSUNG EXYNOS PSEUDO RANDOM NUMBER GENERATOR (RNG) DRIVER -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski L: linux-crypto@vger.kernel.org L: linux-samsung-soc@vger.kernel.org S: Maintained @@ -17035,7 +17035,7 @@ S: Maintained F: drivers/platform/x86/samsung-laptop.c SAMSUNG MULTIFUNCTION PMIC DEVICE DRIVERS -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski M: Bartlomiej Zolnierkiewicz L: linux-kernel@vger.kernel.org L: linux-samsung-soc@vger.kernel.org @@ -17061,7 +17061,7 @@ F: drivers/media/platform/s3c-camif/ F: include/media/drv-intf/s3c_camif.h SAMSUNG S3FWRN5 NFC DRIVER -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski M: Krzysztof Opasiak L: linux-nfc@lists.01.org (subscribers-only) S: Maintained @@ -17083,7 +17083,7 @@ S: Supported F: drivers/media/i2c/s5k5baf.c SAMSUNG S5P Security SubSystem (SSS) DRIVER -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski M: Vladimir Zapolskiy L: linux-crypto@vger.kernel.org L: linux-samsung-soc@vger.kernel.org @@ -17118,7 +17118,7 @@ F: include/linux/clk/samsung.h F: include/linux/platform_data/clk-s3c2410.h SAMSUNG SPI DRIVERS -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski M: Andi Shyti L: linux-spi@vger.kernel.org L: linux-samsung-soc@vger.kernel.org diff --git a/drivers/soc/samsung/exynos-chipid.c b/drivers/soc/samsung/exynos-chipid.c index 2746d05936d3..0fb3631e7346 100644 --- a/drivers/soc/samsung/exynos-chipid.c +++ b/drivers/soc/samsung/exynos-chipid.c @@ -204,7 +204,7 @@ module_platform_driver(exynos_chipid_driver); MODULE_DESCRIPTION("Samsung Exynos ChipID controller and ASV driver"); MODULE_AUTHOR("Bartlomiej Zolnierkiewicz "); -MODULE_AUTHOR("Krzysztof Kozlowski "); +MODULE_AUTHOR("Krzysztof Kozlowski "); MODULE_AUTHOR("Pankaj Dubey "); MODULE_AUTHOR("Sylwester Nawrocki "); MODULE_LICENSE("GPL"); -- cgit From a9a5b720dc8227243f433141ba1343aa53ef57e4 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 7 Mar 2022 18:38:40 +0200 Subject: gpio: sim: Declare gpio_sim_hog_config_item_ops static Compiler is not happy: warning: symbol 'gpio_sim_hog_config_item_ops' was not declared. Should it be static? Fixes: cb8c474e79be ("gpio: sim: new testing module") Signed-off-by: Andy Shevchenko Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-sim.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-sim.c b/drivers/gpio/gpio-sim.c index 153fe79e1bf3..bb9bb595c1a8 100644 --- a/drivers/gpio/gpio-sim.c +++ b/drivers/gpio/gpio-sim.c @@ -1322,7 +1322,7 @@ static void gpio_sim_hog_config_item_release(struct config_item *item) kfree(hog); } -struct configfs_item_operations gpio_sim_hog_config_item_ops = { +static struct configfs_item_operations gpio_sim_hog_config_item_ops = { .release = gpio_sim_hog_config_item_release, }; -- cgit From 6e2edd6371a497a6350bb735534c9bda2a31f43d Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Thu, 3 Mar 2022 18:00:44 +0000 Subject: arm64: Ensure execute-only permissions are not allowed without EPAN Commit 18107f8a2df6 ("arm64: Support execute-only permissions with Enhanced PAN") re-introduced execute-only permissions when EPAN is available. When EPAN is not available, arch_filter_pgprot() is supposed to change a PAGE_EXECONLY permission into PAGE_READONLY_EXEC. However, if BTI or MTE are present, such check does not detect the execute-only pgprot in the presence of PTE_GP (BTI) or MT_NORMAL_TAGGED (MTE), allowing the user to request PROT_EXEC with PROT_BTI or PROT_MTE. Remove the arch_filter_pgprot() function, change the default VM_EXEC permissions to PAGE_READONLY_EXEC and update the protection_map[] array at core_initcall() if EPAN is detected. Signed-off-by: Catalin Marinas Fixes: 18107f8a2df6 ("arm64: Support execute-only permissions with Enhanced PAN") Cc: # 5.13.x Acked-by: Will Deacon Reviewed-by: Vladimir Murzin Tested-by: Vladimir Murzin --- arch/arm64/Kconfig | 3 --- arch/arm64/include/asm/pgtable-prot.h | 4 ++-- arch/arm64/include/asm/pgtable.h | 11 ----------- arch/arm64/mm/mmap.c | 17 +++++++++++++++++ 4 files changed, 19 insertions(+), 16 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index c7a474f71eb4..f711a8eee6ff 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1236,9 +1236,6 @@ config HW_PERF_EVENTS def_bool y depends on ARM_PMU -config ARCH_HAS_FILTER_PGPROT - def_bool y - # Supported by clang >= 7.0 config CC_HAVE_SHADOW_CALL_STACK def_bool $(cc-option, -fsanitize=shadow-call-stack -ffixed-x18) diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index 7032f04c8ac6..b1e1b74d993c 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -92,7 +92,7 @@ extern bool arm64_use_ng_mappings; #define __P001 PAGE_READONLY #define __P010 PAGE_READONLY #define __P011 PAGE_READONLY -#define __P100 PAGE_EXECONLY +#define __P100 PAGE_READONLY_EXEC /* PAGE_EXECONLY if Enhanced PAN */ #define __P101 PAGE_READONLY_EXEC #define __P110 PAGE_READONLY_EXEC #define __P111 PAGE_READONLY_EXEC @@ -101,7 +101,7 @@ extern bool arm64_use_ng_mappings; #define __S001 PAGE_READONLY #define __S010 PAGE_SHARED #define __S011 PAGE_SHARED -#define __S100 PAGE_EXECONLY +#define __S100 PAGE_READONLY_EXEC /* PAGE_EXECONLY if Enhanced PAN */ #define __S101 PAGE_READONLY_EXEC #define __S110 PAGE_SHARED_EXEC #define __S111 PAGE_SHARED_EXEC diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index c4ba047a82d2..94e147e5456c 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1017,17 +1017,6 @@ static inline bool arch_wants_old_prefaulted_pte(void) } #define arch_wants_old_prefaulted_pte arch_wants_old_prefaulted_pte -static inline pgprot_t arch_filter_pgprot(pgprot_t prot) -{ - if (cpus_have_const_cap(ARM64_HAS_EPAN)) - return prot; - - if (pgprot_val(prot) != pgprot_val(PAGE_EXECONLY)) - return prot; - - return PAGE_READONLY_EXEC; -} - static inline bool pud_sect_supported(void) { return PAGE_SIZE == SZ_4K; diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c index a38f54cd638c..77ada00280d9 100644 --- a/arch/arm64/mm/mmap.c +++ b/arch/arm64/mm/mmap.c @@ -7,8 +7,10 @@ #include #include +#include #include +#include #include /* @@ -38,3 +40,18 @@ int valid_mmap_phys_addr_range(unsigned long pfn, size_t size) { return !(((pfn << PAGE_SHIFT) + size) & ~PHYS_MASK); } + +static int __init adjust_protection_map(void) +{ + /* + * With Enhanced PAN we can honour the execute-only permissions as + * there is no PAN override with such mappings. + */ + if (cpus_have_const_cap(ARM64_HAS_EPAN)) { + protection_map[VM_EXEC] = PAGE_EXECONLY; + protection_map[VM_EXEC | VM_SHARED] = PAGE_EXECONLY; + } + + return 0; +} +arch_initcall(adjust_protection_map); -- cgit From 9470c29faa91c804aa04de4c10634bf02462bfa5 Mon Sep 17 00:00:00 2001 From: Jernej Skrabec Date: Mon, 28 Feb 2022 19:14:36 +0100 Subject: drm/sun4i: mixer: Fix P010 and P210 format numbers It turns out that DE3 manual has inverted YUV and YVU format numbers for P010 and P210. Invert them. This was tested by playing video decoded to P010 and additionally confirmed by looking at BSP driver source. Fixes: 169ca4b38932 ("drm/sun4i: Add separate DE3 VI layer formats") Signed-off-by: Jernej Skrabec Signed-off-by: Maxime Ripard Link: https://patchwork.freedesktop.org/patch/msgid/20220228181436.1424550-1-jernej.skrabec@gmail.com --- drivers/gpu/drm/sun4i/sun8i_mixer.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/sun4i/sun8i_mixer.h b/drivers/gpu/drm/sun4i/sun8i_mixer.h index 145833a9d82d..5b3fbee18671 100644 --- a/drivers/gpu/drm/sun4i/sun8i_mixer.h +++ b/drivers/gpu/drm/sun4i/sun8i_mixer.h @@ -111,10 +111,10 @@ /* format 13 is semi-planar YUV411 VUVU */ #define SUN8I_MIXER_FBFMT_YUV411 14 /* format 15 doesn't exist */ -/* format 16 is P010 YVU */ -#define SUN8I_MIXER_FBFMT_P010_YUV 17 -/* format 18 is P210 YVU */ -#define SUN8I_MIXER_FBFMT_P210_YUV 19 +#define SUN8I_MIXER_FBFMT_P010_YUV 16 +/* format 17 is P010 YVU */ +#define SUN8I_MIXER_FBFMT_P210_YUV 18 +/* format 19 is P210 YVU */ /* format 20 is packed YVU444 10-bit */ /* format 21 is packed YUV444 10-bit */ -- cgit From e5417cbf7ab5df1632e68fe7d9e6331fc0e7dbd6 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 7 Mar 2022 12:13:30 +0000 Subject: net: dsa: mt7530: fix incorrect test in mt753x_phylink_validate() Discussing one of the tests in mt753x_phylink_validate() with Landen Chao confirms that the "||" should be "&&". Fix this. Fixes: c288575f7810 ("net: dsa: mt7530: Add the support of MT7531 switch") Signed-off-by: Russell King (Oracle) Link: https://lore.kernel.org/r/E1nRCF0-00CiXD-7q@rmk-PC.armlinux.org.uk Signed-off-by: Paolo Abeni --- drivers/net/dsa/mt7530.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index ff3c267d0f26..a251bc55727f 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -2936,7 +2936,7 @@ mt753x_phylink_validate(struct dsa_switch *ds, int port, phylink_set_port_modes(mask); - if (state->interface != PHY_INTERFACE_MODE_TRGMII || + if (state->interface != PHY_INTERFACE_MODE_TRGMII && !phy_interface_mode_is_8023z(state->interface)) { phylink_set(mask, 10baseT_Half); phylink_set(mask, 10baseT_Full); -- cgit From 1a4e53d2fc4f68aa654ad96d13ad042e1a8e8a7d Mon Sep 17 00:00:00 2001 From: Biju Das Date: Mon, 7 Mar 2022 18:48:43 +0000 Subject: spi: Fix invalid sgs value max_seg_size is unsigned int and it can have a value up to 2^32 (for eg:-RZ_DMAC driver sets dma_set_max_seg_size as U32_MAX) When this value is used in min_t() as an integer type, it becomes -1 and the value of sgs becomes 0. Fix this issue by replacing the 'int' data type with 'unsigned int' in min_t(). Signed-off-by: Biju Das Reviewed-by: Lad Prabhakar Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20220307184843.9994-1-biju.das.jz@bp.renesas.com Signed-off-by: Mark Brown --- drivers/spi/spi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index 4599b121d744..d96082dc3340 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -1019,10 +1019,10 @@ int spi_map_buf(struct spi_controller *ctlr, struct device *dev, int i, ret; if (vmalloced_buf || kmap_buf) { - desc_len = min_t(int, max_seg_size, PAGE_SIZE); + desc_len = min_t(unsigned int, max_seg_size, PAGE_SIZE); sgs = DIV_ROUND_UP(len + offset_in_page(buf), desc_len); } else if (virt_addr_valid(buf)) { - desc_len = min_t(int, max_seg_size, ctlr->max_dma_len); + desc_len = min_t(unsigned int, max_seg_size, ctlr->max_dma_len); sgs = DIV_ROUND_UP(len, desc_len); } else { return -EINVAL; -- cgit From 2f6edb6bcb2f3f41d876e0eba2ba97f87a0296ea Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Tue, 8 Mar 2022 10:36:31 +1030 Subject: ARM: dts: aspeed: Fix AST2600 quad spi group MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Requesting quad mode for the FMC resulted in an error: &fmc { status = "okay"; + pinctrl-names = "default"; + pinctrl-0 = <&pinctrl_fwqspi_default>' [ 0.742963] aspeed-g6-pinctrl 1e6e2000.syscon:pinctrl: invalid function FWQSPID in map table  This is because the quad mode pins are a group of pins, not a function. After applying this patch we can request the pins and the QSPI data lines are muxed: # cat /sys/kernel/debug/pinctrl/1e6e2000.syscon\:pinctrl-aspeed-g6-pinctrl/pinmux-pins |grep 1e620000.spi pin 196 (AE12): device 1e620000.spi function FWSPID group FWQSPID pin 197 (AF12): device 1e620000.spi function FWSPID group FWQSPID pin 240 (Y1): device 1e620000.spi function FWSPID group FWQSPID pin 241 (Y2): device 1e620000.spi function FWSPID group FWQSPID pin 242 (Y3): device 1e620000.spi function FWSPID group FWQSPID pin 243 (Y4): device 1e620000.spi function FWSPID group FWQSPID Fixes: f510f04c8c83 ("ARM: dts: aspeed: Add AST2600 pinmux nodes") Signed-off-by: Joel Stanley Reviewed-by: Andrew Jeffery Link: https://lore.kernel.org/r/20220304011010.974863-1-joel@jms.id.au Link: https://lore.kernel.org/r/20220304011010.974863-1-joel@jms.id.au' Signed-off-by: Arnd Bergmann --- arch/arm/boot/dts/aspeed-g6-pinctrl.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/aspeed-g6-pinctrl.dtsi b/arch/arm/boot/dts/aspeed-g6-pinctrl.dtsi index 6dde51c2aed3..e4775bbceecc 100644 --- a/arch/arm/boot/dts/aspeed-g6-pinctrl.dtsi +++ b/arch/arm/boot/dts/aspeed-g6-pinctrl.dtsi @@ -118,7 +118,7 @@ }; pinctrl_fwqspid_default: fwqspid_default { - function = "FWQSPID"; + function = "FWSPID"; groups = "FWQSPID"; }; -- cgit From 5adf349439d29f92467e864f728dfc23180f3ef9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 3 Mar 2022 12:23:23 +0100 Subject: x86/module: Fix the paravirt vs alternative order Ever since commit 4e6292114c74 ("x86/paravirt: Add new features for paravirt patching") there is an ordering dependency between patching paravirt ops and patching alternatives, the module loader still violates this. Fixes: 4e6292114c74 ("x86/paravirt: Add new features for paravirt patching") Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Reviewed-by: Miroslav Benes Cc: Link: https://lore.kernel.org/r/20220303112825.068773913@infradead.org --- arch/x86/kernel/module.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 95fa745e310a..96d7c27b7093 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -273,6 +273,14 @@ int module_finalize(const Elf_Ehdr *hdr, retpolines = s; } + /* + * See alternative_instructions() for the ordering rules between the + * various patching types. + */ + if (para) { + void *pseg = (void *)para->sh_addr; + apply_paravirt(pseg, pseg + para->sh_size); + } if (retpolines) { void *rseg = (void *)retpolines->sh_addr; apply_retpolines(rseg, rseg + retpolines->sh_size); @@ -290,11 +298,6 @@ int module_finalize(const Elf_Ehdr *hdr, tseg, tseg + text->sh_size); } - if (para) { - void *pseg = (void *)para->sh_addr; - apply_paravirt(pseg, pseg + para->sh_size); - } - /* make jump label nops */ jump_label_apply_nops(me); -- cgit From 979452fbc43028675b5a5da156f91928b739dea8 Mon Sep 17 00:00:00 2001 From: Robert Foss Date: Tue, 8 Mar 2022 10:49:10 +0100 Subject: dt-bindings: drm/bridge: anx7625: Revert DPI support Revert DPI support from binding. DPI support relies on the bus-type enum which does not yet support Mipi DPI, since no v4l2_fwnode_bus_type has been defined for this bus type. When DPI for anx7625 was initially added, it assumed that V4L2_FWNODE_BUS_TYPE_PARALLEL was the correct bus type for representing DPI, which it is not. In order to prevent adding this mis-usage to the ABI, let's revert the support. Signed-off-by: Robert Foss Reviewed-by: Laurent Pinchart Reviewed-by: Rob Herring Signed-off-by: Arnd Bergmann --- .../bindings/display/bridge/analogix,anx7625.yaml | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/Documentation/devicetree/bindings/display/bridge/analogix,anx7625.yaml b/Documentation/devicetree/bindings/display/bridge/analogix,anx7625.yaml index 1d3e88daca04..25b5ef3f759c 100644 --- a/Documentation/devicetree/bindings/display/bridge/analogix,anx7625.yaml +++ b/Documentation/devicetree/bindings/display/bridge/analogix,anx7625.yaml @@ -91,22 +91,7 @@ properties: $ref: /schemas/graph.yaml#/$defs/port-base unevaluatedProperties: false description: - MIPI DSI/DPI input. - - properties: - endpoint: - $ref: /schemas/media/video-interfaces.yaml# - type: object - additionalProperties: false - - properties: - remote-endpoint: true - - bus-type: - enum: [1, 5] - default: 1 - - data-lanes: true + Video port for MIPI DSI input. port@1: $ref: /schemas/graph.yaml#/properties/port @@ -155,8 +140,6 @@ examples: reg = <0>; anx7625_in: endpoint { remote-endpoint = <&mipi_dsi>; - bus-type = <5>; - data-lanes = <0 1 2 3>; }; }; -- cgit From d3258737afc0101f497745f83fc4038c963a6b81 Mon Sep 17 00:00:00 2001 From: Robert Foss Date: Tue, 8 Mar 2022 10:49:11 +0100 Subject: Revert "arm64: dts: mt8183: jacuzzi: Fix bus properties in anx's DSI endpoint" This reverts commit 32568ae37596b529628ac09b875f4874e614f63f. Signed-off-by: Robert Foss Reviewed-by: Chen-Yu Tsai Reviewed-by: Laurent Pinchart Acked-by: Matthias Brugger Signed-off-by: Arnd Bergmann --- arch/arm64/boot/dts/mediatek/mt8183-kukui-jacuzzi.dtsi | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/arm64/boot/dts/mediatek/mt8183-kukui-jacuzzi.dtsi b/arch/arm64/boot/dts/mediatek/mt8183-kukui-jacuzzi.dtsi index e8f133dc96b9..8f7bf33f607d 100644 --- a/arch/arm64/boot/dts/mediatek/mt8183-kukui-jacuzzi.dtsi +++ b/arch/arm64/boot/dts/mediatek/mt8183-kukui-jacuzzi.dtsi @@ -171,8 +171,6 @@ anx7625_in: endpoint { remote-endpoint = <&dsi_out>; - bus-type = <5>; - data-lanes = <0 1 2 3>; }; }; -- cgit From 25875aa71dfefd1959f07e626c4d285b88b27ac2 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 7 Mar 2022 19:28:32 +0000 Subject: ARM: include unprivileged BPF status in Spectre V2 reporting The mitigations for Spectre-BHB are only applied when an exception is taken, but when unprivileged BPF is enabled, userspace can load BPF programs that can be used to exploit the problem. When unprivileged BPF is enabled, report the vulnerable status via the spectre_v2 sysfs file. Signed-off-by: Russell King (Oracle) --- arch/arm/kernel/spectre.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/arm/kernel/spectre.c b/arch/arm/kernel/spectre.c index ade967f18d06..e7fea962d632 100644 --- a/arch/arm/kernel/spectre.c +++ b/arch/arm/kernel/spectre.c @@ -1,9 +1,19 @@ // SPDX-License-Identifier: GPL-2.0-only +#include #include #include #include +static bool _unprivileged_ebpf_enabled(void) +{ +#ifdef CONFIG_BPF_SYSCALL + return !sysctl_unprivileged_bpf_disabled; +#else + return false +#endif +} + ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr, char *buf) { @@ -31,6 +41,9 @@ ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, if (spectre_v2_state != SPECTRE_MITIGATED) return sprintf(buf, "%s\n", "Vulnerable"); + if (_unprivileged_ebpf_enabled()) + return sprintf(buf, "Vulnerable: Unprivileged eBPF enabled\n"); + switch (spectre_v2_methods) { case SPECTRE_V2_METHOD_BPIALL: method = "Branch predictor hardening"; -- cgit From d986afd5a7b75b477ac347b222354cecd97edc87 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Fri, 4 Mar 2022 15:55:59 +0800 Subject: MAINTAINERS: Update Jisheng's email address I'm leaving synaptics. Update my email address to my korg mail address and add entries to .mailmap as well to map my work addresses to korg mail address. Signed-off-by: Jisheng Zhang Link: https://lore.kernel.org/r/ce7213bd-28ac-6580-466e-875e755fe0ae@synaptics.com' Signed-off-by: Arnd Bergmann --- .mailmap | 2 ++ MAINTAINERS | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.mailmap b/.mailmap index b96b98a2d9de..97ccdf147111 100644 --- a/.mailmap +++ b/.mailmap @@ -187,6 +187,8 @@ Jiri Slaby Jiri Slaby Jiri Slaby Jiri Slaby +Jisheng Zhang +Jisheng Zhang Johan Hovold Johan Hovold John Paul Adrian Glaubitz diff --git a/MAINTAINERS b/MAINTAINERS index 6ec18c4bd242..d53eb153157a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2739,7 +2739,7 @@ N: stm32 N: stm ARM/Synaptics SoC support -M: Jisheng Zhang +M: Jisheng Zhang M: Sebastian Hesselbarth L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained -- cgit From 7e807f4b081c5813df21da54e9a0491ea2ce16e7 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Thu, 3 Mar 2022 17:23:49 -0600 Subject: dt-bindings: mfd: Fix pinctrl node name warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The recent addition pinctrl.yaml in commit c09acbc499e8 ("dt-bindings: pinctrl: use pinctrl.yaml") resulted in some node name warnings: Documentation/devicetree/bindings/mfd/cirrus,lochnagar.example.dt.yaml: \ lochnagar-pinctrl: $nodename:0: 'lochnagar-pinctrl' does not match '^(pinctrl|pinmux)(@[0-9a-f]+)?$' Documentation/devicetree/bindings/mfd/cirrus,madera.example.dt.yaml: \ codec@1a: $nodename:0: 'codec@1a' does not match '^(pinctrl|pinmux)(@[0-9a-f]+)?$' Documentation/devicetree/bindings/mfd/brcm,cru.example.dt.yaml: \ pin-controller@1c0: $nodename:0: 'pin-controller@1c0' does not match '^(pinctrl|pinmux)(@[0-9a-f]+)?$' Fix the node names to the preferred 'pinctrl'. For cirrus,madera, nothing from pinctrl.yaml schema is used, so just drop the reference. Fixes: c09acbc499e8 ("dt-bindings: pinctrl: use pinctrl.yaml") Cc: Rafał Miłecki Signed-off-by: Rob Herring Acked-by: Charles Keepax Acked-by: Lee Jones Link: https://lore.kernel.org/r/20220303232350.2591143-1-robh@kernel.org --- Documentation/devicetree/bindings/mfd/brcm,cru.yaml | 4 ++-- Documentation/devicetree/bindings/mfd/cirrus,lochnagar.yaml | 6 +++--- Documentation/devicetree/bindings/pinctrl/cirrus,madera.yaml | 3 --- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/Documentation/devicetree/bindings/mfd/brcm,cru.yaml b/Documentation/devicetree/bindings/mfd/brcm,cru.yaml index be4a2df71c25..b85819fbb07c 100644 --- a/Documentation/devicetree/bindings/mfd/brcm,cru.yaml +++ b/Documentation/devicetree/bindings/mfd/brcm,cru.yaml @@ -39,7 +39,7 @@ patternProperties: '^phy@[a-f0-9]+$': $ref: ../phy/bcm-ns-usb2-phy.yaml - '^pin-controller@[a-f0-9]+$': + '^pinctrl@[a-f0-9]+$': $ref: ../pinctrl/brcm,ns-pinmux.yaml '^syscon@[a-f0-9]+$': @@ -94,7 +94,7 @@ examples: reg = <0x180 0x4>; }; - pin-controller@1c0 { + pinctrl@1c0 { compatible = "brcm,bcm4708-pinmux"; reg = <0x1c0 0x24>; reg-names = "cru_gpio_control"; diff --git a/Documentation/devicetree/bindings/mfd/cirrus,lochnagar.yaml b/Documentation/devicetree/bindings/mfd/cirrus,lochnagar.yaml index c00ad3e21c21..ad285cb480c9 100644 --- a/Documentation/devicetree/bindings/mfd/cirrus,lochnagar.yaml +++ b/Documentation/devicetree/bindings/mfd/cirrus,lochnagar.yaml @@ -126,7 +126,7 @@ properties: clock-frequency: const: 12288000 - lochnagar-pinctrl: + pinctrl: type: object $ref: /schemas/pinctrl/cirrus,lochnagar.yaml# @@ -255,7 +255,7 @@ required: - reg - reset-gpios - lochnagar-clk - - lochnagar-pinctrl + - pinctrl additionalProperties: false @@ -293,7 +293,7 @@ examples: clock-frequency = <32768>; }; - lochnagar-pinctrl { + pinctrl { compatible = "cirrus,lochnagar-pinctrl"; gpio-controller; diff --git a/Documentation/devicetree/bindings/pinctrl/cirrus,madera.yaml b/Documentation/devicetree/bindings/pinctrl/cirrus,madera.yaml index c85f759ae5a3..8a90d8273767 100644 --- a/Documentation/devicetree/bindings/pinctrl/cirrus,madera.yaml +++ b/Documentation/devicetree/bindings/pinctrl/cirrus,madera.yaml @@ -107,9 +107,6 @@ properties: additionalProperties: false -allOf: - - $ref: "pinctrl.yaml#" - required: - pinctrl-0 - pinctrl-names -- cgit From f6eafa4022dd61e029205bea4d4147d26e69fef2 Mon Sep 17 00:00:00 2001 From: Aswath Govindraju Date: Thu, 16 Dec 2021 09:40:11 +0530 Subject: dt-bindings: phy: ti,tcan104x-can: Document mux-states property On some boards, for routing CAN signals from controller to transceivers, muxes might need to be set. This can be implemented using mux-states property. Therefore, document the same in the respective bindings. Signed-off-by: Aswath Govindraju Reviewed-by: Rob Herring Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20211216041012.16892-2-a-govindraju@ti.com --- Documentation/devicetree/bindings/phy/ti,tcan104x-can.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Documentation/devicetree/bindings/phy/ti,tcan104x-can.yaml b/Documentation/devicetree/bindings/phy/ti,tcan104x-can.yaml index 6107880e5246..02b76f15e717 100644 --- a/Documentation/devicetree/bindings/phy/ti,tcan104x-can.yaml +++ b/Documentation/devicetree/bindings/phy/ti,tcan104x-can.yaml @@ -37,6 +37,12 @@ properties: max bit rate supported in bps minimum: 1 + mux-states: + description: + mux controller node to route the signals from controller to + transceiver. + maxItems: 1 + required: - compatible - '#phy-cells' @@ -53,4 +59,5 @@ examples: max-bitrate = <5000000>; standby-gpios = <&wakeup_gpio1 16 GPIO_ACTIVE_LOW>; enable-gpios = <&main_gpio1 67 GPIO_ACTIVE_HIGH>; + mux-states = <&mux0 1>; }; -- cgit From 330f4c53d3c2d8b11d86ec03a964b86dc81452f5 Mon Sep 17 00:00:00 2001 From: Emmanuel Gil Peyrot Date: Tue, 8 Mar 2022 20:18:20 +0100 Subject: ARM: fix build error when BPF_SYSCALL is disabled It was missing a semicolon. Signed-off-by: Emmanuel Gil Peyrot Reviewed-by: Nathan Chancellor Fixes: 25875aa71dfe ("ARM: include unprivileged BPF status in Spectre V2 reporting"). Signed-off-by: Linus Torvalds --- arch/arm/kernel/spectre.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/kernel/spectre.c b/arch/arm/kernel/spectre.c index e7fea962d632..0dcefc36fb7a 100644 --- a/arch/arm/kernel/spectre.c +++ b/arch/arm/kernel/spectre.c @@ -10,7 +10,7 @@ static bool _unprivileged_ebpf_enabled(void) #ifdef CONFIG_BPF_SYSCALL return !sysctl_unprivileged_bpf_disabled; #else - return false + return false; #endif } -- cgit From 2cf29e55894886965722e6625f6a03630b4db31d Mon Sep 17 00:00:00 2001 From: Michal Maloszewski Date: Mon, 24 Jan 2022 13:35:43 +0000 Subject: iavf: Fix handling of vlan strip virtual channel messages Modify netdev->features for vlan stripping based on virtual channel messages received from the PF. Change is needed to synchronize vlan strip status between PF sysfs and iavf ethtool. Fixes: 5951a2b9812d ("iavf: Fix VLAN feature flags after VFR") Signed-off-by: Norbert Ciosek Signed-off-by: Michal Maloszewski Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/iavf/iavf_virtchnl.c | 40 +++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c index 88844d68e150..5263cefe46f5 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c +++ b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c @@ -1834,6 +1834,22 @@ void iavf_request_reset(struct iavf_adapter *adapter) adapter->current_op = VIRTCHNL_OP_UNKNOWN; } +/** + * iavf_netdev_features_vlan_strip_set - update vlan strip status + * @netdev: ptr to netdev being adjusted + * @enable: enable or disable vlan strip + * + * Helper function to change vlan strip status in netdev->features. + */ +static void iavf_netdev_features_vlan_strip_set(struct net_device *netdev, + const bool enable) +{ + if (enable) + netdev->features |= NETIF_F_HW_VLAN_CTAG_RX; + else + netdev->features &= ~NETIF_F_HW_VLAN_CTAG_RX; +} + /** * iavf_virtchnl_completion * @adapter: adapter structure @@ -2057,8 +2073,18 @@ void iavf_virtchnl_completion(struct iavf_adapter *adapter, } break; case VIRTCHNL_OP_ENABLE_VLAN_STRIPPING: + dev_warn(&adapter->pdev->dev, "Changing VLAN Stripping is not allowed when Port VLAN is configured\n"); + /* Vlan stripping could not be enabled by ethtool. + * Disable it in netdev->features. + */ + iavf_netdev_features_vlan_strip_set(netdev, false); + break; case VIRTCHNL_OP_DISABLE_VLAN_STRIPPING: dev_warn(&adapter->pdev->dev, "Changing VLAN Stripping is not allowed when Port VLAN is configured\n"); + /* Vlan stripping could not be disabled by ethtool. + * Enable it in netdev->features. + */ + iavf_netdev_features_vlan_strip_set(netdev, true); break; default: dev_err(&adapter->pdev->dev, "PF returned error %d (%s) to our request %d\n", @@ -2312,6 +2338,20 @@ void iavf_virtchnl_completion(struct iavf_adapter *adapter, spin_unlock_bh(&adapter->adv_rss_lock); } break; + case VIRTCHNL_OP_ENABLE_VLAN_STRIPPING: + /* PF enabled vlan strip on this VF. + * Update netdev->features if needed to be in sync with ethtool. + */ + if (!v_retval) + iavf_netdev_features_vlan_strip_set(netdev, true); + break; + case VIRTCHNL_OP_DISABLE_VLAN_STRIPPING: + /* PF disabled vlan strip on this VF. + * Update netdev->features if needed to be in sync with ethtool. + */ + if (!v_retval) + iavf_netdev_features_vlan_strip_set(netdev, false); + break; default: if (adapter->current_op && (v_opcode != adapter->current_op)) dev_warn(&adapter->pdev->dev, "Expected response %d from PF, received %d\n", -- cgit From 57d03f5608c34079f6f15031f4e8b1e2ae95dcb0 Mon Sep 17 00:00:00 2001 From: Michal Maloszewski Date: Wed, 2 Feb 2022 12:44:54 +0000 Subject: iavf: Fix adopting new combined setting In some cases overloaded flag IAVF_FLAG_REINIT_ITR_NEEDED which should indicate that interrupts need to be completely reinitialized during reset leads to RTNL deadlocks using ethtool -C while a reset is in progress. To fix, it was added a new flag IAVF_FLAG_REINIT_MSIX_NEEDED used to trigger MSI-X reinit. New combined setting is fixed adopt after VF reset. This has been implemented by call reinit interrupt scheme during VF reset. Without this fix new combined setting has never been adopted. Fixes: 209f2f9c7181 ("iavf: Add support for VIRTCHNL_VF_OFFLOAD_VLAN_V2 negotiation") Signed-off-by: Grzegorz Szczurek Signed-off-by: Mitch Williams Signed-off-by: Michal Maloszewski Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/iavf/iavf.h | 1 + drivers/net/ethernet/intel/iavf/iavf_main.c | 13 +++++++++---- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf.h b/drivers/net/ethernet/intel/iavf/iavf.h index 89423947ee65..4babe4705a55 100644 --- a/drivers/net/ethernet/intel/iavf/iavf.h +++ b/drivers/net/ethernet/intel/iavf/iavf.h @@ -288,6 +288,7 @@ struct iavf_adapter { #define IAVF_FLAG_REINIT_ITR_NEEDED BIT(16) #define IAVF_FLAG_QUEUES_DISABLED BIT(17) #define IAVF_FLAG_SETUP_NETDEV_FEATURES BIT(18) +#define IAVF_FLAG_REINIT_MSIX_NEEDED BIT(20) /* duplicates for common code */ #define IAVF_FLAG_DCB_ENABLED 0 /* flags for admin queue service task */ diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index dcf24264c7ea..8e644e9ed8da 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -2120,7 +2120,7 @@ int iavf_parse_vf_resource_msg(struct iavf_adapter *adapter) "Requested %d queues, but PF only gave us %d.\n", num_req_queues, adapter->vsi_res->num_queue_pairs); - adapter->flags |= IAVF_FLAG_REINIT_ITR_NEEDED; + adapter->flags |= IAVF_FLAG_REINIT_MSIX_NEEDED; adapter->num_req_queues = adapter->vsi_res->num_queue_pairs; iavf_schedule_reset(adapter); @@ -2727,7 +2727,8 @@ continue_reset: err); adapter->aq_required = 0; - if (adapter->flags & IAVF_FLAG_REINIT_ITR_NEEDED) { + if ((adapter->flags & IAVF_FLAG_REINIT_MSIX_NEEDED) || + (adapter->flags & IAVF_FLAG_REINIT_ITR_NEEDED)) { err = iavf_reinit_interrupt_scheme(adapter); if (err) goto reset_err; @@ -2799,12 +2800,13 @@ continue_reset: if (err) goto reset_err; - if (adapter->flags & IAVF_FLAG_REINIT_ITR_NEEDED) { + if ((adapter->flags & IAVF_FLAG_REINIT_MSIX_NEEDED) || + (adapter->flags & IAVF_FLAG_REINIT_ITR_NEEDED)) { err = iavf_request_traffic_irqs(adapter, netdev->name); if (err) goto reset_err; - adapter->flags &= ~IAVF_FLAG_REINIT_ITR_NEEDED; + adapter->flags &= ~IAVF_FLAG_REINIT_MSIX_NEEDED; } iavf_configure(adapter); @@ -2819,6 +2821,9 @@ continue_reset: iavf_change_state(adapter, __IAVF_DOWN); wake_up(&adapter->down_waitqueue); } + + adapter->flags &= ~IAVF_FLAG_REINIT_ITR_NEEDED; + mutex_unlock(&adapter->client_lock); mutex_unlock(&adapter->crit_lock); -- cgit From 5710ab79166504013f7c0ae6a57e7d2fd26e5c43 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Wed, 16 Feb 2022 16:51:35 -0800 Subject: i40e: stop disabling VFs due to PF error responses The i40e_vc_send_msg_to_vf_ex (and its wrapper i40e_vc_send_msg_to_vf) function has logic to detect "failure" responses sent to the VF. If a VF is sent more than I40E_DEFAULT_NUM_INVALID_MSGS_ALLOWED, then the VF is marked as disabled. In either case, a dev_info message is printed stating that a VF opcode failed. This logic originates from the early implementation of VF support in commit 5c3c48ac6bf5 ("i40e: implement virtual device interface"). That commit did not go far enough. The "logic" for this behavior seems to be that error responses somehow indicate a malicious VF. This is not really true. The PF might be sending an error for any number of reasons such as lacking resources, an unsupported operation, etc. This does not indicate a malicious VF. We already have a separate robust malicious VF detection which relies on hardware logic to detect and prevent a variety of behaviors. There is no justification for this behavior in the original implementation. In fact, a later commit 18b7af57d9c1 ("i40e: Lower some message levels") reduced the opcode failure message from a dev_err to a dev_info. In addition, recent commit 01cbf50877e6 ("i40e: Fix to not show opcode msg on unsuccessful VF MAC change") changed the logic to allow quieting it for expected failures. That commit prevented this logic from kicking in for specific circumstances. This change did not go far enough. The behavior is not documented nor is it part of any requirement for our products. Other operating systems such as the FreeBSD implementation of our driver do not include this logic. It is clear this check does not make sense, and causes problems which led to ugly workarounds. Fix this by just removing the entire logic and the need for the i40e_vc_send_msg_to_vf_ex function. Fixes: 01cbf50877e6 ("i40e: Fix to not show opcode msg on unsuccessful VF MAC change") Fixes: 5c3c48ac6bf5 ("i40e: implement virtual device interface") Signed-off-by: Jacob Keller Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_debugfs.c | 6 +-- drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 57 +++------------------- drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h | 5 -- 3 files changed, 9 insertions(+), 59 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c index 1e57cc8c47d7..9db5001297c7 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c +++ b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c @@ -742,10 +742,8 @@ static void i40e_dbg_dump_vf(struct i40e_pf *pf, int vf_id) vsi = pf->vsi[vf->lan_vsi_idx]; dev_info(&pf->pdev->dev, "vf %2d: VSI id=%d, seid=%d, qps=%d\n", vf_id, vf->lan_vsi_id, vsi->seid, vf->num_queue_pairs); - dev_info(&pf->pdev->dev, " num MDD=%lld, invalid msg=%lld, valid msg=%lld\n", - vf->num_mdd_events, - vf->num_invalid_msgs, - vf->num_valid_msgs); + dev_info(&pf->pdev->dev, " num MDD=%lld\n", + vf->num_mdd_events); } else { dev_info(&pf->pdev->dev, "invalid VF id %d\n", vf_id); } diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index dfdb6e786461..2606e8f0f19b 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -1917,19 +1917,17 @@ sriov_configure_out: /***********************virtual channel routines******************/ /** - * i40e_vc_send_msg_to_vf_ex + * i40e_vc_send_msg_to_vf * @vf: pointer to the VF info * @v_opcode: virtual channel opcode * @v_retval: virtual channel return value * @msg: pointer to the msg buffer * @msglen: msg length - * @is_quiet: true for not printing unsuccessful return values, false otherwise * * send msg to VF **/ -static int i40e_vc_send_msg_to_vf_ex(struct i40e_vf *vf, u32 v_opcode, - u32 v_retval, u8 *msg, u16 msglen, - bool is_quiet) +static int i40e_vc_send_msg_to_vf(struct i40e_vf *vf, u32 v_opcode, + u32 v_retval, u8 *msg, u16 msglen) { struct i40e_pf *pf; struct i40e_hw *hw; @@ -1944,25 +1942,6 @@ static int i40e_vc_send_msg_to_vf_ex(struct i40e_vf *vf, u32 v_opcode, hw = &pf->hw; abs_vf_id = vf->vf_id + hw->func_caps.vf_base_id; - /* single place to detect unsuccessful return values */ - if (v_retval && !is_quiet) { - vf->num_invalid_msgs++; - dev_info(&pf->pdev->dev, "VF %d failed opcode %d, retval: %d\n", - vf->vf_id, v_opcode, v_retval); - if (vf->num_invalid_msgs > - I40E_DEFAULT_NUM_INVALID_MSGS_ALLOWED) { - dev_err(&pf->pdev->dev, - "Number of invalid messages exceeded for VF %d\n", - vf->vf_id); - dev_err(&pf->pdev->dev, "Use PF Control I/F to enable the VF\n"); - set_bit(I40E_VF_STATE_DISABLED, &vf->vf_states); - } - } else { - vf->num_valid_msgs++; - /* reset the invalid counter, if a valid message is received. */ - vf->num_invalid_msgs = 0; - } - aq_ret = i40e_aq_send_msg_to_vf(hw, abs_vf_id, v_opcode, v_retval, msg, msglen, NULL); if (aq_ret) { @@ -1975,23 +1954,6 @@ static int i40e_vc_send_msg_to_vf_ex(struct i40e_vf *vf, u32 v_opcode, return 0; } -/** - * i40e_vc_send_msg_to_vf - * @vf: pointer to the VF info - * @v_opcode: virtual channel opcode - * @v_retval: virtual channel return value - * @msg: pointer to the msg buffer - * @msglen: msg length - * - * send msg to VF - **/ -static int i40e_vc_send_msg_to_vf(struct i40e_vf *vf, u32 v_opcode, - u32 v_retval, u8 *msg, u16 msglen) -{ - return i40e_vc_send_msg_to_vf_ex(vf, v_opcode, v_retval, - msg, msglen, false); -} - /** * i40e_vc_send_resp_to_vf * @vf: pointer to the VF info @@ -2822,7 +2784,6 @@ error_param: * i40e_check_vf_permission * @vf: pointer to the VF info * @al: MAC address list from virtchnl - * @is_quiet: set true for printing msg without opcode info, false otherwise * * Check that the given list of MAC addresses is allowed. Will return -EPERM * if any address in the list is not valid. Checks the following conditions: @@ -2837,8 +2798,7 @@ error_param: * addresses might not be accurate. **/ static inline int i40e_check_vf_permission(struct i40e_vf *vf, - struct virtchnl_ether_addr_list *al, - bool *is_quiet) + struct virtchnl_ether_addr_list *al) { struct i40e_pf *pf = vf->pf; struct i40e_vsi *vsi = pf->vsi[vf->lan_vsi_idx]; @@ -2846,7 +2806,6 @@ static inline int i40e_check_vf_permission(struct i40e_vf *vf, int mac2add_cnt = 0; int i; - *is_quiet = false; for (i = 0; i < al->num_elements; i++) { struct i40e_mac_filter *f; u8 *addr = al->list[i].addr; @@ -2870,7 +2829,6 @@ static inline int i40e_check_vf_permission(struct i40e_vf *vf, !ether_addr_equal(addr, vf->default_lan_addr.addr)) { dev_err(&pf->pdev->dev, "VF attempting to override administratively set MAC address, bring down and up the VF interface to resume normal operation\n"); - *is_quiet = true; return -EPERM; } @@ -2921,7 +2879,6 @@ static int i40e_vc_add_mac_addr_msg(struct i40e_vf *vf, u8 *msg) (struct virtchnl_ether_addr_list *)msg; struct i40e_pf *pf = vf->pf; struct i40e_vsi *vsi = NULL; - bool is_quiet = false; i40e_status ret = 0; int i; @@ -2938,7 +2895,7 @@ static int i40e_vc_add_mac_addr_msg(struct i40e_vf *vf, u8 *msg) */ spin_lock_bh(&vsi->mac_filter_hash_lock); - ret = i40e_check_vf_permission(vf, al, &is_quiet); + ret = i40e_check_vf_permission(vf, al); if (ret) { spin_unlock_bh(&vsi->mac_filter_hash_lock); goto error_param; @@ -2976,8 +2933,8 @@ static int i40e_vc_add_mac_addr_msg(struct i40e_vf *vf, u8 *msg) error_param: /* send the response to the VF */ - return i40e_vc_send_msg_to_vf_ex(vf, VIRTCHNL_OP_ADD_ETH_ADDR, - ret, NULL, 0, is_quiet); + return i40e_vc_send_msg_to_vf(vf, VIRTCHNL_OP_ADD_ETH_ADDR, + ret, NULL, 0); } /** diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h index 03c42fd0fea1..a554d0a0b09b 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h @@ -10,8 +10,6 @@ #define I40E_VIRTCHNL_SUPPORTED_QTYPES 2 -#define I40E_DEFAULT_NUM_INVALID_MSGS_ALLOWED 10 - #define I40E_VLAN_PRIORITY_SHIFT 13 #define I40E_VLAN_MASK 0xFFF #define I40E_PRIORITY_MASK 0xE000 @@ -92,9 +90,6 @@ struct i40e_vf { u8 num_queue_pairs; /* num of qps assigned to VF vsis */ u8 num_req_queues; /* num of requested qps */ u64 num_mdd_events; /* num of mdd events detected */ - /* num of continuous malformed or invalid msgs detected */ - u64 num_invalid_msgs; - u64 num_valid_msgs; /* num of valid msgs detected */ unsigned long vf_caps; /* vf's adv. capabilities */ unsigned long vf_states; /* vf's runtime states */ -- cgit From 79498d5af8e458102242d1667cf44df1f1564e63 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Wed, 16 Feb 2022 16:51:36 -0800 Subject: ice: stop disabling VFs due to PF error responses The ice_vc_send_msg_to_vf function has logic to detect "failure" responses being sent to a VF. If a VF is sent more than ICE_DFLT_NUM_INVAL_MSGS_ALLOWED then the VF is marked as disabled. Almost identical logic also existed in the i40e driver. This logic was added to the ice driver in commit 1071a8358a28 ("ice: Implement virtchnl commands for AVF support") which itself copied from the i40e implementation in commit 5c3c48ac6bf5 ("i40e: implement virtual device interface"). Neither commit provides a proper explanation or justification of the check. In fact, later commits to i40e changed the logic to allow bypassing the check in some specific instances. The "logic" for this seems to be that error responses somehow indicate a malicious VF. This is not really true. The PF might be sending an error for any number of reasons such as lack of resources, etc. Additionally, this causes the PF to log an info message for every failed VF response which may confuse users, and can spam the kernel log. This behavior is not documented as part of any requirement for our products and other operating system drivers such as the FreeBSD implementation of our drivers do not include this type of check. In fact, the change from dev_err to dev_info in i40e commit 18b7af57d9c1 ("i40e: Lower some message levels") explains that these messages typically don't actually indicate a real issue. It is quite likely that a user who hits this in practice will be very confused as the VF will be disabled without an obvious way to recover. We already have robust malicious driver detection logic using actual hardware detection mechanisms that detect and prevent invalid device usage. Remove the logic since its not a documented requirement and the behavior is not intuitive. Fixes: 1071a8358a28 ("ice: Implement virtchnl commands for AVF support") Signed-off-by: Jacob Keller Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c | 18 ------------------ drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h | 3 --- 2 files changed, 21 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c index 408f78e3eb13..1be3cd4b2bef 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c @@ -2182,24 +2182,6 @@ ice_vc_send_msg_to_vf(struct ice_vf *vf, u32 v_opcode, dev = ice_pf_to_dev(pf); - /* single place to detect unsuccessful return values */ - if (v_retval) { - vf->num_inval_msgs++; - dev_info(dev, "VF %d failed opcode %d, retval: %d\n", vf->vf_id, - v_opcode, v_retval); - if (vf->num_inval_msgs > ICE_DFLT_NUM_INVAL_MSGS_ALLOWED) { - dev_err(dev, "Number of invalid messages exceeded for VF %d\n", - vf->vf_id); - dev_err(dev, "Use PF Control I/F to enable the VF\n"); - set_bit(ICE_VF_STATE_DIS, vf->vf_states); - return -EIO; - } - } else { - vf->num_valid_msgs++; - /* reset the invalid counter, if a valid message is received. */ - vf->num_inval_msgs = 0; - } - aq_ret = ice_aq_send_msg_to_vf(&pf->hw, vf->vf_id, v_opcode, v_retval, msg, msglen, NULL); if (aq_ret && pf->hw.mailboxq.sq_last_status != ICE_AQ_RC_ENOSYS) { diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h index 752487a1bdd6..8f27255cc0cc 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h @@ -14,7 +14,6 @@ #define ICE_MAX_MACADDR_PER_VF 18 /* Malicious Driver Detection */ -#define ICE_DFLT_NUM_INVAL_MSGS_ALLOWED 10 #define ICE_MDD_EVENTS_THRESHOLD 30 /* Static VF transaction/status register def */ @@ -134,8 +133,6 @@ struct ice_vf { unsigned int max_tx_rate; /* Maximum Tx bandwidth limit in Mbps */ DECLARE_BITMAP(vf_states, ICE_VF_STATES_NBITS); /* VF runtime states */ - u64 num_inval_msgs; /* number of continuous invalid msgs */ - u64 num_valid_msgs; /* number of valid msgs detected */ unsigned long vf_caps; /* VF's adv. capabilities */ u8 num_req_qs; /* num of queue pairs requested by VF */ u16 num_mac; -- cgit From 97b0129146b1544bbb0773585327896da3bb4e0a Mon Sep 17 00:00:00 2001 From: Dave Ertman Date: Fri, 18 Feb 2022 12:39:25 -0800 Subject: ice: Fix error with handling of bonding MTU When a bonded interface is destroyed, .ndo_change_mtu can be called during the tear-down process while the RTNL lock is held. This is a problem since the auxiliary driver linked to the LAN driver needs to be notified of the MTU change, and this requires grabbing a device_lock on the auxiliary_device's dev. Currently this is being attempted in the same execution context as the call to .ndo_change_mtu which is causing a dead-lock. Move the notification of the changed MTU to a separate execution context (watchdog service task) and eliminate the "before" notification. Fixes: 348048e724a0e ("ice: Implement iidc operations") Signed-off-by: Dave Ertman Tested-by: Jonathan Toppins Tested-by: Gurucharan G (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice.h | 1 + drivers/net/ethernet/intel/ice/ice_main.c | 29 ++++++++++++++--------------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h index 473b1f6be9de..3121f9b04f59 100644 --- a/drivers/net/ethernet/intel/ice/ice.h +++ b/drivers/net/ethernet/intel/ice/ice.h @@ -483,6 +483,7 @@ enum ice_pf_flags { ICE_FLAG_MDD_AUTO_RESET_VF, ICE_FLAG_LINK_LENIENT_MODE_ENA, ICE_FLAG_PLUG_AUX_DEV, + ICE_FLAG_MTU_CHANGED, ICE_PF_FLAGS_NBITS /* must be last */ }; diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index f3c346e13b7a..6fc6514eb007 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -2258,6 +2258,17 @@ static void ice_service_task(struct work_struct *work) if (test_and_clear_bit(ICE_FLAG_PLUG_AUX_DEV, pf->flags)) ice_plug_aux_dev(pf); + if (test_and_clear_bit(ICE_FLAG_MTU_CHANGED, pf->flags)) { + struct iidc_event *event; + + event = kzalloc(sizeof(*event), GFP_KERNEL); + if (event) { + set_bit(IIDC_EVENT_AFTER_MTU_CHANGE, event->type); + ice_send_event_to_aux(pf, event); + kfree(event); + } + } + ice_clean_adminq_subtask(pf); ice_check_media_subtask(pf); ice_check_for_hang_subtask(pf); @@ -6822,7 +6833,6 @@ static int ice_change_mtu(struct net_device *netdev, int new_mtu) struct ice_netdev_priv *np = netdev_priv(netdev); struct ice_vsi *vsi = np->vsi; struct ice_pf *pf = vsi->back; - struct iidc_event *event; u8 count = 0; int err = 0; @@ -6857,14 +6867,6 @@ static int ice_change_mtu(struct net_device *netdev, int new_mtu) return -EBUSY; } - event = kzalloc(sizeof(*event), GFP_KERNEL); - if (!event) - return -ENOMEM; - - set_bit(IIDC_EVENT_BEFORE_MTU_CHANGE, event->type); - ice_send_event_to_aux(pf, event); - clear_bit(IIDC_EVENT_BEFORE_MTU_CHANGE, event->type); - netdev->mtu = (unsigned int)new_mtu; /* if VSI is up, bring it down and then back up */ @@ -6872,21 +6874,18 @@ static int ice_change_mtu(struct net_device *netdev, int new_mtu) err = ice_down(vsi); if (err) { netdev_err(netdev, "change MTU if_down err %d\n", err); - goto event_after; + return err; } err = ice_up(vsi); if (err) { netdev_err(netdev, "change MTU if_up err %d\n", err); - goto event_after; + return err; } } netdev_dbg(netdev, "changed MTU to %d\n", new_mtu); -event_after: - set_bit(IIDC_EVENT_AFTER_MTU_CHANGE, event->type); - ice_send_event_to_aux(pf, event); - kfree(event); + set_bit(ICE_FLAG_MTU_CHANGED, pf->flags); return err; } -- cgit From 3d97f1afd8d831e0c0dc1157418f94b8faa97b54 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 16 Jan 2022 19:46:20 +0100 Subject: ice: Don't use GFP_KERNEL in atomic context ice_misc_intr() is an irq handler. It should not sleep. Use GFP_ATOMIC instead of GFP_KERNEL when allocating some memory. Fixes: 348048e724a0 ("ice: Implement iidc operations") Signed-off-by: Christophe JAILLET Tested-by: Leszek Kaliszczuk Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 6fc6514eb007..83e3e8aae6cf 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -3034,7 +3034,7 @@ static irqreturn_t ice_misc_intr(int __always_unused irq, void *data) struct iidc_event *event; ena_mask &= ~ICE_AUX_CRIT_ERR; - event = kzalloc(sizeof(*event), GFP_KERNEL); + event = kzalloc(sizeof(*event), GFP_ATOMIC); if (event) { set_bit(IIDC_EVENT_CRIT_ERR, event->type); /* report the entire OICR value to AUX driver */ -- cgit From ad35ffa252af67d4cc7c744b9377a2b577748e3f Mon Sep 17 00:00:00 2001 From: Jedrzej Jagielski Date: Tue, 22 Feb 2022 11:43:04 +0000 Subject: ice: Fix curr_link_speed advertised speed Change curr_link_speed advertised speed, due to link_info.link_speed is not equal phy.curr_user_speed_req. Without this patch it is impossible to set advertised speed to same as link_speed. Testing Hints: Try to set advertised speed to 25G only with 25G default link (use ethtool -s 0x80000000) Fixes: 48cb27f2fd18 ("ice: Implement handlers for ethtool PHY/link operations") Signed-off-by: Grzegorz Siwik Signed-off-by: Jedrzej Jagielski Tested-by: Gurucharan (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_ethtool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c index e2e3ef7fba7f..a5dc9824e255 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool.c +++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c @@ -2298,7 +2298,7 @@ ice_set_link_ksettings(struct net_device *netdev, if (err) goto done; - curr_link_speed = pi->phy.link_info.link_speed; + curr_link_speed = pi->phy.curr_user_speed_req; adv_link_speed = ice_ksettings_find_adv_link_speed(ks); /* If speed didn't get set, set it to what it currently is. -- cgit From 0a5aa8d161d19a1b12fd25b434b32f7c885c73bb Mon Sep 17 00:00:00 2001 From: Shin'ichiro Kawasaki Date: Tue, 8 Mar 2022 17:09:15 +0900 Subject: block: fix blk_mq_attempt_bio_merge and rq_qos_throttle protection Commit 9d497e2941c3 ("block: don't protect submit_bio_checks by q_usage_counter") moved blk_mq_attempt_bio_merge and rq_qos_throttle calls out of q_usage_counter protection. However, these functions require q_usage_counter protection. The blk_mq_attempt_bio_merge call without the protection resulted in blktests block/005 failure with KASAN null- ptr-deref or use-after-free at bio merge. The rq_qos_throttle call without the protection caused kernel hang at qos throttle. To fix the failures, move the blk_mq_attempt_bio_merge and rq_qos_throttle calls back to q_usage_counter protection. Fixes: 9d497e2941c3 ("block: don't protect submit_bio_checks by q_usage_counter") Signed-off-by: Shin'ichiro Kawasaki Link: https://lore.kernel.org/r/20220308080915.3473689-1-shinichiro.kawasaki@wdc.com Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index d69ca91fbc8b..9a9185a0a2d1 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2718,7 +2718,8 @@ static bool blk_mq_attempt_bio_merge(struct request_queue *q, static struct request *blk_mq_get_new_requests(struct request_queue *q, struct blk_plug *plug, - struct bio *bio) + struct bio *bio, + unsigned int nsegs) { struct blk_mq_alloc_data data = { .q = q, @@ -2730,6 +2731,11 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q, if (unlikely(bio_queue_enter(bio))) return NULL; + if (blk_mq_attempt_bio_merge(q, bio, nsegs)) + goto queue_exit; + + rq_qos_throttle(q, bio); + if (plug) { data.nr_tags = plug->nr_ios; plug->nr_ios = 1; @@ -2742,12 +2748,13 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q, rq_qos_cleanup(q, bio); if (bio->bi_opf & REQ_NOWAIT) bio_wouldblock_error(bio); +queue_exit: blk_queue_exit(q); return NULL; } static inline struct request *blk_mq_get_cached_request(struct request_queue *q, - struct blk_plug *plug, struct bio *bio) + struct blk_plug *plug, struct bio **bio, unsigned int nsegs) { struct request *rq; @@ -2757,12 +2764,19 @@ static inline struct request *blk_mq_get_cached_request(struct request_queue *q, if (!rq || rq->q != q) return NULL; - if (blk_mq_get_hctx_type(bio->bi_opf) != rq->mq_hctx->type) + if (blk_mq_attempt_bio_merge(q, *bio, nsegs)) { + *bio = NULL; + return NULL; + } + + rq_qos_throttle(q, *bio); + + if (blk_mq_get_hctx_type((*bio)->bi_opf) != rq->mq_hctx->type) return NULL; - if (op_is_flush(rq->cmd_flags) != op_is_flush(bio->bi_opf)) + if (op_is_flush(rq->cmd_flags) != op_is_flush((*bio)->bi_opf)) return NULL; - rq->cmd_flags = bio->bi_opf; + rq->cmd_flags = (*bio)->bi_opf; plug->cached_rq = rq_list_next(rq); INIT_LIST_HEAD(&rq->queuelist); return rq; @@ -2800,14 +2814,11 @@ void blk_mq_submit_bio(struct bio *bio) if (!bio_integrity_prep(bio)) return; - if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) - return; - - rq_qos_throttle(q, bio); - - rq = blk_mq_get_cached_request(q, plug, bio); + rq = blk_mq_get_cached_request(q, plug, &bio, nr_segs); if (!rq) { - rq = blk_mq_get_new_requests(q, plug, bio); + if (!bio) + return; + rq = blk_mq_get_new_requests(q, plug, bio, nr_segs); if (unlikely(!rq)) return; } -- cgit From b19ab4b38b06aae12442b2de95ccf58b5dc53584 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Tue, 8 Mar 2022 02:47:49 +0000 Subject: ethernet: Fix error handling in xemaclite_of_probe This node pointer is returned by of_parse_phandle() with refcount incremented in this function. Calling of_node_put() to avoid the refcount leak. As the remove function do. Fixes: 5cdaaa12866e ("net: emaclite: adding MDIO and phy lib support") Signed-off-by: Miaoqian Lin Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20220308024751.2320-1-linmq006@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/xilinx/xilinx_emaclite.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/xilinx/xilinx_emaclite.c b/drivers/net/ethernet/xilinx/xilinx_emaclite.c index 519599480b15..77fa2cb03aca 100644 --- a/drivers/net/ethernet/xilinx/xilinx_emaclite.c +++ b/drivers/net/ethernet/xilinx/xilinx_emaclite.c @@ -1183,7 +1183,7 @@ static int xemaclite_of_probe(struct platform_device *ofdev) if (rc) { dev_err(dev, "Cannot register network device, aborting\n"); - goto error; + goto put_node; } dev_info(dev, @@ -1191,6 +1191,8 @@ static int xemaclite_of_probe(struct platform_device *ofdev) (unsigned long __force)ndev->mem_start, lp->base_addr, ndev->irq); return 0; +put_node: + of_node_put(lp->phy_node); error: free_netdev(ndev); return rc; -- cgit From c79fcc27be90b308b3fa90811aefafdd4078668c Mon Sep 17 00:00:00 2001 From: Tung Nguyen Date: Tue, 8 Mar 2022 02:11:59 +0000 Subject: tipc: fix incorrect order of state message data sanity check When receiving a state message, function tipc_link_validate_msg() is called to validate its header portion. Then, its data portion is validated before it can be accessed correctly. However, current data sanity check is done after the message header is accessed to update some link variables. This commit fixes this issue by moving the data sanity check to the beginning of state message handling and right after the header sanity check. Fixes: 9aa422ad3266 ("tipc: improve size validations for received domain records") Acked-by: Jon Maloy Signed-off-by: Tung Nguyen Link: https://lore.kernel.org/r/20220308021200.9245-1-tung.q.nguyen@dektech.com.au Signed-off-by: Jakub Kicinski --- net/tipc/link.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/net/tipc/link.c b/net/tipc/link.c index 1e14d7f8f28f..e260c0d557f5 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -2286,6 +2286,11 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, break; case STATE_MSG: + /* Validate Gap ACK blocks, drop if invalid */ + glen = tipc_get_gap_ack_blks(&ga, l, hdr, true); + if (glen > dlen) + break; + l->rcv_nxt_state = msg_seqno(hdr) + 1; /* Update own tolerance if peer indicates a non-zero value */ @@ -2311,10 +2316,6 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, break; } - /* Receive Gap ACK blocks from peer if any */ - glen = tipc_get_gap_ack_blks(&ga, l, hdr, true); - if(glen > dlen) - break; tipc_mon_rcv(l->net, data + glen, dlen - glen, l->addr, &l->mon_state, l->bearer_id); -- cgit From 7228918b34615ef6317edcd9a058a057bc54aa32 Mon Sep 17 00:00:00 2001 From: Ross Philipson Date: Wed, 23 Feb 2022 21:07:35 -0500 Subject: x86/boot: Fix memremap of setup_indirect structures As documented, the setup_indirect structure is nested inside the setup_data structures in the setup_data list. The code currently accesses the fields inside the setup_indirect structure but only the sizeof(struct setup_data) is being memremapped. No crash occurred but this is just due to how the area is remapped under the covers. Properly memremap both the setup_data and setup_indirect structures in these cases before accessing them. Fixes: b3c72fc9a78e ("x86/boot: Introduce setup_indirect") Signed-off-by: Ross Philipson Signed-off-by: Borislav Petkov Reviewed-by: Daniel Kiper Cc: Link: https://lore.kernel.org/r/1645668456-22036-2-git-send-email-ross.philipson@oracle.com --- arch/x86/kernel/e820.c | 41 +++++++++++++++++------- arch/x86/kernel/kdebugfs.c | 37 ++++++++++++++++------ arch/x86/kernel/ksysfs.c | 77 ++++++++++++++++++++++++++++++++++++---------- arch/x86/kernel/setup.c | 34 +++++++++++++++----- arch/x86/mm/ioremap.c | 24 ++++++++++++--- 5 files changed, 166 insertions(+), 47 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index bc0657f0deed..f267205f2d5a 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -995,8 +995,10 @@ early_param("memmap", parse_memmap_opt); */ void __init e820__reserve_setup_data(void) { + struct setup_indirect *indirect; struct setup_data *data; - u64 pa_data; + u64 pa_data, pa_next; + u32 len; pa_data = boot_params.hdr.setup_data; if (!pa_data) @@ -1004,6 +1006,14 @@ void __init e820__reserve_setup_data(void) while (pa_data) { data = early_memremap(pa_data, sizeof(*data)); + if (!data) { + pr_warn("e820: failed to memremap setup_data entry\n"); + return; + } + + len = sizeof(*data); + pa_next = data->next; + e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); /* @@ -1015,18 +1025,27 @@ void __init e820__reserve_setup_data(void) sizeof(*data) + data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); - if (data->type == SETUP_INDIRECT && - ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) { - e820__range_update(((struct setup_indirect *)data->data)->addr, - ((struct setup_indirect *)data->data)->len, - E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); - e820__range_update_kexec(((struct setup_indirect *)data->data)->addr, - ((struct setup_indirect *)data->data)->len, - E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); + if (data->type == SETUP_INDIRECT) { + len += data->len; + early_memunmap(data, sizeof(*data)); + data = early_memremap(pa_data, len); + if (!data) { + pr_warn("e820: failed to memremap indirect setup_data\n"); + return; + } + + indirect = (struct setup_indirect *)data->data; + + if (indirect->type != SETUP_INDIRECT) { + e820__range_update(indirect->addr, indirect->len, + E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); + e820__range_update_kexec(indirect->addr, indirect->len, + E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); + } } - pa_data = data->next; - early_memunmap(data, sizeof(*data)); + pa_data = pa_next; + early_memunmap(data, len); } e820__update_table(e820_table); diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c index 64b6da95af98..e2e89bebcbc3 100644 --- a/arch/x86/kernel/kdebugfs.c +++ b/arch/x86/kernel/kdebugfs.c @@ -88,11 +88,13 @@ create_setup_data_node(struct dentry *parent, int no, static int __init create_setup_data_nodes(struct dentry *parent) { + struct setup_indirect *indirect; struct setup_data_node *node; struct setup_data *data; - int error; + u64 pa_data, pa_next; struct dentry *d; - u64 pa_data; + int error; + u32 len; int no = 0; d = debugfs_create_dir("setup_data", parent); @@ -112,12 +114,29 @@ static int __init create_setup_data_nodes(struct dentry *parent) error = -ENOMEM; goto err_dir; } - - if (data->type == SETUP_INDIRECT && - ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) { - node->paddr = ((struct setup_indirect *)data->data)->addr; - node->type = ((struct setup_indirect *)data->data)->type; - node->len = ((struct setup_indirect *)data->data)->len; + pa_next = data->next; + + if (data->type == SETUP_INDIRECT) { + len = sizeof(*data) + data->len; + memunmap(data); + data = memremap(pa_data, len, MEMREMAP_WB); + if (!data) { + kfree(node); + error = -ENOMEM; + goto err_dir; + } + + indirect = (struct setup_indirect *)data->data; + + if (indirect->type != SETUP_INDIRECT) { + node->paddr = indirect->addr; + node->type = indirect->type; + node->len = indirect->len; + } else { + node->paddr = pa_data; + node->type = data->type; + node->len = data->len; + } } else { node->paddr = pa_data; node->type = data->type; @@ -125,7 +144,7 @@ static int __init create_setup_data_nodes(struct dentry *parent) } create_setup_data_node(d, no, node); - pa_data = data->next; + pa_data = pa_next; memunmap(data); no++; diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c index d0a19121c6a4..257892fcefa7 100644 --- a/arch/x86/kernel/ksysfs.c +++ b/arch/x86/kernel/ksysfs.c @@ -91,26 +91,41 @@ static int get_setup_data_paddr(int nr, u64 *paddr) static int __init get_setup_data_size(int nr, size_t *size) { - int i = 0; + u64 pa_data = boot_params.hdr.setup_data, pa_next; + struct setup_indirect *indirect; struct setup_data *data; - u64 pa_data = boot_params.hdr.setup_data; + int i = 0; + u32 len; while (pa_data) { data = memremap(pa_data, sizeof(*data), MEMREMAP_WB); if (!data) return -ENOMEM; + pa_next = data->next; + if (nr == i) { - if (data->type == SETUP_INDIRECT && - ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) - *size = ((struct setup_indirect *)data->data)->len; - else + if (data->type == SETUP_INDIRECT) { + len = sizeof(*data) + data->len; + memunmap(data); + data = memremap(pa_data, len, MEMREMAP_WB); + if (!data) + return -ENOMEM; + + indirect = (struct setup_indirect *)data->data; + + if (indirect->type != SETUP_INDIRECT) + *size = indirect->len; + else + *size = data->len; + } else { *size = data->len; + } memunmap(data); return 0; } - pa_data = data->next; + pa_data = pa_next; memunmap(data); i++; } @@ -120,9 +135,11 @@ static int __init get_setup_data_size(int nr, size_t *size) static ssize_t type_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { + struct setup_indirect *indirect; + struct setup_data *data; int nr, ret; u64 paddr; - struct setup_data *data; + u32 len; ret = kobj_to_setup_data_nr(kobj, &nr); if (ret) @@ -135,10 +152,20 @@ static ssize_t type_show(struct kobject *kobj, if (!data) return -ENOMEM; - if (data->type == SETUP_INDIRECT) - ret = sprintf(buf, "0x%x\n", ((struct setup_indirect *)data->data)->type); - else + if (data->type == SETUP_INDIRECT) { + len = sizeof(*data) + data->len; + memunmap(data); + data = memremap(paddr, len, MEMREMAP_WB); + if (!data) + return -ENOMEM; + + indirect = (struct setup_indirect *)data->data; + + ret = sprintf(buf, "0x%x\n", indirect->type); + } else { ret = sprintf(buf, "0x%x\n", data->type); + } + memunmap(data); return ret; } @@ -149,9 +176,10 @@ static ssize_t setup_data_data_read(struct file *fp, char *buf, loff_t off, size_t count) { + struct setup_indirect *indirect; + struct setup_data *data; int nr, ret = 0; u64 paddr, len; - struct setup_data *data; void *p; ret = kobj_to_setup_data_nr(kobj, &nr); @@ -165,10 +193,27 @@ static ssize_t setup_data_data_read(struct file *fp, if (!data) return -ENOMEM; - if (data->type == SETUP_INDIRECT && - ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) { - paddr = ((struct setup_indirect *)data->data)->addr; - len = ((struct setup_indirect *)data->data)->len; + if (data->type == SETUP_INDIRECT) { + len = sizeof(*data) + data->len; + memunmap(data); + data = memremap(paddr, len, MEMREMAP_WB); + if (!data) + return -ENOMEM; + + indirect = (struct setup_indirect *)data->data; + + if (indirect->type != SETUP_INDIRECT) { + paddr = indirect->addr; + len = indirect->len; + } else { + /* + * Even though this is technically undefined, return + * the data as though it is a normal setup_data struct. + * This will at least allow it to be inspected. + */ + paddr += sizeof(*data); + len = data->len; + } } else { paddr += sizeof(*data); len = data->len; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index f7a132eb794d..90d7e1788c91 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -369,21 +369,41 @@ static void __init parse_setup_data(void) static void __init memblock_x86_reserve_range_setup_data(void) { + struct setup_indirect *indirect; struct setup_data *data; - u64 pa_data; + u64 pa_data, pa_next; + u32 len; pa_data = boot_params.hdr.setup_data; while (pa_data) { data = early_memremap(pa_data, sizeof(*data)); + if (!data) { + pr_warn("setup: failed to memremap setup_data entry\n"); + return; + } + + len = sizeof(*data); + pa_next = data->next; + memblock_reserve(pa_data, sizeof(*data) + data->len); - if (data->type == SETUP_INDIRECT && - ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) - memblock_reserve(((struct setup_indirect *)data->data)->addr, - ((struct setup_indirect *)data->data)->len); + if (data->type == SETUP_INDIRECT) { + len += data->len; + early_memunmap(data, sizeof(*data)); + data = early_memremap(pa_data, len); + if (!data) { + pr_warn("setup: failed to memremap indirect setup_data\n"); + return; + } - pa_data = data->next; - early_memunmap(data, sizeof(*data)); + indirect = (struct setup_indirect *)data->data; + + if (indirect->type != SETUP_INDIRECT) + memblock_reserve(indirect->addr, indirect->len); + } + + pa_data = pa_next; + early_memunmap(data, len); } } diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 026031b3b782..ab666c4cd357 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -615,6 +615,7 @@ static bool memremap_is_efi_data(resource_size_t phys_addr, static bool memremap_is_setup_data(resource_size_t phys_addr, unsigned long size) { + struct setup_indirect *indirect; struct setup_data *data; u64 paddr, paddr_next; @@ -627,6 +628,10 @@ static bool memremap_is_setup_data(resource_size_t phys_addr, data = memremap(paddr, sizeof(*data), MEMREMAP_WB | MEMREMAP_DEC); + if (!data) { + pr_warn("failed to memremap setup_data entry\n"); + return false; + } paddr_next = data->next; len = data->len; @@ -636,10 +641,21 @@ static bool memremap_is_setup_data(resource_size_t phys_addr, return true; } - if (data->type == SETUP_INDIRECT && - ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) { - paddr = ((struct setup_indirect *)data->data)->addr; - len = ((struct setup_indirect *)data->data)->len; + if (data->type == SETUP_INDIRECT) { + memunmap(data); + data = memremap(paddr, sizeof(*data) + len, + MEMREMAP_WB | MEMREMAP_DEC); + if (!data) { + pr_warn("failed to memremap indirect setup_data\n"); + return false; + } + + indirect = (struct setup_indirect *)data->data; + + if (indirect->type != SETUP_INDIRECT) { + paddr = indirect->addr; + len = indirect->len; + } } memunmap(data); -- cgit From 445c1470b6ef96440e7cfc42dfc160f5004fd149 Mon Sep 17 00:00:00 2001 From: Ross Philipson Date: Wed, 23 Feb 2022 21:07:36 -0500 Subject: x86/boot: Add setup_indirect support in early_memremap_is_setup_data() The x86 boot documentation describes the setup_indirect structures and how they are used. Only one of the two functions in ioremap.c that needed to be modified to be aware of the introduction of setup_indirect functionality was updated. Adds comparable support to the other function where it was missing. Fixes: b3c72fc9a78e ("x86/boot: Introduce setup_indirect") Signed-off-by: Ross Philipson Signed-off-by: Borislav Petkov Reviewed-by: Daniel Kiper Cc: Link: https://lore.kernel.org/r/1645668456-22036-3-git-send-email-ross.philipson@oracle.com --- arch/x86/mm/ioremap.c | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index ab666c4cd357..17a492c27306 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -676,22 +676,51 @@ static bool memremap_is_setup_data(resource_size_t phys_addr, static bool __init early_memremap_is_setup_data(resource_size_t phys_addr, unsigned long size) { + struct setup_indirect *indirect; struct setup_data *data; u64 paddr, paddr_next; paddr = boot_params.hdr.setup_data; while (paddr) { - unsigned int len; + unsigned int len, size; if (phys_addr == paddr) return true; data = early_memremap_decrypted(paddr, sizeof(*data)); + if (!data) { + pr_warn("failed to early memremap setup_data entry\n"); + return false; + } + + size = sizeof(*data); paddr_next = data->next; len = data->len; - early_memunmap(data, sizeof(*data)); + if ((phys_addr > paddr) && (phys_addr < (paddr + len))) { + early_memunmap(data, sizeof(*data)); + return true; + } + + if (data->type == SETUP_INDIRECT) { + size += len; + early_memunmap(data, sizeof(*data)); + data = early_memremap_decrypted(paddr, size); + if (!data) { + pr_warn("failed to early memremap indirect setup_data\n"); + return false; + } + + indirect = (struct setup_indirect *)data->data; + + if (indirect->type != SETUP_INDIRECT) { + paddr = indirect->addr; + len = indirect->len; + } + } + + early_memunmap(data, size); if ((phys_addr > paddr) && (phys_addr < (paddr + len))) return true; -- cgit From 6babfc6e6fab068018c36e8f6605184b8c0b349d Mon Sep 17 00:00:00 2001 From: Jiasheng Jiang Date: Tue, 8 Mar 2022 14:40:07 +0800 Subject: net: ethernet: ti: cpts: Handle error for clk_enable As the potential failure of the clk_enable(), it should be better to check it and return error if fails. Fixes: 8a2c9a5ab4b9 ("net: ethernet: ti: cpts: rework initialization/deinitialization") Signed-off-by: Jiasheng Jiang Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/cpts.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ti/cpts.c b/drivers/net/ethernet/ti/cpts.c index dc70a6bfaa6a..92ca739fac01 100644 --- a/drivers/net/ethernet/ti/cpts.c +++ b/drivers/net/ethernet/ti/cpts.c @@ -568,7 +568,9 @@ int cpts_register(struct cpts *cpts) for (i = 0; i < CPTS_MAX_EVENTS; i++) list_add(&cpts->pool_data[i].list, &cpts->pool); - clk_enable(cpts->refclk); + err = clk_enable(cpts->refclk); + if (err) + return err; cpts_write32(cpts, CPTS_EN, control); cpts_write32(cpts, TS_PEND_EN, int_enable); -- cgit From 2a760554dcba450d3ad61b32375b50ed6d59a87c Mon Sep 17 00:00:00 2001 From: "Minghao Chi (CGEL ZTE)" Date: Tue, 8 Mar 2022 06:43:09 +0000 Subject: net:mcf8390: Use platform_get_irq() to get the interrupt It is not recommened to use platform_get_resource(pdev, IORESOURCE_IRQ) for requesting IRQ's resources any more, as they can be not ready yet in case of DT-booting. platform_get_irq() instead is a recommended way for getting IRQ even if it was not retrieved earlier. It also makes code simpler because we're getting "int" value right away and no conversion from resource to int is required. Reported-by: Zeal Robot Signed-off-by: Minghao Chi (CGEL ZTE) Signed-off-by: David S. Miller --- drivers/net/ethernet/8390/mcf8390.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/8390/mcf8390.c b/drivers/net/ethernet/8390/mcf8390.c index e320cccba61a..90cd7bdf06f5 100644 --- a/drivers/net/ethernet/8390/mcf8390.c +++ b/drivers/net/ethernet/8390/mcf8390.c @@ -405,12 +405,12 @@ static int mcf8390_init(struct net_device *dev) static int mcf8390_probe(struct platform_device *pdev) { struct net_device *dev; - struct resource *mem, *irq; + struct resource *mem; resource_size_t msize; - int ret; + int ret, irq; - irq = platform_get_resource(pdev, IORESOURCE_IRQ, 0); - if (irq == NULL) { + irq = platform_get_irq(pdev, 0); + if (irq < 0) { dev_err(&pdev->dev, "no IRQ specified?\n"); return -ENXIO; } @@ -433,7 +433,7 @@ static int mcf8390_probe(struct platform_device *pdev) SET_NETDEV_DEV(dev, &pdev->dev); platform_set_drvdata(pdev, dev); - dev->irq = irq->start; + dev->irq = irq; dev->base_addr = mem->start; ret = mcf8390_init(dev); -- cgit From 2169b79258c8be803d2595d6456b1e77129fe154 Mon Sep 17 00:00:00 2001 From: Jiasheng Jiang Date: Tue, 8 Mar 2022 14:57:39 +0800 Subject: net: ethernet: lpc_eth: Handle error for clk_enable As the potential failure of the clk_enable(), it should be better to check it and return error if fails. Fixes: b7370112f519 ("lpc32xx: Added ethernet driver") Signed-off-by: Jiasheng Jiang Signed-off-by: David S. Miller --- drivers/net/ethernet/nxp/lpc_eth.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/nxp/lpc_eth.c b/drivers/net/ethernet/nxp/lpc_eth.c index bc39558fe82b..756f97dce85b 100644 --- a/drivers/net/ethernet/nxp/lpc_eth.c +++ b/drivers/net/ethernet/nxp/lpc_eth.c @@ -1471,6 +1471,7 @@ static int lpc_eth_drv_resume(struct platform_device *pdev) { struct net_device *ndev = platform_get_drvdata(pdev); struct netdata_local *pldat; + int ret; if (device_may_wakeup(&pdev->dev)) disable_irq_wake(ndev->irq); @@ -1480,7 +1481,9 @@ static int lpc_eth_drv_resume(struct platform_device *pdev) pldat = netdev_priv(ndev); /* Enable interface clock */ - clk_enable(pldat->clk); + ret = clk_enable(pldat->clk); + if (ret) + return ret; /* Reset and initialize */ __lpc_eth_reset(pldat); -- cgit From c9ffa3e2bc451816ce0295e40063514fabf2bd36 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Tue, 8 Mar 2022 07:42:47 +0000 Subject: net: marvell: prestera: Add missing of_node_put() in prestera_switch_set_base_mac_addr This node pointer is returned by of_find_compatible_node() with refcount incremented. Calling of_node_put() to aovid the refcount leak. Fixes: 501ef3066c89 ("net: marvell: prestera: Add driver for Prestera family ASIC devices") Signed-off-by: Miaoqian Lin Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/prestera/prestera_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/marvell/prestera/prestera_main.c b/drivers/net/ethernet/marvell/prestera/prestera_main.c index cad93f747d0c..73cd0a4b7291 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_main.c +++ b/drivers/net/ethernet/marvell/prestera/prestera_main.c @@ -554,6 +554,7 @@ static int prestera_switch_set_base_mac_addr(struct prestera_switch *sw) dev_info(prestera_dev(sw), "using random base mac address\n"); } of_node_put(base_mac_np); + of_node_put(np); return prestera_hw_switch_mac_set(sw, sw->base_mac); } -- cgit From 71171ac8eb34ce7fe6b3267dce27c313ab3cb3ac Mon Sep 17 00:00:00 2001 From: Duoming Zhou Date: Tue, 8 Mar 2022 16:12:23 +0800 Subject: ax25: Fix NULL pointer dereference in ax25_kill_by_device When two ax25 devices attempted to establish connection, the requester use ax25_create(), ax25_bind() and ax25_connect() to initiate connection. The receiver use ax25_rcv() to accept connection and use ax25_create_cb() in ax25_rcv() to create ax25_cb, but the ax25_cb->sk is NULL. When the receiver is detaching, a NULL pointer dereference bug caused by sock_hold(sk) in ax25_kill_by_device() will happen. The corresponding fail log is shown below: =============================================================== BUG: KASAN: null-ptr-deref in ax25_device_event+0xfd/0x290 Call Trace: ... ax25_device_event+0xfd/0x290 raw_notifier_call_chain+0x5e/0x70 dev_close_many+0x174/0x220 unregister_netdevice_many+0x1f7/0xa60 unregister_netdevice_queue+0x12f/0x170 unregister_netdev+0x13/0x20 mkiss_close+0xcd/0x140 tty_ldisc_release+0xc0/0x220 tty_release_struct+0x17/0xa0 tty_release+0x62d/0x670 ... This patch add condition check in ax25_kill_by_device(). If s->sk is NULL, it will goto if branch to kill device. Fixes: 4e0f718daf97 ("ax25: improve the incomplete fix to avoid UAF and NPD bugs") Reported-by: Thomas Osterried Signed-off-by: Duoming Zhou Signed-off-by: David S. Miller --- net/ax25/af_ax25.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index d53cbb4e2503..6bd097180772 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -87,6 +87,13 @@ again: ax25_for_each(s, &ax25_list) { if (s->ax25_dev == ax25_dev) { sk = s->sk; + if (!sk) { + spin_unlock_bh(&ax25_list_lock); + s->ax25_dev = NULL; + ax25_disconnect(s, ENETUNREACH); + spin_lock_bh(&ax25_list_lock); + goto again; + } sock_hold(sk); spin_unlock_bh(&ax25_list_lock); lock_sock(sk); -- cgit From b859ebedd1e730bbda69142fca87af4e712649a1 Mon Sep 17 00:00:00 2001 From: Paul Semel Date: Tue, 8 Mar 2022 10:30:58 +0100 Subject: arm64: kasan: fix include error in MTE functions Fix `error: expected string literal in 'asm'`. This happens when compiling an ebpf object file that includes `net/net_namespace.h` from linux kernel headers. Include trace: include/net/net_namespace.h:10 include/linux/workqueue.h:9 include/linux/timer.h:8 include/linux/debugobjects.h:6 include/linux/spinlock.h:90 include/linux/workqueue.h:9 arch/arm64/include/asm/spinlock.h:9 arch/arm64/include/generated/asm/qrwlock.h:1 include/asm-generic/qrwlock.h:14 arch/arm64/include/asm/processor.h:33 arch/arm64/include/asm/kasan.h:9 arch/arm64/include/asm/mte-kasan.h:45 arch/arm64/include/asm/mte-def.h:14 Signed-off-by: Paul Semel Fixes: 2cb34276427a ("arm64: kasan: simplify and inline MTE functions") Cc: # 5.12.x Link: https://lore.kernel.org/r/bacb5387-2992-97e4-0c48-1ed925905bee@gmail.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/mte-kasan.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/include/asm/mte-kasan.h b/arch/arm64/include/asm/mte-kasan.h index e4704a403237..a857bcacf0fe 100644 --- a/arch/arm64/include/asm/mte-kasan.h +++ b/arch/arm64/include/asm/mte-kasan.h @@ -5,6 +5,7 @@ #ifndef __ASM_MTE_KASAN_H #define __ASM_MTE_KASAN_H +#include #include #ifndef __ASSEMBLY__ -- cgit From f0cfe17bcc1dd2f0872966b554a148e888833ee9 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Wed, 9 Mar 2022 14:13:02 +0100 Subject: tracing/osnoise: Do not unregister events twice Nicolas reported that using: # trace-cmd record -e all -M 10 -p osnoise --poll Resulted in the following kernel warning: ------------[ cut here ]------------ WARNING: CPU: 0 PID: 1217 at kernel/tracepoint.c:404 tracepoint_probe_unregister+0x280/0x370 [...] CPU: 0 PID: 1217 Comm: trace-cmd Not tainted 5.17.0-rc6-next-20220307-nico+ #19 RIP: 0010:tracepoint_probe_unregister+0x280/0x370 [...] CR2: 00007ff919b29497 CR3: 0000000109da4005 CR4: 0000000000170ef0 Call Trace: osnoise_workload_stop+0x36/0x90 tracing_set_tracer+0x108/0x260 tracing_set_trace_write+0x94/0xd0 ? __check_object_size.part.0+0x10a/0x150 ? selinux_file_permission+0x104/0x150 vfs_write+0xb5/0x290 ksys_write+0x5f/0xe0 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7ff919a18127 [...] ---[ end trace 0000000000000000 ]--- The warning complains about an attempt to unregister an unregistered tracepoint. This happens on trace-cmd because it first stops tracing, and then switches the tracer to nop. Which is equivalent to: # cd /sys/kernel/tracing/ # echo osnoise > current_tracer # echo 0 > tracing_on # echo nop > current_tracer The osnoise tracer stops the workload when no trace instance is actually collecting data. This can be caused both by disabling tracing or disabling the tracer itself. To avoid unregistering events twice, use the existing trace_osnoise_callback_enabled variable to check if the events (and the workload) are actually active before trying to deactivate them. Link: https://lore.kernel.org/all/c898d1911f7f9303b7e14726e7cc9678fbfb4a0e.camel@redhat.com/ Link: https://lkml.kernel.org/r/938765e17d5a781c2df429a98f0b2e7cc317b022.1646823913.git.bristot@kernel.org Cc: stable@vger.kernel.org Cc: Marcelo Tosatti Fixes: 2fac8d6486d5 ("tracing/osnoise: Allow multiple instances of the same tracer") Reported-by: Nicolas Saenz Julienne Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_osnoise.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index cfddb30e65ab..2aa3efdca755 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -2200,6 +2200,17 @@ static void osnoise_workload_stop(void) if (osnoise_has_registered_instances()) return; + /* + * If callbacks were already disabled in a previous stop + * call, there is no need to disable then again. + * + * For instance, this happens when tracing is stopped via: + * echo 0 > tracing_on + * echo nop > current_tracer. + */ + if (!trace_osnoise_callback_enabled) + return; + trace_osnoise_callback_enabled = false; /* * Make sure that ftrace_nmi_enter/exit() see -- cgit From caf4c86bf136845982c5103b2661751b40c474c0 Mon Sep 17 00:00:00 2001 From: Nicolas Saenz Julienne Date: Mon, 7 Mar 2022 19:07:40 +0100 Subject: tracing/osnoise: Force quiescent states while tracing At the moment running osnoise on a nohz_full CPU or uncontested FIFO priority and a PREEMPT_RCU kernel might have the side effect of extending grace periods too much. This will entice RCU to force a context switch on the wayward CPU to end the grace period, all while introducing unwarranted noise into the tracer. This behaviour is unavoidable as overly extending grace periods might exhaust the system's memory. This same exact problem is what extended quiescent states (EQS) were created for, conversely, rcu_momentary_dyntick_idle() emulates them by performing a zero duration EQS. So let's make use of it. In the common case rcu_momentary_dyntick_idle() is fairly inexpensive: atomically incrementing a local per-CPU counter and doing a store. So it shouldn't affect osnoise's measurements (which has a 1us granularity), so we'll call it unanimously. The uncommon case involve calling rcu_momentary_dyntick_idle() after having the osnoise process: - Receive an expedited quiescent state IPI with preemption disabled or during an RCU critical section. (activates rdp->cpu_no_qs.b.exp code-path). - Being preempted within in an RCU critical section and having the subsequent outermost rcu_read_unlock() called with interrupts disabled. (t->rcu_read_unlock_special.b.blocked code-path). Neither of those are possible at the moment, and are unlikely to be in the future given the osnoise's loop design. On top of this, the noise generated by the situations described above is unavoidable, and if not exposed by rcu_momentary_dyntick_idle() will be eventually seen in subsequent rcu_read_unlock() calls or schedule operations. Link: https://lkml.kernel.org/r/20220307180740.577607-1-nsaenzju@redhat.com Cc: stable@vger.kernel.org Fixes: bce29ac9ce0b ("trace: Add osnoise tracer") Signed-off-by: Nicolas Saenz Julienne Acked-by: Paul E. McKenney Acked-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_osnoise.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 2aa3efdca755..5e3c62a08fc0 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1386,6 +1386,26 @@ static int run_osnoise(void) osnoise_stop_tracing(); } + /* + * In some cases, notably when running on a nohz_full CPU with + * a stopped tick PREEMPT_RCU has no way to account for QSs. + * This will eventually cause unwarranted noise as PREEMPT_RCU + * will force preemption as the means of ending the current + * grace period. We avoid this problem by calling + * rcu_momentary_dyntick_idle(), which performs a zero duration + * EQS allowing PREEMPT_RCU to end the current grace period. + * This call shouldn't be wrapped inside an RCU critical + * section. + * + * Note that in non PREEMPT_RCU kernels QSs are handled through + * cond_resched() + */ + if (IS_ENABLED(CONFIG_PREEMPT_RCU)) { + local_irq_disable(); + rcu_momentary_dyntick_idle(); + local_irq_enable(); + } + /* * For the non-preemptive kernel config: let threads runs, if * they so wish. -- cgit From 78cbc6513217b00be6a9904415ef7ff3619eb035 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Mon, 7 Mar 2022 08:43:03 +0800 Subject: ftrace: Fix some W=1 warnings in kernel doc comments Clean up the following clang-w1 warning: kernel/trace/ftrace.c:7827: warning: Function parameter or member 'ops' not described in 'unregister_ftrace_function'. kernel/trace/ftrace.c:7805: warning: Function parameter or member 'ops' not described in 'register_ftrace_function'. Link: https://lkml.kernel.org/r/20220307004303.26399-1-jiapeng.chong@linux.alibaba.com Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a4b462b6f944..6105b7036482 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -7790,7 +7790,7 @@ int ftrace_is_dead(void) /** * register_ftrace_function - register a function for profiling - * @ops - ops structure that holds the function for profiling. + * @ops: ops structure that holds the function for profiling. * * Register a function to be called by all functions in the * kernel. @@ -7817,7 +7817,7 @@ EXPORT_SYMBOL_GPL(register_ftrace_function); /** * unregister_ftrace_function - unregister a function for profiling. - * @ops - ops structure that holds the function to unregister + * @ops: ops structure that holds the function to unregister * * Unregister a function that was added to be called by ftrace profiling. */ -- cgit From ac77998b7ac3044f0509b097da9637184598980d Mon Sep 17 00:00:00 2001 From: Mohammad Kabat Date: Thu, 25 Mar 2021 14:38:55 +0200 Subject: net/mlx5: Fix size field in bufferx_reg struct According to HW spec the field "size" should be 16 bits in bufferx register. Fixes: e281682bf294 ("net/mlx5_core: HW data structs/types definitions cleanup") Signed-off-by: Mohammad Kabat Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 598ac3bcc901..5743f5b3414b 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -9900,8 +9900,8 @@ struct mlx5_ifc_bufferx_reg_bits { u8 reserved_at_0[0x6]; u8 lossy[0x1]; u8 epsb[0x1]; - u8 reserved_at_8[0xc]; - u8 size[0xc]; + u8 reserved_at_8[0x8]; + u8 size[0x10]; u8 xoff_threshold[0x10]; u8 xon_threshold[0x10]; -- cgit From 063bd355595428750803d8736a9bb7c8db67d42d Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Fri, 4 Feb 2022 11:47:44 +0200 Subject: net/mlx5: Fix a race on command flush flow Fix a refcount use after free warning due to a race on command entry. Such race occurs when one of the commands releases its last refcount and frees its index and entry while another process running command flush flow takes refcount to this command entry. The process which handles commands flush may see this command as needed to be flushed if the other process released its refcount but didn't release the index yet. Fix it by adding the needed spin lock. It fixes the following warning trace: refcount_t: addition on 0; use-after-free. WARNING: CPU: 11 PID: 540311 at lib/refcount.c:25 refcount_warn_saturate+0x80/0xe0 ... RIP: 0010:refcount_warn_saturate+0x80/0xe0 ... Call Trace: mlx5_cmd_trigger_completions+0x293/0x340 [mlx5_core] mlx5_cmd_flush+0x3a/0xf0 [mlx5_core] enter_error_state+0x44/0x80 [mlx5_core] mlx5_fw_fatal_reporter_err_work+0x37/0xe0 [mlx5_core] process_one_work+0x1be/0x390 worker_thread+0x4d/0x3d0 ? rescuer_thread+0x350/0x350 kthread+0x141/0x160 ? set_kthread_struct+0x40/0x40 ret_from_fork+0x1f/0x30 Fixes: 50b2412b7e78 ("net/mlx5: Avoid possible free of command entry while timeout comp handler") Signed-off-by: Moshe Shemesh Reviewed-by: Eran Ben Elisha Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index 17fe05809653..3eacd8739929 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -131,11 +131,8 @@ static int cmd_alloc_index(struct mlx5_cmd *cmd) static void cmd_free_index(struct mlx5_cmd *cmd, int idx) { - unsigned long flags; - - spin_lock_irqsave(&cmd->alloc_lock, flags); + lockdep_assert_held(&cmd->alloc_lock); set_bit(idx, &cmd->bitmask); - spin_unlock_irqrestore(&cmd->alloc_lock, flags); } static void cmd_ent_get(struct mlx5_cmd_work_ent *ent) @@ -145,17 +142,21 @@ static void cmd_ent_get(struct mlx5_cmd_work_ent *ent) static void cmd_ent_put(struct mlx5_cmd_work_ent *ent) { + struct mlx5_cmd *cmd = ent->cmd; + unsigned long flags; + + spin_lock_irqsave(&cmd->alloc_lock, flags); if (!refcount_dec_and_test(&ent->refcnt)) - return; + goto out; if (ent->idx >= 0) { - struct mlx5_cmd *cmd = ent->cmd; - cmd_free_index(cmd, ent->idx); up(ent->page_queue ? &cmd->pages_sem : &cmd->sem); } cmd_free_ent(ent); +out: + spin_unlock_irqrestore(&cmd->alloc_lock, flags); } static struct mlx5_cmd_layout *get_inst(struct mlx5_cmd *cmd, int idx) -- cgit From 39bab83b119faac4bf7f07173a42ed35be95147e Mon Sep 17 00:00:00 2001 From: Dima Chumak Date: Mon, 17 Jan 2022 15:32:16 +0200 Subject: net/mlx5: Fix offloading with ESWITCH_IPV4_TTL_MODIFY_ENABLE Only prio 1 is supported for nic mode when there is no ignore flow level support in firmware. But for switchdev mode, which supports fixed number of statically pre-allocated prios, this restriction is not relevant so it can be relaxed. Fixes: d671e109bd85 ("net/mlx5: Fix tc max supported prio for nic mode") Signed-off-by: Dima Chumak Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c index 1e8ec4f236b2..df58cba37930 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c @@ -121,9 +121,6 @@ u32 mlx5_chains_get_nf_ft_chain(struct mlx5_fs_chains *chains) u32 mlx5_chains_get_prio_range(struct mlx5_fs_chains *chains) { - if (!mlx5_chains_prios_supported(chains)) - return 1; - if (mlx5_chains_ignore_flow_level_supported(chains)) return UINT_MAX; -- cgit From ad11c4f1d8fd1f03639460e425a36f7fd0ea83f5 Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Wed, 16 Feb 2022 13:56:57 +0200 Subject: net/mlx5e: Lag, Only handle events from highest priority multipath entry There could be multiple multipath entries but changing the port affinity for each one doesn't make much sense and there should be a default one. So only track the entry with lowest priority value. The commit doesn't affect existing users with a single entry. Fixes: 544fe7c2e654 ("net/mlx5e: Activate HW multipath and handle port affinity based on FIB events") Signed-off-by: Roi Dayan Reviewed-by: Maor Dickman Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c index 1ca01a5b6cdd..626aa60b6099 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c @@ -126,6 +126,10 @@ static void mlx5_lag_fib_route_event(struct mlx5_lag *ldev, return; } + /* Handle multipath entry with lower priority value */ + if (mp->mfi && mp->mfi != fi && fi->fib_priority >= mp->mfi->fib_priority) + return; + /* Handle add/replace event */ nhs = fib_info_num_path(fi); if (nhs == 1) { @@ -135,12 +139,13 @@ static void mlx5_lag_fib_route_event(struct mlx5_lag *ldev, int i = mlx5_lag_dev_get_netdev_idx(ldev, nh_dev); if (i < 0) - i = MLX5_LAG_NORMAL_AFFINITY; - else - ++i; + return; + i++; mlx5_lag_set_port_affinity(ldev, i); } + + mp->mfi = fi; return; } -- cgit From 99a2b9be077ae3a5d97fbf5f7782e0f2e9812978 Mon Sep 17 00:00:00 2001 From: Ben Ben-Ishay Date: Wed, 2 Mar 2022 17:07:08 +0200 Subject: net/mlx5e: SHAMPO, reduce TIR indication SHAMPO is an RQ / WQ feature, an indication was added to the TIR in the first place to enforce suitability between connected TIR and RQ, this enforcement does not exist in current the Firmware implementation and was redundant in the first place. Fixes: 83439f3c37aa ("net/mlx5e: Add HW-GRO offload") Signed-off-by: Ben Ben-Ishay Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/tir.c | 3 --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 3 +-- include/linux/mlx5/mlx5_ifc.h | 1 - 3 files changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tir.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tir.c index da169b816665..d4239e3b3c88 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tir.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tir.c @@ -88,9 +88,6 @@ void mlx5e_tir_builder_build_packet_merge(struct mlx5e_tir_builder *builder, (MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ - rough_max_l2_l3_hdr_sz) >> 8); MLX5_SET(tirc, tirc, lro_timeout_period_usecs, pkt_merge_param->timeout); break; - case MLX5E_PACKET_MERGE_SHAMPO: - MLX5_SET(tirc, tirc, packet_merge_mask, MLX5_TIRC_PACKET_MERGE_MASK_SHAMPO); - break; default: break; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index bf80fb612449..3667f5ef5990 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -3616,8 +3616,7 @@ static int set_feature_hw_gro(struct net_device *netdev, bool enable) goto out; } - err = mlx5e_safe_switch_params(priv, &new_params, - mlx5e_modify_tirs_packet_merge_ctx, NULL, reset); + err = mlx5e_safe_switch_params(priv, &new_params, NULL, NULL, reset); out: mutex_unlock(&priv->state_lock); return err; diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 5743f5b3414b..49a48d7709ac 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -3434,7 +3434,6 @@ enum { enum { MLX5_TIRC_PACKET_MERGE_MASK_IPV4_LRO = BIT(0), MLX5_TIRC_PACKET_MERGE_MASK_IPV6_LRO = BIT(1), - MLX5_TIRC_PACKET_MERGE_MASK_SHAMPO = BIT(2), }; enum { -- cgit From 33970b031dc4653cc9dc80f2886976706c4c8ef1 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Wed, 9 Mar 2022 19:08:42 +0000 Subject: ARM: fix co-processor register typo In the recent Spectre BHB patches, there was a typo that is only exposed in certain configurations: mcr p15,0,XX,c7,r5,4 should have been mcr p15,0,XX,c7,c5,4 Reported-by: kernel test robot Fixes: b9baf5c8c5c3 ("ARM: Spectre-BHB workaround") Signed-off-by: Russell King (Oracle) Acked-by: Catalin Marinas Signed-off-by: Linus Torvalds --- arch/arm/include/asm/assembler.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h index b50cc0169d5c..aee73ef5b3dc 100644 --- a/arch/arm/include/asm/assembler.h +++ b/arch/arm/include/asm/assembler.h @@ -113,7 +113,7 @@ .endm .macro isb, args - mcr p15, 0, r0, c7, r5, 4 + mcr p15, 0, r0, c7, c5, 4 .endm #endif -- cgit From 52c9f93a9c482251cb0d224faa602ba26d462be8 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 9 Mar 2022 12:16:34 -0700 Subject: arm64: Do not include __READ_ONCE() block in assembly files When building arm64 defconfig + CONFIG_LTO_CLANG_{FULL,THIN}=y after commit 558c303c9734 ("arm64: Mitigate spectre style branch history side channels"), the following error occurs: :4:2: error: invalid fixup for movz/movk instruction mov w0, #ARM_SMCCC_ARCH_WORKAROUND_3 ^ Marc figured out that moving "#include " in include/linux/arm-smccc.h into a !__ASSEMBLY__ block resolves it. The full include chain with CONFIG_LTO=y from include/linux/arm-smccc.h: include/linux/init.h include/linux/compiler.h arch/arm64/include/asm/rwonce.h arch/arm64/include/asm/alternative-macros.h arch/arm64/include/asm/assembler.h The asm/alternative-macros.h include in asm/rwonce.h only happens when CONFIG_LTO is set, which ultimately casues asm/assembler.h to be included before the definition of ARM_SMCCC_ARCH_WORKAROUND_3. As a result, the preprocessor does not expand ARM_SMCCC_ARCH_WORKAROUND_3 in __mitigate_spectre_bhb_fw, which results in the error above. Avoid this problem by just avoiding the CONFIG_LTO=y __READ_ONCE() block in asm/rwonce.h with assembly files, as nothing in that block is useful to assembly files, which allows ARM_SMCCC_ARCH_WORKAROUND_3 to be properly expanded with CONFIG_LTO=y builds. Fixes: e35123d83ee3 ("arm64: lto: Strengthen READ_ONCE() to acquire when CONFIG_LTO=y") Cc: # 5.11.x Link: https://lore.kernel.org/r/20220309155716.3988480-1-maz@kernel.org/ Reported-by: Marc Zyngier Acked-by: James Morse Signed-off-by: Nathan Chancellor Link: https://lore.kernel.org/r/20220309191633.2307110-1-nathan@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/rwonce.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/rwonce.h b/arch/arm64/include/asm/rwonce.h index 1bce62fa908a..56f7b1d4d54b 100644 --- a/arch/arm64/include/asm/rwonce.h +++ b/arch/arm64/include/asm/rwonce.h @@ -5,7 +5,7 @@ #ifndef __ASM_RWONCE_H #define __ASM_RWONCE_H -#ifdef CONFIG_LTO +#if defined(CONFIG_LTO) && !defined(__ASSEMBLY__) #include #include @@ -66,7 +66,7 @@ }) #endif /* !BUILD_VDSO */ -#endif /* CONFIG_LTO */ +#endif /* CONFIG_LTO && !__ASSEMBLY__ */ #include -- cgit From 36168e387fa7d0f1fe0cd5cf76c8cea7aee714fa Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 9 Mar 2022 15:07:27 -0700 Subject: ARM: Do not use NOCROSSREFS directive with ld.lld ld.lld does not support the NOCROSSREFS directive at the moment, which breaks the build after commit b9baf5c8c5c3 ("ARM: Spectre-BHB workaround"): ld.lld: error: ./arch/arm/kernel/vmlinux.lds:34: AT expected, but got NOCROSSREFS Support for this directive will eventually be implemented, at which point a version check can be added. To avoid breaking the build in the meantime, just define NOCROSSREFS to nothing when using ld.lld, with a link to the issue for tracking. Cc: stable@vger.kernel.org Fixes: b9baf5c8c5c3 ("ARM: Spectre-BHB workaround") Link: https://github.com/ClangBuiltLinux/linux/issues/1609 Signed-off-by: Nathan Chancellor Signed-off-by: Linus Torvalds --- arch/arm/include/asm/vmlinux.lds.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/arm/include/asm/vmlinux.lds.h b/arch/arm/include/asm/vmlinux.lds.h index 0ef21bfae9f6..fad45c884e98 100644 --- a/arch/arm/include/asm/vmlinux.lds.h +++ b/arch/arm/include/asm/vmlinux.lds.h @@ -26,6 +26,14 @@ #define ARM_MMU_DISCARD(x) x #endif +/* + * ld.lld does not support NOCROSSREFS: + * https://github.com/ClangBuiltLinux/linux/issues/1609 + */ +#ifdef CONFIG_LD_IS_LLD +#define NOCROSSREFS +#endif + /* Set start/end symbol names to the LMA for the section */ #define ARM_LMA(sym, section) \ sym##_start = LOADADDR(section); \ -- cgit From f80cfe2f26581f188429c12bd937eb905ad3ac7b Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Tue, 8 Mar 2022 21:50:07 +0300 Subject: NFC: port100: fix use-after-free in port100_send_complete Syzbot reported UAF in port100_send_complete(). The root case is in missing usb_kill_urb() calls on error handling path of ->probe function. port100_send_complete() accesses devm allocated memory which will be freed on probe failure. We should kill this urbs before returning an error from probe function to prevent reported use-after-free Fail log: BUG: KASAN: use-after-free in port100_send_complete+0x16e/0x1a0 drivers/nfc/port100.c:935 Read of size 1 at addr ffff88801bb59540 by task ksoftirqd/2/26 ... Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 print_address_description.constprop.0.cold+0x8d/0x303 mm/kasan/report.c:255 __kasan_report mm/kasan/report.c:442 [inline] kasan_report.cold+0x83/0xdf mm/kasan/report.c:459 port100_send_complete+0x16e/0x1a0 drivers/nfc/port100.c:935 __usb_hcd_giveback_urb+0x2b0/0x5c0 drivers/usb/core/hcd.c:1670 ... Allocated by task 1255: kasan_save_stack+0x1e/0x40 mm/kasan/common.c:38 kasan_set_track mm/kasan/common.c:45 [inline] set_alloc_info mm/kasan/common.c:436 [inline] ____kasan_kmalloc mm/kasan/common.c:515 [inline] ____kasan_kmalloc mm/kasan/common.c:474 [inline] __kasan_kmalloc+0xa6/0xd0 mm/kasan/common.c:524 alloc_dr drivers/base/devres.c:116 [inline] devm_kmalloc+0x96/0x1d0 drivers/base/devres.c:823 devm_kzalloc include/linux/device.h:209 [inline] port100_probe+0x8a/0x1320 drivers/nfc/port100.c:1502 Freed by task 1255: kasan_save_stack+0x1e/0x40 mm/kasan/common.c:38 kasan_set_track+0x21/0x30 mm/kasan/common.c:45 kasan_set_free_info+0x20/0x30 mm/kasan/generic.c:370 ____kasan_slab_free mm/kasan/common.c:366 [inline] ____kasan_slab_free+0xff/0x140 mm/kasan/common.c:328 kasan_slab_free include/linux/kasan.h:236 [inline] __cache_free mm/slab.c:3437 [inline] kfree+0xf8/0x2b0 mm/slab.c:3794 release_nodes+0x112/0x1a0 drivers/base/devres.c:501 devres_release_all+0x114/0x190 drivers/base/devres.c:530 really_probe+0x626/0xcc0 drivers/base/dd.c:670 Reported-and-tested-by: syzbot+16bcb127fb73baeecb14@syzkaller.appspotmail.com Fixes: 0347a6ab300a ("NFC: port100: Commands mechanism implementation") Signed-off-by: Pavel Skripkin Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20220308185007.6987-1-paskripkin@gmail.com Signed-off-by: Jakub Kicinski --- drivers/nfc/port100.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nfc/port100.c b/drivers/nfc/port100.c index d7db1a0e6be1..00d8ea6dcb5d 100644 --- a/drivers/nfc/port100.c +++ b/drivers/nfc/port100.c @@ -1612,7 +1612,9 @@ free_nfc_dev: nfc_digital_free_device(dev->nfc_digital_dev); error: + usb_kill_urb(dev->in_urb); usb_free_urb(dev->in_urb); + usb_kill_urb(dev->out_urb); usb_free_urb(dev->out_urb); usb_put_dev(dev->udev); -- cgit From 18dfc667550fe9c032a6dcc3402b50e691e18029 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Tue, 8 Mar 2022 23:15:00 +0100 Subject: selftests: pmtu.sh: Kill tcpdump processes launched by subshell. The cleanup() function takes care of killing processes launched by the test functions. It relies on variables like ${tcpdump_pids} to get the relevant PIDs. But tests are run in their own subshell, so updated *_pids values are invisible to other shells. Therefore cleanup() never sees any process to kill: $ ./tools/testing/selftests/net/pmtu.sh -t pmtu_ipv4_exception TEST: ipv4: PMTU exceptions [ OK ] TEST: ipv4: PMTU exceptions - nexthop objects [ OK ] $ pgrep -af tcpdump 6084 tcpdump -s 0 -i veth_A-R1 -w pmtu_ipv4_exception_veth_A-R1.pcap 6085 tcpdump -s 0 -i veth_R1-A -w pmtu_ipv4_exception_veth_R1-A.pcap 6086 tcpdump -s 0 -i veth_R1-B -w pmtu_ipv4_exception_veth_R1-B.pcap 6087 tcpdump -s 0 -i veth_B-R1 -w pmtu_ipv4_exception_veth_B-R1.pcap 6088 tcpdump -s 0 -i veth_A-R2 -w pmtu_ipv4_exception_veth_A-R2.pcap 6089 tcpdump -s 0 -i veth_R2-A -w pmtu_ipv4_exception_veth_R2-A.pcap 6090 tcpdump -s 0 -i veth_R2-B -w pmtu_ipv4_exception_veth_R2-B.pcap 6091 tcpdump -s 0 -i veth_B-R2 -w pmtu_ipv4_exception_veth_B-R2.pcap 6228 tcpdump -s 0 -i veth_A-R1 -w pmtu_ipv4_exception_veth_A-R1.pcap 6229 tcpdump -s 0 -i veth_R1-A -w pmtu_ipv4_exception_veth_R1-A.pcap 6230 tcpdump -s 0 -i veth_R1-B -w pmtu_ipv4_exception_veth_R1-B.pcap 6231 tcpdump -s 0 -i veth_B-R1 -w pmtu_ipv4_exception_veth_B-R1.pcap 6232 tcpdump -s 0 -i veth_A-R2 -w pmtu_ipv4_exception_veth_A-R2.pcap 6233 tcpdump -s 0 -i veth_R2-A -w pmtu_ipv4_exception_veth_R2-A.pcap 6234 tcpdump -s 0 -i veth_R2-B -w pmtu_ipv4_exception_veth_R2-B.pcap 6235 tcpdump -s 0 -i veth_B-R2 -w pmtu_ipv4_exception_veth_B-R2.pcap Fix this by running cleanup() in the context of the test subshell. Now that each test cleans the environment after completion, there's no need for calling cleanup() again when the next test starts. So let's drop it from the setup() function. This is okay because cleanup() is also called when pmtu.sh starts, so even the first test starts in a clean environment. Also, use tcpdump's immediate mode. Otherwise it might not have time to process buffered packets, resulting in missing packets or even empty pcap files for short tests. Note: PAUSE_ON_FAIL is still evaluated before cleanup(), so one can still inspect the test environment upon failure when using -p. Fixes: a92a0a7b8e7c ("selftests: pmtu: Simplify cleanup and namespace names") Signed-off-by: Guillaume Nault Reviewed-by: Shuah Khan Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/pmtu.sh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh index 543ad7513a8e..2e8972573d91 100755 --- a/tools/testing/selftests/net/pmtu.sh +++ b/tools/testing/selftests/net/pmtu.sh @@ -865,7 +865,6 @@ setup_ovs_bridge() { setup() { [ "$(id -u)" -ne 0 ] && echo " need to run as root" && return $ksft_skip - cleanup for arg do eval setup_${arg} || { echo " ${arg} not supported"; return 1; } done @@ -876,7 +875,7 @@ trace() { for arg do [ "${ns_cmd}" = "" ] && ns_cmd="${arg}" && continue - ${ns_cmd} tcpdump -s 0 -i "${arg}" -w "${name}_${arg}.pcap" 2> /dev/null & + ${ns_cmd} tcpdump --immediate-mode -s 0 -i "${arg}" -w "${name}_${arg}.pcap" 2> /dev/null & tcpdump_pids="${tcpdump_pids} $!" ns_cmd= done @@ -1836,6 +1835,10 @@ run_test() { unset IFS + # Since cleanup() relies on variables modified by this subshell, it + # has to run in this context. + trap cleanup EXIT + if [ "$VERBOSE" = "1" ]; then printf "\n##########################################################################\n\n" fi -- cgit From 94a4a4fe4c696413932eed8bdec46574de9576b8 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Tue, 8 Mar 2022 23:15:03 +0100 Subject: selftests: pmtu.sh: Kill nettest processes launched in subshell. When using "run_cmd &", then "$!" refers to the PID of the subshell used to run , not the command itself. Therefore nettest_pids actually doesn't contain the list of the nettest commands running in the background. So cleanup() can't kill them and the nettest processes run until completion (fortunately they have a 5s timeout). Fix this by defining a new command for running processes in the background, for which "$!" really refers to the PID of the command run. Also, double quote variables on the modified lines, to avoid shellcheck warnings. Fixes: ece1278a9b81 ("selftests: net: add ESP-in-UDP PMTU test") Signed-off-by: Guillaume Nault Reviewed-by: Shuah Khan Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/pmtu.sh | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh index 2e8972573d91..694732e4b344 100755 --- a/tools/testing/selftests/net/pmtu.sh +++ b/tools/testing/selftests/net/pmtu.sh @@ -374,6 +374,16 @@ run_cmd() { return $rc } +run_cmd_bg() { + cmd="$*" + + if [ "$VERBOSE" = "1" ]; then + printf " COMMAND: %s &\n" "${cmd}" + fi + + $cmd 2>&1 & +} + # Find the auto-generated name for this namespace nsname() { eval echo \$NS_$1 @@ -670,10 +680,10 @@ setup_nettest_xfrm() { [ ${1} -eq 6 ] && proto="-6" || proto="" port=${2} - run_cmd ${ns_a} nettest ${proto} -q -D -s -x -p ${port} -t 5 & + run_cmd_bg "${ns_a}" nettest "${proto}" -q -D -s -x -p "${port}" -t 5 nettest_pids="${nettest_pids} $!" - run_cmd ${ns_b} nettest ${proto} -q -D -s -x -p ${port} -t 5 & + run_cmd_bg "${ns_b}" nettest "${proto}" -q -D -s -x -p "${port}" -t 5 nettest_pids="${nettest_pids} $!" } -- cgit From 03fe003547975680fdb9ff5ab0e41cb68276c4f2 Mon Sep 17 00:00:00 2001 From: Mark Featherston Date: Wed, 9 Mar 2022 17:16:16 -0800 Subject: gpio: ts4900: Do not set DAT and OE together This works around an issue with the hardware where both OE and DAT are exposed in the same register. If both are updated simultaneously, the harware makes no guarantees that OE or DAT will actually change in any given order and may result in a glitch of a few ns on a GPIO pin when changing direction and value in a single write. Setting direction to input now only affects OE bit. Setting direction to output updates DAT first, then OE. Fixes: 9c6686322d74 ("gpio: add Technologic I2C-FPGA gpio support") Signed-off-by: Mark Featherston Signed-off-by: Kris Bahnsen Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-ts4900.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/drivers/gpio/gpio-ts4900.c b/drivers/gpio/gpio-ts4900.c index d885032cf814..d918d2df4de2 100644 --- a/drivers/gpio/gpio-ts4900.c +++ b/drivers/gpio/gpio-ts4900.c @@ -1,7 +1,7 @@ /* * Digital I/O driver for Technologic Systems I2C FPGA Core * - * Copyright (C) 2015 Technologic Systems + * Copyright (C) 2015, 2018 Technologic Systems * Copyright (C) 2016 Savoir-Faire Linux * * This program is free software; you can redistribute it and/or @@ -55,19 +55,33 @@ static int ts4900_gpio_direction_input(struct gpio_chip *chip, { struct ts4900_gpio_priv *priv = gpiochip_get_data(chip); - /* - * This will clear the output enable bit, the other bits are - * dontcare when this is cleared + /* Only clear the OE bit here, requires a RMW. Prevents potential issue + * with OE and data getting to the physical pin at different times. */ - return regmap_write(priv->regmap, offset, 0); + return regmap_update_bits(priv->regmap, offset, TS4900_GPIO_OE, 0); } static int ts4900_gpio_direction_output(struct gpio_chip *chip, unsigned int offset, int value) { struct ts4900_gpio_priv *priv = gpiochip_get_data(chip); + unsigned int reg; int ret; + /* If changing from an input to an output, we need to first set the + * proper data bit to what is requested and then set OE bit. This + * prevents a glitch that can occur on the IO line + */ + regmap_read(priv->regmap, offset, ®); + if (!(reg & TS4900_GPIO_OE)) { + if (value) + reg = TS4900_GPIO_OUT; + else + reg &= ~TS4900_GPIO_OUT; + + regmap_write(priv->regmap, offset, reg); + } + if (value) ret = regmap_write(priv->regmap, offset, TS4900_GPIO_OE | TS4900_GPIO_OUT); -- cgit From 55d01c98a88b346e217eaa931b32e7baea905c9a Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 8 Mar 2022 09:44:54 +0100 Subject: gpio: sim: fix a typo Just noticed this when applying Andy's patch. s/childred/children/ Fixes: cb8c474e79be ("gpio: sim: new testing module") Signed-off-by: Bartosz Golaszewski Reviewed-by: Andy Shevchenko --- drivers/gpio/gpio-sim.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-sim.c b/drivers/gpio/gpio-sim.c index bb9bb595c1a8..8e5d87984a48 100644 --- a/drivers/gpio/gpio-sim.c +++ b/drivers/gpio/gpio-sim.c @@ -547,7 +547,7 @@ struct gpio_sim_bank { * * So we need to store the pointer to the parent struct here. We can * dereference it anywhere we need with no checks and no locking as - * it's guaranteed to survive the childred and protected by configfs + * it's guaranteed to survive the children and protected by configfs * locks. * * Same for other structures. -- cgit From b1a384d2cbccb1eb3f84765020d25e2c1929706e Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 10 Mar 2022 10:22:14 +0000 Subject: ARM: fix build warning in proc-v7-bugs.c The kernel test robot discovered that building without HARDEN_BRANCH_PREDICTOR issues a warning due to a missing argument to pr_info(). Add the missing argument. Reported-by: kernel test robot Fixes: 9dd78194a372 ("ARM: report Spectre v2 status through sysfs") Signed-off-by: Russell King (Oracle) Signed-off-by: Linus Torvalds --- arch/arm/mm/proc-v7-bugs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm/mm/proc-v7-bugs.c b/arch/arm/mm/proc-v7-bugs.c index c226feab2457..06dbfb968182 100644 --- a/arch/arm/mm/proc-v7-bugs.c +++ b/arch/arm/mm/proc-v7-bugs.c @@ -108,7 +108,8 @@ static unsigned int spectre_v2_install_workaround(unsigned int method) #else static unsigned int spectre_v2_install_workaround(unsigned int method) { - pr_info("CPU%u: Spectre V2: workarounds disabled by configuration\n"); + pr_info("CPU%u: Spectre V2: workarounds disabled by configuration\n", + smp_processor_id()); return SPECTRE_VULNERABLE; } -- cgit From a1cc1697bb56cdf880ad4d17b79a39ef2c294bc9 Mon Sep 17 00:00:00 2001 From: Pali Rohár Date: Thu, 10 Mar 2022 11:39:23 +0100 Subject: arm64: dts: marvell: armada-37xx: Remap IO space to bus address 0x0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Legacy and old PCI I/O based cards do not support 32-bit I/O addressing. Since commit 64f160e19e92 ("PCI: aardvark: Configure PCIe resources from 'ranges' DT property") kernel can set different PCIe address on CPU and different on the bus for the one A37xx address mapping without any firmware support in case the bus address does not conflict with other A37xx mapping. So remap I/O space to the bus address 0x0 to enable support for old legacy I/O port based cards which have hardcoded I/O ports in low address space. Note that DDR on A37xx is mapped to bus address 0x0. And mapping of I/O space can be set to address 0x0 too because MEM space and I/O space are separate and so do not conflict. Remapping IO space on Turris Mox to different address is not possible to due bootloader bug. Signed-off-by: Pali Rohár Reported-by: Arnd Bergmann Fixes: 76f6386b25cc ("arm64: dts: marvell: Add Aardvark PCIe support for Armada 3700") Cc: stable@vger.kernel.org # 64f160e19e92 ("PCI: aardvark: Configure PCIe resources from 'ranges' DT property") Cc: stable@vger.kernel.org # 514ef1e62d65 ("arm64: dts: marvell: armada-37xx: Extend PCIe MEM space") Reviewed-by: Arnd Bergmann Signed-off-by: Gregory CLEMENT --- arch/arm64/boot/dts/marvell/armada-3720-turris-mox.dts | 7 ++++++- arch/arm64/boot/dts/marvell/armada-37xx.dtsi | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/marvell/armada-3720-turris-mox.dts b/arch/arm64/boot/dts/marvell/armada-3720-turris-mox.dts index 04da07ae4420..4b377fe807e0 100644 --- a/arch/arm64/boot/dts/marvell/armada-3720-turris-mox.dts +++ b/arch/arm64/boot/dts/marvell/armada-3720-turris-mox.dts @@ -138,7 +138,9 @@ /* * U-Boot port for Turris Mox has a bug which always expects that "ranges" DT property * contains exactly 2 ranges with 3 (child) address cells, 2 (parent) address cells and - * 2 size cells and also expects that the second range starts at 16 MB offset. If these + * 2 size cells and also expects that the second range starts at 16 MB offset. Also it + * expects that first range uses same address for PCI (child) and CPU (parent) cells (so + * no remapping) and that this address is the lowest from all specified ranges. If these * conditions are not met then U-Boot crashes during loading kernel DTB file. PCIe address * space is 128 MB long, so the best split between MEM and IO is to use fixed 16 MB window * for IO and the rest 112 MB (64+32+16) for MEM, despite that maximal IO size is just 64 kB. @@ -147,6 +149,9 @@ * https://source.denx.de/u-boot/u-boot/-/commit/cb2ddb291ee6fcbddd6d8f4ff49089dfe580f5d7 * https://source.denx.de/u-boot/u-boot/-/commit/c64ac3b3185aeb3846297ad7391fc6df8ecd73bf * https://source.denx.de/u-boot/u-boot/-/commit/4a82fca8e330157081fc132a591ebd99ba02ee33 + * Bug related to requirement of same child and parent addresses for first range is fixed + * in U-Boot version 2022.04 by following commit: + * https://source.denx.de/u-boot/u-boot/-/commit/1fd54253bca7d43d046bba4853fe5fafd034bc17 */ #address-cells = <3>; #size-cells = <2>; diff --git a/arch/arm64/boot/dts/marvell/armada-37xx.dtsi b/arch/arm64/boot/dts/marvell/armada-37xx.dtsi index 673f4906eef9..fb78ef613b29 100644 --- a/arch/arm64/boot/dts/marvell/armada-37xx.dtsi +++ b/arch/arm64/boot/dts/marvell/armada-37xx.dtsi @@ -499,7 +499,7 @@ * (totaling 127 MiB) for MEM. */ ranges = <0x82000000 0 0xe8000000 0 0xe8000000 0 0x07f00000 /* Port 0 MEM */ - 0x81000000 0 0xefff0000 0 0xefff0000 0 0x00010000>; /* Port 0 IO */ + 0x81000000 0 0x00000000 0 0xefff0000 0 0x00010000>; /* Port 0 IO */ interrupt-map-mask = <0 0 0 7>; interrupt-map = <0 0 0 1 &pcie_intc 0>, <0 0 0 2 &pcie_intc 1>, -- cgit From c80ee64a8020ef1a6a92109798080786829b8994 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Fri, 11 Feb 2022 00:49:43 +0800 Subject: riscv: alternative only works on !XIP_KERNEL The alternative mechanism needs runtime code patching, it can't work on XIP_KERNEL. And the errata workarounds are implemented via the alternative mechanism. So add !XIP_KERNEL dependency for alternative and erratas. Signed-off-by: Jisheng Zhang Fixes: 44c922572952 ("RISC-V: enable XIP") Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig.erratas | 1 + arch/riscv/Kconfig.socs | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/riscv/Kconfig.erratas b/arch/riscv/Kconfig.erratas index b44d6ecdb46e..0aacd7052585 100644 --- a/arch/riscv/Kconfig.erratas +++ b/arch/riscv/Kconfig.erratas @@ -2,6 +2,7 @@ menu "CPU errata selection" config RISCV_ERRATA_ALTERNATIVE bool "RISC-V alternative scheme" + depends on !XIP_KERNEL default y help This Kconfig allows the kernel to automatically patch the diff --git a/arch/riscv/Kconfig.socs b/arch/riscv/Kconfig.socs index 6ec44a22278a..c112ab2a9052 100644 --- a/arch/riscv/Kconfig.socs +++ b/arch/riscv/Kconfig.socs @@ -14,8 +14,8 @@ config SOC_SIFIVE select CLK_SIFIVE select CLK_SIFIVE_PRCI select SIFIVE_PLIC - select RISCV_ERRATA_ALTERNATIVE - select ERRATA_SIFIVE + select RISCV_ERRATA_ALTERNATIVE if !XIP_KERNEL + select ERRATA_SIFIVE if !XIP_KERNEL help This enables support for SiFive SoC platform hardware. -- cgit From fe673d3f5bf1fc50cdc4b754831db91a2ec10126 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 8 Mar 2022 11:55:48 -0800 Subject: mm: gup: make fault_in_safe_writeable() use fixup_user_fault() Instead of using GUP, make fault_in_safe_writeable() actually force a 'handle_mm_fault()' using the same fixup_user_fault() machinery that futexes already use. Using the GUP machinery meant that fault_in_safe_writeable() did not do everything that a real fault would do, ranging from not auto-expanding the stack segment, to not updating accessed or dirty flags in the page tables (GUP sets those flags on the pages themselves). The latter causes problems on architectures (like s390) that do accessed bit handling in software, which meant that fault_in_safe_writeable() didn't actually do all the fault handling it needed to, and trying to access the user address afterwards would still cause faults. Reported-and-tested-by: Andreas Gruenbacher Fixes: cdd591fc86e3 ("iov_iter: Introduce fault_in_iov_iter_writeable") Link: https://lore.kernel.org/all/CAHc6FU5nP+nziNGG0JAF1FUx-GV7kKFvM7aZuU_XD2_1v4vnvg@mail.gmail.com/ Acked-by: David Hildenbrand Signed-off-by: Linus Torvalds --- mm/gup.c | 57 +++++++++++++++++++-------------------------------------- 1 file changed, 19 insertions(+), 38 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index a9d4d724aef7..7bc1ba9ce440 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1729,11 +1729,11 @@ EXPORT_SYMBOL(fault_in_writeable); * @uaddr: start of address range * @size: length of address range * - * Faults in an address range using get_user_pages, i.e., without triggering - * hardware page faults. This is primarily useful when we already know that - * some or all of the pages in the address range aren't in memory. + * Faults in an address range for writing. This is primarily useful when we + * already know that some or all of the pages in the address range aren't in + * memory. * - * Other than fault_in_writeable(), this function is non-destructive. + * Unlike fault_in_writeable(), this function is non-destructive. * * Note that we don't pin or otherwise hold the pages referenced that we fault * in. There's no guarantee that they'll stay in memory for any duration of @@ -1744,46 +1744,27 @@ EXPORT_SYMBOL(fault_in_writeable); */ size_t fault_in_safe_writeable(const char __user *uaddr, size_t size) { - unsigned long start = (unsigned long)untagged_addr(uaddr); - unsigned long end, nstart, nend; + unsigned long start = (unsigned long)uaddr, end; struct mm_struct *mm = current->mm; - struct vm_area_struct *vma = NULL; - int locked = 0; + bool unlocked = false; - nstart = start & PAGE_MASK; + if (unlikely(size == 0)) + return 0; end = PAGE_ALIGN(start + size); - if (end < nstart) + if (end < start) end = 0; - for (; nstart != end; nstart = nend) { - unsigned long nr_pages; - long ret; - if (!locked) { - locked = 1; - mmap_read_lock(mm); - vma = find_vma(mm, nstart); - } else if (nstart >= vma->vm_end) - vma = vma->vm_next; - if (!vma || vma->vm_start >= end) - break; - nend = end ? min(end, vma->vm_end) : vma->vm_end; - if (vma->vm_flags & (VM_IO | VM_PFNMAP)) - continue; - if (nstart < vma->vm_start) - nstart = vma->vm_start; - nr_pages = (nend - nstart) / PAGE_SIZE; - ret = __get_user_pages_locked(mm, nstart, nr_pages, - NULL, NULL, &locked, - FOLL_TOUCH | FOLL_WRITE); - if (ret <= 0) + mmap_read_lock(mm); + do { + if (fixup_user_fault(mm, start, FAULT_FLAG_WRITE, &unlocked)) break; - nend = nstart + ret * PAGE_SIZE; - } - if (locked) - mmap_read_unlock(mm); - if (nstart == end) - return 0; - return size - min_t(size_t, nstart - start, size); + start = (start + PAGE_SIZE) & PAGE_MASK; + } while (start != end); + mmap_read_unlock(mm); + + if (size > (unsigned long)uaddr - start) + return size - ((unsigned long)uaddr - start); + return 0; } EXPORT_SYMBOL(fault_in_safe_writeable); -- cgit From 2ac5b58e645c66932438bb021cb5b52097ce70b0 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Thu, 10 Mar 2022 01:53:13 +0000 Subject: gianfar: ethtool: Fix refcount leak in gfar_get_ts_info The of_find_compatible_node() function returns a node pointer with refcount incremented, We should use of_node_put() on it when done Add the missing of_node_put() to release the refcount. Fixes: 7349a74ea75c ("net: ethernet: gianfar_ethtool: get phc index through drvdata") Signed-off-by: Miaoqian Lin Reviewed-by: Jesse Brandeburg Reviewed-by: Claudiu Manoil Link: https://lore.kernel.org/r/20220310015313.14938-1-linmq006@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/gianfar_ethtool.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/freescale/gianfar_ethtool.c b/drivers/net/ethernet/freescale/gianfar_ethtool.c index ff756265d58f..9a2c16d69e2c 100644 --- a/drivers/net/ethernet/freescale/gianfar_ethtool.c +++ b/drivers/net/ethernet/freescale/gianfar_ethtool.c @@ -1464,6 +1464,7 @@ static int gfar_get_ts_info(struct net_device *dev, ptp_node = of_find_compatible_node(NULL, NULL, "fsl,etsec-ptp"); if (ptp_node) { ptp_dev = of_find_device_by_node(ptp_node); + of_node_put(ptp_node); if (ptp_dev) ptp = platform_get_drvdata(ptp_dev); } -- cgit From 37c9d66c95564c85a001d8a035354f0220a1e1c3 Mon Sep 17 00:00:00 2001 From: Clément Léger Date: Wed, 9 Mar 2022 15:22:28 +0100 Subject: net: phy: DP83822: clear MISR2 register to disable interrupts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MISR1 was cleared twice but the original author intention was probably to clear MISR1 & MISR2 to completely disable interrupts. Fix it to clear MISR2. Fixes: 87461f7a58ab ("net: phy: DP83822 initial driver submission") Signed-off-by: Clément Léger Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Link: https://lore.kernel.org/r/20220309142228.761153-1-clement.leger@bootlin.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/dp83822.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/phy/dp83822.c b/drivers/net/phy/dp83822.c index 211b5476a6f5..ce17b2af3218 100644 --- a/drivers/net/phy/dp83822.c +++ b/drivers/net/phy/dp83822.c @@ -274,7 +274,7 @@ static int dp83822_config_intr(struct phy_device *phydev) if (err < 0) return err; - err = phy_write(phydev, MII_DP83822_MISR1, 0); + err = phy_write(phydev, MII_DP83822_MISR2, 0); if (err < 0) return err; -- cgit From 26183cfe478c1d1d5cd1e3920a4b2c5b1980849d Mon Sep 17 00:00:00 2001 From: Colin Foster Date: Tue, 8 Mar 2022 22:25:44 -0800 Subject: net: phy: correct spelling error of media in documentation The header file incorrectly referenced "median-independant interface" instead of media. Correct this typo. Signed-off-by: Colin Foster Fixes: 4069a572d423 ("net: phy: Document core PHY structures") Reviewed-by: Russell King (Oracle) Link: https://lore.kernel.org/r/20220309062544.3073-1-colin.foster@in-advantage.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/phy.h b/include/linux/phy.h index 6de8d7a90d78..8fa70ba063a5 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -87,8 +87,8 @@ extern const int phy_10gbit_features_array[1]; * * @PHY_INTERFACE_MODE_NA: Not Applicable - don't touch * @PHY_INTERFACE_MODE_INTERNAL: No interface, MAC and PHY combined - * @PHY_INTERFACE_MODE_MII: Median-independent interface - * @PHY_INTERFACE_MODE_GMII: Gigabit median-independent interface + * @PHY_INTERFACE_MODE_MII: Media-independent interface + * @PHY_INTERFACE_MODE_GMII: Gigabit media-independent interface * @PHY_INTERFACE_MODE_SGMII: Serial gigabit media-independent interface * @PHY_INTERFACE_MODE_TBI: Ten Bit Interface * @PHY_INTERFACE_MODE_REVMII: Reverse Media Independent Interface -- cgit From 633593a808980f82d251d0ca89730d8bb8b0220c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Mar 2022 16:11:45 -0800 Subject: sctp: fix kernel-infoleak for SCTP sockets syzbot reported a kernel infoleak [1] of 4 bytes. After analysis, it turned out r->idiag_expires is not initialized if inet_sctp_diag_fill() calls inet_diag_msg_common_fill() Make sure to clear idiag_timer/idiag_retrans/idiag_expires and let inet_diag_msg_sctpasoc_fill() fill them again if needed. [1] BUG: KMSAN: kernel-infoleak in instrument_copy_to_user include/linux/instrumented.h:121 [inline] BUG: KMSAN: kernel-infoleak in copyout lib/iov_iter.c:154 [inline] BUG: KMSAN: kernel-infoleak in _copy_to_iter+0x6ef/0x25a0 lib/iov_iter.c:668 instrument_copy_to_user include/linux/instrumented.h:121 [inline] copyout lib/iov_iter.c:154 [inline] _copy_to_iter+0x6ef/0x25a0 lib/iov_iter.c:668 copy_to_iter include/linux/uio.h:162 [inline] simple_copy_to_iter+0xf3/0x140 net/core/datagram.c:519 __skb_datagram_iter+0x2d5/0x11b0 net/core/datagram.c:425 skb_copy_datagram_iter+0xdc/0x270 net/core/datagram.c:533 skb_copy_datagram_msg include/linux/skbuff.h:3696 [inline] netlink_recvmsg+0x669/0x1c80 net/netlink/af_netlink.c:1977 sock_recvmsg_nosec net/socket.c:948 [inline] sock_recvmsg net/socket.c:966 [inline] __sys_recvfrom+0x795/0xa10 net/socket.c:2097 __do_sys_recvfrom net/socket.c:2115 [inline] __se_sys_recvfrom net/socket.c:2111 [inline] __x64_sys_recvfrom+0x19d/0x210 net/socket.c:2111 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x54/0xd0 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x44/0xae Uninit was created at: slab_post_alloc_hook mm/slab.h:737 [inline] slab_alloc_node mm/slub.c:3247 [inline] __kmalloc_node_track_caller+0xe0c/0x1510 mm/slub.c:4975 kmalloc_reserve net/core/skbuff.c:354 [inline] __alloc_skb+0x545/0xf90 net/core/skbuff.c:426 alloc_skb include/linux/skbuff.h:1158 [inline] netlink_dump+0x3e5/0x16c0 net/netlink/af_netlink.c:2248 __netlink_dump_start+0xcf8/0xe90 net/netlink/af_netlink.c:2373 netlink_dump_start include/linux/netlink.h:254 [inline] inet_diag_handler_cmd+0x2e7/0x400 net/ipv4/inet_diag.c:1341 sock_diag_rcv_msg+0x24a/0x620 netlink_rcv_skb+0x40c/0x7e0 net/netlink/af_netlink.c:2494 sock_diag_rcv+0x63/0x80 net/core/sock_diag.c:277 netlink_unicast_kernel net/netlink/af_netlink.c:1317 [inline] netlink_unicast+0x1093/0x1360 net/netlink/af_netlink.c:1343 netlink_sendmsg+0x14d9/0x1720 net/netlink/af_netlink.c:1919 sock_sendmsg_nosec net/socket.c:705 [inline] sock_sendmsg net/socket.c:725 [inline] sock_write_iter+0x594/0x690 net/socket.c:1061 do_iter_readv_writev+0xa7f/0xc70 do_iter_write+0x52c/0x1500 fs/read_write.c:851 vfs_writev fs/read_write.c:924 [inline] do_writev+0x645/0xe00 fs/read_write.c:967 __do_sys_writev fs/read_write.c:1040 [inline] __se_sys_writev fs/read_write.c:1037 [inline] __x64_sys_writev+0xe5/0x120 fs/read_write.c:1037 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x54/0xd0 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x44/0xae Bytes 68-71 of 2508 are uninitialized Memory access of size 2508 starts at ffff888114f9b000 Data copied to user address 00007f7fe09ff2e0 CPU: 1 PID: 3478 Comm: syz-executor306 Not tainted 5.17.0-rc4-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: 8f840e47f190 ("sctp: add the sctp_diag.c file") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Vlad Yasevich Cc: Neil Horman Cc: Marcelo Ricardo Leitner Reviewed-by: Xin Long Link: https://lore.kernel.org/r/20220310001145.297371-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- net/sctp/diag.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/net/sctp/diag.c b/net/sctp/diag.c index 034e2c74497d..d9c6d8f30f09 100644 --- a/net/sctp/diag.c +++ b/net/sctp/diag.c @@ -61,10 +61,6 @@ static void inet_diag_msg_sctpasoc_fill(struct inet_diag_msg *r, r->idiag_timer = SCTP_EVENT_TIMEOUT_T3_RTX; r->idiag_retrans = asoc->rtx_data_chunks; r->idiag_expires = jiffies_to_msecs(t3_rtx->expires - jiffies); - } else { - r->idiag_timer = 0; - r->idiag_retrans = 0; - r->idiag_expires = 0; } } @@ -144,13 +140,14 @@ static int inet_sctp_diag_fill(struct sock *sk, struct sctp_association *asoc, r = nlmsg_data(nlh); BUG_ON(!sk_fullsock(sk)); + r->idiag_timer = 0; + r->idiag_retrans = 0; + r->idiag_expires = 0; if (asoc) { inet_diag_msg_sctpasoc_fill(r, sk, asoc); } else { inet_diag_msg_common_fill(r, sk); r->idiag_state = sk->sk_state; - r->idiag_timer = 0; - r->idiag_retrans = 0; } if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns, net_admin)) -- cgit From bc0e610a6eb0d46e4123fafdbe5e6141d9fff3be Mon Sep 17 00:00:00 2001 From: Jianglei Nie Date: Wed, 9 Mar 2022 20:18:24 +0800 Subject: net: arc_emac: Fix use after free in arc_mdio_probe() If bus->state is equal to MDIOBUS_ALLOCATED, mdiobus_free(bus) will free the "bus". But bus->name is still used in the next line, which will lead to a use after free. We can fix it by putting the name in a local variable and make the bus->name point to the rodata section "name",then use the name in the error message without referring to bus to avoid the uaf. Fixes: 95b5fc03c189 ("net: arc_emac: Make use of the helper function dev_err_probe()") Signed-off-by: Jianglei Nie Link: https://lore.kernel.org/r/20220309121824.36529-1-niejianglei2021@163.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/arc/emac_mdio.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/arc/emac_mdio.c b/drivers/net/ethernet/arc/emac_mdio.c index 9acf589b1178..87f40c2ba904 100644 --- a/drivers/net/ethernet/arc/emac_mdio.c +++ b/drivers/net/ethernet/arc/emac_mdio.c @@ -132,6 +132,7 @@ int arc_mdio_probe(struct arc_emac_priv *priv) { struct arc_emac_mdio_bus_data *data = &priv->bus_data; struct device_node *np = priv->dev->of_node; + const char *name = "Synopsys MII Bus"; struct mii_bus *bus; int error; @@ -142,7 +143,7 @@ int arc_mdio_probe(struct arc_emac_priv *priv) priv->bus = bus; bus->priv = priv; bus->parent = priv->dev; - bus->name = "Synopsys MII Bus"; + bus->name = name; bus->read = &arc_mdio_read; bus->write = &arc_mdio_write; bus->reset = &arc_mdio_reset; @@ -167,7 +168,7 @@ int arc_mdio_probe(struct arc_emac_priv *priv) if (error) { mdiobus_free(bus); return dev_err_probe(priv->dev, error, - "cannot register MDIO bus %s\n", bus->name); + "cannot register MDIO bus %s\n", name); } return 0; -- cgit From 00b022f8f876a3a036b0df7f971001bef6398605 Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Wed, 9 Mar 2022 22:55:35 -0600 Subject: net: bcmgenet: Don't claim WOL when its not available Some of the bcmgenet platforms don't correctly support WOL, yet ethtool returns: "Supports Wake-on: gsf" which is false. Ideally if there isn't a wol_irq, or there is something else that keeps the device from being able to wakeup it should display: "Supports Wake-on: d" This patch checks whether the device can wakup, before using the hard-coded supported flags. This corrects the ethtool reporting, as well as the WOL configuration because ethtool verifies that the mode is supported before attempting it. Fixes: c51de7f3976b ("net: bcmgenet: add Wake-on-LAN support code") Signed-off-by: Jeremy Linton Tested-by: Peter Robinson Acked-by: Florian Fainelli Link: https://lore.kernel.org/r/20220310045535.224450-1-jeremy.linton@arm.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/genet/bcmgenet_wol.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet_wol.c b/drivers/net/ethernet/broadcom/genet/bcmgenet_wol.c index e31a5a397f11..f55d9d9c01a8 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet_wol.c +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet_wol.c @@ -40,6 +40,13 @@ void bcmgenet_get_wol(struct net_device *dev, struct ethtool_wolinfo *wol) { struct bcmgenet_priv *priv = netdev_priv(dev); + struct device *kdev = &priv->pdev->dev; + + if (!device_can_wakeup(kdev)) { + wol->supported = 0; + wol->wolopts = 0; + return; + } wol->supported = WAKE_MAGIC | WAKE_MAGICSECURE | WAKE_FILTER; wol->wolopts = priv->wolopts; -- cgit From 2c87c6f9fbddc5b84d67b2fa3f432fcac6d99d93 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Wed, 9 Mar 2022 22:04:47 +0100 Subject: net: phy: meson-gxl: improve link-up behavior Sometimes the link comes up but no data flows. This patch fixes this behavior. It's not clear what's the root cause of the issue. According to the tests one other link-up issue remains. In very rare cases the link isn't even reported as up. Fixes: 84c8f773d2dc ("net: phy: meson-gxl: remove the use of .ack_callback()") Tested-by: Erico Nunes Signed-off-by: Heiner Kallweit Link: https://lore.kernel.org/r/e3473452-a1f9-efcf-5fdd-02b6f44c3fcd@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/meson-gxl.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/net/phy/meson-gxl.c b/drivers/net/phy/meson-gxl.c index c49062ad72c6..73f7962a37d3 100644 --- a/drivers/net/phy/meson-gxl.c +++ b/drivers/net/phy/meson-gxl.c @@ -243,7 +243,13 @@ static irqreturn_t meson_gxl_handle_interrupt(struct phy_device *phydev) irq_status == INTSRC_ENERGY_DETECT) return IRQ_HANDLED; - phy_trigger_machine(phydev); + /* Give PHY some time before MAC starts sending data. This works + * around an issue where network doesn't come up properly. + */ + if (!(irq_status & INTSRC_LINK_DOWN)) + phy_queue_state_machine(phydev, msecs_to_jiffies(100)); + else + phy_trigger_machine(phydev); return IRQ_HANDLED; } -- cgit From 5cb1ebdbc4342b1c2ce89516e19808d64417bdbc Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Thu, 10 Mar 2022 18:16:41 +0100 Subject: ice: Fix race condition during interface enslave Commit 5dbbbd01cbba83 ("ice: Avoid RTNL lock when re-creating auxiliary device") changes a process of re-creation of aux device so ice_plug_aux_dev() is called from ice_service_task() context. This unfortunately opens a race window that can result in dead-lock when interface has left LAG and immediately enters LAG again. Reproducer: ``` #!/bin/sh ip link add lag0 type bond mode 1 miimon 100 ip link set lag0 for n in {1..10}; do echo Cycle: $n ip link set ens7f0 master lag0 sleep 1 ip link set ens7f0 nomaster done ``` This results in: [20976.208697] Workqueue: ice ice_service_task [ice] [20976.213422] Call Trace: [20976.215871] __schedule+0x2d1/0x830 [20976.219364] schedule+0x35/0xa0 [20976.222510] schedule_preempt_disabled+0xa/0x10 [20976.227043] __mutex_lock.isra.7+0x310/0x420 [20976.235071] enum_all_gids_of_dev_cb+0x1c/0x100 [ib_core] [20976.251215] ib_enum_roce_netdev+0xa4/0xe0 [ib_core] [20976.256192] ib_cache_setup_one+0x33/0xa0 [ib_core] [20976.261079] ib_register_device+0x40d/0x580 [ib_core] [20976.266139] irdma_ib_register_device+0x129/0x250 [irdma] [20976.281409] irdma_probe+0x2c1/0x360 [irdma] [20976.285691] auxiliary_bus_probe+0x45/0x70 [20976.289790] really_probe+0x1f2/0x480 [20976.298509] driver_probe_device+0x49/0xc0 [20976.302609] bus_for_each_drv+0x79/0xc0 [20976.306448] __device_attach+0xdc/0x160 [20976.310286] bus_probe_device+0x9d/0xb0 [20976.314128] device_add+0x43c/0x890 [20976.321287] __auxiliary_device_add+0x43/0x60 [20976.325644] ice_plug_aux_dev+0xb2/0x100 [ice] [20976.330109] ice_service_task+0xd0c/0xed0 [ice] [20976.342591] process_one_work+0x1a7/0x360 [20976.350536] worker_thread+0x30/0x390 [20976.358128] kthread+0x10a/0x120 [20976.365547] ret_from_fork+0x1f/0x40 ... [20976.438030] task:ip state:D stack: 0 pid:213658 ppid:213627 flags:0x00004084 [20976.446469] Call Trace: [20976.448921] __schedule+0x2d1/0x830 [20976.452414] schedule+0x35/0xa0 [20976.455559] schedule_preempt_disabled+0xa/0x10 [20976.460090] __mutex_lock.isra.7+0x310/0x420 [20976.464364] device_del+0x36/0x3c0 [20976.467772] ice_unplug_aux_dev+0x1a/0x40 [ice] [20976.472313] ice_lag_event_handler+0x2a2/0x520 [ice] [20976.477288] notifier_call_chain+0x47/0x70 [20976.481386] __netdev_upper_dev_link+0x18b/0x280 [20976.489845] bond_enslave+0xe05/0x1790 [bonding] [20976.494475] do_setlink+0x336/0xf50 [20976.502517] __rtnl_newlink+0x529/0x8b0 [20976.543441] rtnl_newlink+0x43/0x60 [20976.546934] rtnetlink_rcv_msg+0x2b1/0x360 [20976.559238] netlink_rcv_skb+0x4c/0x120 [20976.563079] netlink_unicast+0x196/0x230 [20976.567005] netlink_sendmsg+0x204/0x3d0 [20976.570930] sock_sendmsg+0x4c/0x50 [20976.574423] ____sys_sendmsg+0x1eb/0x250 [20976.586807] ___sys_sendmsg+0x7c/0xc0 [20976.606353] __sys_sendmsg+0x57/0xa0 [20976.609930] do_syscall_64+0x5b/0x1a0 [20976.613598] entry_SYSCALL_64_after_hwframe+0x65/0xca 1. Command 'ip link ... set nomaster' causes that ice_plug_aux_dev() is called from ice_service_task() context, aux device is created and associated device->lock is taken. 2. Command 'ip link ... set master...' calls ice's notifier under RTNL lock and that notifier calls ice_unplug_aux_dev(). That function tries to take aux device->lock but this is already taken by ice_plug_aux_dev() in step 1 3. Later ice_plug_aux_dev() tries to take RTNL lock but this is already taken in step 2 4. Dead-lock The patch fixes this issue by following changes: - Bit ICE_FLAG_PLUG_AUX_DEV is kept to be set during ice_plug_aux_dev() call in ice_service_task() - The bit is checked in ice_clear_rdma_cap() and only if it is not set then ice_unplug_aux_dev() is called. If it is set (in other words plugging of aux device was requested and ice_plug_aux_dev() is potentially running) then the function only clears the bit - Once ice_plug_aux_dev() call (in ice_service_task) is finished the bit ICE_FLAG_PLUG_AUX_DEV is cleared but it is also checked whether it was already cleared by ice_clear_rdma_cap(). If so then aux device is unplugged. Signed-off-by: Ivan Vecera Co-developed-by: Petr Oros Signed-off-by: Petr Oros Reviewed-by: Dave Ertman Link: https://lore.kernel.org/r/20220310171641.3863659-1-ivecera@redhat.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ice/ice.h | 11 ++++++++++- drivers/net/ethernet/intel/ice/ice_main.c | 12 +++++++++++- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h index 3121f9b04f59..bea1d1e39fa2 100644 --- a/drivers/net/ethernet/intel/ice/ice.h +++ b/drivers/net/ethernet/intel/ice/ice.h @@ -898,7 +898,16 @@ static inline void ice_set_rdma_cap(struct ice_pf *pf) */ static inline void ice_clear_rdma_cap(struct ice_pf *pf) { - ice_unplug_aux_dev(pf); + /* We can directly unplug aux device here only if the flag bit + * ICE_FLAG_PLUG_AUX_DEV is not set because ice_unplug_aux_dev() + * could race with ice_plug_aux_dev() called from + * ice_service_task(). In this case we only clear that bit now and + * aux device will be unplugged later once ice_plug_aux_device() + * called from ice_service_task() finishes (see ice_service_task()). + */ + if (!test_and_clear_bit(ICE_FLAG_PLUG_AUX_DEV, pf->flags)) + ice_unplug_aux_dev(pf); + clear_bit(ICE_FLAG_RDMA_ENA, pf->flags); clear_bit(ICE_FLAG_AUX_ENA, pf->flags); } diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 83e3e8aae6cf..493942e910be 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -2255,9 +2255,19 @@ static void ice_service_task(struct work_struct *work) return; } - if (test_and_clear_bit(ICE_FLAG_PLUG_AUX_DEV, pf->flags)) + if (test_bit(ICE_FLAG_PLUG_AUX_DEV, pf->flags)) { + /* Plug aux device per request */ ice_plug_aux_dev(pf); + /* Mark plugging as done but check whether unplug was + * requested during ice_plug_aux_dev() call + * (e.g. from ice_clear_rdma_cap()) and if so then + * plug aux device. + */ + if (!test_and_clear_bit(ICE_FLAG_PLUG_AUX_DEV, pf->flags)) + ice_unplug_aux_dev(pf); + } + if (test_and_clear_bit(ICE_FLAG_MTU_CHANGED, pf->flags)) { struct iidc_event *event; -- cgit From e0ae713023a9d09d6e1b454bdc8e8c1dd32c586e Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 9 Mar 2022 23:13:45 +0100 Subject: xdp: xdp_mem_allocator can be NULL in trace_mem_connect(). MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since the commit mentioned below __xdp_reg_mem_model() can return a NULL pointer. This pointer is dereferenced in trace_mem_connect() which leads to segfault. The trace points (mem_connect + mem_disconnect) were put in place to pair connect/disconnect using the IDs. The ID is only assigned if __xdp_reg_mem_model() does not return NULL. That connect trace point is of no use if there is no ID. Skip that connect trace point if xdp_alloc is NULL. [ Toke Høiland-Jørgensen delivered the reasoning for skipping the trace point ] Fixes: 4a48ef70b93b8 ("xdp: Allow registering memory model without rxq reference") Signed-off-by: Sebastian Andrzej Siewior Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/YikmmXsffE+QajTB@linutronix.de Signed-off-by: Jakub Kicinski --- net/core/xdp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/core/xdp.c b/net/core/xdp.c index 7aba35504986..73fae16264e1 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -357,7 +357,8 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, if (IS_ERR(xdp_alloc)) return PTR_ERR(xdp_alloc); - trace_mem_connect(xdp_alloc, xdp_rxq); + if (trace_mem_connect_enabled() && xdp_alloc) + trace_mem_connect(xdp_alloc, xdp_rxq); return 0; } -- cgit From 0966d385830de3470b7131db8e86c0c5bc9c52dc Mon Sep 17 00:00:00 2001 From: Emil Renner Berthing Date: Wed, 23 Feb 2022 20:12:57 +0100 Subject: riscv: Fix auipc+jalr relocation range checks RISC-V can do PC-relative jumps with a 32bit range using the following two instructions: auipc t0, imm20 ; t0 = PC + imm20 * 2^12 jalr ra, t0, imm12 ; ra = PC + 4, PC = t0 + imm12 Crucially both the 20bit immediate imm20 and the 12bit immediate imm12 are treated as two's-complement signed values. For this reason the immediates are usually calculated like this: imm20 = (offset + 0x800) >> 12 imm12 = offset & 0xfff ..where offset is the signed offset from the auipc instruction. When the 11th bit of offset is 0 the addition of 0x800 doesn't change the top 20 bits and imm12 considered positive. When the 11th bit is 1 the carry of the addition by 0x800 means imm20 is one higher, but since imm12 is then considered negative the two's complement representation means it all cancels out nicely. However, this addition by 0x800 (2^11) means an offset greater than or equal to 2^31 - 2^11 would overflow so imm20 is considered negative and result in a backwards jump. Similarly the lower range of offset is also moved down by 2^11 and hence the true 32bit range is [-2^31 - 2^11, 2^31 - 2^11) Signed-off-by: Emil Renner Berthing Fixes: e2c0cdfba7f6 ("RISC-V: User-facing API") Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/module.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/arch/riscv/kernel/module.c b/arch/riscv/kernel/module.c index 68a9e3d1fe16..4a48287513c3 100644 --- a/arch/riscv/kernel/module.c +++ b/arch/riscv/kernel/module.c @@ -13,6 +13,19 @@ #include #include +/* + * The auipc+jalr instruction pair can reach any PC-relative offset + * in the range [-2^31 - 2^11, 2^31 - 2^11) + */ +static bool riscv_insn_valid_32bit_offset(ptrdiff_t val) +{ +#ifdef CONFIG_32BIT + return true; +#else + return (-(1L << 31) - (1L << 11)) <= val && val < ((1L << 31) - (1L << 11)); +#endif +} + static int apply_r_riscv_32_rela(struct module *me, u32 *location, Elf_Addr v) { if (v != (u32)v) { @@ -95,7 +108,7 @@ static int apply_r_riscv_pcrel_hi20_rela(struct module *me, u32 *location, ptrdiff_t offset = (void *)v - (void *)location; s32 hi20; - if (offset != (s32)offset) { + if (!riscv_insn_valid_32bit_offset(offset)) { pr_err( "%s: target %016llx can not be addressed by the 32-bit offset from PC = %p\n", me->name, (long long)v, location); @@ -197,10 +210,9 @@ static int apply_r_riscv_call_plt_rela(struct module *me, u32 *location, Elf_Addr v) { ptrdiff_t offset = (void *)v - (void *)location; - s32 fill_v = offset; u32 hi20, lo12; - if (offset != fill_v) { + if (!riscv_insn_valid_32bit_offset(offset)) { /* Only emit the plt entry if offset over 32-bit range */ if (IS_ENABLED(CONFIG_MODULE_SECTIONS)) { offset = module_emit_plt_entry(me, v); @@ -224,10 +236,9 @@ static int apply_r_riscv_call_rela(struct module *me, u32 *location, Elf_Addr v) { ptrdiff_t offset = (void *)v - (void *)location; - s32 fill_v = offset; u32 hi20, lo12; - if (offset != fill_v) { + if (!riscv_insn_valid_32bit_offset(offset)) { pr_err( "%s: target %016llx can not be addressed by the 32-bit offset from PC = %p\n", me->name, (long long)v, location); -- cgit From c993ee0f9f81caf5767a50d1faeba39a0dc82af2 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Mar 2022 13:23:31 +0000 Subject: watch_queue: Fix filter limit check In watch_queue_set_filter(), there are a couple of places where we check that the filter type value does not exceed what the type_filter bitmap can hold. One place calculates the number of bits by: if (tf[i].type >= sizeof(wfilter->type_filter) * 8) which is fine, but the second does: if (tf[i].type >= sizeof(wfilter->type_filter) * BITS_PER_LONG) which is not. This can lead to a couple of out-of-bounds writes due to a too-large type: (1) __set_bit() on wfilter->type_filter (2) Writing more elements in wfilter->filters[] than we allocated. Fix this by just using the proper WATCH_TYPE__NR instead, which is the number of types we actually know about. The bug may cause an oops looking something like: BUG: KASAN: slab-out-of-bounds in watch_queue_set_filter+0x659/0x740 Write of size 4 at addr ffff88800d2c66bc by task watch_queue_oob/611 ... Call Trace: dump_stack_lvl+0x45/0x59 print_address_description.constprop.0+0x1f/0x150 ... kasan_report.cold+0x7f/0x11b ... watch_queue_set_filter+0x659/0x740 ... __x64_sys_ioctl+0x127/0x190 do_syscall_64+0x43/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae Allocated by task 611: kasan_save_stack+0x1e/0x40 __kasan_kmalloc+0x81/0xa0 watch_queue_set_filter+0x23a/0x740 __x64_sys_ioctl+0x127/0x190 do_syscall_64+0x43/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae The buggy address belongs to the object at ffff88800d2c66a0 which belongs to the cache kmalloc-32 of size 32 The buggy address is located 28 bytes inside of 32-byte region [ffff88800d2c66a0, ffff88800d2c66c0) Fixes: c73be61cede5 ("pipe: Add general notification queue support") Reported-by: Jann Horn Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- include/linux/watch_queue.h | 3 ++- kernel/watch_queue.c | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/include/linux/watch_queue.h b/include/linux/watch_queue.h index c994d1b2cdba..3b9a40ae8bdb 100644 --- a/include/linux/watch_queue.h +++ b/include/linux/watch_queue.h @@ -28,7 +28,8 @@ struct watch_type_filter { struct watch_filter { union { struct rcu_head rcu; - unsigned long type_filter[2]; /* Bitmask of accepted types */ + /* Bitmask of accepted types */ + DECLARE_BITMAP(type_filter, WATCH_TYPE__NR); }; u32 nr_filters; /* Number of filters */ struct watch_type_filter filters[]; diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index 9c9eb20dd2c5..427b0318e303 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -320,7 +320,7 @@ long watch_queue_set_filter(struct pipe_inode_info *pipe, tf[i].info_mask & WATCH_INFO_LENGTH) goto err_filter; /* Ignore any unknown types */ - if (tf[i].type >= sizeof(wfilter->type_filter) * 8) + if (tf[i].type >= WATCH_TYPE__NR) continue; nr_filter++; } @@ -336,7 +336,7 @@ long watch_queue_set_filter(struct pipe_inode_info *pipe, q = wfilter->filters; for (i = 0; i < filter.nr_filters; i++) { - if (tf[i].type >= sizeof(wfilter->type_filter) * BITS_PER_LONG) + if (tf[i].type >= WATCH_TYPE__NR) continue; q->type = tf[i].type; -- cgit From db8facfc9fafacefe8a835416a6b77c838088f8b Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Mar 2022 13:23:38 +0000 Subject: watch_queue, pipe: Free watchqueue state after clearing pipe ring In free_pipe_info(), free the watchqueue state after clearing the pipe ring as each pipe ring descriptor has a release function, and in the case of a notification message, this is watch_queue_pipe_buf_release() which tries to mark the allocation bitmap that was previously released. Fix this by moving the put of the pipe's ref on the watch queue to after the ring has been cleared. We still need to call watch_queue_clear() before doing that to make sure that the pipe is disconnected from any notification sources first. Fixes: c73be61cede5 ("pipe: Add general notification queue support") Reported-by: Jann Horn Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- fs/pipe.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/pipe.c b/fs/pipe.c index cc28623a67b6..4eb88bc138bb 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -831,10 +831,8 @@ void free_pipe_info(struct pipe_inode_info *pipe) int i; #ifdef CONFIG_WATCH_QUEUE - if (pipe->watch_queue) { + if (pipe->watch_queue) watch_queue_clear(pipe->watch_queue); - put_watch_queue(pipe->watch_queue); - } #endif (void) account_pipe_buffers(pipe->user, pipe->nr_accounted, 0); @@ -844,6 +842,10 @@ void free_pipe_info(struct pipe_inode_info *pipe) if (buf->ops) pipe_buf_release(pipe, buf); } +#ifdef CONFIG_WATCH_QUEUE + if (pipe->watch_queue) + put_watch_queue(pipe->watch_queue); +#endif if (pipe->tmp_page) __free_page(pipe->tmp_page); kfree(pipe->bufs); -- cgit From c1853fbadcba1497f4907971e7107888e0714c81 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Mar 2022 13:23:46 +0000 Subject: watch_queue: Fix to release page in ->release() When a pipe ring descriptor points to a notification message, the refcount on the backing page is incremented by the generic get function, but the release function, which marks the bitmap, doesn't drop the page ref. Fix this by calling generic_pipe_buf_release() at the end of watch_queue_pipe_buf_release(). Fixes: c73be61cede5 ("pipe: Add general notification queue support") Reported-by: Jann Horn Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- kernel/watch_queue.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index 427b0318e303..dfb3a7e28280 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -54,6 +54,7 @@ static void watch_queue_pipe_buf_release(struct pipe_inode_info *pipe, bit += page->index; set_bit(bit, wqueue->notes_bitmap); + generic_pipe_buf_release(pipe, buf); } // No try_steal function => no stealing -- cgit From 96a4d8912b28451cd62825fd7caa0e66e091d938 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Mar 2022 13:24:08 +0000 Subject: watch_queue: Fix to always request a pow-of-2 pipe ring size The pipe ring size must always be a power of 2 as the head and tail pointers are masked off by AND'ing with the size of the ring - 1. watch_queue_set_size(), however, lets you specify any number of notes between 1 and 511. This number is passed through to pipe_resize_ring() without checking/forcing its alignment. Fix this by rounding the number of slots required up to the nearest power of two. The request is meant to guarantee that at least that many notifications can be generated before the queue is full, so rounding down isn't an option, but, alternatively, it may be better to give an error if we aren't allowed to allocate that much ring space. Fixes: c73be61cede5 ("pipe: Add general notification queue support") Reported-by: Jann Horn Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- kernel/watch_queue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index dfb3a7e28280..4bcd400984a7 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -244,7 +244,7 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes) goto error; } - ret = pipe_resize_ring(pipe, nr_notes); + ret = pipe_resize_ring(pipe, roundup_pow_of_two(nr_notes)); if (ret < 0) goto error; -- cgit From a66bd7575b5f449ee0ba20cfd21c3bc5b04ef361 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Fri, 11 Mar 2022 13:24:15 +0000 Subject: watch_queue: Use the bitmap API when applicable Use bitmap_alloc() to simplify code, improve the semantic and reduce some open-coded arithmetic in allocator arguments. Also change a memset(0xff) into an equivalent bitmap_fill() to keep consistency. Signed-off-by: Christophe JAILLET Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- kernel/watch_queue.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index 4bcd400984a7..5b516eb2c7cc 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -220,7 +220,6 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes) struct page **pages; unsigned long *bitmap; unsigned long user_bufs; - unsigned int bmsize; int ret, i, nr_pages; if (!wqueue) @@ -259,13 +258,11 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes) pages[i]->index = i * WATCH_QUEUE_NOTES_PER_PAGE; } - bmsize = (nr_notes + BITS_PER_LONG - 1) / BITS_PER_LONG; - bmsize *= sizeof(unsigned long); - bitmap = kmalloc(bmsize, GFP_KERNEL); + bitmap = bitmap_alloc(nr_notes, GFP_KERNEL); if (!bitmap) goto error_p; - memset(bitmap, 0xff, bmsize); + bitmap_fill(bitmap, nr_notes); wqueue->notes = pages; wqueue->notes_bitmap = bitmap; wqueue->nr_pages = nr_pages; -- cgit From 3b4c0371928c17af03e8397ac842346624017ce6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Mar 2022 13:24:22 +0000 Subject: watch_queue: Fix the alloc bitmap size to reflect notes allocated Currently, watch_queue_set_size() sets the number of notes available in wqueue->nr_notes according to the number of notes allocated, but sets the size of the bitmap to the unrounded number of notes originally asked for. Fix this by setting the bitmap size to the number of notes we're actually going to make available (ie. the number allocated). Fixes: c73be61cede5 ("pipe: Add general notification queue support") Reported-by: Jann Horn Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- kernel/watch_queue.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index 5b516eb2c7cc..9c476d2cbac0 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -243,6 +243,7 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes) goto error; } + nr_notes = nr_pages * WATCH_QUEUE_NOTES_PER_PAGE; ret = pipe_resize_ring(pipe, roundup_pow_of_two(nr_notes)); if (ret < 0) goto error; @@ -266,7 +267,7 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes) wqueue->notes = pages; wqueue->notes_bitmap = bitmap; wqueue->nr_pages = nr_pages; - wqueue->nr_notes = nr_pages * WATCH_QUEUE_NOTES_PER_PAGE; + wqueue->nr_notes = nr_notes; return 0; error_p: -- cgit From 7ea1a0124b6da246b5bc8c66cddaafd36acf3ecb Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Mar 2022 13:24:29 +0000 Subject: watch_queue: Free the alloc bitmap when the watch_queue is torn down Free the watch_queue note allocation bitmap when the watch_queue is destroyed. Fixes: c73be61cede5 ("pipe: Add general notification queue support") Reported-by: Jann Horn Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- kernel/watch_queue.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index 9c476d2cbac0..c12267ccc70e 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -370,6 +370,7 @@ static void __put_watch_queue(struct kref *kref) for (i = 0; i < wqueue->nr_pages; i++) __free_page(wqueue->notes[i]); + bitmap_free(wqueue->notes_bitmap); wfilter = rcu_access_pointer(wqueue->filter); if (wfilter) -- cgit From 2ed147f015af2b48f41c6f0b6746aa9ea85c19f3 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Mar 2022 13:24:36 +0000 Subject: watch_queue: Fix lack of barrier/sync/lock between post and read There's nothing to synchronise post_one_notification() versus pipe_read(). Whilst posting is done under pipe->rd_wait.lock, the reader only takes pipe->mutex which cannot bar notification posting as that may need to be made from contexts that cannot sleep. Fix this by setting pipe->head with a barrier in post_one_notification() and reading pipe->head with a barrier in pipe_read(). If that's not sufficient, the rd_wait.lock will need to be taken, possibly in a ->confirm() op so that it only applies to notifications. The lock would, however, have to be dropped before copy_page_to_iter() is invoked. Fixes: c73be61cede5 ("pipe: Add general notification queue support") Reported-by: Jann Horn Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- fs/pipe.c | 3 ++- kernel/watch_queue.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/pipe.c b/fs/pipe.c index 4eb88bc138bb..2667db9506e2 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -253,7 +253,8 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) */ was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage); for (;;) { - unsigned int head = pipe->head; + /* Read ->head with a barrier vs post_one_notification() */ + unsigned int head = smp_load_acquire(&pipe->head); unsigned int tail = pipe->tail; unsigned int mask = pipe->ring_size - 1; diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index c12267ccc70e..37bcd900fd77 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -113,7 +113,7 @@ static bool post_one_notification(struct watch_queue *wqueue, buf->offset = offset; buf->len = len; buf->flags = PIPE_BUF_FLAG_WHOLE; - pipe->head = head + 1; + smp_store_release(&pipe->head, head + 1); /* vs pipe_read() */ if (!test_and_clear_bit(note, wqueue->notes_bitmap)) { spin_unlock_irq(&pipe->rd_wait.lock); -- cgit From 4edc0760412b0c4ecefc7e02cb855b310b122825 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Mar 2022 13:24:47 +0000 Subject: watch_queue: Make comment about setting ->defunct more accurate watch_queue_clear() has a comment stating that setting ->defunct to true preventing new additions as well as preventing notifications. Whilst the latter is true, the first bit is superfluous since at the time this function is called, the pipe cannot be accessed to add new event sources. Remove the "new additions" bit from the comment. Fixes: c73be61cede5 ("pipe: Add general notification queue support") Reported-by: Jann Horn Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- kernel/watch_queue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index 37bcd900fd77..00703444a219 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -566,7 +566,7 @@ void watch_queue_clear(struct watch_queue *wqueue) rcu_read_lock(); spin_lock_bh(&wqueue->lock); - /* Prevent new additions and prevent notifications from happening */ + /* Prevent new notifications from being stored. */ wqueue->defunct = true; while (!hlist_empty(&wqueue->watches)) { -- cgit From a365a65f9ca1ceb9cf1ac29db4a4f51df7c507ad Mon Sep 17 00:00:00 2001 From: Li Huafei Date: Thu, 10 Mar 2022 20:09:15 +0800 Subject: x86/traps: Mark do_int3() NOKPROBE_SYMBOL Since kprobe_int3_handler() is called in do_int3(), probing do_int3() can cause a breakpoint recursion and crash the kernel. Therefore, do_int3() should be marked as NOKPROBE_SYMBOL. Fixes: 21e28290b317 ("x86/traps: Split int3 handler up") Signed-off-by: Li Huafei Signed-off-by: Borislav Petkov Acked-by: Masami Hiramatsu Cc: Link: https://lore.kernel.org/r/20220310120915.63349-1-lihuafei1@huawei.com --- arch/x86/kernel/traps.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index c9d566dcf89a..8143693a7ea6 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -659,6 +659,7 @@ static bool do_int3(struct pt_regs *regs) return res == NOTIFY_STOP; } +NOKPROBE_SYMBOL(do_int3); static void do_int3_user(struct pt_regs *regs) { -- cgit From 173ce1ca47c489135b2799f70f550e1319ba36d8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Mar 2022 15:58:21 +0000 Subject: afs: Fix potential thrashing in afs writeback In afs_writepages_region(), if the dirty page we find is undergoing writeback or write to cache, but the sync_mode is WB_SYNC_NONE, we go round the loop trying the same page again and again with no pausing or waiting unless and until another thread manages to clear the writeback and fscache flags. Fix this with three measures: (1) Advance start to after the page we found. (2) Break out of the loop and return if rescheduling is requested. (3) Arbitrarily give up after a maximum of 5 skips. Fixes: 31143d5d515e ("AFS: implement basic file write support") Reported-by: Marc Dionne Signed-off-by: David Howells Tested-by: Marc Dionne Acked-by: Marc Dionne Link: https://lore.kernel.org/r/164692725757.2097000.2060513769492301854.stgit@warthog.procyon.org.uk/ # v1 Signed-off-by: Linus Torvalds --- fs/afs/write.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/afs/write.c b/fs/afs/write.c index 5e9157d0da29..f447c902318d 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -703,7 +703,7 @@ static int afs_writepages_region(struct address_space *mapping, struct folio *folio; struct page *head_page; ssize_t ret; - int n; + int n, skips = 0; _enter("%llx,%llx,", start, end); @@ -754,8 +754,15 @@ static int afs_writepages_region(struct address_space *mapping, #ifdef CONFIG_AFS_FSCACHE folio_wait_fscache(folio); #endif + } else { + start += folio_size(folio); } folio_put(folio); + if (wbc->sync_mode == WB_SYNC_NONE) { + if (skips >= 5 || need_resched()) + break; + skips++; + } continue; } -- cgit From 413a4a6b0b5553f2423d210f65e98c211b99c3f8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Mar 2022 16:02:18 +0000 Subject: cachefiles: Fix volume coherency attribute A network filesystem may set coherency data on a volume cookie, and if given, cachefiles will store this in an xattr on the directory in the cache corresponding to the volume. The function that sets the xattr just stores the contents of the volume coherency buffer directly into the xattr, with nothing added; the checking function, on the other hand, has a cut'n'paste error whereby it tries to interpret the xattr contents as would be the xattr on an ordinary file (using the cachefiles_xattr struct). This results in a failure to match the coherency data because the buffer ends up being shifted by 18 bytes. Fix this by defining a structure specifically for the volume xattr and making both the setting and checking functions use it. Since the volume coherency doesn't work if used, take the opportunity to insert a reserved field for future use, set it to 0 and check that it is 0. Log mismatch through the appropriate tracepoint. Note that this only affects cifs; 9p, afs, ceph and nfs don't use the volume coherency data at the moment. Fixes: 32e150037dce ("fscache, cachefiles: Store the volume coherency data") Reported-by: Rohith Surabattula Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: Steve French cc: linux-cifs@vger.kernel.org cc: linux-cachefs@redhat.com Signed-off-by: Linus Torvalds --- fs/cachefiles/xattr.c | 23 ++++++++++++++++++++--- include/trace/events/cachefiles.h | 2 ++ 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c index 83f41bd0c3a9..35465109d9c4 100644 --- a/fs/cachefiles/xattr.c +++ b/fs/cachefiles/xattr.c @@ -28,6 +28,11 @@ struct cachefiles_xattr { static const char cachefiles_xattr_cache[] = XATTR_USER_PREFIX "CacheFiles.cache"; +struct cachefiles_vol_xattr { + __be32 reserved; /* Reserved, should be 0 */ + __u8 data[]; /* netfs volume coherency data */ +} __packed; + /* * set the state xattr on a cache file */ @@ -185,6 +190,7 @@ void cachefiles_prepare_to_write(struct fscache_cookie *cookie) */ bool cachefiles_set_volume_xattr(struct cachefiles_volume *volume) { + struct cachefiles_vol_xattr *buf; unsigned int len = volume->vcookie->coherency_len; const void *p = volume->vcookie->coherency; struct dentry *dentry = volume->dentry; @@ -192,10 +198,17 @@ bool cachefiles_set_volume_xattr(struct cachefiles_volume *volume) _enter("%x,#%d", volume->vcookie->debug_id, len); + len += sizeof(*buf); + buf = kmalloc(len, GFP_KERNEL); + if (!buf) + return false; + buf->reserved = cpu_to_be32(0); + memcpy(buf->data, p, len); + ret = cachefiles_inject_write_error(); if (ret == 0) ret = vfs_setxattr(&init_user_ns, dentry, cachefiles_xattr_cache, - p, len, 0); + buf, len, 0); if (ret < 0) { trace_cachefiles_vfs_error(NULL, d_inode(dentry), ret, cachefiles_trace_setxattr_error); @@ -209,6 +222,7 @@ bool cachefiles_set_volume_xattr(struct cachefiles_volume *volume) cachefiles_coherency_vol_set_ok); } + kfree(buf); _leave(" = %d", ret); return ret == 0; } @@ -218,7 +232,7 @@ bool cachefiles_set_volume_xattr(struct cachefiles_volume *volume) */ int cachefiles_check_volume_xattr(struct cachefiles_volume *volume) { - struct cachefiles_xattr *buf; + struct cachefiles_vol_xattr *buf; struct dentry *dentry = volume->dentry; unsigned int len = volume->vcookie->coherency_len; const void *p = volume->vcookie->coherency; @@ -228,6 +242,7 @@ int cachefiles_check_volume_xattr(struct cachefiles_volume *volume) _enter(""); + len += sizeof(*buf); buf = kmalloc(len, GFP_KERNEL); if (!buf) return -ENOMEM; @@ -245,7 +260,9 @@ int cachefiles_check_volume_xattr(struct cachefiles_volume *volume) "Failed to read xattr with error %zd", xlen); } why = cachefiles_coherency_vol_check_xattr; - } else if (memcmp(buf->data, p, len) != 0) { + } else if (buf->reserved != cpu_to_be32(0)) { + why = cachefiles_coherency_vol_check_resv; + } else if (memcmp(buf->data, p, len - sizeof(*buf)) != 0) { why = cachefiles_coherency_vol_check_cmp; } else { why = cachefiles_coherency_vol_check_ok; diff --git a/include/trace/events/cachefiles.h b/include/trace/events/cachefiles.h index c6f5aa74db89..2c530637e10a 100644 --- a/include/trace/events/cachefiles.h +++ b/include/trace/events/cachefiles.h @@ -56,6 +56,7 @@ enum cachefiles_coherency_trace { cachefiles_coherency_set_ok, cachefiles_coherency_vol_check_cmp, cachefiles_coherency_vol_check_ok, + cachefiles_coherency_vol_check_resv, cachefiles_coherency_vol_check_xattr, cachefiles_coherency_vol_set_fail, cachefiles_coherency_vol_set_ok, @@ -139,6 +140,7 @@ enum cachefiles_error_trace { EM(cachefiles_coherency_set_ok, "SET ok ") \ EM(cachefiles_coherency_vol_check_cmp, "VOL BAD cmp ") \ EM(cachefiles_coherency_vol_check_ok, "VOL OK ") \ + EM(cachefiles_coherency_vol_check_resv, "VOL BAD resv") \ EM(cachefiles_coherency_vol_check_xattr,"VOL BAD xatt") \ EM(cachefiles_coherency_vol_set_fail, "VOL SET fail") \ E_(cachefiles_coherency_vol_set_ok, "VOL SET ok ") -- cgit From 08999b2489b4c9b939d7483dbd03702ee4576d96 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Fri, 4 Mar 2022 00:38:58 +0200 Subject: x86/sgx: Free backing memory after faulting the enclave page There is a limited amount of SGX memory (EPC) on each system. When that memory is used up, SGX has its own swapping mechanism which is similar in concept but totally separate from the core mm/* code. Instead of swapping to disk, SGX swaps from EPC to normal RAM. That normal RAM comes from a shared memory pseudo-file and can itself be swapped by the core mm code. There is a hierarchy like this: EPC <-> shmem <-> disk After data is swapped back in from shmem to EPC, the shmem backing storage needs to be freed. Currently, the backing shmem is not freed. This effectively wastes the shmem while the enclave is running. The memory is recovered when the enclave is destroyed and the backing storage freed. Sort this out by freeing memory with shmem_truncate_range(), as soon as a page is faulted back to the EPC. In addition, free the memory for PCMD pages as soon as all PCMD's in a page have been marked as unused by zeroing its contents. Cc: stable@vger.kernel.org Fixes: 1728ab54b4be ("x86/sgx: Add a page reclaimer") Reported-by: Dave Hansen Signed-off-by: Jarkko Sakkinen Signed-off-by: Dave Hansen Link: https://lkml.kernel.org/r/20220303223859.273187-1-jarkko@kernel.org --- arch/x86/kernel/cpu/sgx/encl.c | 57 +++++++++++++++++++++++++++++++++++------- 1 file changed, 48 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 48afe96ae0f0..7c63a1911fae 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -12,6 +12,30 @@ #include "encls.h" #include "sgx.h" +/* + * Calculate byte offset of a PCMD struct associated with an enclave page. PCMD's + * follow right after the EPC data in the backing storage. In addition to the + * visible enclave pages, there's one extra page slot for SECS, before PCMD + * structs. + */ +static inline pgoff_t sgx_encl_get_backing_page_pcmd_offset(struct sgx_encl *encl, + unsigned long page_index) +{ + pgoff_t epc_end_off = encl->size + sizeof(struct sgx_secs); + + return epc_end_off + page_index * sizeof(struct sgx_pcmd); +} + +/* + * Free a page from the backing storage in the given page index. + */ +static inline void sgx_encl_truncate_backing_page(struct sgx_encl *encl, unsigned long page_index) +{ + struct inode *inode = file_inode(encl->backing); + + shmem_truncate_range(inode, PFN_PHYS(page_index), PFN_PHYS(page_index) + PAGE_SIZE - 1); +} + /* * ELDU: Load an EPC page as unblocked. For more info, see "OS Management of EPC * Pages" in the SDM. @@ -22,9 +46,11 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, { unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK; struct sgx_encl *encl = encl_page->encl; + pgoff_t page_index, page_pcmd_off; struct sgx_pageinfo pginfo; struct sgx_backing b; - pgoff_t page_index; + bool pcmd_page_empty; + u8 *pcmd_page; int ret; if (secs_page) @@ -32,14 +58,16 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, else page_index = PFN_DOWN(encl->size); + page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index); + ret = sgx_encl_get_backing(encl, page_index, &b); if (ret) return ret; pginfo.addr = encl_page->desc & PAGE_MASK; pginfo.contents = (unsigned long)kmap_atomic(b.contents); - pginfo.metadata = (unsigned long)kmap_atomic(b.pcmd) + - b.pcmd_offset; + pcmd_page = kmap_atomic(b.pcmd); + pginfo.metadata = (unsigned long)pcmd_page + b.pcmd_offset; if (secs_page) pginfo.secs = (u64)sgx_get_epc_virt_addr(secs_page); @@ -55,11 +83,24 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, ret = -EFAULT; } - kunmap_atomic((void *)(unsigned long)(pginfo.metadata - b.pcmd_offset)); + memset(pcmd_page + b.pcmd_offset, 0, sizeof(struct sgx_pcmd)); + + /* + * The area for the PCMD in the page was zeroed above. Check if the + * whole page is now empty meaning that all PCMD's have been zeroed: + */ + pcmd_page_empty = !memchr_inv(pcmd_page, 0, PAGE_SIZE); + + kunmap_atomic(pcmd_page); kunmap_atomic((void *)(unsigned long)pginfo.contents); sgx_encl_put_backing(&b, false); + sgx_encl_truncate_backing_page(encl, page_index); + + if (pcmd_page_empty) + sgx_encl_truncate_backing_page(encl, PFN_DOWN(page_pcmd_off)); + return ret; } @@ -579,7 +620,7 @@ static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl, int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, struct sgx_backing *backing) { - pgoff_t pcmd_index = PFN_DOWN(encl->size) + 1 + (page_index >> 5); + pgoff_t page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index); struct page *contents; struct page *pcmd; @@ -587,7 +628,7 @@ int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, if (IS_ERR(contents)) return PTR_ERR(contents); - pcmd = sgx_encl_get_backing_page(encl, pcmd_index); + pcmd = sgx_encl_get_backing_page(encl, PFN_DOWN(page_pcmd_off)); if (IS_ERR(pcmd)) { put_page(contents); return PTR_ERR(pcmd); @@ -596,9 +637,7 @@ int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, backing->page_index = page_index; backing->contents = contents; backing->pcmd = pcmd; - backing->pcmd_offset = - (page_index & (PAGE_SIZE / sizeof(struct sgx_pcmd) - 1)) * - sizeof(struct sgx_pcmd); + backing->pcmd_offset = page_pcmd_off & (PAGE_SIZE - 1); return 0; } -- cgit From 6c7cb60bff7aec24b834343ff433125f469886a3 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Fri, 11 Mar 2022 17:13:17 +0000 Subject: ARM: fix Thumb2 regression with Spectre BHB When building for Thumb2, the vectors make use of a local label. Sadly, the Spectre BHB code also uses a local label with the same number which results in the Thumb2 reference pointing at the wrong place. Fix this by changing the number used for the Spectre BHB local label. Fixes: b9baf5c8c5c3 ("ARM: Spectre-BHB workaround") Tested-by: Nathan Chancellor Signed-off-by: Russell King (Oracle) Signed-off-by: Linus Torvalds --- arch/arm/kernel/entry-armv.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S index 676703cbfe4b..ee3f7a599181 100644 --- a/arch/arm/kernel/entry-armv.S +++ b/arch/arm/kernel/entry-armv.S @@ -1040,9 +1040,9 @@ vector_bhb_loop8_\name: @ bhb workaround mov r0, #8 -1: b . + 4 +3: b . + 4 subs r0, r0, #1 - bne 1b + bne 3b dsb isb b 2b -- cgit From 68453767131a5deec1e8f9ac92a9042f929e585d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 11 Mar 2022 11:49:12 -0800 Subject: ARM: Spectre-BHB: provide empty stub for non-config When CONFIG_GENERIC_CPU_VULNERABILITIES is not set, references to spectre_v2_update_state() cause a build error, so provide an empty stub for that function when the Kconfig option is not set. Fixes this build error: arm-linux-gnueabi-ld: arch/arm/mm/proc-v7-bugs.o: in function `cpu_v7_bugs_init': proc-v7-bugs.c:(.text+0x52): undefined reference to `spectre_v2_update_state' arm-linux-gnueabi-ld: proc-v7-bugs.c:(.text+0x82): undefined reference to `spectre_v2_update_state' Fixes: b9baf5c8c5c3 ("ARM: Spectre-BHB workaround") Signed-off-by: Randy Dunlap Reported-by: kernel test robot Cc: Russell King Cc: Catalin Marinas Cc: linux-arm-kernel@lists.infradead.org Cc: patches@armlinux.org.uk Acked-by: Russell King (Oracle) Signed-off-by: Linus Torvalds --- arch/arm/include/asm/spectre.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/arm/include/asm/spectre.h b/arch/arm/include/asm/spectre.h index d1fa5607d3aa..85f9e538fb32 100644 --- a/arch/arm/include/asm/spectre.h +++ b/arch/arm/include/asm/spectre.h @@ -25,7 +25,13 @@ enum { SPECTRE_V2_METHOD_LOOP8 = BIT(__SPECTRE_V2_METHOD_LOOP8), }; +#ifdef CONFIG_GENERIC_CPU_VULNERABILITIES void spectre_v2_update_state(unsigned int state, unsigned int methods); +#else +static inline void spectre_v2_update_state(unsigned int state, + unsigned int methods) +{} +#endif int spectre_bhb_update_vectors(unsigned int method); -- cgit From 3755d35ee1d2454b20b8a1e20d790e56201678a4 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Thu, 3 Feb 2022 10:39:22 +0100 Subject: drm/panel: Select DRM_DP_HELPER for DRM_PANEL_EDP As reported in [1], DRM_PANEL_EDP depends on DRM_DP_HELPER. Select the option to fix the build failure. The error message is shown below. arm-linux-gnueabihf-ld: drivers/gpu/drm/panel/panel-edp.o: in function `panel_edp_probe': panel-edp.c:(.text+0xb74): undefined reference to `drm_panel_dp_aux_backlight' make[1]: *** [/builds/linux/Makefile:1222: vmlinux] Error 1 The issue has been reported before, when DisplayPort helpers were hidden behind the option CONFIG_DRM_KMS_HELPER. [2] v2: * fix and expand commit description (Arnd) Signed-off-by: Thomas Zimmermann Fixes: 9d6366e743f3 ("drm: fb_helper: improve CONFIG_FB dependency") Reported-by: Naresh Kamboju Reported-by: Linux Kernel Functional Testing Reviewed-by: Lyude Paul Acked-by: Sam Ravnborg Link: https://lore.kernel.org/dri-devel/CA+G9fYvN0NyaVkRQmA1O6rX7H8PPaZrUAD7=RDy33QY9rUU-9g@mail.gmail.com/ # [1] Link: https://lore.kernel.org/all/20211117062704.14671-1-rdunlap@infradead.org/ # [2] Cc: Thomas Zimmermann Cc: Lyude Paul Cc: Daniel Vetter Cc: Maarten Lankhorst Cc: Maxime Ripard Cc: dri-devel@lists.freedesktop.org Link: https://patchwork.freedesktop.org/patch/msgid/20220203093922.20754-1-tzimmermann@suse.de Signed-off-by: Dave Airlie --- drivers/gpu/drm/panel/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/panel/Kconfig b/drivers/gpu/drm/panel/Kconfig index 434c2861bb40..0aec5a10b064 100644 --- a/drivers/gpu/drm/panel/Kconfig +++ b/drivers/gpu/drm/panel/Kconfig @@ -106,6 +106,7 @@ config DRM_PANEL_EDP depends on PM select VIDEOMODE_HELPERS select DRM_DP_AUX_BUS + select DRM_DP_HELPER help DRM panel driver for dumb eDP panels that need at most a regulator and a GPIO to be powered up. Optionally a backlight can be attached so -- cgit From 3ec94eeaff9ad58a76be2232068b4a2546b2f6bb Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 21 Dec 2020 12:53:44 -0300 Subject: tools kvm headers arm64: Update KVM headers from the kernel sources To pick the changes from: a5905d6af492ee6a ("KVM: arm64: Allow SMCCC_ARCH_WORKAROUND_3 to be discovered and migrated") That don't causes any changes in tooling (when built on x86), only addresses this perf build warning: Warning: Kernel ABI header at 'tools/arch/arm64/include/uapi/asm/kvm.h' differs from latest version at 'arch/arm64/include/uapi/asm/kvm.h' diff -u tools/arch/arm64/include/uapi/asm/kvm.h arch/arm64/include/uapi/asm/kvm.h Cc: James Morse Link: https://lore.kernel.org/lkml/YiyhAK6sVPc83FaI@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/arm64/include/uapi/asm/kvm.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/arch/arm64/include/uapi/asm/kvm.h b/tools/arch/arm64/include/uapi/asm/kvm.h index b3edde68bc3e..323e251ed37b 100644 --- a/tools/arch/arm64/include/uapi/asm/kvm.h +++ b/tools/arch/arm64/include/uapi/asm/kvm.h @@ -281,6 +281,11 @@ struct kvm_arm_copy_mte_tags { #define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED 3 #define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED (1U << 4) +#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3 KVM_REG_ARM_FW_REG(3) +#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3_NOT_AVAIL 0 +#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3_AVAIL 1 +#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3_NOT_REQUIRED 2 + /* SVE registers */ #define KVM_REG_ARM64_SVE (0x15 << KVM_REG_ARM_COPROC_SHIFT) -- cgit From ec9d50ace39925f7fd0302bf0fad640e2c9826ea Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 1 Jul 2021 13:39:15 -0300 Subject: tools headers cpufeatures: Sync with the kernel sources To pick the changes from: d45476d983240937 ("x86/speculation: Rename RETPOLINE_AMD to RETPOLINE_LFENCE") Its just a comment fixup. This only causes these perf files to be rebuilt: CC /tmp/build/perf/bench/mem-memcpy-x86-64-asm.o CC /tmp/build/perf/bench/mem-memset-x86-64-asm.o And addresses this perf build warning: Warning: Kernel ABI header at 'tools/arch/x86/include/asm/cpufeatures.h' differs from latest version at 'arch/x86/include/asm/cpufeatures.h' diff -u tools/arch/x86/include/asm/cpufeatures.h arch/x86/include/asm/cpufeatures.h Cc: Borislav Petkov Cc: Peter Zijlstra (Intel) Link: https://lore.kernel.org/lkml/YiyiHatGaJQM7l/Y@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/x86/include/asm/cpufeatures.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index ab4e53715d8a..65d147974f8d 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -204,7 +204,7 @@ /* FREE! ( 7*32+10) */ #define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */ #define X86_FEATURE_RETPOLINE ( 7*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */ -#define X86_FEATURE_RETPOLINE_LFENCE ( 7*32+13) /* "" Use LFENCEs for Spectre variant 2 */ +#define X86_FEATURE_RETPOLINE_LFENCE ( 7*32+13) /* "" Use LFENCE for Spectre variant 2 */ #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ #define X86_FEATURE_CDP_L2 ( 7*32+15) /* Code and Data Prioritization L2 */ #define X86_FEATURE_MSR_SPEC_CTRL ( 7*32+16) /* "" MSR SPEC_CTRL is implemented */ -- cgit From a7a72631f62445e3671b7cab5ad01f856c1aa90d Mon Sep 17 00:00:00 2001 From: Weiguo Li Date: Fri, 11 Mar 2022 21:06:57 +0800 Subject: perf parse-events: Fix NULL check against wrong variable We did a null check after "tmp->symbol = strdup(...)", but we checked "list->symbol" other than "tmp->symbol". Reviewed-by: John Garry Signed-off-by: Weiguo Li Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/tencent_DF39269807EC9425E24787E6DB632441A405@qq.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 9739b05b999e..dfb50a5f83d0 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -2193,7 +2193,7 @@ int perf_pmu__test_parse_init(void) for (i = 0; i < ARRAY_SIZE(symbols); i++, tmp++) { tmp->type = symbols[i].type; tmp->symbol = strdup(symbols[i].symbol); - if (!list->symbol) + if (!tmp->symbol) goto err_free; } -- cgit From 073a15c3512f6b8d36c0c05992cf31e845f4dfe0 Mon Sep 17 00:00:00 2001 From: Weiguo Li Date: Fri, 11 Mar 2022 21:07:16 +0800 Subject: perf bench: Fix NULL check against wrong variable We did a NULL check after "epollfdp = calloc(...)", but we checked "epollfd" instead of "epollfdp". Signed-off-by: Weiguo Li Acked-by: Davidlohr Bueso Cc: Alexander Shishkin Cc: Davidlohr Bueso Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/tencent_B5D64530EB9C7DBB8D2C88A0C790F1489D0A@qq.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/bench/epoll-ctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/bench/epoll-ctl.c b/tools/perf/bench/epoll-ctl.c index 740ae764537e..134612bde0cb 100644 --- a/tools/perf/bench/epoll-ctl.c +++ b/tools/perf/bench/epoll-ctl.c @@ -106,7 +106,7 @@ static void nest_epollfd(void) printinfo("Nesting level(s): %d\n", nested); epollfdp = calloc(nested, sizeof(int)); - if (!epollfd) + if (!epollfdp) err(EXIT_FAILURE, "calloc"); for (i = 0; i < nested; i++) { -- cgit From 91c9923a473a694eb1c5c01ab778a77114969707 Mon Sep 17 00:00:00 2001 From: Zhengjun Xing Date: Mon, 7 Mar 2022 23:16:27 +0800 Subject: perf parse: Fix event parser error for hybrid systems This bug happened on hybrid systems when both cpu_core and cpu_atom have the same event name such as "UOPS_RETIRED.MS" while their event terms are different, then during perf stat, the event for cpu_atom will parse fail and then no output for cpu_atom. UOPS_RETIRED.MS -> cpu_core/period=0x1e8483,umask=0x4,event=0xc2,frontend=0x8/ UOPS_RETIRED.MS -> cpu_atom/period=0x1e8483,umask=0x1,event=0xc2/ It is because event terms in the "head" of parse_events_multi_pmu_add will be changed to event terms for cpu_core after parsing UOPS_RETIRED.MS for cpu_core, then when parsing the same event for cpu_atom, it still uses the event terms for cpu_core, but event terms for cpu_atom are different with cpu_core, the event parses for cpu_atom will fail. This patch fixes it, the event terms should be parsed from the original event. This patch can work for the hybrid systems that have the same event in more than 2 PMUs. It also can work in non-hybrid systems. Before: # perf stat -v -e UOPS_RETIRED.MS -a sleep 1 Using CPUID GenuineIntel-6-97-1 UOPS_RETIRED.MS -> cpu_core/period=0x1e8483,umask=0x4,event=0xc2,frontend=0x8/ Control descriptor is not initialized UOPS_RETIRED.MS: 2737845 16068518485 16068518485 Performance counter stats for 'system wide': 2,737,845 cpu_core/UOPS_RETIRED.MS/ 1.002553850 seconds time elapsed After: # perf stat -v -e UOPS_RETIRED.MS -a sleep 1 Using CPUID GenuineIntel-6-97-1 UOPS_RETIRED.MS -> cpu_core/period=0x1e8483,umask=0x4,event=0xc2,frontend=0x8/ UOPS_RETIRED.MS -> cpu_atom/period=0x1e8483,umask=0x1,event=0xc2/ Control descriptor is not initialized UOPS_RETIRED.MS: 1977555 16076950711 16076950711 UOPS_RETIRED.MS: 568684 8038694234 8038694234 Performance counter stats for 'system wide': 1,977,555 cpu_core/UOPS_RETIRED.MS/ 568,684 cpu_atom/UOPS_RETIRED.MS/ 1.004758259 seconds time elapsed Fixes: fb0811535e92c6c1 ("perf parse-events: Allow config on kernel PMU events") Reviewed-by: Kan Liang Signed-off-by: Zhengjun Xing Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20220307151627.30049-1-zhengjun.xing@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index dfb50a5f83d0..24997925ae00 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -1648,6 +1648,7 @@ int parse_events_multi_pmu_add(struct parse_events_state *parse_state, { struct parse_events_term *term; struct list_head *list = NULL; + struct list_head *orig_head = NULL; struct perf_pmu *pmu = NULL; int ok = 0; char *config; @@ -1674,7 +1675,6 @@ int parse_events_multi_pmu_add(struct parse_events_state *parse_state, } list_add_tail(&term->list, head); - /* Add it for all PMUs that support the alias */ list = malloc(sizeof(struct list_head)); if (!list) @@ -1687,13 +1687,15 @@ int parse_events_multi_pmu_add(struct parse_events_state *parse_state, list_for_each_entry(alias, &pmu->aliases, list) { if (!strcasecmp(alias->name, str)) { + parse_events_copy_term_list(head, &orig_head); if (!parse_events_add_pmu(parse_state, list, - pmu->name, head, + pmu->name, orig_head, true, true)) { pr_debug("%s -> %s/%s/\n", str, pmu->name, alias->str); ok++; } + parse_events_terms__delete(orig_head); } } } -- cgit From 09688c0166e76ce2fb85e86b9d99be8b0084cdf9 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 13 Mar 2022 13:23:37 -0700 Subject: Linux 5.17-rc8 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 87f672473523..55a30ca69350 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 5 PATCHLEVEL = 17 SUBLEVEL = 0 -EXTRAVERSION = -rc7 +EXTRAVERSION = -rc8 NAME = Superb Owl # *DOCUMENTATION* -- cgit