From 3af75db12e3d3537bc01c06bbdc876cb17e82b35 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Thu, 24 Aug 2017 09:22:57 +0900 Subject: iio: adc: ti-ads1015: fix comparator polarity setting The comparator polarity field in config register is not correctly initialized as per the interrupt trigger setting. Because the bitfield definision is wrong and bit shifting is missed. Fixes: d9f39babd8ba ("iio: adc: ti-ads1015: add threshold event support") Cc: Daniel Baluta Cc: Jonathan Cameron Signed-off-by: Akinobu Mita Signed-off-by: Jonathan Cameron --- drivers/iio/adc/ti-ads1015.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/iio/adc/ti-ads1015.c b/drivers/iio/adc/ti-ads1015.c index d1210024f6bc..e0dc20488335 100644 --- a/drivers/iio/adc/ti-ads1015.c +++ b/drivers/iio/adc/ti-ads1015.c @@ -52,7 +52,7 @@ #define ADS1015_CFG_COMP_QUE_MASK GENMASK(1, 0) #define ADS1015_CFG_COMP_LAT_MASK BIT(2) -#define ADS1015_CFG_COMP_POL_MASK BIT(2) +#define ADS1015_CFG_COMP_POL_MASK BIT(3) #define ADS1015_CFG_COMP_MODE_MASK BIT(4) #define ADS1015_CFG_DR_MASK GENMASK(7, 5) #define ADS1015_CFG_MOD_MASK BIT(8) @@ -1017,10 +1017,12 @@ static int ads1015_probe(struct i2c_client *client, switch (irq_trig) { case IRQF_TRIGGER_LOW: - cfg_comp |= ADS1015_CFG_COMP_POL_LOW; + cfg_comp |= ADS1015_CFG_COMP_POL_LOW << + ADS1015_CFG_COMP_POL_SHIFT; break; case IRQF_TRIGGER_HIGH: - cfg_comp |= ADS1015_CFG_COMP_POL_HIGH; + cfg_comp |= ADS1015_CFG_COMP_POL_HIGH << + ADS1015_CFG_COMP_POL_SHIFT; break; default: return -EINVAL; -- cgit From c65e3d6ef4bfdc4c8460509f08507cf7dc026974 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 29 Aug 2017 13:45:11 +0200 Subject: iio: magnetometer: st_magn: fix drdy line configuration for LIS3MDL Data-ready line in LIS3MDL is routed to drdy pin and it is not possible to select a different INT pin. st_sensors_set_dataready_irq() assumes that if drdy int address is not exported in register map, irq trigger is not supported by the sensor and hw_irq_trigger is always false. Based on this configuration st_sensors_irq_thread does not consume generated interrupt causing an unhandled irq. Fix this taking into account status register address in st_sensors_set_dataready_irq() Fixes: 90efe0556292 (iio: st_sensors: harden interrupt handling) Signed-off-by: Lorenzo Bianconi Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/common/st_sensors/st_sensors_core.c | 11 ++++++++++- drivers/iio/magnetometer/st_magn_core.c | 4 ++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/iio/common/st_sensors/st_sensors_core.c b/drivers/iio/common/st_sensors/st_sensors_core.c index d99bb1460fe2..02e833b14db0 100644 --- a/drivers/iio/common/st_sensors/st_sensors_core.c +++ b/drivers/iio/common/st_sensors/st_sensors_core.c @@ -463,8 +463,17 @@ int st_sensors_set_dataready_irq(struct iio_dev *indio_dev, bool enable) u8 drdy_mask; struct st_sensor_data *sdata = iio_priv(indio_dev); - if (!sdata->sensor_settings->drdy_irq.addr) + if (!sdata->sensor_settings->drdy_irq.addr) { + /* + * there are some devices (e.g. LIS3MDL) where drdy line is + * routed to a given pin and it is not possible to select a + * different one. Take into account irq status register + * to understand if irq trigger can be properly supported + */ + if (sdata->sensor_settings->drdy_irq.addr_stat_drdy) + sdata->hw_irq_trigger = enable; return 0; + } /* Enable/Disable the interrupt generator 1. */ if (sdata->sensor_settings->drdy_irq.ig1.en_addr > 0) { diff --git a/drivers/iio/magnetometer/st_magn_core.c b/drivers/iio/magnetometer/st_magn_core.c index 703e77008652..2e36d746f5bc 100644 --- a/drivers/iio/magnetometer/st_magn_core.c +++ b/drivers/iio/magnetometer/st_magn_core.c @@ -315,6 +315,10 @@ static const struct st_sensor_settings st_magn_sensors_settings[] = { }, }, }, + .drdy_irq = { + /* drdy line is routed drdy pin */ + .addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR, + }, .multi_read_bit = true, .bootime = 2, }, -- cgit From 50662499f9112ecced68d064846a2f1fd9640b66 Mon Sep 17 00:00:00 2001 From: Jerome Brunet Date: Thu, 31 Aug 2017 15:52:18 +0200 Subject: ARM64: dts: meson-gx: Use correct mmc clock source 0 Now that the clock source 0 is properly described in the CCF, use it instead of assuming the default value (xtal) Signed-off-by: Jerome Brunet Signed-off-by: Kevin Hilman --- arch/arm64/boot/dts/amlogic/meson-gxbb.dtsi | 6 +++--- arch/arm64/boot/dts/amlogic/meson-gxl.dtsi | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/arm64/boot/dts/amlogic/meson-gxbb.dtsi b/arch/arm64/boot/dts/amlogic/meson-gxbb.dtsi index 52f1687e7a09..8f0c0cb02157 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxbb.dtsi +++ b/arch/arm64/boot/dts/amlogic/meson-gxbb.dtsi @@ -661,21 +661,21 @@ &sd_emmc_a { clocks = <&clkc CLKID_SD_EMMC_A>, - <&xtal>, + <&clkc CLKID_SD_EMMC_A_CLK0>, <&clkc CLKID_FCLK_DIV2>; clock-names = "core", "clkin0", "clkin1"; }; &sd_emmc_b { clocks = <&clkc CLKID_SD_EMMC_B>, - <&xtal>, + <&clkc CLKID_SD_EMMC_B_CLK0>, <&clkc CLKID_FCLK_DIV2>; clock-names = "core", "clkin0", "clkin1"; }; &sd_emmc_c { clocks = <&clkc CLKID_SD_EMMC_C>, - <&xtal>, + <&clkc CLKID_SD_EMMC_C_CLK0>, <&clkc CLKID_FCLK_DIV2>; clock-names = "core", "clkin0", "clkin1"; }; diff --git a/arch/arm64/boot/dts/amlogic/meson-gxl.dtsi b/arch/arm64/boot/dts/amlogic/meson-gxl.dtsi index d6876e64979e..829d84db5fc5 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxl.dtsi +++ b/arch/arm64/boot/dts/amlogic/meson-gxl.dtsi @@ -603,21 +603,21 @@ &sd_emmc_a { clocks = <&clkc CLKID_SD_EMMC_A>, - <&xtal>, + <&clkc CLKID_SD_EMMC_A_CLK0>, <&clkc CLKID_FCLK_DIV2>; clock-names = "core", "clkin0", "clkin1"; }; &sd_emmc_b { clocks = <&clkc CLKID_SD_EMMC_B>, - <&xtal>, + <&clkc CLKID_SD_EMMC_B_CLK0>, <&clkc CLKID_FCLK_DIV2>; clock-names = "core", "clkin0", "clkin1"; }; &sd_emmc_c { clocks = <&clkc CLKID_SD_EMMC_C>, - <&xtal>, + <&clkc CLKID_SD_EMMC_C_CLK0>, <&clkc CLKID_FCLK_DIV2>; clock-names = "core", "clkin0", "clkin1"; }; -- cgit From 673ccaaccf32a044d961c3bac3dd63452bdfa86c Mon Sep 17 00:00:00 2001 From: Jerome Brunet Date: Thu, 31 Aug 2017 15:52:19 +0200 Subject: ARM64: dts: meson: remove cap-sd-highspeed from emmc nodes It does not make much sense to define cap-sd-highspeed in the emmc nodes Just remove it. Signed-off-by: Jerome Brunet Signed-off-by: Kevin Hilman --- arch/arm64/boot/dts/amlogic/meson-gx-p23x-q20x.dtsi | 1 - arch/arm64/boot/dts/amlogic/meson-gxbb-nanopi-k2.dts | 1 - arch/arm64/boot/dts/amlogic/meson-gxbb-nexbox-a95x.dts | 1 - arch/arm64/boot/dts/amlogic/meson-gxbb-odroidc2.dts | 1 - arch/arm64/boot/dts/amlogic/meson-gxbb-p20x.dtsi | 1 - arch/arm64/boot/dts/amlogic/meson-gxbb-vega-s95.dtsi | 1 - arch/arm64/boot/dts/amlogic/meson-gxl-s905x-hwacom-amazetv.dts | 1 - arch/arm64/boot/dts/amlogic/meson-gxl-s905x-nexbox-a95x.dts | 1 - arch/arm64/boot/dts/amlogic/meson-gxl-s905x-p212.dtsi | 1 - arch/arm64/boot/dts/amlogic/meson-gxm-nexbox-a1.dts | 1 - arch/arm64/boot/dts/amlogic/meson-gxm-rbox-pro.dts | 1 - 11 files changed, 11 deletions(-) diff --git a/arch/arm64/boot/dts/amlogic/meson-gx-p23x-q20x.dtsi b/arch/arm64/boot/dts/amlogic/meson-gx-p23x-q20x.dtsi index c89010e56488..d4f9c5b550c7 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gx-p23x-q20x.dtsi +++ b/arch/arm64/boot/dts/amlogic/meson-gx-p23x-q20x.dtsi @@ -215,7 +215,6 @@ pinctrl-names = "default"; bus-width = <8>; - cap-sd-highspeed; cap-mmc-highspeed; max-frequency = <200000000>; non-removable; diff --git a/arch/arm64/boot/dts/amlogic/meson-gxbb-nanopi-k2.dts b/arch/arm64/boot/dts/amlogic/meson-gxbb-nanopi-k2.dts index 9697a7a79464..7dae6acd3c8c 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxbb-nanopi-k2.dts +++ b/arch/arm64/boot/dts/amlogic/meson-gxbb-nanopi-k2.dts @@ -297,7 +297,6 @@ pinctrl-names = "default"; bus-width = <8>; - cap-sd-highspeed; max-frequency = <200000000>; non-removable; disable-wp; diff --git a/arch/arm64/boot/dts/amlogic/meson-gxbb-nexbox-a95x.dts b/arch/arm64/boot/dts/amlogic/meson-gxbb-nexbox-a95x.dts index 9c59c3c6d1b6..a690956d6c75 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxbb-nexbox-a95x.dts +++ b/arch/arm64/boot/dts/amlogic/meson-gxbb-nexbox-a95x.dts @@ -274,7 +274,6 @@ pinctrl-names = "default"; bus-width = <8>; - cap-sd-highspeed; cap-mmc-highspeed; max-frequency = <200000000>; non-removable; diff --git a/arch/arm64/boot/dts/amlogic/meson-gxbb-odroidc2.dts b/arch/arm64/boot/dts/amlogic/meson-gxbb-odroidc2.dts index d147c853ab05..a12303becab4 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxbb-odroidc2.dts +++ b/arch/arm64/boot/dts/amlogic/meson-gxbb-odroidc2.dts @@ -274,7 +274,6 @@ pinctrl-names = "default"; bus-width = <8>; - cap-sd-highspeed; max-frequency = <200000000>; non-removable; disable-wp; diff --git a/arch/arm64/boot/dts/amlogic/meson-gxbb-p20x.dtsi b/arch/arm64/boot/dts/amlogic/meson-gxbb-p20x.dtsi index 81ffc689a5bf..d77e19591ee3 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxbb-p20x.dtsi +++ b/arch/arm64/boot/dts/amlogic/meson-gxbb-p20x.dtsi @@ -241,7 +241,6 @@ pinctrl-names = "default"; bus-width = <8>; - cap-sd-highspeed; cap-mmc-highspeed; max-frequency = <200000000>; non-removable; diff --git a/arch/arm64/boot/dts/amlogic/meson-gxbb-vega-s95.dtsi b/arch/arm64/boot/dts/amlogic/meson-gxbb-vega-s95.dtsi index 346753fb6324..0262ef8d48e4 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxbb-vega-s95.dtsi +++ b/arch/arm64/boot/dts/amlogic/meson-gxbb-vega-s95.dtsi @@ -201,7 +201,6 @@ pinctrl-names = "default"; bus-width = <8>; - cap-sd-highspeed; cap-mmc-highspeed; max-frequency = <200000000>; non-removable; diff --git a/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-hwacom-amazetv.dts b/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-hwacom-amazetv.dts index 2a5804ce7f4b..f779a985f923 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-hwacom-amazetv.dts +++ b/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-hwacom-amazetv.dts @@ -144,7 +144,6 @@ pinctrl-names = "default"; bus-width = <8>; - cap-sd-highspeed; cap-mmc-highspeed; max-frequency = <100000000>; non-removable; diff --git a/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-nexbox-a95x.dts b/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-nexbox-a95x.dts index 4c2ac7650fcd..21274a6c1b9b 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-nexbox-a95x.dts +++ b/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-nexbox-a95x.dts @@ -231,7 +231,6 @@ pinctrl-names = "default"; bus-width = <8>; - cap-sd-highspeed; cap-mmc-highspeed; max-frequency = <200000000>; non-removable; diff --git a/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-p212.dtsi b/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-p212.dtsi index f3eea8e89d12..8899121f79e1 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-p212.dtsi +++ b/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-p212.dtsi @@ -137,7 +137,6 @@ pinctrl-names = "default"; bus-width = <8>; - cap-sd-highspeed; cap-mmc-highspeed; max-frequency = <200000000>; non-removable; diff --git a/arch/arm64/boot/dts/amlogic/meson-gxm-nexbox-a1.dts b/arch/arm64/boot/dts/amlogic/meson-gxm-nexbox-a1.dts index 9b10c5f4f8c0..ff8a9f780485 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxm-nexbox-a1.dts +++ b/arch/arm64/boot/dts/amlogic/meson-gxm-nexbox-a1.dts @@ -196,7 +196,6 @@ pinctrl-names = "default"; bus-width = <8>; - cap-sd-highspeed; cap-mmc-highspeed; max-frequency = <200000000>; non-removable; diff --git a/arch/arm64/boot/dts/amlogic/meson-gxm-rbox-pro.dts b/arch/arm64/boot/dts/amlogic/meson-gxm-rbox-pro.dts index 08f1dd69b679..470f72bb863c 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxm-rbox-pro.dts +++ b/arch/arm64/boot/dts/amlogic/meson-gxm-rbox-pro.dts @@ -220,7 +220,6 @@ pinctrl-names = "default"; bus-width = <8>; - cap-sd-highspeed; cap-mmc-highspeed; max-frequency = <200000000>; non-removable; -- cgit From 67e7607fcdf1fad10e9f183424e709c59713e45d Mon Sep 17 00:00:00 2001 From: Jerome Brunet Date: Thu, 31 Aug 2017 15:52:20 +0200 Subject: ARM64: dts: meson: add mmc clk gate pins Add the pinctrl to switch mmc clk pins in gpio (pulled down) mode. This is necessary to be able to gate the clk outside of the SoC while keeping it running in the controller Signed-off-by: Jerome Brunet Signed-off-by: Kevin Hilman --- .../arm64/boot/dts/amlogic/meson-gx-p23x-q20x.dtsi | 9 ++++-- .../boot/dts/amlogic/meson-gxbb-nanopi-k2.dts | 9 ++++-- .../boot/dts/amlogic/meson-gxbb-nexbox-a95x.dts | 11 +++++--- .../arm64/boot/dts/amlogic/meson-gxbb-odroidc2.dts | 8 ++++-- arch/arm64/boot/dts/amlogic/meson-gxbb-p20x.dtsi | 9 ++++-- .../boot/dts/amlogic/meson-gxbb-vega-s95.dtsi | 9 ++++-- arch/arm64/boot/dts/amlogic/meson-gxbb.dtsi | 33 ++++++++++++++++++++++ .../dts/amlogic/meson-gxl-s905x-hwacom-amazetv.dts | 6 ++-- .../dts/amlogic/meson-gxl-s905x-libretech-cc.dts | 6 ++-- .../dts/amlogic/meson-gxl-s905x-nexbox-a95x.dts | 9 ++++-- .../boot/dts/amlogic/meson-gxl-s905x-p212.dtsi | 9 ++++-- arch/arm64/boot/dts/amlogic/meson-gxl.dtsi | 33 ++++++++++++++++++++++ .../arm64/boot/dts/amlogic/meson-gxm-nexbox-a1.dts | 6 ++-- 13 files changed, 126 insertions(+), 31 deletions(-) diff --git a/arch/arm64/boot/dts/amlogic/meson-gx-p23x-q20x.dtsi b/arch/arm64/boot/dts/amlogic/meson-gx-p23x-q20x.dtsi index d4f9c5b550c7..4157987f4a3d 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gx-p23x-q20x.dtsi +++ b/arch/arm64/boot/dts/amlogic/meson-gx-p23x-q20x.dtsi @@ -168,7 +168,8 @@ &sd_emmc_a { status = "okay"; pinctrl-0 = <&sdio_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&sdio_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; #address-cells = <1>; #size-cells = <0>; @@ -194,7 +195,8 @@ &sd_emmc_b { status = "okay"; pinctrl-0 = <&sdcard_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&sdcard_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; bus-width = <4>; cap-sd-highspeed; @@ -212,7 +214,8 @@ &sd_emmc_c { status = "okay"; pinctrl-0 = <&emmc_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&emmc_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; bus-width = <8>; cap-mmc-highspeed; diff --git a/arch/arm64/boot/dts/amlogic/meson-gxbb-nanopi-k2.dts b/arch/arm64/boot/dts/amlogic/meson-gxbb-nanopi-k2.dts index 7dae6acd3c8c..60d5f2da6916 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxbb-nanopi-k2.dts +++ b/arch/arm64/boot/dts/amlogic/meson-gxbb-nanopi-k2.dts @@ -250,7 +250,8 @@ &sd_emmc_a { status = "okay"; pinctrl-0 = <&sdio_pins>, <&sdio_irq_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&sdio_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; #address-cells = <1>; #size-cells = <0>; @@ -276,7 +277,8 @@ &sd_emmc_b { status = "okay"; pinctrl-0 = <&sdcard_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&sdcard_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; bus-width = <4>; cap-sd-highspeed; @@ -294,7 +296,8 @@ &sd_emmc_c { status = "disabled"; pinctrl-0 = <&emmc_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&emmc_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; bus-width = <8>; max-frequency = <200000000>; diff --git a/arch/arm64/boot/dts/amlogic/meson-gxbb-nexbox-a95x.dts b/arch/arm64/boot/dts/amlogic/meson-gxbb-nexbox-a95x.dts index a690956d6c75..38dfdde5c147 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxbb-nexbox-a95x.dts +++ b/arch/arm64/boot/dts/amlogic/meson-gxbb-nexbox-a95x.dts @@ -51,7 +51,7 @@ / { compatible = "nexbox,a95x", "amlogic,meson-gxbb"; model = "NEXBOX A95X"; - + aliases { serial0 = &uart_AO; }; @@ -232,7 +232,8 @@ &sd_emmc_a { status = "okay"; pinctrl-0 = <&sdio_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&sdio_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; #address-cells = <1>; #size-cells = <0>; @@ -253,7 +254,8 @@ &sd_emmc_b { status = "okay"; pinctrl-0 = <&sdcard_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&sdcard_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; bus-width = <4>; cap-sd-highspeed; @@ -271,7 +273,8 @@ &sd_emmc_c { status = "okay"; pinctrl-0 = <&emmc_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&emmc_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; bus-width = <8>; cap-mmc-highspeed; diff --git a/arch/arm64/boot/dts/amlogic/meson-gxbb-odroidc2.dts b/arch/arm64/boot/dts/amlogic/meson-gxbb-odroidc2.dts index a12303becab4..1ffa1c238a72 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxbb-odroidc2.dts +++ b/arch/arm64/boot/dts/amlogic/meson-gxbb-odroidc2.dts @@ -50,7 +50,7 @@ / { compatible = "hardkernel,odroid-c2", "amlogic,meson-gxbb"; model = "Hardkernel ODROID-C2"; - + aliases { serial0 = &uart_AO; }; @@ -253,7 +253,8 @@ &sd_emmc_b { status = "okay"; pinctrl-0 = <&sdcard_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&sdcard_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; bus-width = <4>; cap-sd-highspeed; @@ -271,7 +272,8 @@ &sd_emmc_c { status = "okay"; pinctrl-0 = <&emmc_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&emmc_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; bus-width = <8>; max-frequency = <200000000>; diff --git a/arch/arm64/boot/dts/amlogic/meson-gxbb-p20x.dtsi b/arch/arm64/boot/dts/amlogic/meson-gxbb-p20x.dtsi index d77e19591ee3..704b214e8894 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxbb-p20x.dtsi +++ b/arch/arm64/boot/dts/amlogic/meson-gxbb-p20x.dtsi @@ -194,7 +194,8 @@ &sd_emmc_a { status = "okay"; pinctrl-0 = <&sdio_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&sdio_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; #address-cells = <1>; #size-cells = <0>; @@ -220,7 +221,8 @@ &sd_emmc_b { status = "okay"; pinctrl-0 = <&sdcard_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&sdcard_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; bus-width = <4>; cap-sd-highspeed; @@ -238,7 +240,8 @@ &sd_emmc_c { status = "okay"; pinctrl-0 = <&emmc_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&emmc_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; bus-width = <8>; cap-mmc-highspeed; diff --git a/arch/arm64/boot/dts/amlogic/meson-gxbb-vega-s95.dtsi b/arch/arm64/boot/dts/amlogic/meson-gxbb-vega-s95.dtsi index 0262ef8d48e4..f2bc6dea1fc6 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxbb-vega-s95.dtsi +++ b/arch/arm64/boot/dts/amlogic/meson-gxbb-vega-s95.dtsi @@ -155,7 +155,8 @@ &sd_emmc_a { status = "okay"; pinctrl-0 = <&sdio_pins &sdio_irq_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&sdio_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; #address-cells = <1>; #size-cells = <0>; @@ -181,7 +182,8 @@ &sd_emmc_b { status = "okay"; pinctrl-0 = <&sdcard_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&sdcard_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; bus-width = <4>; cap-sd-highspeed; @@ -198,7 +200,8 @@ &sd_emmc_c { status = "okay"; pinctrl-0 = <&emmc_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&emmc_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; bus-width = <8>; cap-mmc-highspeed; diff --git a/arch/arm64/boot/dts/amlogic/meson-gxbb.dtsi b/arch/arm64/boot/dts/amlogic/meson-gxbb.dtsi index 8f0c0cb02157..af834cdbba79 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxbb.dtsi +++ b/arch/arm64/boot/dts/amlogic/meson-gxbb.dtsi @@ -392,6 +392,17 @@ }; }; + emmc_clk_gate_pins: emmc_clk_gate { + mux { + groups = "BOOT_8"; + function = "gpio_periphs"; + }; + cfg-pull-down { + pins = "BOOT_8"; + bias-pull-down; + }; + }; + nor_pins: nor { mux { groups = "nor_d", @@ -430,6 +441,17 @@ }; }; + sdcard_clk_gate_pins: sdcard_clk_gate { + mux { + groups = "CARD_2"; + function = "gpio_periphs"; + }; + cfg-pull-down { + pins = "CARD_2"; + bias-pull-down; + }; + }; + sdio_pins: sdio { mux { groups = "sdio_d0", @@ -442,6 +464,17 @@ }; }; + sdio_clk_gate_pins: sdio_clk_gate { + mux { + groups = "GPIOX_4"; + function = "gpio_periphs"; + }; + cfg-pull-down { + pins = "GPIOX_4"; + bias-pull-down; + }; + }; + sdio_irq_pins: sdio_irq { mux { groups = "sdio_irq"; diff --git a/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-hwacom-amazetv.dts b/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-hwacom-amazetv.dts index f779a985f923..977b4240f3c1 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-hwacom-amazetv.dts +++ b/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-hwacom-amazetv.dts @@ -123,7 +123,8 @@ &sd_emmc_b { status = "okay"; pinctrl-0 = <&sdcard_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&sdcard_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; bus-width = <4>; cap-sd-highspeed; @@ -141,7 +142,8 @@ &sd_emmc_c { status = "okay"; pinctrl-0 = <&emmc_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&emmc_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; bus-width = <8>; cap-mmc-highspeed; diff --git a/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-libretech-cc.dts b/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-libretech-cc.dts index 69ca14ac10fa..a014c052241e 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-libretech-cc.dts +++ b/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-libretech-cc.dts @@ -197,7 +197,8 @@ &sd_emmc_b { status = "okay"; pinctrl-0 = <&sdcard_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&sdcard_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; bus-width = <4>; cap-sd-highspeed; @@ -215,7 +216,8 @@ &sd_emmc_c { status = "okay"; pinctrl-0 = <&emmc_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&emmc_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; bus-width = <8>; cap-mmc-highspeed; diff --git a/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-nexbox-a95x.dts b/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-nexbox-a95x.dts index 21274a6c1b9b..1b8f32867aa1 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-nexbox-a95x.dts +++ b/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-nexbox-a95x.dts @@ -189,7 +189,8 @@ &sd_emmc_a { status = "okay"; pinctrl-0 = <&sdio_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&sdio_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; #address-cells = <1>; #size-cells = <0>; @@ -210,7 +211,8 @@ &sd_emmc_b { status = "okay"; pinctrl-0 = <&sdcard_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&sdcard_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; bus-width = <4>; cap-sd-highspeed; @@ -228,7 +230,8 @@ &sd_emmc_c { status = "okay"; pinctrl-0 = <&emmc_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&emmc_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; bus-width = <8>; cap-mmc-highspeed; diff --git a/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-p212.dtsi b/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-p212.dtsi index 8899121f79e1..129af9068814 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-p212.dtsi +++ b/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-p212.dtsi @@ -95,7 +95,8 @@ &sd_emmc_a { status = "okay"; pinctrl-0 = <&sdio_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&sdio_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; #address-cells = <1>; #size-cells = <0>; @@ -116,7 +117,8 @@ &sd_emmc_b { status = "okay"; pinctrl-0 = <&sdcard_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&sdcard_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; bus-width = <4>; cap-sd-highspeed; @@ -134,7 +136,8 @@ &sd_emmc_c { status = "okay"; pinctrl-0 = <&emmc_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&emmc_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; bus-width = <8>; cap-mmc-highspeed; diff --git a/arch/arm64/boot/dts/amlogic/meson-gxl.dtsi b/arch/arm64/boot/dts/amlogic/meson-gxl.dtsi index 829d84db5fc5..d8dd3298b15c 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxl.dtsi +++ b/arch/arm64/boot/dts/amlogic/meson-gxl.dtsi @@ -281,6 +281,17 @@ }; }; + emmc_clk_gate_pins: emmc_clk_gate { + mux { + groups = "BOOT_8"; + function = "gpio_periphs"; + }; + cfg-pull-down { + pins = "BOOT_8"; + bias-pull-down; + }; + }; + nor_pins: nor { mux { groups = "nor_d", @@ -319,6 +330,17 @@ }; }; + sdcard_clk_gate_pins: sdcard_clk_gate { + mux { + groups = "CARD_2"; + function = "gpio_periphs"; + }; + cfg-pull-down { + pins = "CARD_2"; + bias-pull-down; + }; + }; + sdio_pins: sdio { mux { groups = "sdio_d0", @@ -331,6 +353,17 @@ }; }; + sdio_clk_gate_pins: sdio_clk_gate { + mux { + groups = "GPIOX_4"; + function = "gpio_periphs"; + }; + cfg-pull-down { + pins = "GPIOX_4"; + bias-pull-down; + }; + }; + sdio_irq_pins: sdio_irq { mux { groups = "sdio_irq"; diff --git a/arch/arm64/boot/dts/amlogic/meson-gxm-nexbox-a1.dts b/arch/arm64/boot/dts/amlogic/meson-gxm-nexbox-a1.dts index ff8a9f780485..22c697732f66 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxm-nexbox-a1.dts +++ b/arch/arm64/boot/dts/amlogic/meson-gxm-nexbox-a1.dts @@ -175,7 +175,8 @@ &sd_emmc_b { status = "okay"; pinctrl-0 = <&sdcard_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&sdcard_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; bus-width = <4>; cap-sd-highspeed; @@ -193,7 +194,8 @@ &sd_emmc_c { status = "okay"; pinctrl-0 = <&emmc_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&emmc_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; bus-width = <8>; cap-mmc-highspeed; -- cgit From 42776561a1def5d96699574efd7c9cbbd2e0fbc4 Mon Sep 17 00:00:00 2001 From: Jerome Brunet Date: Thu, 31 Aug 2017 15:52:21 +0200 Subject: ARM64: dts: meson-gxbb: nanopi-k2: add card regulator settle times Changing the card voltage on the nanopi-k2 is not instantaneous, especially when switching from 3.3v to 1.8v. It take at least 3ms for the regulator to go from 3.3v to 1.8v. Add margin to that to make sure we don't upset the sdcard during the voltage switch Fixes: 9bc7ffb08daf ("arm64: dts: amlogic: Add NanoPi K2") Signed-off-by: Jerome Brunet Signed-off-by: Kevin Hilman --- arch/arm64/boot/dts/amlogic/meson-gxbb-nanopi-k2.dts | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/boot/dts/amlogic/meson-gxbb-nanopi-k2.dts b/arch/arm64/boot/dts/amlogic/meson-gxbb-nanopi-k2.dts index 60d5f2da6916..acb6797756e5 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxbb-nanopi-k2.dts +++ b/arch/arm64/boot/dts/amlogic/meson-gxbb-nanopi-k2.dts @@ -107,6 +107,9 @@ states = <3300000 0>, <1800000 1>; + + regulator-settling-time-up-us = <100>; + regulator-settling-time-down-us = <5000>; }; wifi_32k: wifi-32k { -- cgit From 8a5085c420d272af04552b2d2213471247fa86f2 Mon Sep 17 00:00:00 2001 From: Jerome Brunet Date: Thu, 31 Aug 2017 15:52:22 +0200 Subject: ARM64: dts: meson-gxl: libretech-cc: add card regulator settle times Changing the card voltage on the cc is not instantaneous, especially when switching from 3.3v to 1.8v. It take at least 30ms for the regulator to go from 3.3v to 1.8v. Add margin to that to make sure we don't upset the sdcard during the voltage switch Fixes: 61ff2af9b278 ("ARM64: dts: fixup libretech cc definition") Signed-off-by: Jerome Brunet Signed-off-by: Kevin Hilman --- arch/arm64/boot/dts/amlogic/meson-gxl-s905x-libretech-cc.dts | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-libretech-cc.dts b/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-libretech-cc.dts index a014c052241e..7d252168c2fa 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-libretech-cc.dts +++ b/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-libretech-cc.dts @@ -91,6 +91,9 @@ states = <3300000 0>, <1800000 1>; + + regulator-settling-time-up-us = <200>; + regulator-settling-time-down-us = <50000>; }; vddio_boot: regulator-vddio_boot { -- cgit From 3cde63ebc85cea63806d86a690d04457c0347703 Mon Sep 17 00:00:00 2001 From: Jerome Brunet Date: Thu, 31 Aug 2017 15:52:23 +0200 Subject: ARM64: dts: meson-gxl: libretech-cc: enable high speed modes Enable sdcard UHS modes up to SDR50. Unfortunately, it seems the PCB of the libretech-cc cannot handle SDR104 at 200Mhz reliably. Also enable eMMC DDR52 mode. Signed-off-by: Jerome Brunet Signed-off-by: Kevin Hilman --- arch/arm64/boot/dts/amlogic/meson-gxl-s905x-libretech-cc.dts | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-libretech-cc.dts b/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-libretech-cc.dts index 7d252168c2fa..64c54c92e214 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-libretech-cc.dts +++ b/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-libretech-cc.dts @@ -205,6 +205,9 @@ bus-width = <4>; cap-sd-highspeed; + sd-uhs-sdr12; + sd-uhs-sdr25; + sd-uhs-sdr50; max-frequency = <100000000>; disable-wp; @@ -224,6 +227,7 @@ bus-width = <8>; cap-mmc-highspeed; + mmc-ddr-3_3v; max-frequency = <50000000>; non-removable; disable-wp; -- cgit From 0f553358241a3346b7eef133d631e5bc2f067a15 Mon Sep 17 00:00:00 2001 From: Jerome Brunet Date: Thu, 31 Aug 2017 15:52:24 +0200 Subject: ARM64: dts: meson-gxbb: p20x: enable sdcard UHS modes Enable sdcard UHS modes, up to SDR50, on p20x based boards. While the s905 supports SDR104 mode, it appears that the PCB of p20x based boards can't cope with a rate as high as 200Mhz. Signed-off-by: Jerome Brunet Signed-off-by: Kevin Hilman --- arch/arm64/boot/dts/amlogic/meson-gxbb-p20x.dtsi | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/boot/dts/amlogic/meson-gxbb-p20x.dtsi b/arch/arm64/boot/dts/amlogic/meson-gxbb-p20x.dtsi index 704b214e8894..23c08c3afd0a 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxbb-p20x.dtsi +++ b/arch/arm64/boot/dts/amlogic/meson-gxbb-p20x.dtsi @@ -226,6 +226,9 @@ bus-width = <4>; cap-sd-highspeed; + sd-uhs-sdr12; + sd-uhs-sdr25; + sd-uhs-sdr50; max-frequency = <100000000>; disable-wp; -- cgit From c1429e20a5a9f578e0e3ddb551c8ea94e8d3ddb3 Mon Sep 17 00:00:00 2001 From: Jerome Brunet Date: Thu, 31 Aug 2017 15:52:25 +0200 Subject: ARM64: dts: meson-gxbb: nanopi-k2: enable sdcard UHS modes Enable UHS modes, up to SDR50, on the nanopi-k2 SBC. Signed-off-by: Jerome Brunet Signed-off-by: Kevin Hilman --- arch/arm64/boot/dts/amlogic/meson-gxbb-nanopi-k2.dts | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/boot/dts/amlogic/meson-gxbb-nanopi-k2.dts b/arch/arm64/boot/dts/amlogic/meson-gxbb-nanopi-k2.dts index acb6797756e5..4c1320a93fef 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxbb-nanopi-k2.dts +++ b/arch/arm64/boot/dts/amlogic/meson-gxbb-nanopi-k2.dts @@ -285,6 +285,9 @@ bus-width = <4>; cap-sd-highspeed; + sd-uhs-sdr12; + sd-uhs-sdr25; + sd-uhs-sdr50; max-frequency = <100000000>; disable-wp; -- cgit From 485a308f05d843034b6e82f688704c44888aecde Mon Sep 17 00:00:00 2001 From: Jerome Brunet Date: Thu, 31 Aug 2017 15:52:26 +0200 Subject: ARM64: dts: meson-gxbb: nanopi-k2: enable sdr104 mode SDR104 seems to be OK on the nanopi-k2 SBC so enable it Signed-off-by: Jerome Brunet Signed-off-by: Kevin Hilman --- arch/arm64/boot/dts/amlogic/meson-gxbb-nanopi-k2.dts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/amlogic/meson-gxbb-nanopi-k2.dts b/arch/arm64/boot/dts/amlogic/meson-gxbb-nanopi-k2.dts index 4c1320a93fef..4b17a76959b2 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxbb-nanopi-k2.dts +++ b/arch/arm64/boot/dts/amlogic/meson-gxbb-nanopi-k2.dts @@ -288,7 +288,8 @@ sd-uhs-sdr12; sd-uhs-sdr25; sd-uhs-sdr50; - max-frequency = <100000000>; + sd-uhs-sdr104; + max-frequency = <200000000>; disable-wp; cd-gpios = <&gpio CARD_6 GPIO_ACTIVE_HIGH>; -- cgit From ce06760ba46b66dae50f2519ae76bd15e89b5710 Mon Sep 17 00:00:00 2001 From: Ping Cheng Date: Thu, 31 Aug 2017 15:50:03 -0700 Subject: HID: wacom: bits shifted too much for 9th and 10th buttons Cintiq 12 has 10 expresskey buttons. The bit shift for the last two buttons were off by 5. Fixes: c7f0522 ("HID: wacom: Slim down wacom_intuos_pad processing") Signed-off-by: Ping Cheng Tested-by: Matthieu Robin Signed-off-by: Jiri Kosina --- drivers/hid/wacom_wac.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/hid/wacom_wac.c b/drivers/hid/wacom_wac.c index bb17d7bbefd3..f8ddcaaa2ac6 100644 --- a/drivers/hid/wacom_wac.c +++ b/drivers/hid/wacom_wac.c @@ -567,8 +567,8 @@ static int wacom_intuos_pad(struct wacom_wac *wacom) keys = data[9] & 0x07; } } else { - buttons = ((data[6] & 0x10) << 10) | - ((data[5] & 0x10) << 9) | + buttons = ((data[6] & 0x10) << 5) | + ((data[5] & 0x10) << 4) | ((data[6] & 0x0F) << 4) | (data[5] & 0x0F); } -- cgit From 74aebed6dc13425233f2224668353cff7a112776 Mon Sep 17 00:00:00 2001 From: Aaron Armstrong Skomra Date: Mon, 28 Aug 2017 14:15:39 -0700 Subject: HID: wacom: leds: Don't try to control the EKR's read-only LEDs Commit a50aac7193f1 introduces 'led.groups' and adds EKR support for these groups. However, unlike the other devices with LEDs, the EKR's LEDs are read-only and we shouldn't attempt to control them in wacom_led_control(). See bug: https://sourceforge.net/p/linuxwacom/bugs/342/ Fixes: a50aac7193f1 ("HID: wacom: leds: dynamically allocate LED groups") Cc: stable # 4.9 Signed-off-by: Aaron Armstrong Skomra Reviewed-by: Jason Gerecke Signed-off-by: Jiri Kosina --- drivers/hid/wacom_sys.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/hid/wacom_sys.c b/drivers/hid/wacom_sys.c index e82a696a1d07..735bfbbcaa82 100644 --- a/drivers/hid/wacom_sys.c +++ b/drivers/hid/wacom_sys.c @@ -766,6 +766,9 @@ static int wacom_led_control(struct wacom *wacom) if (!wacom->led.groups) return -ENOTSUPP; + if (wacom->wacom_wac.features.type == REMOTE) + return -ENOTSUPP; + if (wacom->wacom_wac.pid) { /* wireless connected */ report_id = WAC_CMD_WL_LED_CONTROL; buf_size = 13; -- cgit From a3ae552b5259e6ffd8d124a9fa9923283af2828a Mon Sep 17 00:00:00 2001 From: Nicholas Bishop Date: Mon, 4 Sep 2017 15:40:42 -0400 Subject: HID: add multi-input quirk for IDC6680 touchscreen The Ideacom 6680 touchscreen is found in the Dell Latitude 2100. It has two USB descriptors, the first of which has two input reports. The HID_QUIRK_MULTI_INPUT quirk is needed to keep the correct maximum value for ABS_X/ABS_Y (8191 instead of 65535). Signed-off-by: Nicholas Bishop Signed-off-by: Jiri Kosina --- drivers/hid/hid-ids.h | 1 + drivers/hid/usbhid/hid-quirks.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index b397a14ab970..11798b10e522 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -533,6 +533,7 @@ #define USB_VENDOR_ID_IDEACOM 0x1cb6 #define USB_DEVICE_ID_IDEACOM_IDC6650 0x6650 #define USB_DEVICE_ID_IDEACOM_IDC6651 0x6651 +#define USB_DEVICE_ID_IDEACOM_IDC6680 0x6680 #define USB_VENDOR_ID_ILITEK 0x222a #define USB_DEVICE_ID_ILITEK_MULTITOUCH 0x0001 diff --git a/drivers/hid/usbhid/hid-quirks.c b/drivers/hid/usbhid/hid-quirks.c index a83fa76655b9..f489a5cfcb48 100644 --- a/drivers/hid/usbhid/hid-quirks.c +++ b/drivers/hid/usbhid/hid-quirks.c @@ -99,6 +99,7 @@ static const struct hid_blacklist { { USB_VENDOR_ID_HP, USB_PRODUCT_ID_HP_LOGITECH_OEM_USB_OPTICAL_MOUSE_0A4A, HID_QUIRK_ALWAYS_POLL }, { USB_VENDOR_ID_HP, USB_PRODUCT_ID_HP_LOGITECH_OEM_USB_OPTICAL_MOUSE_0B4A, HID_QUIRK_ALWAYS_POLL }, { USB_VENDOR_ID_HP, USB_PRODUCT_ID_HP_PIXART_OEM_USB_OPTICAL_MOUSE, HID_QUIRK_ALWAYS_POLL }, + { USB_VENDOR_ID_IDEACOM, USB_DEVICE_ID_IDEACOM_IDC6680, HID_QUIRK_MULTI_INPUT }, { USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_C007, HID_QUIRK_ALWAYS_POLL }, { USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_C077, HID_QUIRK_ALWAYS_POLL }, { USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_KEYBOARD_G710_PLUS, HID_QUIRK_NOGET }, -- cgit From e57f4e67a0c73ca1c3afb68928b917906bd82eaf Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Tue, 22 Aug 2017 08:37:52 +0200 Subject: HID: multitouch: Fix system-control buttons not working Some laptops have system-control buttons (e.g. KEY_SLEEP) on the same interface as a hid-multitouch touch-pad. This commit fixes these buttons not working. Signed-off-by: Hans de Goede Acked-by: Benjamin Tissoires Signed-off-by: Jiri Kosina --- drivers/hid/hid-multitouch.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/hid/hid-multitouch.c b/drivers/hid/hid-multitouch.c index 440b999304a5..5609725df714 100644 --- a/drivers/hid/hid-multitouch.c +++ b/drivers/hid/hid-multitouch.c @@ -930,6 +930,7 @@ static int mt_input_mapping(struct hid_device *hdev, struct hid_input *hi, field->application != HID_DG_PEN && field->application != HID_DG_TOUCHPAD && field->application != HID_GD_KEYBOARD && + field->application != HID_GD_SYSTEM_CONTROL && field->application != HID_CP_CONSUMER_CONTROL && field->application != HID_GD_WIRELESS_RADIO_CTLS && !(field->application == HID_VD_ASUS_CUSTOM_MEDIA_KEYS && -- cgit From b63c4c2718d641ba9bec888994f0cb0c23a1ef45 Mon Sep 17 00:00:00 2001 From: Jason Gerecke Date: Wed, 30 Aug 2017 15:13:25 -0700 Subject: HID: wacom: Properly report negative values from Intuos Pro 2 Bluetooth The wacom driver's IRQ handler for Bluetooth reports from the 2nd-gen Intuos Pro does not correctly process negative numbers. Values for tilt and rotation (which can go negative) are instead interpreted as unsigned and so jump to very large values when the data should be negative. This commit properly casts the data to ensure we report negative numbers when necessary. Fixes: 4922cd2 ("HID: wacom: Support 2nd-gen Intuos Pro's Bluetooth classic interface") Cc: stable@vger.kernel.org # v4.11 Signed-off-by: Jason Gerecke Signed-off-by: Jiri Kosina --- drivers/hid/wacom_wac.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/hid/wacom_wac.c b/drivers/hid/wacom_wac.c index f8ddcaaa2ac6..1d9e32d2bc63 100644 --- a/drivers/hid/wacom_wac.c +++ b/drivers/hid/wacom_wac.c @@ -1229,9 +1229,9 @@ static void wacom_intuos_pro2_bt_pen(struct wacom_wac *wacom) if (range) { input_report_abs(pen_input, ABS_X, get_unaligned_le16(&frame[1])); input_report_abs(pen_input, ABS_Y, get_unaligned_le16(&frame[3])); - input_report_abs(pen_input, ABS_TILT_X, frame[7]); - input_report_abs(pen_input, ABS_TILT_Y, frame[8]); - input_report_abs(pen_input, ABS_Z, get_unaligned_le16(&frame[9])); + input_report_abs(pen_input, ABS_TILT_X, (char)frame[7]); + input_report_abs(pen_input, ABS_TILT_Y, (char)frame[8]); + input_report_abs(pen_input, ABS_Z, (int16_t)get_unaligned_le16(&frame[9])); input_report_abs(pen_input, ABS_WHEEL, get_unaligned_le16(&frame[11])); } input_report_abs(pen_input, ABS_PRESSURE, get_unaligned_le16(&frame[5])); -- cgit From d252f4a10fb9c8f7187c6c936ff530039f8cb799 Mon Sep 17 00:00:00 2001 From: Jason Gerecke Date: Wed, 30 Aug 2017 15:13:26 -0700 Subject: HID: wacom: Correct coordinate system of touchring and pen twist The MobileStudio Pro, Cintiq Pro, and 2nd-gen Intuos Pro devices use a different coordinate system for their touchring and pen twist than prior devices. Prior devices had zero aligned to the tablet's left and would increase clockwise. Userspace expects data from the kernel to be in this old coordinate space, so adjustments are necessary. While the coordinate system for pen twist is formally defined by the HID standard, no such definition existed for the touchring at the time these tablets were introduced. Future tablets are expected to report touchring data using the same "zero-up clockwise-increasing" coordinate system defined for twist. Fixes: 50066a042d ("HID: wacom: generic: Add support for height, tilt, and twist usages") Fixes: 4922cd26f0 ("HID: wacom: Support 2nd-gen Intuos Pro's Bluetooth classic interface") Fixes: 60a2218698 ("HID: wacom: generic: add support for touchring") Cc: stable@vger.kernel.org # 4.10, 4.11 Signed-off-by: Jason Gerecke Reviewed-by: Ping Cheng Signed-off-by: Jiri Kosina --- drivers/hid/wacom_wac.c | 73 +++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 68 insertions(+), 5 deletions(-) diff --git a/drivers/hid/wacom_wac.c b/drivers/hid/wacom_wac.c index 1d9e32d2bc63..78d0398904dc 100644 --- a/drivers/hid/wacom_wac.c +++ b/drivers/hid/wacom_wac.c @@ -1227,11 +1227,17 @@ static void wacom_intuos_pro2_bt_pen(struct wacom_wac *wacom) continue; if (range) { + /* Fix rotation alignment: userspace expects zero at left */ + int16_t rotation = (int16_t)get_unaligned_le16(&frame[9]); + rotation += 1800/4; + if (rotation > 899) + rotation -= 1800; + input_report_abs(pen_input, ABS_X, get_unaligned_le16(&frame[1])); input_report_abs(pen_input, ABS_Y, get_unaligned_le16(&frame[3])); input_report_abs(pen_input, ABS_TILT_X, (char)frame[7]); input_report_abs(pen_input, ABS_TILT_Y, (char)frame[8]); - input_report_abs(pen_input, ABS_Z, (int16_t)get_unaligned_le16(&frame[9])); + input_report_abs(pen_input, ABS_Z, rotation); input_report_abs(pen_input, ABS_WHEEL, get_unaligned_le16(&frame[11])); } input_report_abs(pen_input, ABS_PRESSURE, get_unaligned_le16(&frame[5])); @@ -1319,12 +1325,19 @@ static void wacom_intuos_pro2_bt_pad(struct wacom_wac *wacom) unsigned char *data = wacom->data; int buttons = (data[282] << 1) | ((data[281] >> 6) & 0x01); - int ring = data[285]; - int prox = buttons | (ring & 0x80); + int ring = data[285] & 0x7F; + bool ringstatus = data[285] & 0x80; + bool prox = buttons || ringstatus; + + /* Fix touchring data: userspace expects 0 at left and increasing clockwise */ + ring = 71 - ring; + ring += 3*72/16; + if (ring > 71) + ring -= 72; wacom_report_numbered_buttons(pad_input, 9, buttons); - input_report_abs(pad_input, ABS_WHEEL, (ring & 0x80) ? (ring & 0x7f) : 0); + input_report_abs(pad_input, ABS_WHEEL, ringstatus ? ring : 0); input_report_key(pad_input, wacom->tool[1], prox ? 1 : 0); input_report_abs(pad_input, ABS_MISC, prox ? PAD_DEVICE_ID : 0); @@ -1616,6 +1629,20 @@ static int wacom_tpc_irq(struct wacom_wac *wacom, size_t len) return 0; } +static int wacom_offset_rotation(struct input_dev *input, struct hid_usage *usage, + int value, int num, int denom) +{ + struct input_absinfo *abs = &input->absinfo[usage->code]; + int range = (abs->maximum - abs->minimum + 1); + + value += num*range/denom; + if (value > abs->maximum) + value -= range; + else if (value < abs->minimum) + value += range; + return value; +} + int wacom_equivalent_usage(int usage) { if ((usage & HID_USAGE_PAGE) == WACOM_HID_UP_WACOMDIGITIZER) { @@ -1898,6 +1925,7 @@ static void wacom_wac_pad_event(struct hid_device *hdev, struct hid_field *field unsigned equivalent_usage = wacom_equivalent_usage(usage->hid); int i; bool is_touch_on = value; + bool do_report = false; /* * Avoid reporting this event and setting inrange_state if this usage @@ -1912,6 +1940,29 @@ static void wacom_wac_pad_event(struct hid_device *hdev, struct hid_field *field } switch (equivalent_usage) { + case WACOM_HID_WD_TOUCHRING: + /* + * Userspace expects touchrings to increase in value with + * clockwise gestures and have their zero point at the + * tablet's left. HID events "should" be clockwise- + * increasing and zero at top, though the MobileStudio + * Pro and 2nd-gen Intuos Pro don't do this... + */ + if (hdev->vendor == 0x56a && + (hdev->product == 0x34d || hdev->product == 0x34e || /* MobileStudio Pro */ + hdev->product == 0x357 || hdev->product == 0x358)) { /* Intuos Pro 2 */ + value = (field->logical_maximum - value); + + if (hdev->product == 0x357 || hdev->product == 0x358) + value = wacom_offset_rotation(input, usage, value, 3, 16); + else if (hdev->product == 0x34d || hdev->product == 0x34e) + value = wacom_offset_rotation(input, usage, value, 1, 2); + } + else { + value = wacom_offset_rotation(input, usage, value, 1, 4); + } + do_report = true; + break; case WACOM_HID_WD_TOUCHRINGSTATUS: if (!value) input_event(input, usage->type, usage->code, 0); @@ -1945,10 +1996,14 @@ static void wacom_wac_pad_event(struct hid_device *hdev, struct hid_field *field value, i); /* fall through*/ default: + do_report = true; + break; + } + + if (do_report) { input_event(input, usage->type, usage->code, value); if (value) wacom_wac->hid_data.pad_input_event_flag = true; - break; } } @@ -2089,6 +2144,14 @@ static void wacom_wac_pen_event(struct hid_device *hdev, struct hid_field *field wacom_wac->serial[0] = (wacom_wac->serial[0] & ~0xFFFFFFFFULL); wacom_wac->serial[0] |= (__u32)value; return; + case HID_DG_TWIST: + /* + * Userspace expects pen twist to have its zero point when + * the buttons/finger is on the tablet's left. HID values + * are zero when buttons are toward the top. + */ + value = wacom_offset_rotation(input, usage, value, 1, 4); + break; case WACOM_HID_WD_SENSE: wacom_wac->hid_data.sense_state = value; return; -- cgit From 56d859e11aad4016e1cf864d65a0954d83120571 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Fri, 25 Aug 2017 18:06:04 -0400 Subject: HID: multitouch: support buttons and trackpoint on Lenovo X1 Tab Gen2 On the 2nd generation Lenovo Tablet only clickpad is working; the trackpoint and three mouse buttons do not work. hid_multitouch must export all inputs in order to get trackpoint and buttons to function. Signed-off-by: Pavel Tatashin Signed-off-by: Jiri Kosina --- drivers/hid/hid-ids.h | 1 + drivers/hid/hid-multitouch.c | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index 11798b10e522..a98919199858 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -661,6 +661,7 @@ #define USB_DEVICE_ID_LENOVO_CBTKBD 0x6048 #define USB_DEVICE_ID_LENOVO_TPPRODOCK 0x6067 #define USB_DEVICE_ID_LENOVO_X1_COVER 0x6085 +#define USB_DEVICE_ID_LENOVO_X1_TAB 0x60a3 #define USB_VENDOR_ID_LG 0x1fd2 #define USB_DEVICE_ID_LG_MULTITOUCH 0x0064 diff --git a/drivers/hid/hid-multitouch.c b/drivers/hid/hid-multitouch.c index 5609725df714..9e8c4d2ba11d 100644 --- a/drivers/hid/hid-multitouch.c +++ b/drivers/hid/hid-multitouch.c @@ -1420,6 +1420,12 @@ static const struct hid_device_id mt_devices[] = { USB_VENDOR_ID_ALPS_JP, HID_DEVICE_ID_ALPS_U1_DUAL_3BTN_PTP) }, + /* Lenovo X1 TAB Gen 2 */ + { .driver_data = MT_CLS_WIN_8_DUAL, + HID_DEVICE(BUS_USB, HID_GROUP_MULTITOUCH_WIN_8, + USB_VENDOR_ID_LENOVO, + USB_DEVICE_ID_LENOVO_X1_TAB) }, + /* Anton devices */ { .driver_data = MT_CLS_EXPORT_ALL_INPUTS, MT_USB_DEVICE(USB_VENDOR_ID_ANTON, -- cgit From fcaa4a07d2a4b541e91da7a55d8b3331f96d1865 Mon Sep 17 00:00:00 2001 From: Shrirang Bagul Date: Thu, 10 Aug 2017 17:54:01 +0800 Subject: HID: multitouch: Support ALPS PTP stick with pid 0x120A This patch adds ALPS PTP sticks with pid/device id 0x120A to the list of devices supported by hid-multitouch. Signed-off-by: Shrirang Bagul Signed-off-by: Jiri Kosina --- drivers/hid/hid-ids.h | 1 + drivers/hid/hid-multitouch.c | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index a98919199858..832897d4a849 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -75,6 +75,7 @@ #define USB_VENDOR_ID_ALPS_JP 0x044E #define HID_DEVICE_ID_ALPS_U1_DUAL 0x120B +#define HID_DEVICE_ID_ALPS_U1_PTP_2 0x120A #define HID_DEVICE_ID_ALPS_U1_DUAL_PTP 0x121F #define HID_DEVICE_ID_ALPS_U1_DUAL_3BTN_PTP 0x1220 diff --git a/drivers/hid/hid-multitouch.c b/drivers/hid/hid-multitouch.c index 9e8c4d2ba11d..c78625dceced 100644 --- a/drivers/hid/hid-multitouch.c +++ b/drivers/hid/hid-multitouch.c @@ -1419,6 +1419,10 @@ static const struct hid_device_id mt_devices[] = { HID_DEVICE(BUS_I2C, HID_GROUP_MULTITOUCH_WIN_8, USB_VENDOR_ID_ALPS_JP, HID_DEVICE_ID_ALPS_U1_DUAL_3BTN_PTP) }, + { .driver_data = MT_CLS_WIN_8_DUAL, + HID_DEVICE(BUS_I2C, HID_GROUP_MULTITOUCH_WIN_8, + USB_VENDOR_ID_ALPS_JP, + HID_DEVICE_ID_ALPS_U1_PTP_2) }, /* Lenovo X1 TAB Gen 2 */ { .driver_data = MT_CLS_WIN_8_DUAL, -- cgit From 23e4c67ae13da7549a54f4e8c0014f48b2ef5204 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 6 Sep 2017 23:45:34 +0200 Subject: ata: avoid gcc-7 warning in ata_timing_quantize gcc-7 warns about the result of a constant multiplication used as a boolean: drivers/ata/libata-core.c: In function 'ata_timing_quantize': drivers/ata/libata-core.c:3164:30: warning: '*' in boolean context, suggest '&&' instead [-Wint-in-bool-context] This slightly rearranges the macro to simplify the code and avoid the warning at the same time. Signed-off-by: Arnd Bergmann Signed-off-by: Tejun Heo --- drivers/ata/libata-core.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 1945a8ea2099..ee4c1ec9dca0 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -3234,19 +3234,19 @@ static const struct ata_timing ata_timing[] = { }; #define ENOUGH(v, unit) (((v)-1)/(unit)+1) -#define EZ(v, unit) ((v)?ENOUGH(v, unit):0) +#define EZ(v, unit) ((v)?ENOUGH(((v) * 1000), unit):0) static void ata_timing_quantize(const struct ata_timing *t, struct ata_timing *q, int T, int UT) { - q->setup = EZ(t->setup * 1000, T); - q->act8b = EZ(t->act8b * 1000, T); - q->rec8b = EZ(t->rec8b * 1000, T); - q->cyc8b = EZ(t->cyc8b * 1000, T); - q->active = EZ(t->active * 1000, T); - q->recover = EZ(t->recover * 1000, T); - q->dmack_hold = EZ(t->dmack_hold * 1000, T); - q->cycle = EZ(t->cycle * 1000, T); - q->udma = EZ(t->udma * 1000, UT); + q->setup = EZ(t->setup, T); + q->act8b = EZ(t->act8b, T); + q->rec8b = EZ(t->rec8b, T); + q->cyc8b = EZ(t->cyc8b, T); + q->active = EZ(t->active, T); + q->recover = EZ(t->recover, T); + q->dmack_hold = EZ(t->dmack_hold, T); + q->cycle = EZ(t->cycle, T); + q->udma = EZ(t->udma, UT); } void ata_timing_merge(const struct ata_timing *a, const struct ata_timing *b, -- cgit From 59cd827f26019ac790b2f34cbad478037f51c570 Mon Sep 17 00:00:00 2001 From: Matt Chen Date: Mon, 28 Aug 2017 14:57:54 +0800 Subject: iwlwifi: mvm: fix wowlan resume failed to load INIT ucode If we set disconnect on wowlan and run suspend/resume, will run into: ...snipped iwlwifi 0000:01:00.0: Failed to load firmware chunk! iwlwifi 0000:01:00.0: Could not load the [0] uCode section iwlwifi 0000:01:00.0: Failed to start INIT ucode: -110 iwlwifi 0000:01:00.0: Failed to run INIT ucode: -110 iwlwifi 0000:01:00.0: Failed to start RT ucode: -110 It is because we still keep IWL_MVM_STATUS_IN_HW_RESTART in __iwl_mvm_resume. When mac80211 starts the device as __iwl_mvm_mac_start(), we will miss iwl_mvm_restart_cleanup(mvm). Signed-off-by: Matt Chen Signed-off-by: Luca Coelho --- drivers/net/wireless/intel/iwlwifi/mvm/d3.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/d3.c b/drivers/net/wireless/intel/iwlwifi/mvm/d3.c index 5de19ea10575..b205a7bfb828 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/d3.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/d3.c @@ -2167,7 +2167,7 @@ out: * 1. We are not using a unified image * 2. We are using a unified image but had an error while exiting D3 */ - set_bit(IWL_MVM_STATUS_IN_HW_RESTART, &mvm->status); + set_bit(IWL_MVM_STATUS_HW_RESTART_REQUESTED, &mvm->status); set_bit(IWL_MVM_STATUS_D3_RECONFIG, &mvm->status); /* * When switching images we return 1, which causes mac80211 -- cgit From 6110d9e5bdd15c4e60fb67f330fbf74681e7daf7 Mon Sep 17 00:00:00 2001 From: David Spinadel Date: Tue, 29 Aug 2017 13:56:02 +0300 Subject: iwlwifi: mvm: Flush non STA TX queues When starting wowlan mac80211 requests flush w/o vif and we ignore this request. As a result some packets stay stuck in the queue and it may end up with a queue hang. Allow the driver to flush queues even if station isn't specified. Signed-off-by: David Spinadel Signed-off-by: Luca Coelho --- drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c | 44 ++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c index 15f2d826bb4b..64b0be73ea72 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c @@ -3975,6 +3975,43 @@ out_unlock: return ret; } +static void iwl_mvm_flush_no_vif(struct iwl_mvm *mvm, u32 queues, bool drop) +{ + if (drop) { + if (iwl_mvm_has_new_tx_api(mvm)) + /* TODO new tx api */ + WARN_ONCE(1, + "Need to implement flush TX queue\n"); + else + iwl_mvm_flush_tx_path(mvm, + iwl_mvm_flushable_queues(mvm) & queues, + 0); + } else { + if (iwl_mvm_has_new_tx_api(mvm)) { + struct ieee80211_sta *sta; + int i; + + mutex_lock(&mvm->mutex); + + for (i = 0; i < ARRAY_SIZE(mvm->fw_id_to_mac_id); i++) { + sta = rcu_dereference_protected( + mvm->fw_id_to_mac_id[i], + lockdep_is_held(&mvm->mutex)); + if (IS_ERR_OR_NULL(sta)) + continue; + + iwl_mvm_wait_sta_queues_empty(mvm, + iwl_mvm_sta_from_mac80211(sta)); + } + + mutex_unlock(&mvm->mutex); + } else { + iwl_trans_wait_tx_queues_empty(mvm->trans, + queues); + } + } +} + static void iwl_mvm_mac_flush(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u32 queues, bool drop) { @@ -3985,7 +4022,12 @@ static void iwl_mvm_mac_flush(struct ieee80211_hw *hw, int i; u32 msk = 0; - if (!vif || vif->type != NL80211_IFTYPE_STATION) + if (!vif) { + iwl_mvm_flush_no_vif(mvm, queues, drop); + return; + } + + if (vif->type != NL80211_IFTYPE_STATION) return; /* Make sure we're done with the deferred traffic before flushing */ -- cgit From 0fe8bed6e37c259b85d123ef9667f972305c9d6b Mon Sep 17 00:00:00 2001 From: Avraham Stern Date: Thu, 31 Aug 2017 16:27:06 +0300 Subject: iwlwifi: mvm: send all non-bufferable frames on the probe queue AP interfaces now send all non-bufferable frames using the broadcast station. Thus allow them to use the probe queue and don't warn about it. Fixes: eb045e6e0389 ("iwlwifi: mvm: Avoid deferring non bufferable frames") Signed-off-by: Avraham Stern Signed-off-by: Luca Coelho --- drivers/net/wireless/intel/iwlwifi/mvm/tx.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c index 172b5e63d3fb..6f2e2af23219 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c @@ -564,8 +564,8 @@ static int iwl_mvm_get_ctrl_vif_queue(struct iwl_mvm *mvm, case NL80211_IFTYPE_AP: case NL80211_IFTYPE_ADHOC: /* - * Handle legacy hostapd as well, where station will be added - * only just before sending the association response. + * Non-bufferable frames use the broadcast station, thus they + * use the probe queue. * Also take care of the case where we send a deauth to a * station that we don't have, or similarly an association * response (with non-success status) for a station we can't @@ -573,9 +573,9 @@ static int iwl_mvm_get_ctrl_vif_queue(struct iwl_mvm *mvm, * Also, disassociate frames might happen, particular with * reason 7 ("Class 3 frame received from nonassociated STA"). */ - if (ieee80211_is_probe_resp(fc) || ieee80211_is_auth(fc) || - ieee80211_is_deauth(fc) || ieee80211_is_assoc_resp(fc) || - ieee80211_is_disassoc(fc)) + if (ieee80211_is_mgmt(fc) && + (!ieee80211_is_bufferable_mmpdu(fc) || + ieee80211_is_deauth(fc) || ieee80211_is_disassoc(fc))) return mvm->probe_queue; if (info->hw_queue == info->control.vif->cab_queue) return mvmvif->cab_queue; -- cgit From bd800e41a3de5c7e56b2fd27088bdaf5e228d227 Mon Sep 17 00:00:00 2001 From: Naftali Goldstein Date: Mon, 28 Aug 2017 11:51:05 +0300 Subject: iwlwifi: mvm: change state when queueing agg start work Add a new state to enum iwl_mvm_agg_state, which is used between queueing the work that starts tx aggregations and actually starting that work (changing to state IWL_AGG_STARTING). This solves a race where ieee80211_start_tx_ba_session is called a second time, before the work queued by the first run has a chance to change the agg_state. In this case the second call to ieee80211_start_tx_ba_session returns an error, and the fallback is to abort the ba session start. Fixes: 482e48440a0e ("iwlwifi: mvm: change open and close criteria of a BA session") Signed-off-by: Naftali Goldstein Signed-off-by: Luca Coelho --- drivers/net/wireless/intel/iwlwifi/mvm/rs.c | 3 ++- drivers/net/wireless/intel/iwlwifi/mvm/sta.c | 6 ++++-- drivers/net/wireless/intel/iwlwifi/mvm/sta.h | 2 ++ 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rs.c b/drivers/net/wireless/intel/iwlwifi/mvm/rs.c index ba7bd049d3d4..0fe723ca844e 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/rs.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/rs.c @@ -661,7 +661,8 @@ static void rs_tl_turn_on_agg(struct iwl_mvm *mvm, struct iwl_mvm_sta *mvmsta, (lq_sta->tx_agg_tid_en & BIT(tid)) && (tid_data->tx_count_last >= IWL_MVM_RS_AGG_START_THRESHOLD)) { IWL_DEBUG_RATE(mvm, "try to aggregate tid %d\n", tid); - rs_tl_turn_on_agg_for_tid(mvm, lq_sta, tid, sta); + if (rs_tl_turn_on_agg_for_tid(mvm, lq_sta, tid, sta) == 0) + tid_data->state = IWL_AGG_QUEUED; } } diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/sta.c b/drivers/net/wireless/intel/iwlwifi/mvm/sta.c index 411a2055dc45..2dafe9bb4d8b 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/sta.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/sta.c @@ -2385,8 +2385,10 @@ int iwl_mvm_sta_tx_agg_start(struct iwl_mvm *mvm, struct ieee80211_vif *vif, if (WARN_ON_ONCE(tid >= IWL_MAX_TID_COUNT)) return -EINVAL; - if (mvmsta->tid_data[tid].state != IWL_AGG_OFF) { - IWL_ERR(mvm, "Start AGG when state is not IWL_AGG_OFF %d!\n", + if (mvmsta->tid_data[tid].state != IWL_AGG_QUEUED && + mvmsta->tid_data[tid].state != IWL_AGG_OFF) { + IWL_ERR(mvm, + "Start AGG when state is not IWL_AGG_QUEUED or IWL_AGG_OFF %d!\n", mvmsta->tid_data[tid].state); return -ENXIO; } diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/sta.h b/drivers/net/wireless/intel/iwlwifi/mvm/sta.h index d13893806513..aedabe101cf0 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/sta.h +++ b/drivers/net/wireless/intel/iwlwifi/mvm/sta.h @@ -281,6 +281,7 @@ struct iwl_mvm_vif; * These states relate to a specific RA / TID. * * @IWL_AGG_OFF: aggregation is not used + * @IWL_AGG_QUEUED: aggregation start work has been queued * @IWL_AGG_STARTING: aggregation are starting (between start and oper) * @IWL_AGG_ON: aggregation session is up * @IWL_EMPTYING_HW_QUEUE_ADDBA: establishing a BA session - waiting for the @@ -290,6 +291,7 @@ struct iwl_mvm_vif; */ enum iwl_mvm_agg_state { IWL_AGG_OFF = 0, + IWL_AGG_QUEUED, IWL_AGG_STARTING, IWL_AGG_ON, IWL_EMPTYING_HW_QUEUE_ADDBA, -- cgit From 8458e48ac7ad86a5ab7f3d1a8cacd9205a9a97ce Mon Sep 17 00:00:00 2001 From: Avraham Stern Date: Sun, 3 Sep 2017 16:04:38 +0300 Subject: iwlwifi: mvm: wake the correct mac80211 queue iwl_mvm_start_mac_queues() takes a bitmap of the queues to wake. When deferred tx is purged, set the bit of the hw_queue so the correct queue will be waken up. Fixes: 7e39a00d5931 ("iwlwifi: mvm: start mac queues when deferred tx frames are purged") Signed-off-by: Avraham Stern Signed-off-by: Luca Coelho --- drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c index 64b0be73ea72..3a6ce4222ff5 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c @@ -2563,7 +2563,7 @@ static void iwl_mvm_purge_deferred_tx_frames(struct iwl_mvm *mvm, * queues, so we should never get a second deferred * frame for the RA/TID. */ - iwl_mvm_start_mac_queues(mvm, info->hw_queue); + iwl_mvm_start_mac_queues(mvm, BIT(info->hw_queue)); ieee80211_free_txskb(mvm->hw, skb); } } -- cgit From 97bce57bd7f96e1218751996f549a6e61f18cc8c Mon Sep 17 00:00:00 2001 From: Luca Coelho Date: Fri, 1 Sep 2017 17:59:15 +0300 Subject: iwlwifi: mvm: use IWL_HCMD_NOCOPY for MCAST_FILTER_CMD The MCAST_FILTER_CMD can get quite large when we have many mcast addresses to set (we support up to 255). So the command should be send as NOCOPY to prevent a warning caused by too-long commands: WARNING: CPU: 0 PID: 9700 at /root/iwlwifi/stack-dev/drivers/net/wireless/intel/iwlwifi/pcie/tx.c:1550 iwl_pcie_enqueue_hcmd+0x8c7/0xb40 [iwlwifi] Command MCAST_FILTER_CMD (0x1d0) is too large (328 bytes) This fixes: https://bugzilla.kernel.org/show_bug.cgi?id=196743 Cc: stable@vger.kernel.org Signed-off-by: Luca Coelho --- drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c index 3a6ce4222ff5..635db63f972e 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c @@ -1546,6 +1546,11 @@ static void iwl_mvm_mc_iface_iterator(void *_data, u8 *mac, struct iwl_mvm_mc_iter_data *data = _data; struct iwl_mvm *mvm = data->mvm; struct iwl_mcast_filter_cmd *cmd = mvm->mcast_filter_cmd; + struct iwl_host_cmd hcmd = { + .id = MCAST_FILTER_CMD, + .flags = CMD_ASYNC, + .dataflags[0] = IWL_HCMD_DFL_NOCOPY, + }; int ret, len; /* if we don't have free ports, mcast frames will be dropped */ @@ -1560,7 +1565,10 @@ static void iwl_mvm_mc_iface_iterator(void *_data, u8 *mac, memcpy(cmd->bssid, vif->bss_conf.bssid, ETH_ALEN); len = roundup(sizeof(*cmd) + cmd->count * ETH_ALEN, 4); - ret = iwl_mvm_send_cmd_pdu(mvm, MCAST_FILTER_CMD, CMD_ASYNC, len, cmd); + hcmd.len[0] = len; + hcmd.data[0] = cmd; + + ret = iwl_mvm_send_cmd(mvm, &hcmd); if (ret) IWL_ERR(mvm, "mcast filter cmd error. ret=%d\n", ret); } -- cgit From 61e7d91bcf7725b9fcd9cbfc5fa0e0f84f19e6de Mon Sep 17 00:00:00 2001 From: Luca Coelho Date: Fri, 1 Sep 2017 18:57:35 +0300 Subject: iwlwifi: mvm: handle FIF_ALLMULTI when setting multicast addresses We were ignoring the FIF_ALLMULTI flag when setting the multicast addresses with MCAST_FILTER_CMD. Check if this flag is set and enable pass_all accordingly. We also need to set the count to 0 if pass_all is enable so we don't pass addresses to the firmware when not needed (as doing so causes an assert). This fixes https://bugzilla.kernel.org/show_bug.cgi?id=196741 Signed-off-by: Luca Coelho --- drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c index 635db63f972e..3bcaa82f59b2 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c @@ -1643,6 +1643,12 @@ static void iwl_mvm_configure_filter(struct ieee80211_hw *hw, if (!cmd) goto out; + if (changed_flags & FIF_ALLMULTI) + cmd->pass_all = !!(*total_flags & FIF_ALLMULTI); + + if (cmd->pass_all) + cmd->count = 0; + iwl_mvm_recalc_multicast(mvm); out: mutex_unlock(&mvm->mutex); -- cgit From 3f497de997c7ed34ad8a90b64f1ca53a41d428b4 Mon Sep 17 00:00:00 2001 From: Luca Coelho Date: Sat, 2 Sep 2017 11:05:22 +0300 Subject: iwlwifi: mvm: initialize status in iwl_mvm_add_int_sta_common() We always need to initialize the status argument to the success case before calling iwl_mvm_send_cmd_status() or iwl_mvm_send_cmd_pdu_status() (which calls the former) otherwise we may get an uninitialized value back. In this case, we use ADD_STA_SUCCESS as success. Fixes: 732d06e9d9cf ("iwlwifi: mvm: add station before allocating a queue") Reported by: Dan Carpenter Signed-off-by: Luca Coelho --- drivers/net/wireless/intel/iwlwifi/mvm/sta.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/sta.c b/drivers/net/wireless/intel/iwlwifi/mvm/sta.c index 2dafe9bb4d8b..c4a343534c5e 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/sta.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/sta.c @@ -1285,7 +1285,7 @@ static int iwl_mvm_add_int_sta_common(struct iwl_mvm *mvm, { struct iwl_mvm_add_sta_cmd cmd; int ret; - u32 status; + u32 status = ADD_STA_SUCCESS; lockdep_assert_held(&mvm->mutex); -- cgit From d460f1fb83a44833a09c8eaa34b30ce553cab8c5 Mon Sep 17 00:00:00 2001 From: Luca Coelho Date: Sat, 2 Sep 2017 11:25:40 +0300 Subject: iwlwifi: mvm: set status before calling iwl_mvm_send_cmd_status() We always must set the status to what we consider success before calling iwl_mvm_send_cmd_status() (also iwl_mvm_send_cmd_pdu_status() which calls it). Fix a few places where initialization is missing. Signed-off-by: Luca Coelho --- drivers/net/wireless/intel/iwlwifi/mvm/scan.c | 2 +- drivers/net/wireless/intel/iwlwifi/mvm/tt.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c index 50983615dce6..774122fed454 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c @@ -555,7 +555,7 @@ static int iwl_mvm_lmac_scan_abort(struct iwl_mvm *mvm) struct iwl_host_cmd cmd = { .id = SCAN_OFFLOAD_ABORT_CMD, }; - u32 status; + u32 status = CAN_ABORT_STATUS; ret = iwl_mvm_send_cmd_status(mvm, &cmd, &status); if (ret) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/tt.c b/drivers/net/wireless/intel/iwlwifi/mvm/tt.c index 8876c2abc440..4d907f60bce9 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/tt.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/tt.c @@ -529,6 +529,7 @@ int iwl_mvm_ctdp_command(struct iwl_mvm *mvm, u32 op, u32 state) lockdep_assert_held(&mvm->mutex); + status = 0; ret = iwl_mvm_send_cmd_pdu_status(mvm, WIDE_ID(PHY_OPS_GROUP, CTDP_CONFIG_CMD), sizeof(cmd), &cmd, &status); -- cgit From 5f90472c00ddf1e64c2865f71cced297bd5f80a2 Mon Sep 17 00:00:00 2001 From: Sara Sharon Date: Mon, 4 Sep 2017 20:27:04 +0300 Subject: iwlwifi: mvm: fix reorder buffer for 9000 devices The condition to check if reorder buffer ran out of space is faulty, as it takes into account only the NSSN. In case the head SN was too far behind the reorder buffer should move forward, regardless of the NSSN status. This caused the driver to release packets out of order in some scenarios. Fixes: b915c10174fb ("iwlwifi: mvm: add reorder buffer per queue") Signed-off-by: Sara Sharon Signed-off-by: Luca Coelho --- drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c b/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c index 67ffd9774712..77f77bc5d083 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c @@ -672,11 +672,12 @@ static bool iwl_mvm_reorder(struct iwl_mvm *mvm, * If there was a significant jump in the nssn - adjust. * If the SN is smaller than the NSSN it might need to first go into * the reorder buffer, in which case we just release up to it and the - * rest of the function will take of storing it and releasing up to the - * nssn + * rest of the function will take care of storing it and releasing up to + * the nssn */ if (!iwl_mvm_is_sn_less(nssn, buffer->head_sn + buffer->buf_size, - buffer->buf_size)) { + buffer->buf_size) || + !ieee80211_sn_less(sn, buffer->head_sn + buffer->buf_size)) { u16 min_sn = ieee80211_sn_less(sn, nssn) ? sn : nssn; iwl_mvm_release_frames(mvm, sta, napi, buffer, min_sn); -- cgit From cac72b990d34f4c70208998a86f910ba38253c94 Mon Sep 17 00:00:00 2001 From: Lyude Date: Sat, 22 Jul 2017 21:15:09 -0400 Subject: HID: rmi: Make sure the HID device is opened on resume So it looks like that suspend/resume has actually always been broken on hid-rmi. The fact it worked was a rather silly coincidence that was relying on the HID device to already be opened upon resume. This means that so long as anything was reading the /dev/input/eventX node for for an RMI device, it would suspend and resume correctly. As well, if nothing happened to be keeping the HID device away it would shut off, then the RMI driver would get confused on resume when it stopped responding and explode. So, call hid_hw_open() in rmi_post_resume() so we make sure that the device is alive before we try talking to it. This fixes RMI device suspend/resume over HID. Link: https://bugzilla.kernel.org/show_bug.cgi?id=196851 [jkosina@suse.cz: removed useless hunk that was zero-initializing 'ret'] Signed-off-by: Lyude Cc: Andrew Duggan Cc: stable@vger.kernel.org Reviewed-by: Benjamin Tissoires Signed-off-by: Jiri Kosina --- drivers/hid/hid-rmi.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/hid/hid-rmi.c b/drivers/hid/hid-rmi.c index 5b40c2614599..ef241d66562e 100644 --- a/drivers/hid/hid-rmi.c +++ b/drivers/hid/hid-rmi.c @@ -436,17 +436,24 @@ static int rmi_post_resume(struct hid_device *hdev) if (!(data->device_flags & RMI_DEVICE)) return 0; - ret = rmi_reset_attn_mode(hdev); + /* Make sure the HID device is ready to receive events */ + ret = hid_hw_open(hdev); if (ret) return ret; + ret = rmi_reset_attn_mode(hdev); + if (ret) + goto out; + ret = rmi_driver_resume(rmi_dev, false); if (ret) { hid_warn(hdev, "Failed to resume device: %d\n", ret); - return ret; + goto out; } - return 0; +out: + hid_hw_close(hdev); + return ret; } #endif /* CONFIG_PM */ -- cgit From bfaa1ce809bbcd12b1399409ab1dbf0cdaba6e27 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 8 Sep 2017 15:13:33 +0100 Subject: drm/amdkfd: check for null dev to avoid a null pointer dereference The call to kfd_device_by_id can potentially return null, so check that dev is null and return with -EINVAL to avoid a null pointer dereference. Detected by CoverityScan CID#1454629 ("Dereference null return value") Fixes: 5d71dbc3a588 ("drm/amdkfd: Implement image tiling mode support v2") Signed-off-by: Colin Ian King Signed-off-by: Oded Gabbay --- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index e4a8c2e52cb2..660b3fbade41 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -892,6 +892,8 @@ static int kfd_ioctl_get_tile_config(struct file *filep, int err = 0; dev = kfd_device_by_id(args->gpu_id); + if (!dev) + return -EINVAL; dev->kfd2kgd->get_tile_config(dev->kgd, &config); -- cgit From b0e07da3f5c8d069d186a7983ff64eaebf2ea230 Mon Sep 17 00:00:00 2001 From: Gerd Hoffmann Date: Mon, 11 Sep 2017 11:39:50 +0200 Subject: qxl: fix primary surface handling The atomic conversion of the qxl driver didn't got the primary surface handling completely right. It works in the common simple cases, but fails for example when changing the display resolution using xrandr or in multihead setups. The rules are simple: There is one primary surface. Before defining a new one you have to destroy the old one. This patch makes qxl_primary_atomic_update() destroy the primary surface before defining a new one. It fixes is_primary flag updates. It adds is_primary checks so we don't try to update the primary surface in case it already has the state we want it being in. Fixes: 3538e80a869b ("drm: qxl: Atomic phase 1: Implement mode_set_nofb") Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=102338 Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=196777 Signed-off-by: Gerd Hoffmann Reviewed-by: Gabriel Krisman Bertazi Link: http://patchwork.freedesktop.org/patch/msgid/20170911093950.22401-1-kraxel@redhat.com --- drivers/gpu/drm/qxl/qxl_display.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/qxl/qxl_display.c b/drivers/gpu/drm/qxl/qxl_display.c index 03fe182203ce..7babdd8f20fd 100644 --- a/drivers/gpu/drm/qxl/qxl_display.c +++ b/drivers/gpu/drm/qxl/qxl_display.c @@ -512,23 +512,25 @@ static void qxl_primary_atomic_update(struct drm_plane *plane, .y2 = qfb->base.height }; - if (!old_state->fb) { - qxl_io_log(qdev, - "create primary fb: %dx%d,%d,%d\n", - bo->surf.width, bo->surf.height, - bo->surf.stride, bo->surf.format); + if (old_state->fb) { + qfb_old = to_qxl_framebuffer(old_state->fb); + bo_old = gem_to_qxl_bo(qfb_old->obj); + } else { + bo_old = NULL; + } - qxl_io_create_primary(qdev, 0, bo); - bo->is_primary = true; + if (bo == bo_old) return; - } else { - qfb_old = to_qxl_framebuffer(old_state->fb); - bo_old = gem_to_qxl_bo(qfb_old->obj); + if (bo_old && bo_old->is_primary) { + qxl_io_destroy_primary(qdev); bo_old->is_primary = false; } - bo->is_primary = true; + if (!bo->is_primary) { + qxl_io_create_primary(qdev, 0, bo); + bo->is_primary = true; + } qxl_draw_dirty_fb(qdev, qfb, bo, 0, 0, &norect, 1, 1); } @@ -537,13 +539,15 @@ static void qxl_primary_atomic_disable(struct drm_plane *plane, { struct qxl_device *qdev = plane->dev->dev_private; - if (old_state->fb) - { struct qxl_framebuffer *qfb = + if (old_state->fb) { + struct qxl_framebuffer *qfb = to_qxl_framebuffer(old_state->fb); struct qxl_bo *bo = gem_to_qxl_bo(qfb->obj); - qxl_io_destroy_primary(qdev); - bo->is_primary = false; + if (bo->is_primary) { + qxl_io_destroy_primary(qdev); + bo->is_primary = false; + } } } -- cgit From 5a642e6bc49f59922e19ebd639e74f72753fc77b Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Fri, 8 Sep 2017 16:24:32 +0200 Subject: etnaviv: fix submit error path If the gpu submit fails, bail out to avoid accessing a potentially unititalized fence. CC: stable@vger.kernel.org #4.12+ Signed-off-by: Lucas Stach --- drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c b/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c index 6463fc2c736f..b95362186f9c 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c @@ -445,8 +445,10 @@ int etnaviv_ioctl_gem_submit(struct drm_device *dev, void *data, cmdbuf->user_size = ALIGN(args->stream_size, 8); ret = etnaviv_gpu_submit(gpu, submit, cmdbuf); - if (ret == 0) - cmdbuf = NULL; + if (ret) + goto out; + + cmdbuf = NULL; if (args->flags & ETNA_SUBMIT_FENCE_FD_OUT) { /* -- cgit From 518417525f3652c12fb5fad6da4ade66c0072fa3 Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Mon, 11 Sep 2017 15:29:31 +0200 Subject: etnaviv: fix gem object list corruption All manipulations of the gem_object list need to be protected by the list mutex, as GEM objects can be created and freed in parallel. This fixes a kernel memory corruption. CC: stable@vger.kernel.org Signed-off-by: Lucas Stach --- drivers/gpu/drm/etnaviv/etnaviv_gem.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c index 9a3bea738330..87b95eeedd9e 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c @@ -551,12 +551,15 @@ static const struct etnaviv_gem_ops etnaviv_gem_shmem_ops = { void etnaviv_gem_free_object(struct drm_gem_object *obj) { struct etnaviv_gem_object *etnaviv_obj = to_etnaviv_bo(obj); + struct etnaviv_drm_private *priv = obj->dev->dev_private; struct etnaviv_vram_mapping *mapping, *tmp; /* object should not be active */ WARN_ON(is_active(etnaviv_obj)); + mutex_lock(&priv->gem_lock); list_del(&etnaviv_obj->gem_node); + mutex_unlock(&priv->gem_lock); list_for_each_entry_safe(mapping, tmp, &etnaviv_obj->vram_list, obj_node) { -- cgit From 8320caeeffdefec3b58b9d4a7ed8e1079492fe7b Mon Sep 17 00:00:00 2001 From: Adrian Salido Date: Fri, 8 Sep 2017 10:55:27 -0700 Subject: HID: i2c-hid: allocate hid buffers for real worst case The buffer allocation is not currently accounting for an extra byte for the report id. This can cause an out of bounds access in function i2c_hid_set_or_send_report() with reportID > 15. Cc: stable@vger.kernel.org Signed-off-by: Adrian Salido Reviewed-by: Benson Leung Signed-off-by: Guenter Roeck Signed-off-by: Dmitry Torokhov Signed-off-by: Jiri Kosina --- drivers/hid/i2c-hid/i2c-hid.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/hid/i2c-hid/i2c-hid.c b/drivers/hid/i2c-hid/i2c-hid.c index 77396145d2d0..9145c2129a96 100644 --- a/drivers/hid/i2c-hid/i2c-hid.c +++ b/drivers/hid/i2c-hid/i2c-hid.c @@ -543,7 +543,8 @@ static int i2c_hid_alloc_buffers(struct i2c_hid *ihid, size_t report_size) { /* the worst case is computed from the set_report command with a * reportID > 15 and the maximum report length */ - int args_len = sizeof(__u8) + /* optional ReportID byte */ + int args_len = sizeof(__u8) + /* ReportID */ + sizeof(__u8) + /* optional ReportID byte */ sizeof(__u16) + /* data register */ sizeof(__u16) + /* size of the report */ report_size; /* report */ -- cgit From 993f0d93f8538c15bd5c12a1a9fd74c777efea1b Mon Sep 17 00:00:00 2001 From: Jason Gerecke Date: Thu, 7 Sep 2017 17:44:12 -0700 Subject: HID: wacom: generic: Send MSC_SERIAL and ABS_MISC when leaving prox The latest generation of pro devices (MobileStudio Pro, 2nd-gen Intuos Pro, Cintiq Pro) send a serial number of '0' whenever the pen is too far away for reliable communication. Userspace defines that a serial number of '0' is invalid, so we need to be careful not to actually forward this value. Additionally, since EMR ISDv4 devices do not support serial numbers or tool IDs, we'd like to not send these events if they aren't necessary. The existing code achieves these goals by adding a check for a non-zero serial number within the wacom_wac_pen_report function. The MSC_SERIAL and ABS_MISC events are only sent if the serial number is non-zero. This code fails, however when the pen for a pro device leaves proximity. When the pen leaves prox and the tablet sends a serial of 0, wacom_wac_pen_event dutifully clears the serial number. When wacom_wac_pen_report is called, it does not send either the MSC_SERIAL of the exiting tool nor an ABS_MISC event. This patch prevents the wacom_wac_pen_event function from clearing an already-set serial number. This ensures that we have the serial number handy when exiting proximity, but requires us to manually clear it afterwards to ensure the driver does not send stale data (e.g. when switching between AES pens that report a serial nubmer of 0 for the first few fully in-proximity packets). Fixes: f85c9dc678 ("HID: wacom: generic: Support tool ID and additional tool types") Cc: stable # v4.10 Signed-off-by: Ping Cheng Signed-off-by: Jason Gerecke Signed-off-by: Jiri Kosina --- drivers/hid/wacom_wac.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/drivers/hid/wacom_wac.c b/drivers/hid/wacom_wac.c index 78d0398904dc..e2ba36da6e20 100644 --- a/drivers/hid/wacom_wac.c +++ b/drivers/hid/wacom_wac.c @@ -2141,8 +2141,10 @@ static void wacom_wac_pen_event(struct hid_device *hdev, struct hid_field *field wacom_wac->hid_data.tipswitch |= value; return; case HID_DG_TOOLSERIALNUMBER: - wacom_wac->serial[0] = (wacom_wac->serial[0] & ~0xFFFFFFFFULL); - wacom_wac->serial[0] |= (__u32)value; + if (value) { + wacom_wac->serial[0] = (wacom_wac->serial[0] & ~0xFFFFFFFFULL); + wacom_wac->serial[0] |= (__u32)value; + } return; case HID_DG_TWIST: /* @@ -2156,15 +2158,17 @@ static void wacom_wac_pen_event(struct hid_device *hdev, struct hid_field *field wacom_wac->hid_data.sense_state = value; return; case WACOM_HID_WD_SERIALHI: - wacom_wac->serial[0] = (wacom_wac->serial[0] & 0xFFFFFFFF); - wacom_wac->serial[0] |= ((__u64)value) << 32; - /* - * Non-USI EMR devices may contain additional tool type - * information here. See WACOM_HID_WD_TOOLTYPE case for - * more details. - */ - if (value >> 20 == 1) { - wacom_wac->id[0] |= value & 0xFFFFF; + if (value) { + wacom_wac->serial[0] = (wacom_wac->serial[0] & 0xFFFFFFFF); + wacom_wac->serial[0] |= ((__u64)value) << 32; + /* + * Non-USI EMR devices may contain additional tool type + * information here. See WACOM_HID_WD_TOOLTYPE case for + * more details. + */ + if (value >> 20 == 1) { + wacom_wac->id[0] |= value & 0xFFFFF; + } } return; case WACOM_HID_WD_TOOLTYPE: @@ -2279,6 +2283,7 @@ static void wacom_wac_pen_report(struct hid_device *hdev, if (!prox) { wacom_wac->tool[0] = 0; wacom_wac->id[0] = 0; + wacom_wac->serial[0] = 0; } } -- cgit From 92380b572d95caf48f8424746aeee63c5a2b1922 Mon Sep 17 00:00:00 2001 From: Jason Gerecke Date: Thu, 7 Sep 2017 17:47:38 -0700 Subject: HID: wacom: generic: Clear ABS_MISC when tool leaves proximity The tool ID information sent in ABS_MISC is expected to be reset to 0 when a tool leaves proximity. Not doing this can cause problems if a tool is removed and then re-introduced. Kernel event filtering will prevent the (identical) ABS_MISC event from being sent when the tool re-enters proxmity. This can cause userspace to not properly set the tool ID. Fixes: f85c9dc678 ("HID: wacom: generic: Support tool ID and additional tool types") Cc: stable # v4.10 Signed-off-by: Ping Cheng Signed-off-by: Jason Gerecke Signed-off-by: Jiri Kosina --- drivers/hid/wacom_wac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hid/wacom_wac.c b/drivers/hid/wacom_wac.c index e2ba36da6e20..aa692e28b2cd 100644 --- a/drivers/hid/wacom_wac.c +++ b/drivers/hid/wacom_wac.c @@ -2272,7 +2272,7 @@ static void wacom_wac_pen_report(struct hid_device *hdev, input_report_key(input, wacom_wac->tool[0], prox); if (wacom_wac->serial[0]) { input_event(input, EV_MSC, MSC_SERIAL, wacom_wac->serial[0]); - input_report_abs(input, ABS_MISC, id); + input_report_abs(input, ABS_MISC, prox ? id : 0); } wacom_wac->hid_data.tipswitch = false; -- cgit From d0b6e0a8ef24b1b07078ababe5d91bcdf4f4264a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 12 Sep 2017 21:36:55 +0200 Subject: watchdog/hardlockup: Provide interface to stop/restart perf events Provide an interface to stop and restart perf NMI watchdog events on all CPUs. This is only usable during init and especially for handling the perf HT bug on Intel machines. It's safe to use it this way as nothing can start/stop the NMI watchdog in parallel. Signed-off-by: Peter Zijlstra Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194146.167649596@linutronix.de Signed-off-by: Ingo Molnar --- include/linux/nmi.h | 4 ++++ kernel/watchdog_hld.c | 41 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/include/linux/nmi.h b/include/linux/nmi.h index a36abe2da13e..b24d4a58674a 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -75,7 +75,11 @@ static inline void hardlockup_detector_disable(void) {} #if defined(CONFIG_HARDLOCKUP_DETECTOR_PERF) extern void arch_touch_nmi_watchdog(void); +extern void hardlockup_detector_perf_stop(void); +extern void hardlockup_detector_perf_restart(void); #else +static inline void hardlockup_detector_perf_stop(void) { } +static inline void hardlockup_detector_perf_restart(void) { } #if !defined(CONFIG_HAVE_NMI_WATCHDOG) static inline void arch_touch_nmi_watchdog(void) {} #endif diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 3a09ea1b1d3d..c9586ebc2e98 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -261,3 +261,44 @@ void watchdog_nmi_disable(unsigned int cpu) firstcpu_err = 0; } } + +/** + * hardlockup_detector_perf_stop - Globally stop watchdog events + * + * Special interface for x86 to handle the perf HT bug. + */ +void __init hardlockup_detector_perf_stop(void) +{ + int cpu; + + lockdep_assert_cpus_held(); + + for_each_online_cpu(cpu) { + struct perf_event *event = per_cpu(watchdog_ev, cpu); + + if (event) + perf_event_disable(event); + } +} + +/** + * hardlockup_detector_perf_restart - Globally restart watchdog events + * + * Special interface for x86 to handle the perf HT bug. + */ +void __init hardlockup_detector_perf_restart(void) +{ + int cpu; + + lockdep_assert_cpus_held(); + + if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) + return; + + for_each_online_cpu(cpu) { + struct perf_event *event = per_cpu(watchdog_ev, cpu); + + if (event) + perf_event_enable(event); + } +} -- cgit From 2406e3b166eee42777a6b0b38f52f924454474d7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 12 Sep 2017 21:36:56 +0200 Subject: perf/x86/intel, watchdog/core: Sanitize PMU HT bug workaround The lockup_detector_suspend/resume() interface is broken in several ways especially as it results in recursive locking of the CPU hotplug lock. Use the new stop/restart interface in the perf NMI watchdog to temporarily disable and reenable the already active watchdog events. That's enough to handle it. Signed-off-by: Peter Zijlstra Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194146.247141871@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/events/intel/core.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 829e89cfcee2..9fb9a1f1e47b 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4409,10 +4409,9 @@ static __init int fixup_ht_bug(void) return 0; } - if (lockup_detector_suspend() != 0) { - pr_debug("failed to disable PMU erratum BJ122, BV98, HSD29 workaround\n"); - return 0; - } + cpus_read_lock(); + + hardlockup_detector_perf_stop(); x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED); @@ -4420,9 +4419,7 @@ static __init int fixup_ht_bug(void) x86_pmu.commit_scheduling = NULL; x86_pmu.stop_scheduling = NULL; - lockup_detector_resume(); - - cpus_read_lock(); + hardlockup_detector_perf_restart(); for_each_online_cpu(c) free_excl_cntrs(c); -- cgit From 6554fd8cf06db86f861bb24d7487b2873ca444c4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:36:57 +0200 Subject: watchdog/core: Provide interface to stop from poweroff() PARISC has a a busy looping power off routine. If the watchdog is enabled the watchdog timer will still fire, but the thread is not running, which causes the softlockup watchdog to trigger. Provide a interface which allows to turn the watchdog off. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Helge Deller Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Cc: linux-parisc@vger.kernel.org Link: http://lkml.kernel.org/r/20170912194146.327343752@linutronix.de Signed-off-by: Ingo Molnar --- include/linux/nmi.h | 6 +++--- kernel/watchdog.c | 14 +++++++++++++- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/include/linux/nmi.h b/include/linux/nmi.h index b24d4a58674a..85bb268be39c 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -12,10 +12,10 @@ #ifdef CONFIG_LOCKUP_DETECTOR void lockup_detector_init(void); +void lockup_detector_soft_poweroff(void); #else -static inline void lockup_detector_init(void) -{ -} +static inline void lockup_detector_init(void) { } +static inline void lockup_detector_soft_poweroff(void) { } #endif #ifdef CONFIG_SOFTLOCKUP_DETECTOR diff --git a/kernel/watchdog.c b/kernel/watchdog.c index f5d52024f6b7..f23e373aa3bf 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -333,7 +333,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) int duration; int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace; - if (atomic_read(&watchdog_park_in_progress) != 0) + if (!watchdog_enabled || + atomic_read(&watchdog_park_in_progress) != 0) return HRTIMER_NORESTART; /* kick the hardlockup detector */ @@ -660,6 +661,17 @@ static void set_sample_period(void) } #endif /* SOFTLOCKUP */ +/** + * lockup_detector_soft_poweroff - Interface to stop lockup detector(s) + * + * Special interface for parisc. It prevents lockup detector warnings from + * the default pm_poweroff() function which busy loops forever. + */ +void lockup_detector_soft_poweroff(void) +{ + watchdog_enabled = 0; +} + /* * Suspend the hard and soft lockup detector by parking the watchdog threads. */ -- cgit From 47bb4baf7df43ac8bbc51c24022466972ba29ef1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:36:58 +0200 Subject: parisc, watchdog/core: Use lockup_detector_stop() The broken lockup_detector_suspend/resume() interface is going away. Use the new lockup_detector_soft_poweroff() interface to stop the watchdog from the busy looping power off routine. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Helge Deller Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Cc: linux-parisc@vger.kernel.org Link: http://lkml.kernel.org/r/20170912194146.407385557@linutronix.de Signed-off-by: Ingo Molnar --- arch/parisc/kernel/process.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c index a45a67d526f8..30f92391a93e 100644 --- a/arch/parisc/kernel/process.c +++ b/arch/parisc/kernel/process.c @@ -146,7 +146,7 @@ void machine_power_off(void) /* prevent soft lockup/stalled CPU messages for endless loop. */ rcu_sysrq_start(); - lockup_detector_suspend(); + lockup_detector_soft_poweroff(); for (;;); } -- cgit From 5490125d77a43016b26f629d4b485e2c62172551 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:36:59 +0200 Subject: watchdog/core: Remove broken suspend/resume interfaces This interface has several issues: - It's causing recursive locking of the hotplug lock. - It's complete overkill to teardown all threads and then recreate them The same can be achieved with the simple hardlockup_detector_perf_stop / restart() interfaces. The abuse from the busy looping poweroff() loop of PARISC has been solved as well. Remove the cruft. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194146.487537732@linutronix.de Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/watchdog.c | 3 -- include/linux/nmi.h | 12 ------ kernel/watchdog.c | 89 +----------------------------------------- 3 files changed, 1 insertion(+), 103 deletions(-) diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c index 2f6eadd9408d..5ded171f02d6 100644 --- a/arch/powerpc/kernel/watchdog.c +++ b/arch/powerpc/kernel/watchdog.c @@ -310,9 +310,6 @@ static int start_wd_on_cpu(unsigned int cpu) if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) return 0; - if (watchdog_suspended) - return 0; - if (!cpumask_test_cpu(cpu, &watchdog_cpumask)) return 0; diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 85bb268be39c..7eefe7abf44b 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -164,7 +164,6 @@ extern int watchdog_thresh; extern unsigned long watchdog_enabled; extern struct cpumask watchdog_cpumask; extern unsigned long *watchdog_cpumask_bits; -extern int __read_mostly watchdog_suspended; #ifdef CONFIG_SMP extern int sysctl_softlockup_all_cpu_backtrace; extern int sysctl_hardlockup_all_cpu_backtrace; @@ -192,17 +191,6 @@ extern int proc_watchdog_thresh(struct ctl_table *, int , void __user *, size_t *, loff_t *); extern int proc_watchdog_cpumask(struct ctl_table *, int, void __user *, size_t *, loff_t *); -extern int lockup_detector_suspend(void); -extern void lockup_detector_resume(void); -#else -static inline int lockup_detector_suspend(void) -{ - return 0; -} - -static inline void lockup_detector_resume(void) -{ -} #endif #ifdef CONFIG_HAVE_ACPI_APEI_NMI diff --git a/kernel/watchdog.c b/kernel/watchdog.c index f23e373aa3bf..b2d46757917e 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -97,19 +97,6 @@ unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); * unregistered/stopped, so it is an indicator whether the threads exist. */ static int __read_mostly watchdog_running; -/* - * If a subsystem has a need to deactivate the watchdog temporarily, it - * can use the suspend/resume interface to achieve this. The content of - * the 'watchdog_suspended' variable reflects this state. Existing threads - * are parked/unparked by the lockup_detector_{suspend|resume} functions - * (see comment blocks pertaining to those functions for further details). - * - * 'watchdog_suspended' also prevents threads from being registered/started - * or unregistered/stopped via parameters in /proc/sys/kernel, so the state - * of 'watchdog_running' cannot change while the watchdog is deactivated - * temporarily (see related code in 'proc' handlers). - */ -int __read_mostly watchdog_suspended; /* * These functions can be overridden if an architecture implements its @@ -136,7 +123,6 @@ void __weak watchdog_nmi_disable(unsigned int cpu) * - watchdog_cpumask * - sysctl_hardlockup_all_cpu_backtrace * - hardlockup_panic - * - watchdog_suspended */ void __weak watchdog_nmi_reconfigure(void) { @@ -672,61 +658,6 @@ void lockup_detector_soft_poweroff(void) watchdog_enabled = 0; } -/* - * Suspend the hard and soft lockup detector by parking the watchdog threads. - */ -int lockup_detector_suspend(void) -{ - int ret = 0; - - get_online_cpus(); - mutex_lock(&watchdog_proc_mutex); - /* - * Multiple suspend requests can be active in parallel (counted by - * the 'watchdog_suspended' variable). If the watchdog threads are - * running, the first caller takes care that they will be parked. - * The state of 'watchdog_running' cannot change while a suspend - * request is active (see related code in 'proc' handlers). - */ - if (watchdog_running && !watchdog_suspended) - ret = watchdog_park_threads(); - - if (ret == 0) - watchdog_suspended++; - else { - watchdog_disable_all_cpus(); - pr_err("Failed to suspend lockup detectors, disabled\n"); - watchdog_enabled = 0; - } - - watchdog_nmi_reconfigure(); - - mutex_unlock(&watchdog_proc_mutex); - - return ret; -} - -/* - * Resume the hard and soft lockup detector by unparking the watchdog threads. - */ -void lockup_detector_resume(void) -{ - mutex_lock(&watchdog_proc_mutex); - - watchdog_suspended--; - /* - * The watchdog threads are unparked if they were previously running - * and if there is no more active suspend request. - */ - if (watchdog_running && !watchdog_suspended) - watchdog_unpark_threads(); - - watchdog_nmi_reconfigure(); - - mutex_unlock(&watchdog_proc_mutex); - put_online_cpus(); -} - #ifdef CONFIG_SYSCTL /* @@ -775,12 +706,6 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, get_online_cpus(); mutex_lock(&watchdog_proc_mutex); - if (watchdog_suspended) { - /* no parameter changes allowed while watchdog is suspended */ - err = -EAGAIN; - goto out; - } - /* * If the parameter is being read return the state of the corresponding * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the @@ -872,12 +797,6 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, get_online_cpus(); mutex_lock(&watchdog_proc_mutex); - if (watchdog_suspended) { - /* no parameter changes allowed while watchdog is suspended */ - err = -EAGAIN; - goto out; - } - old = ACCESS_ONCE(watchdog_thresh); err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); @@ -917,12 +836,6 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, get_online_cpus(); mutex_lock(&watchdog_proc_mutex); - if (watchdog_suspended) { - /* no parameter changes allowed while watchdog is suspended */ - err = -EAGAIN; - goto out; - } - err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); if (!err && write) { /* Remove impossible cpus to keep sysctl output cleaner. */ @@ -941,7 +854,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, watchdog_nmi_reconfigure(); } -out: + mutex_unlock(&watchdog_proc_mutex); put_online_cpus(); return err; -- cgit From b7a349819d4b9b5db64e523351e66a79a758eaa5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:00 +0200 Subject: watchdog/core: Rework CPU hotplug locking The watchdog proc interface causes extensive recursive locking of the CPU hotplug percpu rwsem, which is deadlock prone. Replace the get/put_online_cpus() pairs with cpu_hotplug_disable()/enable() calls for now. Later patches will remove that requirement completely. Reported-by: Borislav Petkov Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194146.568079057@linutronix.de Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index b2d46757917e..cd79f644ea34 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -703,7 +703,7 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, int err, old, new; int *watchdog_param = (int *)table->data; - get_online_cpus(); + cpu_hotplug_disable(); mutex_lock(&watchdog_proc_mutex); /* @@ -752,7 +752,7 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, } out: mutex_unlock(&watchdog_proc_mutex); - put_online_cpus(); + cpu_hotplug_enable(); return err; } @@ -794,7 +794,7 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, { int err, old, new; - get_online_cpus(); + cpu_hotplug_disable(); mutex_lock(&watchdog_proc_mutex); old = ACCESS_ONCE(watchdog_thresh); @@ -818,7 +818,7 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, } out: mutex_unlock(&watchdog_proc_mutex); - put_online_cpus(); + cpu_hotplug_enable(); return err; } @@ -833,7 +833,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, { int err; - get_online_cpus(); + cpu_hotplug_disable(); mutex_lock(&watchdog_proc_mutex); err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); @@ -856,7 +856,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, } mutex_unlock(&watchdog_proc_mutex); - put_online_cpus(); + cpu_hotplug_enable(); return err; } -- cgit From 946d197794b23202b8b46c43016747c72fe23393 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:01 +0200 Subject: watchdog/core: Rename watchdog_proc_mutex Following patches will use the mutex for other purposes as well. Rename it as it is not longer a proc specific thing. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194146.647714850@linutronix.de Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index cd79f644ea34..7c3a0a76b41b 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -29,8 +29,7 @@ #include #include -/* Watchdog configuration */ -static DEFINE_MUTEX(watchdog_proc_mutex); +static DEFINE_MUTEX(watchdog_mutex); int __read_mostly nmi_watchdog_enabled; @@ -704,7 +703,7 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, int *watchdog_param = (int *)table->data; cpu_hotplug_disable(); - mutex_lock(&watchdog_proc_mutex); + mutex_lock(&watchdog_mutex); /* * If the parameter is being read return the state of the corresponding @@ -751,7 +750,7 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, err = proc_watchdog_update(); } out: - mutex_unlock(&watchdog_proc_mutex); + mutex_unlock(&watchdog_mutex); cpu_hotplug_enable(); return err; } @@ -795,7 +794,7 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, int err, old, new; cpu_hotplug_disable(); - mutex_lock(&watchdog_proc_mutex); + mutex_lock(&watchdog_mutex); old = ACCESS_ONCE(watchdog_thresh); err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); @@ -817,7 +816,7 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, set_sample_period(); } out: - mutex_unlock(&watchdog_proc_mutex); + mutex_unlock(&watchdog_mutex); cpu_hotplug_enable(); return err; } @@ -834,7 +833,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, int err; cpu_hotplug_disable(); - mutex_lock(&watchdog_proc_mutex); + mutex_lock(&watchdog_mutex); err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); if (!err && write) { @@ -855,7 +854,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, watchdog_nmi_reconfigure(); } - mutex_unlock(&watchdog_proc_mutex); + mutex_unlock(&watchdog_mutex); cpu_hotplug_enable(); return err; } -- cgit From 7a3558200739e1378800a7a6d7f63c031115f7a4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:02 +0200 Subject: watchdog/core: Mark hardlockup_detector_disable() __init The function is only used by the KVM init code. Mark it __init to prevent creative abuse. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194146.727134632@linutronix.de Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 7c3a0a76b41b..1c185d9dd468 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -55,7 +55,7 @@ unsigned int __read_mostly hardlockup_panic = * kernel command line parameters are parsed, because otherwise it is not * possible to override this in hardlockup_panic_setup(). */ -void hardlockup_detector_disable(void) +void __init hardlockup_detector_disable(void) { watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; } -- cgit From 20d853fd0703b1d73c35a22024c0d4fcbcc57c8c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:03 +0200 Subject: watchdog/hardlockup/perf: Remove broken self disable on failure The self disabling feature is broken vs. CPU hotplug locking: CPU 0 CPU 1 cpus_write_lock(); cpu_up(1) wait_for_completion() .... unpark_watchdog() ->unpark() perf_event_create() <- fails watchdog_enable &= ~NMI_WATCHDOG; .... cpus_write_unlock(); CPU 2 cpus_write_lock() cpu_down(2) wait_for_completion() wakeup(watchdog); watchdog() if (!(watchdog_enable & NMI_WATCHDOG)) watchdog_nmi_disable() perf_event_disable() .... cpus_read_lock(); stop_smpboot_threads() park_watchdog(); wait_for_completion(watchdog->parked); Result: End of hotplug and instantaneous full lockup of the machine. There is a similar problem with disabling the watchdog via the user space interface as the sysctl function fiddles with watchdog_enable directly. It's very debatable whether this is required at all. If the watchdog works nicely on N CPUs and it fails to enable on the N + 1 CPU either during hotplug or because the user space interface disabled it via sysctl cpumask and then some perf user grabbed the counter which is then unavailable for the watchdog when the sysctl cpumask gets changed back. There is no real justification for this. One of the reasons WHY this is done is the utter stupidity of the init code of the perf NMI watchdog. Instead of checking upfront at boot whether PERF is available and functional at all, it just does this check at run time over and over when user space fiddles with the sysctl. That's broken beyond repair along with the idiotic error code dependent warn level printks and the even more silly printk rate limiting. If the init code checks whether perf works at boot time, then this mess can be more or less avoided completely. Perf does not come magically into life at runtime. Brain usage while coding is overrated. Remove the cruft and add a temporary safe guard which gets removed later. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194146.806708429@linutronix.de Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 15 --------------- kernel/watchdog_hld.c | 20 +++++++------------- 2 files changed, 7 insertions(+), 28 deletions(-) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 1c185d9dd468..af000956286c 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -485,21 +485,6 @@ static void watchdog(unsigned int cpu) __this_cpu_write(soft_lockup_hrtimer_cnt, __this_cpu_read(hrtimer_interrupts)); __touch_watchdog(); - - /* - * watchdog_nmi_enable() clears the NMI_WATCHDOG_ENABLED bit in the - * failure path. Check for failures that can occur asynchronously - - * for example, when CPUs are on-lined - and shut down the hardware - * perf event on each CPU accordingly. - * - * The only non-obvious place this bit can be cleared is through - * watchdog_nmi_enable(), so a pr_info() is placed there. Placing a - * pr_info here would be too noisy as it would result in a message - * every few seconds if the hardlockup was disabled but the softlockup - * enabled. - */ - if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) - watchdog_nmi_disable(cpu); } static struct smp_hotplug_thread watchdog_threads = { diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index c9586ebc2e98..7b602714ea53 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -23,6 +23,7 @@ static DEFINE_PER_CPU(bool, watchdog_nmi_touch); static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); static unsigned long hardlockup_allcpu_dumped; +static bool hardlockup_detector_disabled; void arch_touch_nmi_watchdog(void) { @@ -178,6 +179,10 @@ int watchdog_nmi_enable(unsigned int cpu) if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) goto out; + /* A failure disabled the hardlockup detector permanently */ + if (hardlockup_detector_disabled) + return -ENODEV; + /* is it already setup and enabled? */ if (event && event->state > PERF_EVENT_STATE_OFF) goto out; @@ -206,18 +211,6 @@ int watchdog_nmi_enable(unsigned int cpu) goto out_save; } - /* - * Disable the hard lockup detector if _any_ CPU fails to set up - * set up the hardware perf event. The watchdog() function checks - * the NMI_WATCHDOG_ENABLED bit periodically. - * - * The barriers are for syncing up watchdog_enabled across all the - * cpus, as clear_bit() does not use barriers. - */ - smp_mb__before_atomic(); - clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled); - smp_mb__after_atomic(); - /* skip displaying the same error again */ if (!firstcpu && (PTR_ERR(event) == firstcpu_err)) return PTR_ERR(event); @@ -232,7 +225,8 @@ int watchdog_nmi_enable(unsigned int cpu) pr_err("disabled (cpu%i): unable to create perf event: %ld\n", cpu, PTR_ERR(event)); - pr_info("Shutting down hard lockup detector on all cpus\n"); + pr_info("Disabling hard lockup detector permanently\n"); + hardlockup_detector_disabled = true; return PTR_ERR(event); -- cgit From 941154bd6937a710ae9193a3c733c0029e5ae7b8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:04 +0200 Subject: watchdog/hardlockup/perf: Prevent CPU hotplug deadlock The following deadlock is possible in the watchdog hotplug code: cpus_write_lock() ... takedown_cpu() smpboot_park_threads() smpboot_park_thread() kthread_park() ->park() := watchdog_disable() watchdog_nmi_disable() perf_event_release_kernel(); put_event() _free_event() ->destroy() := hw_perf_event_destroy() x86_release_hardware() release_ds_buffers() get_online_cpus() when a per cpu watchdog perf event is destroyed which drops the last reference to the PMU hardware. The cleanup code there invokes get_online_cpus() which instantly deadlocks because the hotplug percpu rwsem is write locked. To solve this add a deferring mechanism: cpus_write_lock() kthread_park() watchdog_nmi_disable(deferred) perf_event_disable(event); move_event_to_deferred(event); .... cpus_write_unlock() cleaup_deferred_events() perf_event_release_kernel() This is still properly serialized against concurrent hotplug via the cpu_add_remove_lock, which is held by the task which initiated the hotplug event. This is also used to handle event destruction when the watchdog threads are parked via other mechanisms than CPU hotplug. Analyzed-by: Peter Zijlstra Reported-by: Borislav Petkov Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194146.884469246@linutronix.de Signed-off-by: Ingo Molnar --- include/linux/nmi.h | 6 ++++++ kernel/cpu.c | 6 ++++++ kernel/watchdog.c | 25 +++++++++++++++++++++++++ kernel/watchdog_hld.c | 34 ++++++++++++++++++++++++++++------ 4 files changed, 65 insertions(+), 6 deletions(-) diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 7eefe7abf44b..80354e6fa86d 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -13,9 +13,11 @@ #ifdef CONFIG_LOCKUP_DETECTOR void lockup_detector_init(void); void lockup_detector_soft_poweroff(void); +void lockup_detector_cleanup(void); #else static inline void lockup_detector_init(void) { } static inline void lockup_detector_soft_poweroff(void) { } +static inline void lockup_detector_cleanup(void) { } #endif #ifdef CONFIG_SOFTLOCKUP_DETECTOR @@ -77,9 +79,13 @@ static inline void hardlockup_detector_disable(void) {} extern void arch_touch_nmi_watchdog(void); extern void hardlockup_detector_perf_stop(void); extern void hardlockup_detector_perf_restart(void); +extern void hardlockup_detector_perf_disable(void); +extern void hardlockup_detector_perf_cleanup(void); #else static inline void hardlockup_detector_perf_stop(void) { } static inline void hardlockup_detector_perf_restart(void) { } +static inline void hardlockup_detector_perf_disable(void) { } +static inline void hardlockup_detector_perf_cleanup(void) { } #if !defined(CONFIG_HAVE_NMI_WATCHDOG) static inline void arch_touch_nmi_watchdog(void) {} #endif diff --git a/kernel/cpu.c b/kernel/cpu.c index acf5308fad51..a96b348591df 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -734,6 +735,11 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, out: cpus_write_unlock(); + /* + * Do post unplug cleanup. This is still protected against + * concurrent CPU hotplug via cpu_add_remove_lock. + */ + lockup_detector_cleanup(); return ret; } diff --git a/kernel/watchdog.c b/kernel/watchdog.c index af000956286c..dd1fd59683c5 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -109,8 +109,10 @@ int __weak watchdog_nmi_enable(unsigned int cpu) { return 0; } + void __weak watchdog_nmi_disable(unsigned int cpu) { + hardlockup_detector_perf_disable(); } /* @@ -193,6 +195,8 @@ __setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); #endif #endif +static void __lockup_detector_cleanup(void); + /* * Hard-lockup warnings should be triggered after just a few seconds. Soft- * lockups can have false positives under extreme conditions. So we generally @@ -631,6 +635,24 @@ static void set_sample_period(void) } #endif /* SOFTLOCKUP */ +static void __lockup_detector_cleanup(void) +{ + lockdep_assert_held(&watchdog_mutex); + hardlockup_detector_perf_cleanup(); +} + +/** + * lockup_detector_cleanup - Cleanup after cpu hotplug or sysctl changes + * + * Caller must not hold the cpu hotplug rwsem. + */ +void lockup_detector_cleanup(void) +{ + mutex_lock(&watchdog_mutex); + __lockup_detector_cleanup(); + mutex_unlock(&watchdog_mutex); +} + /** * lockup_detector_soft_poweroff - Interface to stop lockup detector(s) * @@ -665,6 +687,8 @@ static int proc_watchdog_update(void) watchdog_nmi_reconfigure(); + __lockup_detector_cleanup(); + return err; } @@ -837,6 +861,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, } watchdog_nmi_reconfigure(); + __lockup_detector_cleanup(); } mutex_unlock(&watchdog_mutex); diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 7b602714ea53..94111ccb09b5 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -21,6 +21,8 @@ static DEFINE_PER_CPU(bool, hard_watchdog_warn); static DEFINE_PER_CPU(bool, watchdog_nmi_touch); static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); +static DEFINE_PER_CPU(struct perf_event *, dead_event); +static struct cpumask dead_events_mask; static unsigned long hardlockup_allcpu_dumped; static bool hardlockup_detector_disabled; @@ -239,16 +241,18 @@ out: return 0; } -void watchdog_nmi_disable(unsigned int cpu) +/** + * hardlockup_detector_perf_disable - Disable the local event + */ +void hardlockup_detector_perf_disable(void) { - struct perf_event *event = per_cpu(watchdog_ev, cpu); + struct perf_event *event = this_cpu_read(watchdog_ev); if (event) { perf_event_disable(event); - per_cpu(watchdog_ev, cpu) = NULL; - - /* should be in cleanup, but blocks oprofile */ - perf_event_release_kernel(event); + this_cpu_write(watchdog_ev, NULL); + this_cpu_write(dead_event, event); + cpumask_set_cpu(smp_processor_id(), &dead_events_mask); /* watchdog_nmi_enable() expects this to be zero initially. */ if (atomic_dec_and_test(&watchdog_cpus)) @@ -256,6 +260,24 @@ void watchdog_nmi_disable(unsigned int cpu) } } +/** + * hardlockup_detector_perf_cleanup - Cleanup disabled events and destroy them + * + * Called from lockup_detector_cleanup(). Serialized by the caller. + */ +void hardlockup_detector_perf_cleanup(void) +{ + int cpu; + + for_each_cpu(cpu, &dead_events_mask) { + struct perf_event *event = per_cpu(dead_event, cpu); + + per_cpu(dead_event, cpu) = NULL; + perf_event_release_kernel(event); + } + cpumask_clear(&dead_events_mask); +} + /** * hardlockup_detector_perf_stop - Globally stop watchdog events * -- cgit From 01f0a02701cbcf32d22cfc9d1ab9a3f0ff2ba68c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:05 +0200 Subject: watchdog/core: Remove the park_in_progress obfuscation Commit: b94f51183b06 ("kernel/watchdog: prevent false hardlockup on overloaded system") tries to fix the following issue: proc_write() set_sample_period() <--- New sample period becoms visible <----- Broken starts proc_watchdog_update() watchdog_enable_all_cpus() watchdog_hrtimer_fn() update_watchdog_all_cpus() restart_timer(sample_period) watchdog_park_threads() thread->park() disable_nmi() <----- Broken ends The reason why this is broken is that the update of the watchdog threshold becomes immediately effective and visible for the hrtimer function which uses that value to rearm the timer. But the NMI/perf side still uses the old value up to the point where it is disabled. If the rate has been lowered then the NMI can run fast enough to 'detect' a hard lockup because the timer has not fired due to the longer period. The patch 'fixed' this by adding a variable: proc_write() set_sample_period() <----- Broken starts proc_watchdog_update() watchdog_enable_all_cpus() watchdog_hrtimer_fn() update_watchdog_all_cpus() restart_timer(sample_period) watchdog_park_threads() park_in_progress = 1 <----- Broken ends nmi_watchdog() if (park_in_progress) return; The only effect of this variable was to make the window where the breakage can hit small enough that it was not longer observable in testing. From a correctness point of view it is a pointless bandaid which merily papers over the root cause: the unsychronized update of the variable. Looking deeper into the related code pathes unearthed similar problems in the watchdog_start()/stop() functions. watchdog_start() perf_nmi_event_start() hrtimer_start() watchdog_stop() hrtimer_cancel() perf_nmi_event_stop() In both cases the call order is wrong because if the tasks gets preempted or the VM gets scheduled out long enough after the first call, then there is a chance that the next NMI will see a stale hrtimer interrupt count and trigger a false positive hard lockup splat. Get rid of park_in_progress so the code can be gradually deobfuscated and pruned from several layers of duct tape papering over the root cause, which has been either ignored or not understood at all. Once this is removed the underlying problem will be fixed by rewriting the proc interface to do a proper synchronized update. Address the start/stop() ordering problem as well by reverting the call order, so this part is at least correct now. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1709052038270.2393@nanos Signed-off-by: Ingo Molnar --- include/linux/nmi.h | 1 - kernel/watchdog.c | 37 +++++++++++++++++-------------------- kernel/watchdog_hld.c | 7 ++----- 3 files changed, 19 insertions(+), 26 deletions(-) diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 80354e6fa86d..91a3a4a4c8ae 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -27,7 +27,6 @@ extern void touch_softlockup_watchdog_sync(void); extern void touch_all_softlockup_watchdogs(void); extern unsigned int softlockup_panic; extern int soft_watchdog_enabled; -extern atomic_t watchdog_park_in_progress; #else static inline void touch_softlockup_watchdog_sched(void) { diff --git a/kernel/watchdog.c b/kernel/watchdog.c index dd1fd59683c5..c290135fb415 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -136,8 +136,6 @@ void __weak watchdog_nmi_reconfigure(void) #define for_each_watchdog_cpu(cpu) \ for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask) -atomic_t watchdog_park_in_progress = ATOMIC_INIT(0); - static u64 __read_mostly sample_period; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); @@ -322,8 +320,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) int duration; int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace; - if (!watchdog_enabled || - atomic_read(&watchdog_park_in_progress) != 0) + if (!watchdog_enabled) return HRTIMER_NORESTART; /* kick the hardlockup detector */ @@ -437,32 +434,37 @@ static void watchdog_set_prio(unsigned int policy, unsigned int prio) static void watchdog_enable(unsigned int cpu) { - struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); + struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer); - /* kick off the timer for the hardlockup detector */ + /* + * Start the timer first to prevent the NMI watchdog triggering + * before the timer has a chance to fire. + */ hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hrtimer->function = watchdog_timer_fn; + hrtimer_start(hrtimer, ns_to_ktime(sample_period), + HRTIMER_MODE_REL_PINNED); + /* Initialize timestamp */ + __touch_watchdog(); /* Enable the perf event */ watchdog_nmi_enable(cpu); - /* done here because hrtimer_start can only pin to smp_processor_id() */ - hrtimer_start(hrtimer, ns_to_ktime(sample_period), - HRTIMER_MODE_REL_PINNED); - - /* initialize timestamp */ watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1); - __touch_watchdog(); } static void watchdog_disable(unsigned int cpu) { - struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); + struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer); watchdog_set_prio(SCHED_NORMAL, 0); - hrtimer_cancel(hrtimer); - /* disable the perf event */ + /* + * Disable the perf event first. That prevents that a large delay + * between disabling the timer and disabling the perf event causes + * the perf NMI to detect a false positive. + */ watchdog_nmi_disable(cpu); + hrtimer_cancel(hrtimer); } static void watchdog_cleanup(unsigned int cpu, bool online) @@ -518,16 +520,11 @@ static int watchdog_park_threads(void) { int cpu, ret = 0; - atomic_set(&watchdog_park_in_progress, 1); - for_each_watchdog_cpu(cpu) { ret = kthread_park(per_cpu(softlockup_watchdog, cpu)); if (ret) break; } - - atomic_set(&watchdog_park_in_progress, 0); - return ret; } diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 94111ccb09b5..0aa191ee3d51 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -106,15 +106,12 @@ static struct perf_event_attr wd_hw_attr = { /* Callback function for perf event subsystem */ static void watchdog_overflow_callback(struct perf_event *event, - struct perf_sample_data *data, - struct pt_regs *regs) + struct perf_sample_data *data, + struct pt_regs *regs) { /* Ensure the watchdog never gets throttled */ event->hw.interrupts = 0; - if (atomic_read(&watchdog_park_in_progress) != 0) - return; - if (__this_cpu_read(watchdog_nmi_touch) == true) { __this_cpu_write(watchdog_nmi_touch, false); return; -- cgit From 2b9d7f233b835663cbc7b6b3f88dd20f61118d1e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:06 +0200 Subject: watchdog/core: Clean up stub functions Having stub functions which take a full page is not helping the readablility of code. Condense them and move the doubled #ifdef variant into the SYSFS section. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194147.045545271@linutronix.de Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 68 ++++++++++++++++++------------------------------------- 1 file changed, 22 insertions(+), 46 deletions(-) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index c290135fb415..af37c040436c 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -125,10 +125,7 @@ void __weak watchdog_nmi_disable(unsigned int cpu) * - sysctl_hardlockup_all_cpu_backtrace * - hardlockup_panic */ -void __weak watchdog_nmi_reconfigure(void) -{ -} - +void __weak watchdog_nmi_reconfigure(void) { } #ifdef CONFIG_SOFTLOCKUP_DETECTOR @@ -136,6 +133,11 @@ void __weak watchdog_nmi_reconfigure(void) #define for_each_watchdog_cpu(cpu) \ for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask) +/* Global variables, exported for sysctl */ +unsigned int __read_mostly softlockup_panic = + CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; +int __read_mostly soft_watchdog_enabled; + static u64 __read_mostly sample_period; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); @@ -149,13 +151,9 @@ static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); static unsigned long soft_lockup_nmi_warn; -unsigned int __read_mostly softlockup_panic = - CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; - static int __init softlockup_panic_setup(char *str) { softlockup_panic = simple_strtoul(str, NULL, 0); - return 1; } __setup("softlockup_panic=", softlockup_panic_setup); @@ -593,44 +591,13 @@ static void watchdog_disable_all_cpus(void) } } -#ifdef CONFIG_SYSCTL -static int watchdog_update_cpus(void) -{ - return smpboot_update_cpumask_percpu_thread( - &watchdog_threads, &watchdog_cpumask); -} -#endif - -#else /* SOFTLOCKUP */ -static int watchdog_park_threads(void) -{ - return 0; -} - -static void watchdog_unpark_threads(void) -{ -} - -static int watchdog_enable_all_cpus(void) -{ - return 0; -} - -static void watchdog_disable_all_cpus(void) -{ -} - -#ifdef CONFIG_SYSCTL -static int watchdog_update_cpus(void) -{ - return 0; -} -#endif - -static void set_sample_period(void) -{ -} -#endif /* SOFTLOCKUP */ +#else /* CONFIG_SOFTLOCKUP_DETECTOR */ +static inline int watchdog_park_threads(void) { return 0; } +static inline void watchdog_unpark_threads(void) { } +static inline int watchdog_enable_all_cpus(void) { return 0; } +static inline void watchdog_disable_all_cpus(void) { } +static inline void set_sample_period(void) { } +#endif /* !CONFIG_SOFTLOCKUP_DETECTOR */ static void __lockup_detector_cleanup(void) { @@ -827,6 +794,15 @@ out: return err; } +static int watchdog_update_cpus(void) +{ + if (IS_ENABLED(CONFIG_SOFTLOCKUP_DETECTOR)) { + return smpboot_update_cpumask_percpu_thread(&watchdog_threads, + &watchdog_cpumask); + } + return 0; +} + /* * The cpumask is the mask of possible cpus that the watchdog can run * on, not the mask of cpus it is actually running on. This allows the -- cgit From 368a7e2ce8ff0ddcdcb37eadb76530b033f6eb2d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:07 +0200 Subject: watchdog/core: Clean up the #ifdef maze The #ifdef maze in this file is horrible, group stuff at least a bit so one can figure out what belongs to what. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194147.139629546@linutronix.de Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 33 +++++++++++++-------------------- 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index af37c040436c..a9bdfde73e4b 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -41,7 +41,6 @@ unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED; #endif #ifdef CONFIG_HARDLOCKUP_DETECTOR -/* boot commands */ /* * Should we panic when a soft-lockup or hard-lockup occurs: */ @@ -74,19 +73,21 @@ static int __init hardlockup_panic_setup(char *str) } __setup("nmi_watchdog=", hardlockup_panic_setup); -#endif +# ifdef CONFIG_SMP +int __read_mostly sysctl_hardlockup_all_cpu_backtrace; -#ifdef CONFIG_SOFTLOCKUP_DETECTOR -int __read_mostly soft_watchdog_enabled; -#endif +static int __init hardlockup_all_cpu_backtrace_setup(char *str) +{ + sysctl_hardlockup_all_cpu_backtrace = !!simple_strtol(str, NULL, 0); + return 1; +} +__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); +# endif /* CONFIG_SMP */ +#endif /* CONFIG_HARDLOCKUP_DETECTOR */ int __read_mostly watchdog_user_enabled; int __read_mostly watchdog_thresh = 10; -#ifdef CONFIG_SMP -int __read_mostly sysctl_softlockup_all_cpu_backtrace; -int __read_mostly sysctl_hardlockup_all_cpu_backtrace; -#endif struct cpumask watchdog_cpumask __read_mostly; unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); @@ -173,22 +174,14 @@ static int __init nosoftlockup_setup(char *str) __setup("nosoftlockup", nosoftlockup_setup); #ifdef CONFIG_SMP +int __read_mostly sysctl_softlockup_all_cpu_backtrace; + static int __init softlockup_all_cpu_backtrace_setup(char *str) { - sysctl_softlockup_all_cpu_backtrace = - !!simple_strtol(str, NULL, 0); + sysctl_softlockup_all_cpu_backtrace = !!simple_strtol(str, NULL, 0); return 1; } __setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup); -#ifdef CONFIG_HARDLOCKUP_DETECTOR -static int __init hardlockup_all_cpu_backtrace_setup(char *str) -{ - sysctl_hardlockup_all_cpu_backtrace = - !!simple_strtol(str, NULL, 0); - return 1; -} -__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); -#endif #endif static void __lockup_detector_cleanup(void); -- cgit From 05ba3de74a3f499dcaa37b186220aaf174c95a4b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:08 +0200 Subject: watchdog/core: Split out cpumask write function Split the write part of the cpumask proc handler out into a separate helper to avoid deep indentation. This also reduces the patch complexity in the following cleanups. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194147.218075991@linutronix.de Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index a9bdfde73e4b..cedf45ab4d81 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -792,10 +792,29 @@ static int watchdog_update_cpus(void) if (IS_ENABLED(CONFIG_SOFTLOCKUP_DETECTOR)) { return smpboot_update_cpumask_percpu_thread(&watchdog_threads, &watchdog_cpumask); + __lockup_detector_cleanup(); } return 0; } +static void proc_watchdog_cpumask_update(void) +{ + /* Remove impossible cpus to keep sysctl output clean. */ + cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask); + + if (watchdog_running) { + /* + * Failure would be due to being unable to allocate a + * temporary cpumask, so we are likely not in a position to + * do much else to make things better. + */ + if (watchdog_update_cpus() != 0) + pr_err("cpumask update failed\n"); + } + + watchdog_nmi_reconfigure(); +} + /* * The cpumask is the mask of possible cpus that the watchdog can run * on, not the mask of cpus it is actually running on. This allows the @@ -811,30 +830,13 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, mutex_lock(&watchdog_mutex); err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); - if (!err && write) { - /* Remove impossible cpus to keep sysctl output cleaner. */ - cpumask_and(&watchdog_cpumask, &watchdog_cpumask, - cpu_possible_mask); - - if (watchdog_running) { - /* - * Failure would be due to being unable to allocate - * a temporary cpumask, so we are likely not in a - * position to do much else to make things better. - */ - if (watchdog_update_cpus() != 0) - pr_err("cpumask update failed\n"); - } - - watchdog_nmi_reconfigure(); - __lockup_detector_cleanup(); - } + if (!err && write) + proc_watchdog_cpumask_update(); mutex_unlock(&watchdog_mutex); cpu_hotplug_enable(); return err; } - #endif /* CONFIG_SYSCTL */ void __init lockup_detector_init(void) -- cgit From 0d85923c7a81719567311ba0eae8ecb2efd4c8a0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:09 +0200 Subject: smpboot/threads, watchdog/core: Avoid runtime allocation smpboot_update_cpumask_threads_percpu() allocates a temporary cpumask at runtime. This is suboptimal because the call site needs more code size for proper error handling than a statically allocated temporary mask requires data size. Add static temporary cpumask. The function is globaly serialized, so no further protection required. Remove the half baken error handling in the watchdog code and get rid of the export as there are no in tree modular users of that function. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194147.297288838@linutronix.de Signed-off-by: Ingo Molnar --- include/linux/smpboot.h | 4 ++-- kernel/smpboot.c | 22 +++++++--------------- kernel/watchdog.c | 21 +++++---------------- 3 files changed, 14 insertions(+), 33 deletions(-) diff --git a/include/linux/smpboot.h b/include/linux/smpboot.h index 12910cf19869..c149aa7bedf3 100644 --- a/include/linux/smpboot.h +++ b/include/linux/smpboot.h @@ -55,7 +55,7 @@ smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread) } void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread); -int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread, - const struct cpumask *); +void smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread, + const struct cpumask *); #endif diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 1d71c051a951..ed7507b69b48 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -344,39 +344,31 @@ EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread); * by the client, but only by calling this function. * This function can only be called on a registered smp_hotplug_thread. */ -int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread, - const struct cpumask *new) +void smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread, + const struct cpumask *new) { struct cpumask *old = plug_thread->cpumask; - cpumask_var_t tmp; + static struct cpumask tmp; unsigned int cpu; - if (!alloc_cpumask_var(&tmp, GFP_KERNEL)) - return -ENOMEM; - get_online_cpus(); mutex_lock(&smpboot_threads_lock); /* Park threads that were exclusively enabled on the old mask. */ - cpumask_andnot(tmp, old, new); - for_each_cpu_and(cpu, tmp, cpu_online_mask) + cpumask_andnot(&tmp, old, new); + for_each_cpu_and(cpu, &tmp, cpu_online_mask) smpboot_park_thread(plug_thread, cpu); /* Unpark threads that are exclusively enabled on the new mask. */ - cpumask_andnot(tmp, new, old); - for_each_cpu_and(cpu, tmp, cpu_online_mask) + cpumask_andnot(&tmp, new, old); + for_each_cpu_and(cpu, &tmp, cpu_online_mask) smpboot_unpark_thread(plug_thread, cpu); cpumask_copy(old, new); mutex_unlock(&smpboot_threads_lock); put_online_cpus(); - - free_cpumask_var(tmp); - - return 0; } -EXPORT_SYMBOL_GPL(smpboot_update_cpumask_percpu_thread); static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index cedf45ab4d81..8935a3a4c2fb 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -787,31 +787,20 @@ out: return err; } -static int watchdog_update_cpus(void) +static void watchdog_update_cpus(void) { - if (IS_ENABLED(CONFIG_SOFTLOCKUP_DETECTOR)) { - return smpboot_update_cpumask_percpu_thread(&watchdog_threads, - &watchdog_cpumask); + if (IS_ENABLED(CONFIG_SOFTLOCKUP_DETECTOR) && watchdog_running) { + smpboot_update_cpumask_percpu_thread(&watchdog_threads, + &watchdog_cpumask); __lockup_detector_cleanup(); } - return 0; } static void proc_watchdog_cpumask_update(void) { /* Remove impossible cpus to keep sysctl output clean. */ cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask); - - if (watchdog_running) { - /* - * Failure would be due to being unable to allocate a - * temporary cpumask, so we are likely not in a position to - * do much else to make things better. - */ - if (watchdog_update_cpus() != 0) - pr_err("cpumask update failed\n"); - } - + watchdog_update_cpus(); watchdog_nmi_reconfigure(); } -- cgit From 2eb2527f847d1bd8d8fb9db1e8139db5d6eddb36 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:10 +0200 Subject: watchdog/core: Create new thread handling infrastructure The lockup detector reconfiguration tears down all watchdog threads when the watchdog is disabled and sets them up again when its enabled. That's a pointless exercise. The watchdog threads are not consuming an insane amount of resources, so it's enough to set them up at init time and keep them in parked position when the watchdog is disabled and unpark them when it is reenabled. The smpboot thread infrastructure takes care of keeping the force parked threads in place even across cpu hotplug. Another horrible mechanism are the open coded park/unpark loops which are used for reconfiguration of the watchdog. The smpboot infrastructure allows exactly the same via smpboot_update_cpumask_thread_percpu(), which is cpu hotplug safe. Using that instead of the open coded loops allows to get rid of the hotplug locking mess in the watchdog code. Implement a clean infrastructure which allows to replace the open coded nonsense. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194147.377182587@linutronix.de Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 8935a3a4c2fb..b35518375fb7 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -139,6 +139,9 @@ unsigned int __read_mostly softlockup_panic = CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; int __read_mostly soft_watchdog_enabled; +struct cpumask watchdog_allowed_mask __read_mostly; +static bool softlockup_threads_initialized __read_mostly; + static u64 __read_mostly sample_period; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); @@ -584,12 +587,84 @@ static void watchdog_disable_all_cpus(void) } } +static void softlockup_update_smpboot_threads(void) +{ + lockdep_assert_held(&watchdog_mutex); + + if (!softlockup_threads_initialized) + return; + + smpboot_update_cpumask_percpu_thread(&watchdog_threads, + &watchdog_allowed_mask); + __lockup_detector_cleanup(); +} + +/* Temporarily park all watchdog threads */ +static void softlockup_park_all_threads(void) +{ + cpumask_clear(&watchdog_allowed_mask); + softlockup_update_smpboot_threads(); +} + +/* + * Park threads which are not longer enabled and unpark threads which have + * been newly enabled. + */ +static void softlockup_update_threads(void) +{ + cpumask_copy(&watchdog_allowed_mask, &watchdog_cpumask); + softlockup_update_smpboot_threads(); +} + +static void softlockup_reconfigure_threads(bool enabled) +{ + softlockup_park_all_threads(); + set_sample_period(); + if (enabled) + softlockup_update_threads(); +} + +/* + * Create the watchdog thread infrastructure. + * + * The threads are not unparked as watchdog_allowed_mask is empty. When + * the threads are sucessfully initialized, take the proper locks and + * unpark the threads in the watchdog_cpumask if the watchdog is enabled. + */ +static __init void softlockup_init_threads(void) +{ + int ret; + + /* + * If sysctl is off and watchdog got disabled on the command line, + * nothing to do here. + */ + if (!IS_ENABLED(CONFIG_SYSCTL) && + !(watchdog_enabled && watchdog_thresh)) + return; + + ret = smpboot_register_percpu_thread_cpumask(&watchdog_threads, + &watchdog_allowed_mask); + if (ret) { + pr_err("Failed to initialize soft lockup detector threads\n"); + return; + } + + mutex_lock(&watchdog_mutex); + softlockup_threads_initialized = true; + softlockup_reconfigure_threads(watchdog_enabled && watchdog_thresh); + mutex_unlock(&watchdog_mutex); +} + #else /* CONFIG_SOFTLOCKUP_DETECTOR */ static inline int watchdog_park_threads(void) { return 0; } static inline void watchdog_unpark_threads(void) { } static inline int watchdog_enable_all_cpus(void) { return 0; } static inline void watchdog_disable_all_cpus(void) { } static inline void set_sample_period(void) { } +static inline void softlockup_init_threads(void) { } +static inline void softlockup_update_threads(void) { } +static inline void softlockup_reconfigure_threads(bool enabled) { } #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */ static void __lockup_detector_cleanup(void) -- cgit From d57108d4f6791291e89d980e7f7a3566c32ab188 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:11 +0200 Subject: watchdog/core: Get rid of the thread teardown/setup dance The lockup detector reconfiguration tears down all watchdog threads when the watchdog is disabled and sets them up again when its enabled. That's a pointless exercise. The watchdog threads are not consuming an insane amount of resources, so it's enough to set them up at init time and keep them in parked position when the watchdog is disabled and unpark them when it is reenabled. The smpboot thread infrastructure takes care of keeping the force parked threads in place even across cpu hotplug. Aside of that the code implements the park/unpark facility of smp hotplug threads on its own, which is even more pointless. We have functionality in the smpboot thread code to do so. Use the new thread management functions and get rid of the unholy mess. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194147.470370113@linutronix.de Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 190 ++++++------------------------------------------------ 1 file changed, 19 insertions(+), 171 deletions(-) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index b35518375fb7..762d3ed82a08 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -91,13 +91,6 @@ int __read_mostly watchdog_thresh = 10; struct cpumask watchdog_cpumask __read_mostly; unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); -/* - * The 'watchdog_running' variable is set to 1 when the watchdog threads - * are registered/started and is set to 0 when the watchdog threads are - * unregistered/stopped, so it is an indicator whether the threads exist. - */ -static int __read_mostly watchdog_running; - /* * These functions can be overridden if an architecture implements its * own hardlockup detector. @@ -130,10 +123,6 @@ void __weak watchdog_nmi_reconfigure(void) { } #ifdef CONFIG_SOFTLOCKUP_DETECTOR -/* Helper for online, unparked cpus. */ -#define for_each_watchdog_cpu(cpu) \ - for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask) - /* Global variables, exported for sysctl */ unsigned int __read_mostly softlockup_panic = CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; @@ -259,11 +248,15 @@ void touch_all_softlockup_watchdogs(void) int cpu; /* - * this is done lockless - * do we care if a 0 races with a timestamp? - * all it means is the softlock check starts one cycle later + * watchdog_mutex cannpt be taken here, as this might be called + * from (soft)interrupt context, so the access to + * watchdog_allowed_cpumask might race with a concurrent update. + * + * The watchdog time stamp can race against a concurrent real + * update as well, the only side effect might be a cycle delay for + * the softlockup check. */ - for_each_watchdog_cpu(cpu) + for_each_cpu(cpu, &watchdog_allowed_mask) per_cpu(watchdog_touch_ts, cpu) = 0; wq_watchdog_touch(-1); } @@ -303,9 +296,6 @@ static void watchdog_interrupt_count(void) __this_cpu_inc(hrtimer_interrupts); } -static int watchdog_enable_all_cpus(void); -static void watchdog_disable_all_cpus(void); - /* watchdog kicker functions */ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) { @@ -498,95 +488,6 @@ static struct smp_hotplug_thread watchdog_threads = { .unpark = watchdog_enable, }; -/* - * park all watchdog threads that are specified in 'watchdog_cpumask' - * - * This function returns an error if kthread_park() of a watchdog thread - * fails. In this situation, the watchdog threads of some CPUs can already - * be parked and the watchdog threads of other CPUs can still be runnable. - * Callers are expected to handle this special condition as appropriate in - * their context. - * - * This function may only be called in a context that is protected against - * races with CPU hotplug - for example, via get_online_cpus(). - */ -static int watchdog_park_threads(void) -{ - int cpu, ret = 0; - - for_each_watchdog_cpu(cpu) { - ret = kthread_park(per_cpu(softlockup_watchdog, cpu)); - if (ret) - break; - } - return ret; -} - -/* - * unpark all watchdog threads that are specified in 'watchdog_cpumask' - * - * This function may only be called in a context that is protected against - * races with CPU hotplug - for example, via get_online_cpus(). - */ -static void watchdog_unpark_threads(void) -{ - int cpu; - - for_each_watchdog_cpu(cpu) - kthread_unpark(per_cpu(softlockup_watchdog, cpu)); -} - -static int update_watchdog_all_cpus(void) -{ - int ret; - - ret = watchdog_park_threads(); - if (ret) - return ret; - - watchdog_unpark_threads(); - - return 0; -} - -static int watchdog_enable_all_cpus(void) -{ - int err = 0; - - if (!watchdog_running) { - err = smpboot_register_percpu_thread_cpumask(&watchdog_threads, - &watchdog_cpumask); - if (err) - pr_err("Failed to create watchdog threads, disabled\n"); - else - watchdog_running = 1; - } else { - /* - * Enable/disable the lockup detectors or - * change the sample period 'on the fly'. - */ - err = update_watchdog_all_cpus(); - - if (err) { - watchdog_disable_all_cpus(); - pr_err("Failed to update lockup detectors, disabled\n"); - } - } - - if (err) - watchdog_enabled = 0; - - return err; -} - -static void watchdog_disable_all_cpus(void) -{ - if (watchdog_running) { - watchdog_running = 0; - smpboot_unregister_percpu_thread(&watchdog_threads); - } -} - static void softlockup_update_smpboot_threads(void) { lockdep_assert_held(&watchdog_mutex); @@ -661,7 +562,6 @@ static inline int watchdog_park_threads(void) { return 0; } static inline void watchdog_unpark_threads(void) { } static inline int watchdog_enable_all_cpus(void) { return 0; } static inline void watchdog_disable_all_cpus(void) { } -static inline void set_sample_period(void) { } static inline void softlockup_init_threads(void) { } static inline void softlockup_update_threads(void) { } static inline void softlockup_reconfigure_threads(bool enabled) { } @@ -701,28 +601,10 @@ void lockup_detector_soft_poweroff(void) /* * Update the run state of the lockup detectors. */ -static int proc_watchdog_update(void) +static void proc_watchdog_update(void) { - int err = 0; - - /* - * Watchdog threads won't be started if they are already active. - * The 'watchdog_running' variable in watchdog_*_all_cpus() takes - * care of this. If those threads are already active, the sample - * period will be updated and the lockup detectors will be enabled - * or disabled 'on the fly'. - */ - if (watchdog_enabled && watchdog_thresh) - err = watchdog_enable_all_cpus(); - else - watchdog_disable_all_cpus(); - + softlockup_reconfigure_threads(watchdog_enabled && watchdog_thresh); watchdog_nmi_reconfigure(); - - __lockup_detector_cleanup(); - - return err; - } /* @@ -778,17 +660,8 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, new = old & ~which; } while (cmpxchg(&watchdog_enabled, old, new) != old); - /* - * Update the run state of the lockup detectors. There is _no_ - * need to check the value returned by proc_watchdog_update() - * and to restore the previous value of 'watchdog_enabled' as - * both lockup detectors are disabled if proc_watchdog_update() - * returns an error. - */ - if (old == new) - goto out; - - err = proc_watchdog_update(); + if (old != new) + proc_watchdog_update(); } out: mutex_unlock(&watchdog_mutex); @@ -832,50 +705,28 @@ int proc_soft_watchdog(struct ctl_table *table, int write, int proc_watchdog_thresh(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - int err, old, new; + int err, old; cpu_hotplug_disable(); mutex_lock(&watchdog_mutex); - old = ACCESS_ONCE(watchdog_thresh); + old = READ_ONCE(watchdog_thresh); err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - if (err || !write) - goto out; + if (!err && write && old != READ_ONCE(watchdog_thresh)) + proc_watchdog_update(); - /* - * Update the sample period. Restore on failure. - */ - new = ACCESS_ONCE(watchdog_thresh); - if (old == new) - goto out; - - set_sample_period(); - err = proc_watchdog_update(); - if (err) { - watchdog_thresh = old; - set_sample_period(); - } -out: mutex_unlock(&watchdog_mutex); cpu_hotplug_enable(); return err; } -static void watchdog_update_cpus(void) -{ - if (IS_ENABLED(CONFIG_SOFTLOCKUP_DETECTOR) && watchdog_running) { - smpboot_update_cpumask_percpu_thread(&watchdog_threads, - &watchdog_cpumask); - __lockup_detector_cleanup(); - } -} - static void proc_watchdog_cpumask_update(void) { /* Remove impossible cpus to keep sysctl output clean. */ cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask); - watchdog_update_cpus(); + + softlockup_update_threads(); watchdog_nmi_reconfigure(); } @@ -905,8 +756,6 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, void __init lockup_detector_init(void) { - set_sample_period(); - #ifdef CONFIG_NO_HZ_FULL if (tick_nohz_full_enabled()) { pr_info("Disabling watchdog on nohz_full cores by default\n"); @@ -917,6 +766,5 @@ void __init lockup_detector_init(void) cpumask_copy(&watchdog_cpumask, cpu_possible_mask); #endif - if (watchdog_enabled) - watchdog_enable_all_cpus(); + softlockup_init_threads(); } -- cgit From e8b62b2dd14f8f2427856ba24cb7db922bda9bfd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:12 +0200 Subject: watchdog/core: Further simplify sysctl handling Use a single function to update sysctl changes. This is not a high frequency user space interface and it's root only. Preparatory patch to cleanup the sysctl variable handling. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194147.549114957@linutronix.de Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 27 +++++++-------------------- 1 file changed, 7 insertions(+), 20 deletions(-) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 762d3ed82a08..ca8747221e87 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -507,11 +507,8 @@ static void softlockup_park_all_threads(void) softlockup_update_smpboot_threads(); } -/* - * Park threads which are not longer enabled and unpark threads which have - * been newly enabled. - */ -static void softlockup_update_threads(void) +/* Unpark enabled threads */ +static void softlockup_unpark_threads(void) { cpumask_copy(&watchdog_allowed_mask, &watchdog_cpumask); softlockup_update_smpboot_threads(); @@ -522,7 +519,7 @@ static void softlockup_reconfigure_threads(bool enabled) softlockup_park_all_threads(); set_sample_period(); if (enabled) - softlockup_update_threads(); + softlockup_unpark_threads(); } /* @@ -563,7 +560,6 @@ static inline void watchdog_unpark_threads(void) { } static inline int watchdog_enable_all_cpus(void) { return 0; } static inline void watchdog_disable_all_cpus(void) { } static inline void softlockup_init_threads(void) { } -static inline void softlockup_update_threads(void) { } static inline void softlockup_reconfigure_threads(bool enabled) { } #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */ @@ -598,11 +594,11 @@ void lockup_detector_soft_poweroff(void) #ifdef CONFIG_SYSCTL -/* - * Update the run state of the lockup detectors. - */ +/* Propagate any changes to the watchdog threads */ static void proc_watchdog_update(void) { + /* Remove impossible cpus to keep sysctl output clean. */ + cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask); softlockup_reconfigure_threads(watchdog_enabled && watchdog_thresh); watchdog_nmi_reconfigure(); } @@ -721,15 +717,6 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, return err; } -static void proc_watchdog_cpumask_update(void) -{ - /* Remove impossible cpus to keep sysctl output clean. */ - cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask); - - softlockup_update_threads(); - watchdog_nmi_reconfigure(); -} - /* * The cpumask is the mask of possible cpus that the watchdog can run * on, not the mask of cpus it is actually running on. This allows the @@ -746,7 +733,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); if (!err && write) - proc_watchdog_cpumask_update(); + proc_watchdog_update(); mutex_unlock(&watchdog_mutex); cpu_hotplug_enable(); -- cgit From 3b371b5936e7777c819619c00ca60f196a8e13fa Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:13 +0200 Subject: watchdog/core: Clean up header mess Having the same #ifdef in various places does not make it more readable. Collect stuff into one place. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194147.627096864@linutronix.de Signed-off-by: Ingo Molnar --- include/linux/nmi.h | 60 ++++++++++++++++++++++------------------------------- 1 file changed, 25 insertions(+), 35 deletions(-) diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 91a3a4a4c8ae..cfebb3bc4eed 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -14,11 +14,29 @@ void lockup_detector_init(void); void lockup_detector_soft_poweroff(void); void lockup_detector_cleanup(void); +bool is_hardlockup(void); + +extern int watchdog_user_enabled; +extern int nmi_watchdog_enabled; +extern int soft_watchdog_enabled; +extern int watchdog_thresh; +extern unsigned long watchdog_enabled; + +extern struct cpumask watchdog_cpumask; +extern unsigned long *watchdog_cpumask_bits; +#ifdef CONFIG_SMP +extern int sysctl_softlockup_all_cpu_backtrace; +extern int sysctl_hardlockup_all_cpu_backtrace; #else +#define sysctl_softlockup_all_cpu_backtrace 0 +#define sysctl_hardlockup_all_cpu_backtrace 0 +#endif /* !CONFIG_SMP */ + +#else /* CONFIG_LOCKUP_DETECTOR */ static inline void lockup_detector_init(void) { } static inline void lockup_detector_soft_poweroff(void) { } static inline void lockup_detector_cleanup(void) { } -#endif +#endif /* !CONFIG_LOCKUP_DETECTOR */ #ifdef CONFIG_SOFTLOCKUP_DETECTOR extern void touch_softlockup_watchdog_sched(void); @@ -26,28 +44,17 @@ extern void touch_softlockup_watchdog(void); extern void touch_softlockup_watchdog_sync(void); extern void touch_all_softlockup_watchdogs(void); extern unsigned int softlockup_panic; -extern int soft_watchdog_enabled; #else -static inline void touch_softlockup_watchdog_sched(void) -{ -} -static inline void touch_softlockup_watchdog(void) -{ -} -static inline void touch_softlockup_watchdog_sync(void) -{ -} -static inline void touch_all_softlockup_watchdogs(void) -{ -} +static inline void touch_softlockup_watchdog_sched(void) { } +static inline void touch_softlockup_watchdog(void) { } +static inline void touch_softlockup_watchdog_sync(void) { } +static inline void touch_all_softlockup_watchdogs(void) { } #endif #ifdef CONFIG_DETECT_HUNG_TASK void reset_hung_task_detector(void); #else -static inline void reset_hung_task_detector(void) -{ -} +static inline void reset_hung_task_detector(void) { } #endif /* @@ -92,7 +99,7 @@ static inline void arch_touch_nmi_watchdog(void) {} /** * touch_nmi_watchdog - restart NMI watchdog timeout. - * + * * If the architecture supports the NMI watchdog, touch_nmi_watchdog() * may be used to reset the timeout - for code which intentionally * disables interrupts for a long time. This call is stateless. @@ -162,21 +169,6 @@ static inline bool trigger_single_cpu_backtrace(int cpu) u64 hw_nmi_get_sample_period(int watchdog_thresh); #endif -#ifdef CONFIG_LOCKUP_DETECTOR -extern int nmi_watchdog_enabled; -extern int watchdog_user_enabled; -extern int watchdog_thresh; -extern unsigned long watchdog_enabled; -extern struct cpumask watchdog_cpumask; -extern unsigned long *watchdog_cpumask_bits; -#ifdef CONFIG_SMP -extern int sysctl_softlockup_all_cpu_backtrace; -extern int sysctl_hardlockup_all_cpu_backtrace; -#else -#define sysctl_softlockup_all_cpu_backtrace 0 -#define sysctl_hardlockup_all_cpu_backtrace 0 -#endif - #if defined(CONFIG_HARDLOCKUP_CHECK_TIMESTAMP) && \ defined(CONFIG_HARDLOCKUP_DETECTOR) void watchdog_update_hrtimer_threshold(u64 period); @@ -184,7 +176,6 @@ void watchdog_update_hrtimer_threshold(u64 period); static inline void watchdog_update_hrtimer_threshold(u64 period) { } #endif -extern bool is_hardlockup(void); struct ctl_table; extern int proc_watchdog(struct ctl_table *, int , void __user *, size_t *, loff_t *); @@ -196,7 +187,6 @@ extern int proc_watchdog_thresh(struct ctl_table *, int , void __user *, size_t *, loff_t *); extern int proc_watchdog_cpumask(struct ctl_table *, int, void __user *, size_t *, loff_t *); -#endif #ifdef CONFIG_HAVE_ACPI_APEI_NMI #include -- cgit From 51d4052b01ca555e0d1d5fe297b309beb6c64aa0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:14 +0200 Subject: watchdog/sysctl: Get rid of the #ifdeffery The sysctl of the nmi_watchdog file prevents writes by setting: min = max = 0 if none of the users is enabled. That involves ifdeffery and is competely non obvious. If none of the facilities is enabeld, then the file can simply be made read only. Move the ifdeffery into the header and use a constant for file permissions. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194147.706073616@linutronix.de Signed-off-by: Ingo Molnar --- include/linux/nmi.h | 6 ++++++ kernel/sysctl.c | 6 +----- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/include/linux/nmi.h b/include/linux/nmi.h index cfebb3bc4eed..5774b443dba1 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -81,6 +81,12 @@ extern unsigned int hardlockup_panic; static inline void hardlockup_detector_disable(void) {} #endif +#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR) +# define NMI_WATCHDOG_SYSCTL_PERM 0644 +#else +# define NMI_WATCHDOG_SYSCTL_PERM 0444 +#endif + #if defined(CONFIG_HARDLOCKUP_DETECTOR_PERF) extern void arch_touch_nmi_watchdog(void); extern void hardlockup_detector_perf_stop(void); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6648fbbb8157..539cb4e97bb8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -891,14 +891,10 @@ static struct ctl_table kern_table[] = { .procname = "nmi_watchdog", .data = &nmi_watchdog_enabled, .maxlen = sizeof (int), - .mode = 0644, + .mode = NMI_WATCHDOG_SYSCTL_PERM, .proc_handler = proc_nmi_watchdog, .extra1 = &zero, -#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR) .extra2 = &one, -#else - .extra2 = &zero, -#endif }, { .procname = "watchdog_cpumask", -- cgit From 7feeb9cd4f5b34476ffb9e6d58d58c5416375b19 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:15 +0200 Subject: watchdog/sysctl: Clean up sysctl variable name space Reflect that these variables are user interface related and remove the whitespace damage in the sysctl table while at it. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194147.783210221@linutronix.de Signed-off-by: Ingo Molnar --- include/linux/nmi.h | 16 ++++++++-------- kernel/sysctl.c | 16 ++++++++-------- kernel/watchdog.c | 41 ++++++++++++++++++++--------------------- 3 files changed, 36 insertions(+), 37 deletions(-) diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 5774b443dba1..4a8d1037364e 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -17,8 +17,8 @@ void lockup_detector_cleanup(void); bool is_hardlockup(void); extern int watchdog_user_enabled; -extern int nmi_watchdog_enabled; -extern int soft_watchdog_enabled; +extern int nmi_watchdog_user_enabled; +extern int soft_watchdog_user_enabled; extern int watchdog_thresh; extern unsigned long watchdog_enabled; @@ -62,12 +62,12 @@ static inline void reset_hung_task_detector(void) { } * 'watchdog_enabled' variable. Each lockup detector has its dedicated bit - * bit 0 for the hard lockup detector and bit 1 for the soft lockup detector. * - * 'watchdog_user_enabled', 'nmi_watchdog_enabled' and 'soft_watchdog_enabled' - * are variables that are only used as an 'interface' between the parameters - * in /proc/sys/kernel and the internal state bits in 'watchdog_enabled'. The - * 'watchdog_thresh' variable is handled differently because its value is not - * boolean, and the lockup detectors are 'suspended' while 'watchdog_thresh' - * is equal zero. + * 'watchdog_user_enabled', 'nmi_watchdog_user_enabled' and + * 'soft_watchdog_user_enabled' are variables that are only used as an + * 'interface' between the parameters in /proc/sys/kernel and the internal + * state bits in 'watchdog_enabled'. The 'watchdog_thresh' variable is + * handled differently because its value is not boolean, and the lockup + * detectors are 'suspended' while 'watchdog_thresh' is equal zero. */ #define NMI_WATCHDOG_ENABLED_BIT 0 #define SOFT_WATCHDOG_ENABLED_BIT 1 diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 539cb4e97bb8..4c08ed4a379e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -871,9 +871,9 @@ static struct ctl_table kern_table[] = { #if defined(CONFIG_LOCKUP_DETECTOR) { .procname = "watchdog", - .data = &watchdog_user_enabled, - .maxlen = sizeof (int), - .mode = 0644, + .data = &watchdog_user_enabled, + .maxlen = sizeof(int), + .mode = 0644, .proc_handler = proc_watchdog, .extra1 = &zero, .extra2 = &one, @@ -889,8 +889,8 @@ static struct ctl_table kern_table[] = { }, { .procname = "nmi_watchdog", - .data = &nmi_watchdog_enabled, - .maxlen = sizeof (int), + .data = &nmi_watchdog_user_enabled, + .maxlen = sizeof(int), .mode = NMI_WATCHDOG_SYSCTL_PERM, .proc_handler = proc_nmi_watchdog, .extra1 = &zero, @@ -906,9 +906,9 @@ static struct ctl_table kern_table[] = { #ifdef CONFIG_SOFTLOCKUP_DETECTOR { .procname = "soft_watchdog", - .data = &soft_watchdog_enabled, - .maxlen = sizeof (int), - .mode = 0644, + .data = &soft_watchdog_user_enabled, + .maxlen = sizeof(int), + .mode = 0644, .proc_handler = proc_soft_watchdog, .extra1 = &zero, .extra2 = &one, diff --git a/kernel/watchdog.c b/kernel/watchdog.c index ca8747221e87..baae9fc95031 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -31,8 +31,6 @@ static DEFINE_MUTEX(watchdog_mutex); -int __read_mostly nmi_watchdog_enabled; - #if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG) unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED | NMI_WATCHDOG_ENABLED; @@ -40,6 +38,17 @@ unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED | unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED; #endif +int __read_mostly nmi_watchdog_user_enabled; +int __read_mostly soft_watchdog_user_enabled; +int __read_mostly watchdog_user_enabled; +int __read_mostly watchdog_thresh = 10; + +struct cpumask watchdog_allowed_mask __read_mostly; +static bool softlockup_threads_initialized __read_mostly; + +struct cpumask watchdog_cpumask __read_mostly; +unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); + #ifdef CONFIG_HARDLOCKUP_DETECTOR /* * Should we panic when a soft-lockup or hard-lockup occurs: @@ -85,12 +94,6 @@ __setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); # endif /* CONFIG_SMP */ #endif /* CONFIG_HARDLOCKUP_DETECTOR */ -int __read_mostly watchdog_user_enabled; -int __read_mostly watchdog_thresh = 10; - -struct cpumask watchdog_cpumask __read_mostly; -unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); - /* * These functions can be overridden if an architecture implements its * own hardlockup detector. @@ -113,7 +116,7 @@ void __weak watchdog_nmi_disable(unsigned int cpu) * watchdog_nmi_reconfigure can be implemented to be notified after any * watchdog configuration change. The arch hardlockup watchdog should * respond to the following variables: - * - nmi_watchdog_enabled + * - watchdog_enabled * - watchdog_thresh * - watchdog_cpumask * - sysctl_hardlockup_all_cpu_backtrace @@ -126,10 +129,6 @@ void __weak watchdog_nmi_reconfigure(void) { } /* Global variables, exported for sysctl */ unsigned int __read_mostly softlockup_panic = CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; -int __read_mostly soft_watchdog_enabled; - -struct cpumask watchdog_allowed_mask __read_mostly; -static bool softlockup_threads_initialized __read_mostly; static u64 __read_mostly sample_period; @@ -606,14 +605,14 @@ static void proc_watchdog_update(void) /* * common function for watchdog, nmi_watchdog and soft_watchdog parameter * - * caller | table->data points to | 'which' contains the flag(s) - * -------------------|-----------------------|----------------------------- - * proc_watchdog | watchdog_user_enabled | NMI_WATCHDOG_ENABLED or'ed - * | | with SOFT_WATCHDOG_ENABLED - * -------------------|-----------------------|----------------------------- - * proc_nmi_watchdog | nmi_watchdog_enabled | NMI_WATCHDOG_ENABLED - * -------------------|-----------------------|----------------------------- - * proc_soft_watchdog | soft_watchdog_enabled | SOFT_WATCHDOG_ENABLED + * caller | table->data points to | 'which' + * -------------------|----------------------------|-------------------------- + * proc_watchdog | watchdog_user_enabled | NMI_WATCHDOG_ENABLED | + * | | SOFT_WATCHDOG_ENABLED + * -------------------|----------------------------|-------------------------- + * proc_nmi_watchdog | nmi_watchdog_user_enabled | NMI_WATCHDOG_ENABLED + * -------------------|----------------------------|-------------------------- + * proc_soft_watchdog | soft_watchdog_user_enabled | SOFT_WATCHDOG_ENABLED */ static int proc_watchdog_common(int which, struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) -- cgit From 6592ad2fcc8f15b4f99b36c1db7d9f65510c203b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:16 +0200 Subject: watchdog/core, powerpc: Make watchdog_nmi_reconfigure() two stage Both the perf reconfiguration and the powerpc watchdog_nmi_reconfigure() need to be done in two steps. 1) Stop all NMIs 2) Read the new parameters and start NMIs Right now watchdog_nmi_reconfigure() is a combination of both. To allow a clean reconfiguration add a 'run' argument and split the functionality in powerpc. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/20170912194147.862865570@linutronix.de Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/watchdog.c | 17 +++++++++-------- include/linux/nmi.h | 2 ++ kernel/watchdog.c | 31 ++++++++++++++++++++++--------- 3 files changed, 33 insertions(+), 17 deletions(-) diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c index 5ded171f02d6..291af79a9826 100644 --- a/arch/powerpc/kernel/watchdog.c +++ b/arch/powerpc/kernel/watchdog.c @@ -355,17 +355,18 @@ static void watchdog_calc_timeouts(void) wd_timer_period_ms = watchdog_thresh * 1000 * 2 / 5; } -void watchdog_nmi_reconfigure(void) +void watchdog_nmi_reconfigure(bool run) { int cpu; - watchdog_calc_timeouts(); - - for_each_cpu(cpu, &wd_cpus_enabled) - stop_wd_on_cpu(cpu); - - for_each_cpu_and(cpu, cpu_online_mask, &watchdog_cpumask) - start_wd_on_cpu(cpu); + if (!run) { + for_each_cpu(cpu, &wd_cpus_enabled) + stop_wd_on_cpu(cpu); + } else { + watchdog_calc_timeouts(); + for_each_cpu_and(cpu, cpu_online_mask, &watchdog_cpumask) + start_wd_on_cpu(cpu); + } } /* diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 4a8d1037364e..eee255bc0fd6 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -103,6 +103,8 @@ static inline void arch_touch_nmi_watchdog(void) {} #endif #endif +void watchdog_nmi_reconfigure(bool run); + /** * touch_nmi_watchdog - restart NMI watchdog timeout. * diff --git a/kernel/watchdog.c b/kernel/watchdog.c index baae9fc95031..5693afd2b8ea 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -112,17 +112,25 @@ void __weak watchdog_nmi_disable(unsigned int cpu) hardlockup_detector_perf_disable(); } -/* - * watchdog_nmi_reconfigure can be implemented to be notified after any - * watchdog configuration change. The arch hardlockup watchdog should - * respond to the following variables: +/** + * watchdog_nmi_reconfigure - Optional function to reconfigure NMI watchdogs + * @run: If false stop the watchdogs on all enabled CPUs + * If true start the watchdogs on all enabled CPUs + * + * The core call order is: + * watchdog_nmi_reconfigure(false); + * update_variables(); + * watchdog_nmi_reconfigure(true); + * + * The second call which starts the watchdogs again guarantees that the + * following variables are stable across the call. * - watchdog_enabled * - watchdog_thresh * - watchdog_cpumask - * - sysctl_hardlockup_all_cpu_backtrace - * - hardlockup_panic + * + * After the call the variables can be changed again. */ -void __weak watchdog_nmi_reconfigure(void) { } +void __weak watchdog_nmi_reconfigure(bool run) { } #ifdef CONFIG_SOFTLOCKUP_DETECTOR @@ -515,10 +523,12 @@ static void softlockup_unpark_threads(void) static void softlockup_reconfigure_threads(bool enabled) { + watchdog_nmi_reconfigure(false); softlockup_park_all_threads(); set_sample_period(); if (enabled) softlockup_unpark_threads(); + watchdog_nmi_reconfigure(true); } /* @@ -559,7 +569,11 @@ static inline void watchdog_unpark_threads(void) { } static inline int watchdog_enable_all_cpus(void) { return 0; } static inline void watchdog_disable_all_cpus(void) { } static inline void softlockup_init_threads(void) { } -static inline void softlockup_reconfigure_threads(bool enabled) { } +static void softlockup_reconfigure_threads(bool enabled) +{ + watchdog_nmi_reconfigure(false); + watchdog_nmi_reconfigure(true); +} #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */ static void __lockup_detector_cleanup(void) @@ -599,7 +613,6 @@ static void proc_watchdog_update(void) /* Remove impossible cpus to keep sysctl output clean. */ cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask); softlockup_reconfigure_threads(watchdog_enabled && watchdog_thresh); - watchdog_nmi_reconfigure(); } /* -- cgit From 091549858ed881e5f3054374af4f5b1cac681d50 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:17 +0200 Subject: watchdog/core: Get rid of the racy update loop Letting user space poke directly at variables which are used at run time is stupid and causes a lot of race conditions and other issues. Seperate the user variables and on change invoke the reconfiguration, which then stops the watchdogs, reevaluates the new user value and restarts the watchdogs with the new parameters. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194147.939985640@linutronix.de Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 95 +++++++++++++++++++++++++++---------------------------- 1 file changed, 47 insertions(+), 48 deletions(-) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 5693afd2b8ea..84886319d7b0 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -32,15 +32,17 @@ static DEFINE_MUTEX(watchdog_mutex); #if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG) -unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED | - NMI_WATCHDOG_ENABLED; +# define WATCHDOG_DEFAULT (SOFT_WATCHDOG_ENABLED | NMI_WATCHDOG_ENABLED) +# define NMI_WATCHDOG_DEFAULT 1 #else -unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED; +# define WATCHDOG_DEFAULT (SOFT_WATCHDOG_ENABLED) +# define NMI_WATCHDOG_DEFAULT 0 #endif -int __read_mostly nmi_watchdog_user_enabled; -int __read_mostly soft_watchdog_user_enabled; -int __read_mostly watchdog_user_enabled; +unsigned long __read_mostly watchdog_enabled; +int __read_mostly watchdog_user_enabled = 1; +int __read_mostly nmi_watchdog_user_enabled = NMI_WATCHDOG_DEFAULT; +int __read_mostly soft_watchdog_user_enabled = 1; int __read_mostly watchdog_thresh = 10; struct cpumask watchdog_allowed_mask __read_mostly; @@ -65,7 +67,7 @@ unsigned int __read_mostly hardlockup_panic = */ void __init hardlockup_detector_disable(void) { - watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; + nmi_watchdog_user_enabled = 0; } static int __init hardlockup_panic_setup(char *str) @@ -75,9 +77,9 @@ static int __init hardlockup_panic_setup(char *str) else if (!strncmp(str, "nopanic", 7)) hardlockup_panic = 0; else if (!strncmp(str, "0", 1)) - watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; + nmi_watchdog_user_enabled = 0; else if (!strncmp(str, "1", 1)) - watchdog_enabled |= NMI_WATCHDOG_ENABLED; + nmi_watchdog_user_enabled = 1; return 1; } __setup("nmi_watchdog=", hardlockup_panic_setup); @@ -132,6 +134,23 @@ void __weak watchdog_nmi_disable(unsigned int cpu) */ void __weak watchdog_nmi_reconfigure(bool run) { } +/** + * lockup_detector_update_enable - Update the sysctl enable bit + * + * Caller needs to make sure that the NMI/perf watchdogs are off, so this + * can't race with watchdog_nmi_disable(). + */ +static void lockup_detector_update_enable(void) +{ + watchdog_enabled = 0; + if (!watchdog_user_enabled) + return; + if (nmi_watchdog_user_enabled) + watchdog_enabled |= NMI_WATCHDOG_ENABLED; + if (soft_watchdog_user_enabled) + watchdog_enabled |= SOFT_WATCHDOG_ENABLED; +} + #ifdef CONFIG_SOFTLOCKUP_DETECTOR /* Global variables, exported for sysctl */ @@ -160,14 +179,14 @@ __setup("softlockup_panic=", softlockup_panic_setup); static int __init nowatchdog_setup(char *str) { - watchdog_enabled = 0; + watchdog_user_enabled = 0; return 1; } __setup("nowatchdog", nowatchdog_setup); static int __init nosoftlockup_setup(char *str) { - watchdog_enabled &= ~SOFT_WATCHDOG_ENABLED; + soft_watchdog_user_enabled = 0; return 1; } __setup("nosoftlockup", nosoftlockup_setup); @@ -521,12 +540,13 @@ static void softlockup_unpark_threads(void) softlockup_update_smpboot_threads(); } -static void softlockup_reconfigure_threads(bool enabled) +static void softlockup_reconfigure_threads(void) { watchdog_nmi_reconfigure(false); softlockup_park_all_threads(); set_sample_period(); - if (enabled) + lockup_detector_update_enable(); + if (watchdog_enabled && watchdog_thresh) softlockup_unpark_threads(); watchdog_nmi_reconfigure(true); } @@ -546,6 +566,8 @@ static __init void softlockup_init_threads(void) * If sysctl is off and watchdog got disabled on the command line, * nothing to do here. */ + lockup_detector_update_enable(); + if (!IS_ENABLED(CONFIG_SYSCTL) && !(watchdog_enabled && watchdog_thresh)) return; @@ -559,7 +581,7 @@ static __init void softlockup_init_threads(void) mutex_lock(&watchdog_mutex); softlockup_threads_initialized = true; - softlockup_reconfigure_threads(watchdog_enabled && watchdog_thresh); + softlockup_reconfigure_threads(); mutex_unlock(&watchdog_mutex); } @@ -569,9 +591,10 @@ static inline void watchdog_unpark_threads(void) { } static inline int watchdog_enable_all_cpus(void) { return 0; } static inline void watchdog_disable_all_cpus(void) { } static inline void softlockup_init_threads(void) { } -static void softlockup_reconfigure_threads(bool enabled) +static void softlockup_reconfigure_threads(void) { watchdog_nmi_reconfigure(false); + lockup_detector_update_enable(); watchdog_nmi_reconfigure(true); } #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */ @@ -612,7 +635,7 @@ static void proc_watchdog_update(void) { /* Remove impossible cpus to keep sysctl output clean. */ cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask); - softlockup_reconfigure_threads(watchdog_enabled && watchdog_thresh); + softlockup_reconfigure_threads(); } /* @@ -630,48 +653,24 @@ static void proc_watchdog_update(void) static int proc_watchdog_common(int which, struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - int err, old, new; - int *watchdog_param = (int *)table->data; + int err, old, *param = table->data; cpu_hotplug_disable(); mutex_lock(&watchdog_mutex); - /* - * If the parameter is being read return the state of the corresponding - * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the - * run state of the lockup detectors. - */ if (!write) { - *watchdog_param = (watchdog_enabled & which) != 0; + /* + * On read synchronize the userspace interface. This is a + * racy snapshot. + */ + *param = (watchdog_enabled & which) != 0; err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); } else { + old = READ_ONCE(*param); err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - if (err) - goto out; - - /* - * There is a race window between fetching the current value - * from 'watchdog_enabled' and storing the new value. During - * this race window, watchdog_nmi_enable() can sneak in and - * clear the NMI_WATCHDOG_ENABLED bit in 'watchdog_enabled'. - * The 'cmpxchg' detects this race and the loop retries. - */ - do { - old = watchdog_enabled; - /* - * If the parameter value is not zero set the - * corresponding bit(s), else clear it(them). - */ - if (*watchdog_param) - new = old | which; - else - new = old & ~which; - } while (cmpxchg(&watchdog_enabled, old, new) != old); - - if (old != new) + if (!err && old != READ_ONCE(*param)) proc_watchdog_update(); } -out: mutex_unlock(&watchdog_mutex); cpu_hotplug_enable(); return err; -- cgit From 178b9f7a36d2c74a38274b66dd89f53611298a19 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:18 +0200 Subject: watchdog/hardlockup/perf: Implement init time perf validation The watchdog tries to create perf events even after it figured out that perf is not functional or the requested event is not supported. That's braindead as this can be done once at init time and if not supported the NMI watchdog can be turned off unconditonally. Implement the perf hardlockup detector functionality for that. This creates a new event create function, which will replace the unholy mess of the existing one in later patches. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194148.019090547@linutronix.de Signed-off-by: Ingo Molnar --- include/linux/nmi.h | 8 ++++++-- kernel/watchdog_hld.c | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 2 deletions(-) diff --git a/include/linux/nmi.h b/include/linux/nmi.h index eee255bc0fd6..72c62a809e92 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -93,14 +93,18 @@ extern void hardlockup_detector_perf_stop(void); extern void hardlockup_detector_perf_restart(void); extern void hardlockup_detector_perf_disable(void); extern void hardlockup_detector_perf_cleanup(void); +extern int hardlockup_detector_perf_init(void); #else static inline void hardlockup_detector_perf_stop(void) { } static inline void hardlockup_detector_perf_restart(void) { } static inline void hardlockup_detector_perf_disable(void) { } static inline void hardlockup_detector_perf_cleanup(void) { } -#if !defined(CONFIG_HAVE_NMI_WATCHDOG) +# if !defined(CONFIG_HAVE_NMI_WATCHDOG) +static inline int hardlockup_detector_perf_init(void) { return -ENODEV; } static inline void arch_touch_nmi_watchdog(void) {} -#endif +# else +static inline int hardlockup_detector_perf_init(void) { return 0; } +# endif #endif void watchdog_nmi_reconfigure(bool run); diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 0aa191ee3d51..f7e752e6e9b4 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -238,6 +238,27 @@ out: return 0; } +static int hardlockup_detector_event_create(void) +{ + unsigned int cpu = smp_processor_id(); + struct perf_event_attr *wd_attr; + struct perf_event *evt; + + wd_attr = &wd_hw_attr; + wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); + + /* Try to register using hardware perf events */ + evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL, + watchdog_overflow_callback, NULL); + if (IS_ERR(evt)) { + pr_info("Perf event create on CPU %d failed with %ld\n", cpu, + PTR_ERR(evt)); + return PTR_ERR(evt); + } + this_cpu_write(watchdog_ev, evt); + return 0; +} + /** * hardlockup_detector_perf_disable - Disable the local event */ @@ -315,3 +336,19 @@ void __init hardlockup_detector_perf_restart(void) perf_event_enable(event); } } + +/** + * hardlockup_detector_perf_init - Probe whether NMI event is available at all + */ +int __init hardlockup_detector_perf_init(void) +{ + int ret = hardlockup_detector_event_create(); + + if (ret) { + pr_info("Perf NMI watchdog permanetely disabled\n"); + } else { + perf_event_release_kernel(this_cpu_read(watchdog_ev)); + this_cpu_write(watchdog_ev, NULL); + } + return ret; +} -- cgit From a994a3147e4c0c9c50a46e6cace7586254975e20 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:19 +0200 Subject: watchdog/hardlockup/perf: Implement init time detection of perf Use the init time detection of the perf NMI watchdog to determine whether the perf NMI watchdog is functional. If not disable it permanentely. It won't come back magically at runtime. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194148.099799541@linutronix.de Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 84886319d7b0..fd8a998eb197 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -44,6 +44,7 @@ int __read_mostly watchdog_user_enabled = 1; int __read_mostly nmi_watchdog_user_enabled = NMI_WATCHDOG_DEFAULT; int __read_mostly soft_watchdog_user_enabled = 1; int __read_mostly watchdog_thresh = 10; +int __read_mostly nmi_watchdog_available; struct cpumask watchdog_allowed_mask __read_mostly; static bool softlockup_threads_initialized __read_mostly; @@ -114,6 +115,12 @@ void __weak watchdog_nmi_disable(unsigned int cpu) hardlockup_detector_perf_disable(); } +/* Return 0, if a NMI watchdog is available. Error code otherwise */ +int __weak __init watchdog_nmi_probe(void) +{ + return hardlockup_detector_perf_init(); +} + /** * watchdog_nmi_reconfigure - Optional function to reconfigure NMI watchdogs * @run: If false stop the watchdogs on all enabled CPUs @@ -145,7 +152,7 @@ static void lockup_detector_update_enable(void) watchdog_enabled = 0; if (!watchdog_user_enabled) return; - if (nmi_watchdog_user_enabled) + if (nmi_watchdog_available && nmi_watchdog_user_enabled) watchdog_enabled |= NMI_WATCHDOG_ENABLED; if (soft_watchdog_user_enabled) watchdog_enabled |= SOFT_WATCHDOG_ENABLED; @@ -692,6 +699,8 @@ int proc_watchdog(struct ctl_table *table, int write, int proc_nmi_watchdog(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + if (!nmi_watchdog_available && write) + return -ENOTSUPP; return proc_watchdog_common(NMI_WATCHDOG_ENABLED, table, write, buffer, lenp, ppos); } @@ -764,5 +773,7 @@ void __init lockup_detector_init(void) cpumask_copy(&watchdog_cpumask, cpu_possible_mask); #endif + if (!watchdog_nmi_probe()) + nmi_watchdog_available = true; softlockup_init_threads(); } -- cgit From 2a1b8ee4f5665b4291e43e4a25d964c3eb2f4c32 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:20 +0200 Subject: watchdog/hardlockup/perf: Implement CPU enable replacement watchdog_nmi_enable() is an unparseable mess, Provide a clean perf specific implementation, which will be used when the existing setup/teardown mess is replaced. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194148.180215498@linutronix.de Signed-off-by: Ingo Molnar --- include/linux/nmi.h | 2 ++ kernel/watchdog_hld.c | 11 +++++++++++ 2 files changed, 13 insertions(+) diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 72c62a809e92..89ba8b23c6fe 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -92,12 +92,14 @@ extern void arch_touch_nmi_watchdog(void); extern void hardlockup_detector_perf_stop(void); extern void hardlockup_detector_perf_restart(void); extern void hardlockup_detector_perf_disable(void); +extern void hardlockup_detector_perf_enable(void); extern void hardlockup_detector_perf_cleanup(void); extern int hardlockup_detector_perf_init(void); #else static inline void hardlockup_detector_perf_stop(void) { } static inline void hardlockup_detector_perf_restart(void) { } static inline void hardlockup_detector_perf_disable(void) { } +static inline void hardlockup_detector_perf_enable(void) { } static inline void hardlockup_detector_perf_cleanup(void) { } # if !defined(CONFIG_HAVE_NMI_WATCHDOG) static inline int hardlockup_detector_perf_init(void) { return -ENODEV; } diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index f7e752e6e9b4..99a3f22e48cc 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -259,6 +259,17 @@ static int hardlockup_detector_event_create(void) return 0; } +/** + * hardlockup_detector_perf_enable - Enable the local event + */ +void hardlockup_detector_perf_enable(void) +{ + if (hardlockup_detector_event_create()) + return; + + perf_event_enable(this_cpu_read(watchdog_ev)); +} + /** * hardlockup_detector_perf_disable - Disable the local event */ -- cgit From 146c9d0e9dfdb62ed6afd43cc263efafbbfd1dcf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:21 +0200 Subject: watchdog/hardlockup/perf: Use new perf CPU enable mechanism Get rid of the hodgepodge which tries to be smart about perf being unavailable and error printout rate limiting. That's all not required simply because this is never invoked when the perf NMI watchdog is not functional. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194148.259651788@linutronix.de Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 4 ++- kernel/watchdog_hld.c | 88 +++------------------------------------------------ 2 files changed, 8 insertions(+), 84 deletions(-) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index fd8a998eb197..5eb11960e4a2 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -107,6 +107,7 @@ __setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); */ int __weak watchdog_nmi_enable(unsigned int cpu) { + hardlockup_detector_perf_enable(); return 0; } @@ -465,7 +466,8 @@ static void watchdog_enable(unsigned int cpu) /* Initialize timestamp */ __touch_watchdog(); /* Enable the perf event */ - watchdog_nmi_enable(cpu); + if (watchdog_enabled & NMI_WATCHDOG_ENABLED) + watchdog_nmi_enable(cpu); watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1); } diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 99a3f22e48cc..509bb6b59c41 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -25,7 +25,7 @@ static DEFINE_PER_CPU(struct perf_event *, dead_event); static struct cpumask dead_events_mask; static unsigned long hardlockup_allcpu_dumped; -static bool hardlockup_detector_disabled; +static unsigned int watchdog_cpus; void arch_touch_nmi_watchdog(void) { @@ -160,84 +160,6 @@ static void watchdog_overflow_callback(struct perf_event *event, return; } -/* - * People like the simple clean cpu node info on boot. - * Reduce the watchdog noise by only printing messages - * that are different from what cpu0 displayed. - */ -static unsigned long firstcpu_err; -static atomic_t watchdog_cpus; - -int watchdog_nmi_enable(unsigned int cpu) -{ - struct perf_event_attr *wd_attr; - struct perf_event *event = per_cpu(watchdog_ev, cpu); - int firstcpu = 0; - - /* nothing to do if the hard lockup detector is disabled */ - if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) - goto out; - - /* A failure disabled the hardlockup detector permanently */ - if (hardlockup_detector_disabled) - return -ENODEV; - - /* is it already setup and enabled? */ - if (event && event->state > PERF_EVENT_STATE_OFF) - goto out; - - /* it is setup but not enabled */ - if (event != NULL) - goto out_enable; - - if (atomic_inc_return(&watchdog_cpus) == 1) - firstcpu = 1; - - wd_attr = &wd_hw_attr; - wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); - - /* Try to register using hardware perf events */ - event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); - - /* save the first cpu's error for future comparision */ - if (firstcpu && IS_ERR(event)) - firstcpu_err = PTR_ERR(event); - - if (!IS_ERR(event)) { - /* only print for the first cpu initialized */ - if (firstcpu || firstcpu_err) - pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n"); - goto out_save; - } - - /* skip displaying the same error again */ - if (!firstcpu && (PTR_ERR(event) == firstcpu_err)) - return PTR_ERR(event); - - /* vary the KERN level based on the returned errno */ - if (PTR_ERR(event) == -EOPNOTSUPP) - pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu); - else if (PTR_ERR(event) == -ENOENT) - pr_warn("disabled (cpu%i): hardware events not enabled\n", - cpu); - else - pr_err("disabled (cpu%i): unable to create perf event: %ld\n", - cpu, PTR_ERR(event)); - - pr_info("Disabling hard lockup detector permanently\n"); - hardlockup_detector_disabled = true; - - return PTR_ERR(event); - - /* success path */ -out_save: - per_cpu(watchdog_ev, cpu) = event; -out_enable: - perf_event_enable(per_cpu(watchdog_ev, cpu)); -out: - return 0; -} - static int hardlockup_detector_event_create(void) { unsigned int cpu = smp_processor_id(); @@ -267,6 +189,9 @@ void hardlockup_detector_perf_enable(void) if (hardlockup_detector_event_create()) return; + if (!watchdog_cpus++) + pr_info("Enabled. Permanently consumes one hw-PMU counter.\n"); + perf_event_enable(this_cpu_read(watchdog_ev)); } @@ -282,10 +207,7 @@ void hardlockup_detector_perf_disable(void) this_cpu_write(watchdog_ev, NULL); this_cpu_write(dead_event, event); cpumask_set_cpu(smp_processor_id(), &dead_events_mask); - - /* watchdog_nmi_enable() expects this to be zero initially. */ - if (atomic_dec_and_test(&watchdog_cpus)) - firstcpu_err = 0; + watchdog_cpus--; } } -- cgit From a33d44843d4574ec05bec39527d8a87b7af2072c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:22 +0200 Subject: watchdog/hardlockup/perf: Simplify deferred event destroy Now that all functionality is properly serialized against CPU hotplug, remove the extra per cpu storage which holds the disabled events for cleanup. The core makes sure that cleanup happens before new events are created. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194148.340708074@linutronix.de Signed-off-by: Ingo Molnar --- kernel/watchdog_hld.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 509bb6b59c41..b2931154b5f2 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -21,7 +21,6 @@ static DEFINE_PER_CPU(bool, hard_watchdog_warn); static DEFINE_PER_CPU(bool, watchdog_nmi_touch); static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); -static DEFINE_PER_CPU(struct perf_event *, dead_event); static struct cpumask dead_events_mask; static unsigned long hardlockup_allcpu_dumped; @@ -204,8 +203,6 @@ void hardlockup_detector_perf_disable(void) if (event) { perf_event_disable(event); - this_cpu_write(watchdog_ev, NULL); - this_cpu_write(dead_event, event); cpumask_set_cpu(smp_processor_id(), &dead_events_mask); watchdog_cpus--; } @@ -221,9 +218,9 @@ void hardlockup_detector_perf_cleanup(void) int cpu; for_each_cpu(cpu, &dead_events_mask) { - struct perf_event *event = per_cpu(dead_event, cpu); + struct perf_event *event = per_cpu(watchdog_ev, cpu); - per_cpu(dead_event, cpu) = NULL; + per_cpu(watchdog_ev, cpu) = NULL; perf_event_release_kernel(event); } cpumask_clear(&dead_events_mask); -- cgit From ab5fe3ff38ff9653490910cc71dbbedc95a86e41 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:23 +0200 Subject: watchdog/hardlockup: Clean up hotplug locking mess All watchdog thread related functions are delegated to the smpboot thread infrastructure, which handles serialization against CPU hotplug correctly. The sysctl interface is completely decoupled from anything which requires CPU hotplug protection. No need to protect the sysctl writes against cpu hotplug anymore. Remove it and add the now required protection to the powerpc arch_nmi_watchdog implementation. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/20170912194148.418497420@linutronix.de Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/watchdog.c | 2 ++ kernel/watchdog.c | 6 ------ 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c index 291af79a9826..dfb067764480 100644 --- a/arch/powerpc/kernel/watchdog.c +++ b/arch/powerpc/kernel/watchdog.c @@ -359,6 +359,7 @@ void watchdog_nmi_reconfigure(bool run) { int cpu; + cpus_read_lock(); if (!run) { for_each_cpu(cpu, &wd_cpus_enabled) stop_wd_on_cpu(cpu); @@ -367,6 +368,7 @@ void watchdog_nmi_reconfigure(bool run) for_each_cpu_and(cpu, cpu_online_mask, &watchdog_cpumask) start_wd_on_cpu(cpu); } + cpus_read_unlock(); } /* diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 5eb11960e4a2..f6ef163b72cd 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -664,7 +664,6 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, { int err, old, *param = table->data; - cpu_hotplug_disable(); mutex_lock(&watchdog_mutex); if (!write) { @@ -681,7 +680,6 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, proc_watchdog_update(); } mutex_unlock(&watchdog_mutex); - cpu_hotplug_enable(); return err; } @@ -725,7 +723,6 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, { int err, old; - cpu_hotplug_disable(); mutex_lock(&watchdog_mutex); old = READ_ONCE(watchdog_thresh); @@ -735,7 +732,6 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, proc_watchdog_update(); mutex_unlock(&watchdog_mutex); - cpu_hotplug_enable(); return err; } @@ -750,7 +746,6 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, { int err; - cpu_hotplug_disable(); mutex_lock(&watchdog_mutex); err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); @@ -758,7 +753,6 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, proc_watchdog_update(); mutex_unlock(&watchdog_mutex); - cpu_hotplug_enable(); return err; } #endif /* CONFIG_SYSCTL */ -- cgit From 4cf97582b46f123a4b7cd88d999f1806c2eb4093 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Mon, 11 Sep 2017 17:43:56 +0200 Subject: drm/amdgpu: revert tile table update for oland MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Several users have complained that the tile table update broke Oland support. Despite several attempts to fix it, the root cause is still unknown at this point and no solution is available. As it is not acceptable to leave a known regression breaking a major functionality in the kernel for several releases, let's just reverse this optimization for now. It can be implemented again later if and only if the breakage is understood and fixed. As there were no complaints for Hainan so far, only the Oland part of the offending commit is reverted. Optimization is preserved on Hainan, so this commit isn't an actual revert of the original. This fixes bug #194761: https://bugzilla.kernel.org/show_bug.cgi?id=194761 Reviewed-by: Marek Olšák Signed-off-by: Jean Delvare Fixes: f8d9422ef80c ("drm/amdgpu: update tile table for oland/hainan") Cc: Flora Cui Cc: Junwei Zhang Cc: Alex Deucher Cc: Marek Olšák Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c | 189 +++++++++++++++++++++++++++++++++- 1 file changed, 188 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c index d228f5a99044..dbbe986f90f2 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c @@ -636,7 +636,194 @@ static void gfx_v6_0_tiling_mode_table_init(struct amdgpu_device *adev) NUM_BANKS(ADDR_SURF_2_BANK); for (reg_offset = 0; reg_offset < num_tile_mode_states; reg_offset++) WREG32(mmGB_TILE_MODE0 + reg_offset, tilemode[reg_offset]); - } else if (adev->asic_type == CHIP_OLAND || adev->asic_type == CHIP_HAINAN) { + } else if (adev->asic_type == CHIP_OLAND) { + tilemode[0] = MICRO_TILE_MODE(ADDR_SURF_DEPTH_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_64B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_4) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_4); + tilemode[1] = MICRO_TILE_MODE(ADDR_SURF_DEPTH_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_128B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_4) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_4); + tilemode[2] = MICRO_TILE_MODE(ADDR_SURF_DEPTH_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_256B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_4) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_4); + tilemode[3] = MICRO_TILE_MODE(ADDR_SURF_DEPTH_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_128B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_4) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_4); + tilemode[4] = MICRO_TILE_MODE(ADDR_SURF_DEPTH_MICRO_TILING) | + ARRAY_MODE(ARRAY_1D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_64B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_2) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_2); + tilemode[5] = MICRO_TILE_MODE(ADDR_SURF_DEPTH_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(split_equal_to_row_size) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_2) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_2); + tilemode[6] = MICRO_TILE_MODE(ADDR_SURF_DEPTH_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(split_equal_to_row_size) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_1) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_2); + tilemode[7] = MICRO_TILE_MODE(ADDR_SURF_DEPTH_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(split_equal_to_row_size) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_4) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_4); + tilemode[8] = MICRO_TILE_MODE(ADDR_SURF_DISPLAY_MICRO_TILING) | + ARRAY_MODE(ARRAY_LINEAR_ALIGNED) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_64B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_2) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_2); + tilemode[9] = MICRO_TILE_MODE(ADDR_SURF_DISPLAY_MICRO_TILING) | + ARRAY_MODE(ARRAY_1D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_64B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_2) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_2); + tilemode[10] = MICRO_TILE_MODE(ADDR_SURF_DISPLAY_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_256B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_4) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_4); + tilemode[11] = MICRO_TILE_MODE(ADDR_SURF_DISPLAY_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_256B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_2) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_2); + tilemode[12] = MICRO_TILE_MODE(ADDR_SURF_DISPLAY_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_512B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_1) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_2); + tilemode[13] = MICRO_TILE_MODE(ADDR_SURF_THIN_MICRO_TILING) | + ARRAY_MODE(ARRAY_1D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_64B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_2) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_2); + tilemode[14] = MICRO_TILE_MODE(ADDR_SURF_THIN_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_256B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_4) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_2); + tilemode[15] = MICRO_TILE_MODE(ADDR_SURF_THIN_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_256B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_2) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_2); + tilemode[16] = MICRO_TILE_MODE(ADDR_SURF_THIN_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_512B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_1) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_2); + tilemode[17] = MICRO_TILE_MODE(ADDR_SURF_THIN_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(split_equal_to_row_size) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_1) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_2); + tilemode[21] = MICRO_TILE_MODE(ADDR_SURF_THIN_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P8_32x32_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_256B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_2) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_4) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_2); + tilemode[22] = MICRO_TILE_MODE(ADDR_SURF_THIN_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P8_32x32_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_256B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_4) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_4); + tilemode[23] = MICRO_TILE_MODE(ADDR_SURF_THIN_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P8_32x32_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_256B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_2) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_2); + tilemode[24] = MICRO_TILE_MODE(ADDR_SURF_THIN_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P8_32x32_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_512B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_1) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_2); + tilemode[25] = MICRO_TILE_MODE(ADDR_SURF_THIN_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P8_32x32_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_1KB) | + NUM_BANKS(ADDR_SURF_8_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_1) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_1); + for (reg_offset = 0; reg_offset < num_tile_mode_states; reg_offset++) + WREG32(mmGB_TILE_MODE0 + reg_offset, tilemode[reg_offset]); + } else if (adev->asic_type == CHIP_HAINAN) { tilemode[0] = MICRO_TILE_MODE(ADDR_SURF_DEPTH_MICRO_TILING) | ARRAY_MODE(ARRAY_2D_TILED_THIN1) | PIPE_CONFIG(ADDR_SURF_P2) | -- cgit From 820608548737e315c6f93e3099b4e65bde062334 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 15 Sep 2017 11:55:27 -0400 Subject: drm/radeon: disable hard reset in hibernate for APUs Fixes a hibernation regression on APUs. Bug: https://bugzilla.kernel.org/show_bug.cgi?id=191571 Fixes: 274ad65c9d02bdc (drm/radeon: hard reset r600 and newer GPU when hibernating.) Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/radeon/radeon_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c index 997131d58c7f..ffc10cadcf34 100644 --- a/drivers/gpu/drm/radeon/radeon_device.c +++ b/drivers/gpu/drm/radeon/radeon_device.c @@ -1663,7 +1663,7 @@ int radeon_suspend_kms(struct drm_device *dev, bool suspend, radeon_agp_suspend(rdev); pci_save_state(dev->pdev); - if (freeze && rdev->family >= CHIP_CEDAR) { + if (freeze && rdev->family >= CHIP_CEDAR && !(rdev->flags & RADEON_IS_IGP)) { rdev->asic->asic_reset(rdev, true); pci_restore_state(dev->pdev); } else if (suspend) { -- cgit From 6c92f7dbf25c36f35320e4ae0b508676410bac04 Mon Sep 17 00:00:00 2001 From: Dave Carroll Date: Fri, 15 Sep 2017 11:04:28 -0600 Subject: scsi: aacraid: Fix 2T+ drives on SmartIOC-2000 The logic for supporting large drives was previously tied to 4Kn support for SmartIOC-2000. As SmartIOC-2000 does not support volumes using 4Kn drives, use the intended option flag AAC_OPT_NEW_COMM_64 to determine support for volumes greater than 2T. Cc: Signed-off-by: Dave Carroll Reviewed-by: Christoph Hellwig Reviewed-by: Raghava Aditya Renukunta Signed-off-by: Martin K. Petersen --- drivers/scsi/aacraid/aachba.c | 12 ++++++------ drivers/scsi/aacraid/aacraid.h | 5 +++++ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/scsi/aacraid/aachba.c b/drivers/scsi/aacraid/aachba.c index a64285ab0728..af3e4d3f9735 100644 --- a/drivers/scsi/aacraid/aachba.c +++ b/drivers/scsi/aacraid/aachba.c @@ -699,13 +699,13 @@ static void _aac_probe_container1(void * context, struct fib * fibptr) int status; dresp = (struct aac_mount *) fib_data(fibptr); - if (!(fibptr->dev->supplement_adapter_info.supported_options2 & - AAC_OPTION_VARIABLE_BLOCK_SIZE)) + if (!aac_supports_2T(fibptr->dev)) { dresp->mnt[0].capacityhigh = 0; - if ((le32_to_cpu(dresp->status) != ST_OK) || - (le32_to_cpu(dresp->mnt[0].vol) != CT_NONE)) { - _aac_probe_container2(context, fibptr); - return; + if ((le32_to_cpu(dresp->status) == ST_OK) && + (le32_to_cpu(dresp->mnt[0].vol) != CT_NONE)) { + _aac_probe_container2(context, fibptr); + return; + } } scsicmd = (struct scsi_cmnd *) context; diff --git a/drivers/scsi/aacraid/aacraid.h b/drivers/scsi/aacraid/aacraid.h index 92fabf2b0c24..403a639574e5 100644 --- a/drivers/scsi/aacraid/aacraid.h +++ b/drivers/scsi/aacraid/aacraid.h @@ -2701,6 +2701,11 @@ static inline int aac_is_src(struct aac_dev *dev) return 0; } +static inline int aac_supports_2T(struct aac_dev *dev) +{ + return (dev->adapter_info.options & AAC_OPT_NEW_COMM_64); +} + char * get_container_type(unsigned type); extern int numacb; extern char aac_driver_version[]; -- cgit From 5c756065e47dc3e84b00577bd109f0a8e69903d7 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Wed, 6 Sep 2017 11:02:56 +0200 Subject: scsi: lpfc: Don't return internal MBXERR_ERROR code from probe function Internal error codes happen to be positive, thus the PCI driver core won't treat them as failure, but we do. This would cause a crash later on as lpfc_pci_remove_one() is called (e.g. as shutdown function). Fixes: 6d368e532168 ("[SCSI] lpfc 8.3.24: Add resource extent support") Signed-off-by: Stefano Brivio Reviewed-by: Johannes Thumshirn Acked-by: Dick Kennedy Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_init.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c index 7e7ae786121b..100bc4c8798d 100644 --- a/drivers/scsi/lpfc/lpfc_init.c +++ b/drivers/scsi/lpfc/lpfc_init.c @@ -6131,6 +6131,7 @@ lpfc_sli4_driver_resource_setup(struct lpfc_hba *phba) "Extents and RPI headers enabled.\n"); } mempool_free(mboxq, phba->mbox_mem_pool); + rc = -EIO; goto out_free_bsmbx; } -- cgit From 4cb433e856bce5974ea035181cc8eb406496dccc Mon Sep 17 00:00:00 2001 From: Nikola Pajkovsky Date: Wed, 13 Sep 2017 10:46:17 +0200 Subject: scsi: aacraid: error: testing array offset 'bus' after use Fix possible indexing array of bound for &aac->hba_map[bus][cid], where bus and cid boundary check happens later. Fixes: 0d643ff3c353 ("scsi: aacraid: use aac_tmf_callback for reset fib") Signed-off-by: Nikola Pajkovsky Reviewed-by: Dave Carroll Signed-off-by: Martin K. Petersen --- drivers/scsi/aacraid/linit.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/drivers/scsi/aacraid/linit.c b/drivers/scsi/aacraid/linit.c index 87cc4a93e637..62beb2596466 100644 --- a/drivers/scsi/aacraid/linit.c +++ b/drivers/scsi/aacraid/linit.c @@ -906,12 +906,14 @@ static int aac_eh_dev_reset(struct scsi_cmnd *cmd) bus = aac_logical_to_phys(scmd_channel(cmd)); cid = scmd_id(cmd); - info = &aac->hba_map[bus][cid]; - if (bus >= AAC_MAX_BUSES || cid >= AAC_MAX_TARGETS || - info->devtype != AAC_DEVTYPE_NATIVE_RAW) + + if (bus >= AAC_MAX_BUSES || cid >= AAC_MAX_TARGETS) return FAILED; - if (info->reset_state > 0) + info = &aac->hba_map[bus][cid]; + + if (info->devtype != AAC_DEVTYPE_NATIVE_RAW && + info->reset_state > 0) return FAILED; pr_err("%s: Host adapter reset request. SCSI hang ?\n", @@ -962,12 +964,14 @@ static int aac_eh_target_reset(struct scsi_cmnd *cmd) bus = aac_logical_to_phys(scmd_channel(cmd)); cid = scmd_id(cmd); - info = &aac->hba_map[bus][cid]; - if (bus >= AAC_MAX_BUSES || cid >= AAC_MAX_TARGETS || - info->devtype != AAC_DEVTYPE_NATIVE_RAW) + + if (bus >= AAC_MAX_BUSES || cid >= AAC_MAX_TARGETS) return FAILED; - if (info->reset_state > 0) + info = &aac->hba_map[bus][cid]; + + if (info->devtype != AAC_DEVTYPE_NATIVE_RAW && + info->reset_state > 0) return FAILED; pr_err("%s: Host adapter reset request. SCSI hang ?\n", -- cgit From 6354a06cbaa8c49d8377a6cee3e7db399c23601c Mon Sep 17 00:00:00 2001 From: Heiko Stuebner Date: Wed, 13 Sep 2017 09:38:40 +0200 Subject: Revert "arm64: dts: rockchip: Add basic cpu frequencies for RK3368" This reverts commit 6f2dea1f5fdb73eb2e050d9ebe990121d557e519. Without accurate cpu regulators being set for boards this will wreak havoc when cpufreq-dt begins to set new frequencies without adjusting the core frequency. Additionally the rk3368 has an unsolved issue in that it has two separate cpu clusters with separate clock lines but only one cpu supply regulator for both clusters, which causes even more problems. While it seems that originally only one cluster was supposed to be active at a time (big or little), talking with real users of the hardware revealed that having all 8 cores accessible at 1.2GHz max is way more liked than having 4 cores at 1.5GHz max. Such an approach needs changes to cpufreq and/or opp though to control the two separate clock lines when setting both clusters to the same frequencies. In any case, having the OPPs in the dts at this point in time is undesireable, so remove them again for now. Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3368.dtsi | 72 +------------------------------- 1 file changed, 2 insertions(+), 70 deletions(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3368.dtsi b/arch/arm64/boot/dts/rockchip/rk3368.dtsi index e0518b4bc6c2..19fbaa5e7bdd 100644 --- a/arch/arm64/boot/dts/rockchip/rk3368.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3368.dtsi @@ -113,8 +113,7 @@ compatible = "arm,cortex-a53", "arm,armv8"; reg = <0x0 0x0>; enable-method = "psci"; - clocks = <&cru ARMCLKL>; - operating-points-v2 = <&cluster0_opp>; + #cooling-cells = <2>; /* min followed by max */ }; @@ -123,8 +122,6 @@ compatible = "arm,cortex-a53", "arm,armv8"; reg = <0x0 0x1>; enable-method = "psci"; - clocks = <&cru ARMCLKL>; - operating-points-v2 = <&cluster0_opp>; }; cpu_l2: cpu@2 { @@ -132,8 +129,6 @@ compatible = "arm,cortex-a53", "arm,armv8"; reg = <0x0 0x2>; enable-method = "psci"; - clocks = <&cru ARMCLKL>; - operating-points-v2 = <&cluster0_opp>; }; cpu_l3: cpu@3 { @@ -141,8 +136,6 @@ compatible = "arm,cortex-a53", "arm,armv8"; reg = <0x0 0x3>; enable-method = "psci"; - clocks = <&cru ARMCLKL>; - operating-points-v2 = <&cluster0_opp>; }; cpu_b0: cpu@100 { @@ -150,8 +143,7 @@ compatible = "arm,cortex-a53", "arm,armv8"; reg = <0x0 0x100>; enable-method = "psci"; - clocks = <&cru ARMCLKB>; - operating-points-v2 = <&cluster1_opp>; + #cooling-cells = <2>; /* min followed by max */ }; @@ -160,8 +152,6 @@ compatible = "arm,cortex-a53", "arm,armv8"; reg = <0x0 0x101>; enable-method = "psci"; - clocks = <&cru ARMCLKB>; - operating-points-v2 = <&cluster1_opp>; }; cpu_b2: cpu@102 { @@ -169,8 +159,6 @@ compatible = "arm,cortex-a53", "arm,armv8"; reg = <0x0 0x102>; enable-method = "psci"; - clocks = <&cru ARMCLKB>; - operating-points-v2 = <&cluster1_opp>; }; cpu_b3: cpu@103 { @@ -178,62 +166,6 @@ compatible = "arm,cortex-a53", "arm,armv8"; reg = <0x0 0x103>; enable-method = "psci"; - clocks = <&cru ARMCLKB>; - operating-points-v2 = <&cluster1_opp>; - }; - }; - - cluster0_opp: opp-table0 { - compatible = "operating-points-v2"; - opp-shared; - - opp00 { - opp-hz = /bits/ 64 <312000000>; - opp-microvolt = <950000>; - clock-latency-ns = <40000>; - }; - opp01 { - opp-hz = /bits/ 64 <408000000>; - opp-microvolt = <950000>; - }; - opp02 { - opp-hz = /bits/ 64 <600000000>; - opp-microvolt = <950000>; - }; - opp03 { - opp-hz = /bits/ 64 <816000000>; - opp-microvolt = <1025000>; - }; - opp04 { - opp-hz = /bits/ 64 <1008000000>; - opp-microvolt = <1125000>; - }; - }; - - cluster1_opp: opp-table1 { - compatible = "operating-points-v2"; - opp-shared; - - opp00 { - opp-hz = /bits/ 64 <312000000>; - opp-microvolt = <950000>; - clock-latency-ns = <40000>; - }; - opp01 { - opp-hz = /bits/ 64 <408000000>; - opp-microvolt = <950000>; - }; - opp02 { - opp-hz = /bits/ 64 <600000000>; - opp-microvolt = <950000>; - }; - opp03 { - opp-hz = /bits/ 64 <816000000>; - opp-microvolt = <975000>; - }; - opp04 { - opp-hz = /bits/ 64 <1008000000>; - opp-microvolt = <1050000>; }; }; -- cgit From e8620acc90785ecbde041d241a13778044df9208 Mon Sep 17 00:00:00 2001 From: Elaine Zhang Date: Fri, 1 Sep 2017 10:01:44 +0800 Subject: clk: rockchip: add pclk_pmu as critical clock on rk3128 pclk_pmu need always on, and no dts node to handle this clk, so make it as critical clock Signed-off-by: Elaine Zhang Signed-off-by: Heiko Stuebner --- drivers/clk/rockchip/clk-rk3128.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/clk/rockchip/clk-rk3128.c b/drivers/clk/rockchip/clk-rk3128.c index 62d7854e4b87..f15c9b874911 100644 --- a/drivers/clk/rockchip/clk-rk3128.c +++ b/drivers/clk/rockchip/clk-rk3128.c @@ -541,7 +541,7 @@ static struct rockchip_clk_branch common_clk_branches[] __initdata = { GATE(0, "pclk_grf", "pclk_cpu", CLK_IGNORE_UNUSED, RK2928_CLKGATE_CON(5), 4, GFLAGS), GATE(0, "pclk_mipiphy", "pclk_cpu", CLK_IGNORE_UNUSED, RK2928_CLKGATE_CON(5), 0, GFLAGS), - GATE(0, "pclk_pmu", "pclk_pmu_pre", CLK_IGNORE_UNUSED, RK2928_CLKGATE_CON(9), 2, GFLAGS), + GATE(0, "pclk_pmu", "pclk_pmu_pre", 0, RK2928_CLKGATE_CON(9), 2, GFLAGS), GATE(0, "pclk_pmu_niu", "pclk_pmu_pre", CLK_IGNORE_UNUSED, RK2928_CLKGATE_CON(9), 3, GFLAGS), /* PD_MMC */ @@ -577,6 +577,7 @@ static const char *const rk3128_critical_clocks[] __initconst = { "aclk_peri", "hclk_peri", "pclk_peri", + "pclk_pmu", }; static struct rockchip_clk_provider *__init rk3128_common_clk_init(struct device_node *np) -- cgit From a4eb286565eb07ee5acfe6a1409f68ad9f663845 Mon Sep 17 00:00:00 2001 From: Elaine Zhang Date: Fri, 1 Sep 2017 10:01:45 +0800 Subject: clk: rockchip: fix up rk3128 pvtm and mipi_24m gate regs error A copy-paste error made them use the wrong bits in the register. Signed-off-by: Elaine Zhang Signed-off-by: Heiko Stuebner --- drivers/clk/rockchip/clk-rk3128.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/clk/rockchip/clk-rk3128.c b/drivers/clk/rockchip/clk-rk3128.c index f15c9b874911..ce02d2cff608 100644 --- a/drivers/clk/rockchip/clk-rk3128.c +++ b/drivers/clk/rockchip/clk-rk3128.c @@ -315,13 +315,13 @@ static struct rockchip_clk_branch common_clk_branches[] __initdata = { RK2928_CLKGATE_CON(10), 8, GFLAGS), GATE(SCLK_PVTM_CORE, "clk_pvtm_core", "xin24m", 0, - RK2928_CLKGATE_CON(10), 8, GFLAGS), + RK2928_CLKGATE_CON(10), 0, GFLAGS), GATE(SCLK_PVTM_GPU, "clk_pvtm_gpu", "xin24m", 0, - RK2928_CLKGATE_CON(10), 8, GFLAGS), + RK2928_CLKGATE_CON(10), 1, GFLAGS), GATE(SCLK_PVTM_FUNC, "clk_pvtm_func", "xin24m", 0, - RK2928_CLKGATE_CON(10), 8, GFLAGS), + RK2928_CLKGATE_CON(10), 2, GFLAGS), GATE(SCLK_MIPI_24M, "clk_mipi_24m", "xin24m", CLK_IGNORE_UNUSED, - RK2928_CLKGATE_CON(10), 8, GFLAGS), + RK2928_CLKGATE_CON(2), 15, GFLAGS), COMPOSITE(SCLK_SDMMC, "sclk_sdmmc0", mux_mmc_src_p, 0, RK2928_CLKSEL_CON(11), 6, 2, MFLAGS, 0, 6, DFLAGS, -- cgit From 00e6751ffc9e6e0651e514961316fd15f0409683 Mon Sep 17 00:00:00 2001 From: Elaine Zhang Date: Fri, 1 Sep 2017 10:01:46 +0800 Subject: clk: rockchip: add sclk_timer5 as critical clock on rk3128 sclk_timer5 is for arm arch counter, so need always on. but no dts node to handle this clk, so make it as critical clock Signed-off-by: Elaine Zhang Signed-off-by: Heiko Stuebner --- drivers/clk/rockchip/clk-rk3128.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/clk/rockchip/clk-rk3128.c b/drivers/clk/rockchip/clk-rk3128.c index ce02d2cff608..5970a50671b9 100644 --- a/drivers/clk/rockchip/clk-rk3128.c +++ b/drivers/clk/rockchip/clk-rk3128.c @@ -578,6 +578,7 @@ static const char *const rk3128_critical_clocks[] __initconst = { "hclk_peri", "pclk_peri", "pclk_pmu", + "sclk_timer5", }; static struct rockchip_clk_provider *__init rk3128_common_clk_init(struct device_node *np) -- cgit From 27563cd9f8f52f09523e061985917c38f302bd0c Mon Sep 17 00:00:00 2001 From: Ludovic Desroches Date: Thu, 14 Sep 2017 17:28:12 +0300 Subject: ARM: dts: at91: sama5d27_som1_ek: update pinmux/pinconf for LEDs and USB There are some changes from the prototype board concerning LEDs and USB pins: - USBB power enable and red LED pins are inverted. - The polarity of LEDs is inverted too. Signed-off-by: Ludovic Desroches Signed-off-by: Claudiu Beznea Signed-off-by: Nicolas Ferre --- arch/arm/boot/dts/at91-sama5d27_som1_ek.dts | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/arm/boot/dts/at91-sama5d27_som1_ek.dts b/arch/arm/boot/dts/at91-sama5d27_som1_ek.dts index 9c9088c99cc4..f13ef4fbab60 100644 --- a/arch/arm/boot/dts/at91-sama5d27_som1_ek.dts +++ b/arch/arm/boot/dts/at91-sama5d27_som1_ek.dts @@ -67,7 +67,7 @@ usb1: ohci@00400000 { num-ports = <3>; - atmel,vbus-gpio = <&pioA PIN_PA10 GPIO_ACTIVE_HIGH>; + atmel,vbus-gpio = <&pioA PIN_PA27 GPIO_ACTIVE_HIGH>; pinctrl-names = "default"; pinctrl-0 = <&pinctrl_usb_default>; status = "okay"; @@ -330,7 +330,7 @@ }; pinctrl_led_gpio_default: led_gpio_default { - pinmux = , + pinmux = , , ; bias-pull-up; @@ -396,7 +396,7 @@ }; pinctrl_usb_default: usb_default { - pinmux = , + pinmux = , ; bias-disable; }; @@ -520,17 +520,17 @@ red { label = "red"; - gpios = <&pioA PIN_PA27 GPIO_ACTIVE_LOW>; + gpios = <&pioA PIN_PA10 GPIO_ACTIVE_HIGH>; }; green { label = "green"; - gpios = <&pioA PIN_PB1 GPIO_ACTIVE_LOW>; + gpios = <&pioA PIN_PB1 GPIO_ACTIVE_HIGH>; }; blue { label = "blue"; - gpios = <&pioA PIN_PA31 GPIO_ACTIVE_LOW>; + gpios = <&pioA PIN_PA31 GPIO_ACTIVE_HIGH>; linux,default-trigger = "heartbeat"; }; }; -- cgit From 5f506faa0de810f07af9345826fd588f61bb3b2f Mon Sep 17 00:00:00 2001 From: Ludovic Desroches Date: Thu, 14 Sep 2017 17:28:13 +0300 Subject: ARM: dts: at91: sama5d27_som1_ek: fix typos Fix typos that prevent proper using of uart2 and uart4 devices. Signed-off-by: Ludovic Desroches Signed-off-by: Claudiu Beznea Signed-off-by: Nicolas Ferre --- arch/arm/boot/dts/at91-sama5d27_som1_ek.dts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/boot/dts/at91-sama5d27_som1_ek.dts b/arch/arm/boot/dts/at91-sama5d27_som1_ek.dts index f13ef4fbab60..be5cd913f274 100644 --- a/arch/arm/boot/dts/at91-sama5d27_som1_ek.dts +++ b/arch/arm/boot/dts/at91-sama5d27_som1_ek.dts @@ -120,7 +120,7 @@ pinctrl-names = "default"; pinctrl-0 = <&pinctrl_mikrobus2_uart>; atmel,use-dma-rx; - atmel-use-dma-tx; + atmel,use-dma-tx; status = "okay"; }; @@ -178,7 +178,7 @@ uart4: serial@fc00c000 { atmel,use-dma-rx; atmel,use-dma-tx; - pinctrl-name = "default"; + pinctrl-names = "default"; pinctrl-0 = <&pinctrl_mikrobus1_uart>; status = "okay"; }; -- cgit From e025a3ac3460275bf86a4c5d02857eee14db4247 Mon Sep 17 00:00:00 2001 From: Nicolas Ferre Date: Thu, 14 Sep 2017 17:28:14 +0300 Subject: ARM: dts: at91: sama5d27_som1_ek: fix USB host vbus The USB host has 3 ports so we must specify the entries for each in the atmel,vbus-gpio property. The specified pin (PA27) is the vbus for USBB and not USBA. Signed-off-by: Nicolas Ferre [claudiu.beznea@microchip.com: change subject to match the desired prefix] Signed-off-by: Claudiu Beznea --- arch/arm/boot/dts/at91-sama5d27_som1_ek.dts | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/at91-sama5d27_som1_ek.dts b/arch/arm/boot/dts/at91-sama5d27_som1_ek.dts index be5cd913f274..60cb084a8d92 100644 --- a/arch/arm/boot/dts/at91-sama5d27_som1_ek.dts +++ b/arch/arm/boot/dts/at91-sama5d27_som1_ek.dts @@ -67,7 +67,10 @@ usb1: ohci@00400000 { num-ports = <3>; - atmel,vbus-gpio = <&pioA PIN_PA27 GPIO_ACTIVE_HIGH>; + atmel,vbus-gpio = <0 /* &pioA PIN_PD20 GPIO_ACTIVE_HIGH */ + &pioA PIN_PA27 GPIO_ACTIVE_HIGH + 0 + >; pinctrl-names = "default"; pinctrl-0 = <&pinctrl_usb_default>; status = "okay"; -- cgit From 093d79f62a89f47d9b5fd0746768146d9696535c Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Thu, 24 Aug 2017 13:44:54 +0200 Subject: ARM: at91: Replace uses of virt_to_phys with __pa_symbol The PM code wrongly uses virt_to_phys() instead of __pa_symbol() and was not updated by commit 64fc2a947a98 ("ARM: 8641/1: treewide: Replace uses of virt_to_phys with __pa_symbol") because it was not yet in tree. Signed-off-by: Alexandre Belloni Signed-off-by: Nicolas Ferre --- arch/arm/mach-at91/pm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/mach-at91/pm.c b/arch/arm/mach-at91/pm.c index 5036f996e694..849014c01cf4 100644 --- a/arch/arm/mach-at91/pm.c +++ b/arch/arm/mach-at91/pm.c @@ -533,8 +533,8 @@ static void __init at91_pm_backup_init(void) } pm_bu->suspended = 0; - pm_bu->canary = virt_to_phys(&canary); - pm_bu->resume = virt_to_phys(cpu_resume); + pm_bu->canary = __pa_symbol(&canary); + pm_bu->resume = __pa_symbol(cpu_resume); return; -- cgit From 2d48a237c8b61b457d146310d7e1e61224d0ca56 Mon Sep 17 00:00:00 2001 From: Thomas Meyer Date: Sat, 9 Sep 2017 06:02:46 +0200 Subject: ARC: reset: Only build on archs that have IOMEM This avoids the error: drivers/reset/reset-hsdk-v1.o: In function `hsdkv1_reset_probe': /home/thomas/git/linux/drivers/reset/reset-hsdk-v1.c:101: undefined reference to `devm_ioremap_resource' collect2: error: ld returned 1 exit status Signed-off-by: Thomas Meyer Signed-off-by: Philipp Zabel --- drivers/reset/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/reset/Kconfig b/drivers/reset/Kconfig index e0c393214264..d063bd5373dd 100644 --- a/drivers/reset/Kconfig +++ b/drivers/reset/Kconfig @@ -36,6 +36,7 @@ config RESET_BERLIN config RESET_HSDK_V1 bool "HSDK v1 Reset Driver" + depends on HAS_IOMEM default n help This enables the reset controller driver for HSDK v1. -- cgit From fc9655e65160936e32adae0d0e8aae25eb12c4e0 Mon Sep 17 00:00:00 2001 From: Eugeniy Paltsev Date: Thu, 14 Sep 2017 12:49:41 +0200 Subject: ARC: reset: add missing DT binding documentation for HSDKv1 reset driver When applying the original patch [1], the DT binding docs were lost. This patch adds them back. [1] https://patchwork.kernel.org/patch/9852997/ Fixes: e0be864f1424 ("ARC: reset: introduce HSDKv1 reset driver") Signed-off-by: Eugeniy Paltsev Signed-off-by: Philipp Zabel --- .../bindings/reset/snps,hsdk-v1-reset.txt | 28 ++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 Documentation/devicetree/bindings/reset/snps,hsdk-v1-reset.txt diff --git a/Documentation/devicetree/bindings/reset/snps,hsdk-v1-reset.txt b/Documentation/devicetree/bindings/reset/snps,hsdk-v1-reset.txt new file mode 100644 index 000000000000..6a68146ee353 --- /dev/null +++ b/Documentation/devicetree/bindings/reset/snps,hsdk-v1-reset.txt @@ -0,0 +1,28 @@ +Binding for the HSDK v1 reset controller + +This binding uses the common reset binding[1]. + +[1] Documentation/devicetree/bindings/reset/reset.txt + +Required properties: +- compatible: should be "snps,hsdk-v1.0-reset". +- reg: should always contain 2 pairs address - length: first for reset + configuration register and second for corresponding SW reset and status bits + register. +- #reset-cells: from common reset binding; Should always be set to 1. + +Example: + reset: reset@880 { + compatible = "snps,hsdk-v1.0-reset"; + #reset-cells = <1>; + reg = <0x8A0 0x4>, <0xFF0 0x4>; + }; + +Specifying reset lines connected to IP modules: + ethernet@.... { + .... + resets = <&reset HSDK_V1_ETH_RESET>; + .... + }; + +The index could be found in -- cgit From 70e743e4cec3733dc13559f6184b35d358b9ef3f Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 14 Sep 2017 16:52:59 +0200 Subject: uwb: ensure that endpoint is interrupt hwarc_neep_init() assumes that endpoint 0 is interrupt, but there's no check for that, which results in a WARNING in USB core code, when a bad USB descriptor is provided from a device: usb 1-1: BOGUS urb xfer, pipe 1 != type 3 ------------[ cut here ]------------ WARNING: CPU: 0 PID: 3 at drivers/usb/core/urb.c:449 usb_submit_urb+0xf8a/0x11d0 Modules linked in: CPU: 0 PID: 3 Comm: kworker/0:0 Not tainted 4.13.0+ #111 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Workqueue: usb_hub_wq hub_event task: ffff88006bdc1a00 task.stack: ffff88006bde8000 RIP: 0010:usb_submit_urb+0xf8a/0x11d0 drivers/usb/core/urb.c:448 RSP: 0018:ffff88006bdee3c0 EFLAGS: 00010282 RAX: 0000000000000029 RBX: ffff8800672a7200 RCX: 0000000000000000 RDX: 0000000000000029 RSI: ffff88006c815c78 RDI: ffffed000d7bdc6a RBP: ffff88006bdee4c0 R08: fffffbfff0fe00ff R09: fffffbfff0fe00ff R10: 0000000000000018 R11: fffffbfff0fe00fe R12: 1ffff1000d7bdc7f R13: 0000000000000003 R14: 0000000000000001 R15: ffff88006b02cc90 FS: 0000000000000000(0000) GS:ffff88006c800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fe4daddf000 CR3: 000000006add6000 CR4: 00000000000006f0 Call Trace: hwarc_neep_init+0x4ce/0x9c0 drivers/uwb/hwa-rc.c:710 uwb_rc_add+0x2fb/0x730 drivers/uwb/lc-rc.c:361 hwarc_probe+0x34e/0x9b0 drivers/uwb/hwa-rc.c:858 usb_probe_interface+0x351/0x8d0 drivers/usb/core/driver.c:361 really_probe drivers/base/dd.c:385 driver_probe_device+0x610/0xa00 drivers/base/dd.c:529 __device_attach_driver+0x230/0x290 drivers/base/dd.c:625 bus_for_each_drv+0x15e/0x210 drivers/base/bus.c:463 __device_attach+0x269/0x3c0 drivers/base/dd.c:682 device_initial_probe+0x1f/0x30 drivers/base/dd.c:729 bus_probe_device+0x1da/0x280 drivers/base/bus.c:523 device_add+0xcf9/0x1640 drivers/base/core.c:1703 usb_set_configuration+0x1064/0x1890 drivers/usb/core/message.c:1932 generic_probe+0x73/0xe0 drivers/usb/core/generic.c:174 usb_probe_device+0xaf/0xe0 drivers/usb/core/driver.c:266 really_probe drivers/base/dd.c:385 driver_probe_device+0x610/0xa00 drivers/base/dd.c:529 __device_attach_driver+0x230/0x290 drivers/base/dd.c:625 bus_for_each_drv+0x15e/0x210 drivers/base/bus.c:463 __device_attach+0x269/0x3c0 drivers/base/dd.c:682 device_initial_probe+0x1f/0x30 drivers/base/dd.c:729 bus_probe_device+0x1da/0x280 drivers/base/bus.c:523 device_add+0xcf9/0x1640 drivers/base/core.c:1703 usb_new_device+0x7b8/0x1020 drivers/usb/core/hub.c:2457 hub_port_connect drivers/usb/core/hub.c:4890 hub_port_connect_change drivers/usb/core/hub.c:4996 port_event drivers/usb/core/hub.c:5102 hub_event+0x23c8/0x37c0 drivers/usb/core/hub.c:5182 process_one_work+0x9fb/0x1570 kernel/workqueue.c:2097 worker_thread+0x1e4/0x1350 kernel/workqueue.c:2231 kthread+0x324/0x3f0 kernel/kthread.c:231 ret_from_fork+0x25/0x30 arch/x86/entry/entry_64.S:425 Code: 48 8b 85 30 ff ff ff 48 8d b8 98 00 00 00 e8 8e 93 07 ff 45 89 e8 44 89 f1 4c 89 fa 48 89 c6 48 c7 c7 a0 e5 55 86 e8 20 08 8f fd <0f> ff e9 9b f7 ff ff e8 4a 04 d6 fd e9 80 f7 ff ff e8 60 11 a6 ---[ end trace 55d741234124cfc3 ]--- Check that endpoint is interrupt. Found by syzkaller. Signed-off-by: Andrey Konovalov Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/uwb/hwa-rc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/uwb/hwa-rc.c b/drivers/uwb/hwa-rc.c index 35a1e777b449..9a53912bdfe9 100644 --- a/drivers/uwb/hwa-rc.c +++ b/drivers/uwb/hwa-rc.c @@ -825,6 +825,8 @@ static int hwarc_probe(struct usb_interface *iface, if (iface->cur_altsetting->desc.bNumEndpoints < 1) return -ENODEV; + if (!usb_endpoint_xfer_int(&iface->cur_altsetting->endpoint[0].desc)) + return -ENODEV; result = -ENOMEM; uwb_rc = uwb_rc_alloc(); -- cgit From bbf26183b7a6236ba602f4d6a2f7cade35bba043 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 14 Sep 2017 14:30:55 +0200 Subject: uwb: properly check kthread_run return value uwbd_start() calls kthread_run() and checks that the return value is not NULL. But the return value is not NULL in case kthread_run() fails, it takes the form of ERR_PTR(-EINTR). Use IS_ERR() instead. Also add a check to uwbd_stop(). Signed-off-by: Andrey Konovalov Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/uwb/uwbd.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/uwb/uwbd.c b/drivers/uwb/uwbd.c index 01c20a260a8b..39dd4ef53c77 100644 --- a/drivers/uwb/uwbd.c +++ b/drivers/uwb/uwbd.c @@ -302,18 +302,22 @@ static int uwbd(void *param) /** Start the UWB daemon */ void uwbd_start(struct uwb_rc *rc) { - rc->uwbd.task = kthread_run(uwbd, rc, "uwbd"); - if (rc->uwbd.task == NULL) + struct task_struct *task = kthread_run(uwbd, rc, "uwbd"); + if (IS_ERR(task)) { + rc->uwbd.task = NULL; printk(KERN_ERR "UWB: Cannot start management daemon; " "UWB won't work\n"); - else + } else { + rc->uwbd.task = task; rc->uwbd.pid = rc->uwbd.task->pid; + } } /* Stop the UWB daemon and free any unprocessed events */ void uwbd_stop(struct uwb_rc *rc) { - kthread_stop(rc->uwbd.task); + if (rc->uwbd.task) + kthread_stop(rc->uwbd.task); uwbd_flush(rc); } -- cgit From b2a542bbb3081dbd64acc8929c140d196664c406 Mon Sep 17 00:00:00 2001 From: Dmitry Fleytman Date: Tue, 5 Sep 2017 11:40:56 +0300 Subject: usb: Increase quirk delay for USB devices Commit e0429362ab15 ("usb: Add device quirk for Logitech HD Pro Webcams C920 and C930e") introduced quirk to workaround an issue with some Logitech webcams. The workaround is introducing delay for some USB operations. According to our testing, delay introduced by original commit is not long enough and in rare cases we still see issues described by the aforementioned commit. This patch increases delays introduced by original commit. Having this patch applied we do not see those problems anymore. Signed-off-by: Dmitry Fleytman Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/config.c | 2 +- drivers/usb/core/hub.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/usb/core/config.c b/drivers/usb/core/config.c index 4be52c602e9b..854c8d66cfbe 100644 --- a/drivers/usb/core/config.c +++ b/drivers/usb/core/config.c @@ -852,7 +852,7 @@ int usb_get_configuration(struct usb_device *dev) } if (dev->quirks & USB_QUIRK_DELAY_INIT) - msleep(100); + msleep(200); result = usb_get_descriptor(dev, USB_DT_CONFIG, cfgno, bigbuffer, length); diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index 41eaf0b52518..b5c733613823 100644 --- a/drivers/usb/core/hub.c +++ b/drivers/usb/core/hub.c @@ -4838,7 +4838,7 @@ static void hub_port_connect(struct usb_hub *hub, int port1, u16 portstatus, goto loop; if (udev->quirks & USB_QUIRK_DELAY_INIT) - msleep(1000); + msleep(2000); /* consecutive bus-powered hubs aren't reliable; they can * violate the voltage drop budget. if the new child has -- cgit From 0a51fb7174f2b7866b4d7a4a5c23b685b674beb6 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Sat, 9 Sep 2017 12:19:22 +0300 Subject: quota: add missing lock into __dquot_transfer() Lock dq_dqb_lock around dquot_decr_inodes() Signed-off-by: Konstantin Khlebnikov Fixes: 7b9ca4c61bc2 ("quota: Reduce contention on dq_data_lock") Signed-off-by: Jan Kara --- fs/quota/dquot.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 8381db9db6d9..50b0556a124f 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -1980,7 +1980,9 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) ret = dquot_add_space(transfer_to[cnt], cur_space, rsv_space, 0, &warn_to[cnt]); if (ret) { + spin_lock(&transfer_to[cnt]->dq_dqb_lock); dquot_decr_inodes(transfer_to[cnt], inode_usage); + spin_unlock(&transfer_to[cnt]->dq_dqb_lock); goto over_quota; } } -- cgit From 0ab0b271bf75073cb254b5ea0593aceae5a42bd3 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sat, 16 Sep 2017 22:32:12 +0200 Subject: isofs: fix build regression The new isofs_show_options() function fails to build when CONFIG_NLS is disabled: fs/isofs/inode.c: In function 'isofs_show_options': fs/isofs/inode.c:518:44: error: 'CONFIG_NLS_DEFAULT' undeclared (first use in this function) fs/isofs/inode.c:518:44: note: each undeclared identifier is reported only once for each function it appears in This adds a check for CONFIG_JOLIET (which selects NLS), matching the other uses of the iocharset handling in this file. Fixes: 6fecb86a44f5 ("isofs: Implement show_options") Signed-off-by: Arnd Bergmann Signed-off-by: Jan Kara --- fs/isofs/inode.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index db692f554158..447a24d77b89 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -514,9 +514,11 @@ static int isofs_show_options(struct seq_file *m, struct dentry *root) if (sbi->s_fmode != ISOFS_INVALID_MODE) seq_printf(m, ",fmode=%o", sbi->s_fmode); +#ifdef CONFIG_JOLIET if (sbi->s_nls_iocharset && strcmp(sbi->s_nls_iocharset->charset, CONFIG_NLS_DEFAULT) != 0) seq_printf(m, ",iocharset=%s", sbi->s_nls_iocharset->charset); +#endif return 0; } -- cgit From 056e4fc2018364ba01a23a1ced0ccbbdfa4520b3 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 15 Sep 2017 21:23:13 +0200 Subject: staging: unisys/visorbus: add __init/__exit annotations gcc-4.6 causes a harmless warning about the init function: WARNING: vmlinux.o(.text+0xed62c2): Section mismatch in reference from the function init_unisys() to the function .init.text:visorutil_spar_detect() The function init_unisys() references the function __init visorutil_spar_detect(). This is often because init_unisys lacks a __init annotation or the annotation of visorutil_spar_detect is wrong. It appears that newer versions inline visorutil_spar_detect(), end up with an empty __init section. This marks the module entry points as __init and __exit respectively, which avoids the warning and slightly reduces the runtime code size. Signed-off-by: Arnd Bergmann Signed-off-by: Greg Kroah-Hartman --- drivers/staging/unisys/visorbus/visorchipset.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/staging/unisys/visorbus/visorchipset.c b/drivers/staging/unisys/visorbus/visorchipset.c index 74cce4f1a7bd..27ecf6fb49fd 100644 --- a/drivers/staging/unisys/visorbus/visorchipset.c +++ b/drivers/staging/unisys/visorbus/visorchipset.c @@ -1826,7 +1826,7 @@ static __init int visorutil_spar_detect(void) return 0; } -static int init_unisys(void) +static int __init init_unisys(void) { int result; @@ -1841,7 +1841,7 @@ static int init_unisys(void) return 0; }; -static void exit_unisys(void) +static void __exit exit_unisys(void) { acpi_bus_unregister_driver(&unisys_acpi_driver); } -- cgit From a3563b09f132661d447f69224ef65fdec02f5c61 Mon Sep 17 00:00:00 2001 From: Arun Nagendran Date: Fri, 15 Sep 2017 12:30:32 -0400 Subject: staging: mt29f_spinand: Enable the read ECC before program the page Current program_page function did following operation: 1. read page (with ECC OFF) 2. modify the page 3. write the page (with ECC ON) For some case(buggy flash Chip), while read the page without ECC ON, we may read the page with bit flip error and modify that bad page without knowing the bit flip error on that page. also we re-calculate the hash for bad page and write it. This could bring potential in-consistency problem with Flash data. Verify this logic with GIGA DEVICE Part(GD5F2GQ4RCFIG): we see this in-conststency problem wit Giga Device and fix on this patch resovle that issue. Signed-off-by: Arun Nagendran Signed-off-by: Greg Kroah-Hartman --- drivers/staging/mt29f_spinand/mt29f_spinand.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/staging/mt29f_spinand/mt29f_spinand.c b/drivers/staging/mt29f_spinand/mt29f_spinand.c index 13eaf16ecd16..87595c594b12 100644 --- a/drivers/staging/mt29f_spinand/mt29f_spinand.c +++ b/drivers/staging/mt29f_spinand/mt29f_spinand.c @@ -496,8 +496,12 @@ static int spinand_program_page(struct spi_device *spi_nand, if (!wbuf) return -ENOMEM; - enable_read_hw_ecc = 0; - spinand_read_page(spi_nand, page_id, 0, CACHE_BUF, wbuf); + enable_read_hw_ecc = 1; + retval = spinand_read_page(spi_nand, page_id, 0, CACHE_BUF, wbuf); + if (retval < 0) { + dev_err(&spi_nand->dev, "ecc error on read page!!!\n"); + return retval; + } for (i = offset, j = 0; i < len; i++, j++) wbuf[i] &= buf[j]; -- cgit From e1bf28868ab0bfd1fc73dd8d5e642d88456c30e9 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 8 Sep 2017 14:55:06 +0100 Subject: staging: r8822be: fix null pointer dereferences with a null driver_adapter The call to _rtl_dbg_trace via macro HALMAC_RT_TRACE will trigger a null pointer deference on a null driver_adapter. Fix this by assigning driver_adapter earlier to halmac_adapter->driver_adapter before the tracing call so that a non-null driver_adapter is passed instead. I should have spotted these with an earlier patch I sent, but I overlooked these in the rather large CoverityScan logs. Detected by CoverityScan, CID#1454550, CID#1454554, CID#1454565, CID#1454591, CID#1454598 ("Explicit null dereferenced") Fixes: 938a0447f094 ("staging: r8822be: Add code for halmac sub-driver") Signed-off-by: Colin Ian King Signed-off-by: Greg Kroah-Hartman --- .../staging/rtlwifi/halmac/halmac_88xx/halmac_api_88xx.c | 4 ++-- .../staging/rtlwifi/halmac/halmac_88xx/halmac_func_88xx.c | 13 +++++++------ 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/staging/rtlwifi/halmac/halmac_88xx/halmac_api_88xx.c b/drivers/staging/rtlwifi/halmac/halmac_88xx/halmac_api_88xx.c index 5f84526cb5b5..edbf6af1c8b7 100644 --- a/drivers/staging/rtlwifi/halmac/halmac_88xx/halmac_api_88xx.c +++ b/drivers/staging/rtlwifi/halmac/halmac_88xx/halmac_api_88xx.c @@ -2901,11 +2901,11 @@ halmac_update_datapack_88xx(struct halmac_adapter *halmac_adapter, if (halmac_adapter->fw_version.h2c_version < 4) return HALMAC_RET_FW_NO_SUPPORT; + driver_adapter = halmac_adapter->driver_adapter; + HALMAC_RT_TRACE(driver_adapter, HALMAC_MSG_H2C, DBG_DMESG, "[TRACE]%s ==========>\n", __func__); - driver_adapter = halmac_adapter->driver_adapter; - HALMAC_RT_TRACE(driver_adapter, HALMAC_MSG_H2C, DBG_DMESG, "[TRACE]%s <==========\n", __func__); diff --git a/drivers/staging/rtlwifi/halmac/halmac_88xx/halmac_func_88xx.c b/drivers/staging/rtlwifi/halmac/halmac_88xx/halmac_func_88xx.c index f33024e4d853..544f638ed3ef 100644 --- a/drivers/staging/rtlwifi/halmac/halmac_88xx/halmac_func_88xx.c +++ b/drivers/staging/rtlwifi/halmac/halmac_88xx/halmac_func_88xx.c @@ -1618,10 +1618,11 @@ halmac_send_h2c_set_pwr_mode_88xx(struct halmac_adapter *halmac_adapter, void *driver_adapter = NULL; enum halmac_ret_status status = HALMAC_RET_SUCCESS; + driver_adapter = halmac_adapter->driver_adapter; + HALMAC_RT_TRACE(driver_adapter, HALMAC_MSG_H2C, DBG_DMESG, "%s!!\n", __func__); - driver_adapter = halmac_adapter->driver_adapter; h2c_header = h2c_buff; h2c_cmd = h2c_header + HALMAC_H2C_CMD_HDR_SIZE_88XX; @@ -1713,10 +1714,11 @@ halmac_media_status_rpt_88xx(struct halmac_adapter *halmac_adapter, u8 op_mode, void *driver_adapter = NULL; enum halmac_ret_status status = HALMAC_RET_SUCCESS; + driver_adapter = halmac_adapter->driver_adapter; + HALMAC_RT_TRACE(driver_adapter, HALMAC_MSG_H2C, DBG_DMESG, "halmac_send_h2c_set_pwr_mode_88xx!!\n"); - driver_adapter = halmac_adapter->driver_adapter; h2c_header = H2c_buff; h2c_cmd = h2c_header + HALMAC_H2C_CMD_HDR_SIZE_88XX; @@ -2143,10 +2145,11 @@ halmac_func_ctrl_ch_switch_88xx(struct halmac_adapter *halmac_adapter, enum halmac_cmd_process_status *process_status = &halmac_adapter->halmac_state.scan_state_set.process_status; + driver_adapter = halmac_adapter->driver_adapter; + HALMAC_RT_TRACE(driver_adapter, HALMAC_MSG_H2C, DBG_DMESG, "halmac_ctrl_ch_switch!!\n"); - driver_adapter = halmac_adapter->driver_adapter; halmac_api = (struct halmac_api *)halmac_adapter->halmac_api; if (halmac_transition_scan_state_88xx( @@ -2276,15 +2279,13 @@ enum halmac_ret_status halmac_send_h2c_update_bcn_parse_info_88xx( { u8 h2c_buff[HALMAC_H2C_CMD_SIZE_88XX] = {0}; u16 h2c_seq_mum = 0; - void *driver_adapter = NULL; + void *driver_adapter = halmac_adapter->driver_adapter; struct halmac_h2c_header_info h2c_header_info; enum halmac_ret_status status = HALMAC_RET_SUCCESS; HALMAC_RT_TRACE(driver_adapter, HALMAC_MSG_H2C, DBG_DMESG, "%s!!\n", __func__); - driver_adapter = halmac_adapter->driver_adapter; - UPDATE_BEACON_PARSING_INFO_SET_FUNC_EN(h2c_buff, bcn_ie_info->func_en); UPDATE_BEACON_PARSING_INFO_SET_SIZE_TH(h2c_buff, bcn_ie_info->size_th); UPDATE_BEACON_PARSING_INFO_SET_TIMEOUT(h2c_buff, bcn_ie_info->timeout); -- cgit From b72703e26b9478da531144ce5c4552dd22f1103d Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 6 Sep 2017 17:40:25 +0200 Subject: staging: pi433: Move limit check to switch default to kill warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With gcc-4.1.2: drivers/staging/pi433/rf69.c: In function ‘rf69_set_dio_mapping’: drivers/staging/pi433/rf69.c:566: warning: ‘regaddr’ may be used uninitialized in this function drivers/staging/pi433/rf69.c:565: warning: ‘shift’ may be used uninitialized in this function drivers/staging/pi433/rf69.c:564: warning: ‘mask’ may be used uninitialized in this function While this is a false positive, it can easily be fixed by moving the limit check into the "default" case of the switch statement. Signed-off-by: Geert Uytterhoeven Signed-off-by: Greg Kroah-Hartman --- drivers/staging/pi433/rf69.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/staging/pi433/rf69.c b/drivers/staging/pi433/rf69.c index c4b1b218ea38..290b419aa9dd 100644 --- a/drivers/staging/pi433/rf69.c +++ b/drivers/staging/pi433/rf69.c @@ -570,12 +570,6 @@ int rf69_set_dio_mapping(struct spi_device *spi, u8 DIONumber, u8 value) dev_dbg(&spi->dev, "set: DIO mapping"); #endif - // check DIO number - if (DIONumber > 5) { - dev_dbg(&spi->dev, "set: illegal input param"); - return -EINVAL; - } - switch (DIONumber) { case 0: mask=MASK_DIO0; shift=SHIFT_DIO0; regaddr=REG_DIOMAPPING1; break; case 1: mask=MASK_DIO1; shift=SHIFT_DIO1; regaddr=REG_DIOMAPPING1; break; @@ -583,6 +577,9 @@ int rf69_set_dio_mapping(struct spi_device *spi, u8 DIONumber, u8 value) case 3: mask=MASK_DIO3; shift=SHIFT_DIO3; regaddr=REG_DIOMAPPING1; break; case 4: mask=MASK_DIO4; shift=SHIFT_DIO4; regaddr=REG_DIOMAPPING2; break; case 5: mask=MASK_DIO5; shift=SHIFT_DIO5; regaddr=REG_DIOMAPPING2; break; + default: + dev_dbg(&spi->dev, "set: illegal input param"); + return -EINVAL; } // read reg -- cgit From e5f5d0e20b6cecc0ebe6fc8e7df6f8823ad2d594 Mon Sep 17 00:00:00 2001 From: Okash Khawaja Date: Tue, 5 Sep 2017 12:51:59 +0100 Subject: staging: speakup: fix speakup-r empty line lockup When cursor is at beginning of an empty or whitespace-only line and speakup-r typed, kernel locks up. This happens because deadlock of in input_event function over dev->event_lock, as demonstrated by lockdep logs. The reason for that is speakup simulates a down arrow - because cursor is at an empty line - while inside key press notifier handler which is ultimately triggered from input_event function. The simulated key press leads to input_event being called again, this time under its own context. So the spinlock is dev->event_lock is acquired while still being held. This patch ensures that key press is not simulated from inside key press notifier handler. Instead it delegates to cursor_timer. It starts the timer and passes RA_DOWN_ARROW as argument. When timer handler runs and sees RA_DOWN_ARROW, it will then call kbd_fakekey2(RA_DOWN_ARROW) which will correctly simulate the keypress inside timer context. When not inside key press notifier callback, the behaviour will remain the same as before this patch. Signed-off-by: Okash Khawaja Reviewed-by: Samuel Thibault Signed-off-by: Greg Kroah-Hartman --- drivers/staging/speakup/main.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/staging/speakup/main.c b/drivers/staging/speakup/main.c index 67956e24779c..56f7be6af1f6 100644 --- a/drivers/staging/speakup/main.c +++ b/drivers/staging/speakup/main.c @@ -1376,6 +1376,8 @@ static void reset_highlight_buffers(struct vc_data *); static int read_all_key; +static int in_keyboard_notifier; + static void start_read_all_timer(struct vc_data *vc, int command); enum { @@ -1408,7 +1410,10 @@ static void read_all_doc(struct vc_data *vc) cursor_track = read_all_mode; spk_reset_index_count(0); if (get_sentence_buf(vc, 0) == -1) { - kbd_fakekey2(vc, RA_DOWN_ARROW); + del_timer(&cursor_timer); + if (!in_keyboard_notifier) + speakup_fake_down_arrow(); + start_read_all_timer(vc, RA_DOWN_ARROW); } else { say_sentence_num(0, 0); synth_insert_next_index(0); @@ -2212,8 +2217,10 @@ static int keyboard_notifier_call(struct notifier_block *nb, int ret = NOTIFY_OK; static int keycode; /* to hold the current keycode */ + in_keyboard_notifier = 1; + if (vc->vc_mode == KD_GRAPHICS) - return ret; + goto out; /* * First, determine whether we are handling a fake keypress on @@ -2225,7 +2232,7 @@ static int keyboard_notifier_call(struct notifier_block *nb, */ if (speakup_fake_key_pressed()) - return ret; + goto out; switch (code) { case KBD_KEYCODE: @@ -2266,6 +2273,8 @@ static int keyboard_notifier_call(struct notifier_block *nb, break; } } +out: + in_keyboard_notifier = 0; return ret; } -- cgit From 974d4d03fc020af4fa4e9e72a86f0fefa37803c5 Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Sun, 3 Sep 2017 19:06:31 +0200 Subject: staging: vchiq_2835_arm: Fix NULL ptr dereference in free_pagelist This fixes a NULL pointer dereference on RPi 2 with multi_v7_defconfig. The function page_address() could return NULL with enabled CONFIG_HIGHMEM. So fix this by using kmap() instead. Signed-off-by: Stefan Wahren Fixes: 71bad7f08641 ("staging: add bcm2708 vchiq driver") Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/staging/vc04_services/interface/vchiq_arm/vchiq_2835_arm.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_2835_arm.c b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_2835_arm.c index 0159ca4407d8..be08849175ea 100644 --- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_2835_arm.c +++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_2835_arm.c @@ -612,18 +612,20 @@ free_pagelist(struct vchiq_pagelist_info *pagelistinfo, if (head_bytes > actual) head_bytes = actual; - memcpy((char *)page_address(pages[0]) + + memcpy((char *)kmap(pages[0]) + pagelist->offset, fragments, head_bytes); + kunmap(pages[0]); } if ((actual >= 0) && (head_bytes < actual) && (tail_bytes != 0)) { - memcpy((char *)page_address(pages[num_pages - 1]) + + memcpy((char *)kmap(pages[num_pages - 1]) + ((pagelist->offset + actual) & (PAGE_SIZE - 1) & ~(g_cache_line_size - 1)), fragments + g_cache_line_size, tail_bytes); + kunmap(pages[num_pages - 1]); } down(&g_free_fragments_mutex); -- cgit From 55168470835688e5da5828cdcf1b1498d7baadb1 Mon Sep 17 00:00:00 2001 From: Felipe Balbi Date: Mon, 11 Sep 2017 10:45:12 +0300 Subject: usb: dwc3: ep0: fix DMA starvation by assigning req->trb on ep0 If we don't assign a TRB to ep0 requests, we won't be able to unmap the request later on resulting in starvation of DMA resources. Fixes: 4a71fcb8ac5f ("usb: dwc3: gadget: only unmap requests from DMA if mapped") Reported-by: Thinh Nguyen Tested-by: Thinh Nguyen Signed-off-by: Felipe Balbi --- drivers/usb/dwc3/ep0.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/usb/dwc3/ep0.c b/drivers/usb/dwc3/ep0.c index 827e376bfa97..75e6cb044eb2 100644 --- a/drivers/usb/dwc3/ep0.c +++ b/drivers/usb/dwc3/ep0.c @@ -990,6 +990,8 @@ static void __dwc3_ep0_do_control_data(struct dwc3 *dwc, DWC3_TRBCTL_CONTROL_DATA, true); + req->trb = &dwc->ep0_trb[dep->trb_enqueue - 1]; + /* Now prepare one extra TRB to align transfer size */ dwc3_ep0_prepare_one_trb(dep, dwc->bounce_addr, maxpacket - rem, @@ -1015,6 +1017,8 @@ static void __dwc3_ep0_do_control_data(struct dwc3 *dwc, DWC3_TRBCTL_CONTROL_DATA, true); + req->trb = &dwc->ep0_trb[dep->trb_enqueue - 1]; + /* Now prepare one extra TRB to align transfer size */ dwc3_ep0_prepare_one_trb(dep, dwc->bounce_addr, 0, DWC3_TRBCTL_CONTROL_DATA, @@ -1029,6 +1033,9 @@ static void __dwc3_ep0_do_control_data(struct dwc3 *dwc, dwc3_ep0_prepare_one_trb(dep, req->request.dma, req->request.length, DWC3_TRBCTL_CONTROL_DATA, false); + + req->trb = &dwc->ep0_trb[dep->trb_enqueue]; + ret = dwc3_ep0_start_trans(dep); } -- cgit From 13541226dc056fa3f54417ce12f18ba711a1591c Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Thu, 31 Aug 2017 11:06:07 -0700 Subject: ARC: reset: remove the misleading v1 suffix all over There is no plan yet to do a v2 board. And even if we were to do it only some IPs would actually change, so it be best to add suffixes at that point, not now ! Signed-off-by: Vineet Gupta Signed-off-by: Philipp Zabel --- .../devicetree/bindings/reset/snps,hsdk-reset.txt | 28 +++++ .../bindings/reset/snps,hsdk-v1-reset.txt | 28 ----- MAINTAINERS | 6 +- drivers/reset/Kconfig | 6 +- drivers/reset/Makefile | 2 +- drivers/reset/reset-hsdk-v1.c | 137 --------------------- drivers/reset/reset-hsdk.c | 137 +++++++++++++++++++++ include/dt-bindings/reset/snps,hsdk-reset.h | 17 +++ include/dt-bindings/reset/snps,hsdk-v1-reset.h | 17 --- 9 files changed, 189 insertions(+), 189 deletions(-) create mode 100644 Documentation/devicetree/bindings/reset/snps,hsdk-reset.txt delete mode 100644 Documentation/devicetree/bindings/reset/snps,hsdk-v1-reset.txt delete mode 100644 drivers/reset/reset-hsdk-v1.c create mode 100644 drivers/reset/reset-hsdk.c create mode 100644 include/dt-bindings/reset/snps,hsdk-reset.h delete mode 100644 include/dt-bindings/reset/snps,hsdk-v1-reset.h diff --git a/Documentation/devicetree/bindings/reset/snps,hsdk-reset.txt b/Documentation/devicetree/bindings/reset/snps,hsdk-reset.txt new file mode 100644 index 000000000000..830069b1c37c --- /dev/null +++ b/Documentation/devicetree/bindings/reset/snps,hsdk-reset.txt @@ -0,0 +1,28 @@ +Binding for the Synopsys HSDK reset controller + +This binding uses the common reset binding[1]. + +[1] Documentation/devicetree/bindings/reset/reset.txt + +Required properties: +- compatible: should be "snps,hsdk-reset". +- reg: should always contain 2 pairs address - length: first for reset + configuration register and second for corresponding SW reset and status bits + register. +- #reset-cells: from common reset binding; Should always be set to 1. + +Example: + reset: reset@880 { + compatible = "snps,hsdk-reset"; + #reset-cells = <1>; + reg = <0x8A0 0x4>, <0xFF0 0x4>; + }; + +Specifying reset lines connected to IP modules: + ethernet@.... { + .... + resets = <&reset HSDK_V1_ETH_RESET>; + .... + }; + +The index could be found in diff --git a/Documentation/devicetree/bindings/reset/snps,hsdk-v1-reset.txt b/Documentation/devicetree/bindings/reset/snps,hsdk-v1-reset.txt deleted file mode 100644 index 6a68146ee353..000000000000 --- a/Documentation/devicetree/bindings/reset/snps,hsdk-v1-reset.txt +++ /dev/null @@ -1,28 +0,0 @@ -Binding for the HSDK v1 reset controller - -This binding uses the common reset binding[1]. - -[1] Documentation/devicetree/bindings/reset/reset.txt - -Required properties: -- compatible: should be "snps,hsdk-v1.0-reset". -- reg: should always contain 2 pairs address - length: first for reset - configuration register and second for corresponding SW reset and status bits - register. -- #reset-cells: from common reset binding; Should always be set to 1. - -Example: - reset: reset@880 { - compatible = "snps,hsdk-v1.0-reset"; - #reset-cells = <1>; - reg = <0x8A0 0x4>, <0xFF0 0x4>; - }; - -Specifying reset lines connected to IP modules: - ethernet@.... { - .... - resets = <&reset HSDK_V1_ETH_RESET>; - .... - }; - -The index could be found in diff --git a/MAINTAINERS b/MAINTAINERS index 2281af4b41b6..3b9887b3644e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12915,9 +12915,9 @@ F: drivers/mmc/host/dw_mmc* SYNOPSYS HSDK RESET CONTROLLER DRIVER M: Eugeniy Paltsev S: Supported -F: drivers/reset/reset-hsdk-v1.c -F: include/dt-bindings/reset/snps,hsdk-v1-reset.h -F: Documentation/devicetree/bindings/reset/snps,hsdk-v1-reset.txt +F: drivers/reset/reset-hsdk.c +F: include/dt-bindings/reset/snps,hsdk-reset.h +F: Documentation/devicetree/bindings/reset/snps,hsdk-reset.txt SYSTEM CONFIGURATION (SYSCON) M: Lee Jones diff --git a/drivers/reset/Kconfig b/drivers/reset/Kconfig index d063bd5373dd..a7c7d5a8c089 100644 --- a/drivers/reset/Kconfig +++ b/drivers/reset/Kconfig @@ -34,12 +34,12 @@ config RESET_BERLIN help This enables the reset controller driver for Marvell Berlin SoCs. -config RESET_HSDK_V1 - bool "HSDK v1 Reset Driver" +config RESET_HSDK + bool "Synopsys HSDK Reset Driver" depends on HAS_IOMEM default n help - This enables the reset controller driver for HSDK v1. + This enables the reset controller driver for HSDK board. config RESET_IMX7 bool "i.MX7 Reset Driver" if COMPILE_TEST diff --git a/drivers/reset/Makefile b/drivers/reset/Makefile index d368367110e5..af1c15c330b3 100644 --- a/drivers/reset/Makefile +++ b/drivers/reset/Makefile @@ -5,7 +5,7 @@ obj-$(CONFIG_ARCH_TEGRA) += tegra/ obj-$(CONFIG_RESET_A10SR) += reset-a10sr.o obj-$(CONFIG_RESET_ATH79) += reset-ath79.o obj-$(CONFIG_RESET_BERLIN) += reset-berlin.o -obj-$(CONFIG_RESET_HSDK_V1) += reset-hsdk-v1.o +obj-$(CONFIG_RESET_HSDK) += reset-hsdk.o obj-$(CONFIG_RESET_IMX7) += reset-imx7.o obj-$(CONFIG_RESET_LANTIQ) += reset-lantiq.o obj-$(CONFIG_RESET_LPC18XX) += reset-lpc18xx.o diff --git a/drivers/reset/reset-hsdk-v1.c b/drivers/reset/reset-hsdk-v1.c deleted file mode 100644 index bca13e4bf622..000000000000 --- a/drivers/reset/reset-hsdk-v1.c +++ /dev/null @@ -1,137 +0,0 @@ -/* - * Copyright (C) 2017 Synopsys. - * - * Synopsys HSDKv1 SDP reset driver. - * - * This file is licensed under the terms of the GNU General Public - * License version 2. This program is licensed "as is" without any - * warranty of any kind, whether express or implied. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define to_hsdkv1_rst(p) container_of((p), struct hsdkv1_rst, rcdev) - -struct hsdkv1_rst { - void __iomem *regs_ctl; - void __iomem *regs_rst; - spinlock_t lock; - struct reset_controller_dev rcdev; -}; - -static const u32 rst_map[] = { - BIT(16), /* APB_RST */ - BIT(17), /* AXI_RST */ - BIT(18), /* ETH_RST */ - BIT(19), /* USB_RST */ - BIT(20), /* SDIO_RST */ - BIT(21), /* HDMI_RST */ - BIT(22), /* GFX_RST */ - BIT(25), /* DMAC_RST */ - BIT(31), /* EBI_RST */ -}; - -#define HSDK_MAX_RESETS ARRAY_SIZE(rst_map) - -#define CGU_SYS_RST_CTRL 0x0 -#define CGU_IP_SW_RESET 0x0 -#define CGU_IP_SW_RESET_DELAY_SHIFT 16 -#define CGU_IP_SW_RESET_DELAY_MASK GENMASK(31, CGU_IP_SW_RESET_DELAY_SHIFT) -#define CGU_IP_SW_RESET_DELAY 0 -#define CGU_IP_SW_RESET_RESET BIT(0) -#define SW_RESET_TIMEOUT 10000 - -static void hsdkv1_reset_config(struct hsdkv1_rst *rst, unsigned long id) -{ - writel(rst_map[id], rst->regs_ctl + CGU_SYS_RST_CTRL); -} - -static int hsdkv1_reset_do(struct hsdkv1_rst *rst) -{ - u32 reg; - - reg = readl(rst->regs_rst + CGU_IP_SW_RESET); - reg &= ~CGU_IP_SW_RESET_DELAY_MASK; - reg |= CGU_IP_SW_RESET_DELAY << CGU_IP_SW_RESET_DELAY_SHIFT; - reg |= CGU_IP_SW_RESET_RESET; - writel(reg, rst->regs_rst + CGU_IP_SW_RESET); - - /* wait till reset bit is back to 0 */ - return readl_poll_timeout_atomic(rst->regs_rst + CGU_IP_SW_RESET, reg, - !(reg & CGU_IP_SW_RESET_RESET), 5, SW_RESET_TIMEOUT); -} - -static int hsdkv1_reset_reset(struct reset_controller_dev *rcdev, - unsigned long id) -{ - struct hsdkv1_rst *rst = to_hsdkv1_rst(rcdev); - unsigned long flags; - int ret; - - spin_lock_irqsave(&rst->lock, flags); - hsdkv1_reset_config(rst, id); - ret = hsdkv1_reset_do(rst); - spin_unlock_irqrestore(&rst->lock, flags); - - return ret; -} - -static const struct reset_control_ops hsdkv1_reset_ops = { - .reset = hsdkv1_reset_reset, -}; - -static int hsdkv1_reset_probe(struct platform_device *pdev) -{ - struct hsdkv1_rst *rst; - struct resource *mem; - - rst = devm_kzalloc(&pdev->dev, sizeof(*rst), GFP_KERNEL); - if (!rst) - return -ENOMEM; - - mem = platform_get_resource(pdev, IORESOURCE_MEM, 0); - rst->regs_ctl = devm_ioremap_resource(&pdev->dev, mem); - if (IS_ERR(rst->regs_ctl)) - return PTR_ERR(rst->regs_ctl); - - mem = platform_get_resource(pdev, IORESOURCE_MEM, 1); - rst->regs_rst = devm_ioremap_resource(&pdev->dev, mem); - if (IS_ERR(rst->regs_rst)) - return PTR_ERR(rst->regs_rst); - - spin_lock_init(&rst->lock); - - rst->rcdev.owner = THIS_MODULE; - rst->rcdev.ops = &hsdkv1_reset_ops; - rst->rcdev.of_node = pdev->dev.of_node; - rst->rcdev.nr_resets = HSDK_MAX_RESETS; - rst->rcdev.of_reset_n_cells = 1; - - return reset_controller_register(&rst->rcdev); -} - -static const struct of_device_id hsdkv1_reset_dt_match[] = { - { .compatible = "snps,hsdk-v1.0-reset" }, - { }, -}; - -static struct platform_driver hsdkv1_reset_driver = { - .probe = hsdkv1_reset_probe, - .driver = { - .name = "hsdk-v1.0-reset", - .of_match_table = hsdkv1_reset_dt_match, - }, -}; -builtin_platform_driver(hsdkv1_reset_driver); - -MODULE_AUTHOR("Eugeniy Paltsev "); -MODULE_DESCRIPTION("Synopsys HSDKv1 SDP reset driver"); -MODULE_LICENSE("GPL v2"); diff --git a/drivers/reset/reset-hsdk.c b/drivers/reset/reset-hsdk.c new file mode 100644 index 000000000000..8bce391c6943 --- /dev/null +++ b/drivers/reset/reset-hsdk.c @@ -0,0 +1,137 @@ +/* + * Copyright (C) 2017 Synopsys. + * + * Synopsys HSDK Development platform reset driver. + * + * This file is licensed under the terms of the GNU General Public + * License version 2. This program is licensed "as is" without any + * warranty of any kind, whether express or implied. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define to_hsdk_rst(p) container_of((p), struct hsdk_rst, rcdev) + +struct hsdk_rst { + void __iomem *regs_ctl; + void __iomem *regs_rst; + spinlock_t lock; + struct reset_controller_dev rcdev; +}; + +static const u32 rst_map[] = { + BIT(16), /* APB_RST */ + BIT(17), /* AXI_RST */ + BIT(18), /* ETH_RST */ + BIT(19), /* USB_RST */ + BIT(20), /* SDIO_RST */ + BIT(21), /* HDMI_RST */ + BIT(22), /* GFX_RST */ + BIT(25), /* DMAC_RST */ + BIT(31), /* EBI_RST */ +}; + +#define HSDK_MAX_RESETS ARRAY_SIZE(rst_map) + +#define CGU_SYS_RST_CTRL 0x0 +#define CGU_IP_SW_RESET 0x0 +#define CGU_IP_SW_RESET_DELAY_SHIFT 16 +#define CGU_IP_SW_RESET_DELAY_MASK GENMASK(31, CGU_IP_SW_RESET_DELAY_SHIFT) +#define CGU_IP_SW_RESET_DELAY 0 +#define CGU_IP_SW_RESET_RESET BIT(0) +#define SW_RESET_TIMEOUT 10000 + +static void hsdk_reset_config(struct hsdk_rst *rst, unsigned long id) +{ + writel(rst_map[id], rst->regs_ctl + CGU_SYS_RST_CTRL); +} + +static int hsdk_reset_do(struct hsdk_rst *rst) +{ + u32 reg; + + reg = readl(rst->regs_rst + CGU_IP_SW_RESET); + reg &= ~CGU_IP_SW_RESET_DELAY_MASK; + reg |= CGU_IP_SW_RESET_DELAY << CGU_IP_SW_RESET_DELAY_SHIFT; + reg |= CGU_IP_SW_RESET_RESET; + writel(reg, rst->regs_rst + CGU_IP_SW_RESET); + + /* wait till reset bit is back to 0 */ + return readl_poll_timeout_atomic(rst->regs_rst + CGU_IP_SW_RESET, reg, + !(reg & CGU_IP_SW_RESET_RESET), 5, SW_RESET_TIMEOUT); +} + +static int hsdk_reset_reset(struct reset_controller_dev *rcdev, + unsigned long id) +{ + struct hsdk_rst *rst = to_hsdk_rst(rcdev); + unsigned long flags; + int ret; + + spin_lock_irqsave(&rst->lock, flags); + hsdk_reset_config(rst, id); + ret = hsdk_reset_do(rst); + spin_unlock_irqrestore(&rst->lock, flags); + + return ret; +} + +static const struct reset_control_ops hsdk_reset_ops = { + .reset = hsdk_reset_reset, +}; + +static int hsdk_reset_probe(struct platform_device *pdev) +{ + struct hsdk_rst *rst; + struct resource *mem; + + rst = devm_kzalloc(&pdev->dev, sizeof(*rst), GFP_KERNEL); + if (!rst) + return -ENOMEM; + + mem = platform_get_resource(pdev, IORESOURCE_MEM, 0); + rst->regs_ctl = devm_ioremap_resource(&pdev->dev, mem); + if (IS_ERR(rst->regs_ctl)) + return PTR_ERR(rst->regs_ctl); + + mem = platform_get_resource(pdev, IORESOURCE_MEM, 1); + rst->regs_rst = devm_ioremap_resource(&pdev->dev, mem); + if (IS_ERR(rst->regs_rst)) + return PTR_ERR(rst->regs_rst); + + spin_lock_init(&rst->lock); + + rst->rcdev.owner = THIS_MODULE; + rst->rcdev.ops = &hsdk_reset_ops; + rst->rcdev.of_node = pdev->dev.of_node; + rst->rcdev.nr_resets = HSDK_MAX_RESETS; + rst->rcdev.of_reset_n_cells = 1; + + return reset_controller_register(&rst->rcdev); +} + +static const struct of_device_id hsdk_reset_dt_match[] = { + { .compatible = "snps,hsdk-reset" }, + { }, +}; + +static struct platform_driver hsdk_reset_driver = { + .probe = hsdk_reset_probe, + .driver = { + .name = "hsdk-reset", + .of_match_table = hsdk_reset_dt_match, + }, +}; +builtin_platform_driver(hsdk_reset_driver); + +MODULE_AUTHOR("Eugeniy Paltsev "); +MODULE_DESCRIPTION("Synopsys HSDK SDP reset driver"); +MODULE_LICENSE("GPL v2"); diff --git a/include/dt-bindings/reset/snps,hsdk-reset.h b/include/dt-bindings/reset/snps,hsdk-reset.h new file mode 100644 index 000000000000..e1a643e4bc91 --- /dev/null +++ b/include/dt-bindings/reset/snps,hsdk-reset.h @@ -0,0 +1,17 @@ +/** + * This header provides index for the HSDK reset controller. + */ +#ifndef _DT_BINDINGS_RESET_CONTROLLER_SNPS_HSDK +#define _DT_BINDINGS_RESET_CONTROLLER_SNPS_HSDK + +#define HSDK_APB_RESET 0 +#define HSDK_AXI_RESET 1 +#define HSDK_ETH_RESET 2 +#define HSDK_USB_RESET 3 +#define HSDK_SDIO_RESET 4 +#define HSDK_HDMI_RESET 5 +#define HSDK_GFX_RESET 6 +#define HSDK_DMAC_RESET 7 +#define HSDK_EBI_RESET 8 + +#endif /*_DT_BINDINGS_RESET_CONTROLLER_SNPS_HSDK*/ diff --git a/include/dt-bindings/reset/snps,hsdk-v1-reset.h b/include/dt-bindings/reset/snps,hsdk-v1-reset.h deleted file mode 100644 index d898c89b7123..000000000000 --- a/include/dt-bindings/reset/snps,hsdk-v1-reset.h +++ /dev/null @@ -1,17 +0,0 @@ -/** - * This header provides index for the HSDK v1 reset controller. - */ -#ifndef _DT_BINDINGS_RESET_CONTROLLER_HSDK_V1 -#define _DT_BINDINGS_RESET_CONTROLLER_HSDK_V1 - -#define HSDK_V1_APB_RESET 0 -#define HSDK_V1_AXI_RESET 1 -#define HSDK_V1_ETH_RESET 2 -#define HSDK_V1_USB_RESET 3 -#define HSDK_V1_SDIO_RESET 4 -#define HSDK_V1_HDMI_RESET 5 -#define HSDK_V1_GFX_RESET 6 -#define HSDK_V1_DMAC_RESET 7 -#define HSDK_V1_EBI_RESET 8 - -#endif /*_DT_BINDINGS_RESET_CONTROLLER_HSDK_V1*/ -- cgit From a931b9ce93841a5b66b709ba5a244276e345e63b Mon Sep 17 00:00:00 2001 From: Guneshwor Singh Date: Thu, 14 Sep 2017 17:49:40 +0530 Subject: ALSA: compress: Remove unused variable Commit 04c5d5a430fc ("ALSA: compress: Embed struct device") removed the statement that used 'str' but didn't remove the variable itself. So remove it. [Adding stable to Cc since pr_debug() may refer to the uninitialized buffer -- tiwai] Fixes: 04c5d5a430fc ("ALSA: compress: Embed struct device") Signed-off-by: Guneshwor Singh Cc: Signed-off-by: Takashi Iwai --- sound/core/compress_offload.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sound/core/compress_offload.c b/sound/core/compress_offload.c index fec1dfdb14ad..4490a699030b 100644 --- a/sound/core/compress_offload.c +++ b/sound/core/compress_offload.c @@ -948,14 +948,13 @@ static const struct file_operations snd_compr_file_ops = { static int snd_compress_dev_register(struct snd_device *device) { int ret = -EINVAL; - char str[16]; struct snd_compr *compr; if (snd_BUG_ON(!device || !device->device_data)) return -EBADFD; compr = device->device_data; - pr_debug("reg %s for device %s, direction %d\n", str, compr->name, + pr_debug("reg device %s, direction %d\n", compr->name, compr->direction); /* register compressed device */ ret = snd_register_device(SNDRV_DEVICE_TYPE_COMPRESS, -- cgit From 1c363eaece2752c5f8b1b874cb4ae435de06aa66 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 5 Sep 2017 10:56:13 +0200 Subject: android: binder: fix type mismatch warning Allowing binder to expose the 64-bit API on 32-bit kernels caused a build warning: drivers/android/binder.c: In function 'binder_transaction_buffer_release': drivers/android/binder.c:2220:15: error: cast to pointer from integer of different size [-Werror=int-to-pointer-cast] fd_array = (u32 *)(parent_buffer + fda->parent_offset); ^ drivers/android/binder.c: In function 'binder_translate_fd_array': drivers/android/binder.c:2445:13: error: cast to pointer from integer of different size [-Werror=int-to-pointer-cast] fd_array = (u32 *)(parent_buffer + fda->parent_offset); ^ drivers/android/binder.c: In function 'binder_fixup_parent': drivers/android/binder.c:2511:18: error: cast to pointer from integer of different size [-Werror=int-to-pointer-cast] This adds extra type casts to avoid the warning. However, there is another problem with the Kconfig option: turning it on or off creates two incompatible ABI versions, a kernel that has this enabled cannot run user space that was built without it or vice versa. A better solution might be to leave the option hidden until the binder code is fixed to deal with both ABI versions. Fixes: e8d2ed7db7c3 ("Revert "staging: Fix build issues with new binder API"") Signed-off-by: Arnd Bergmann Signed-off-by: Greg Kroah-Hartman --- drivers/android/binder.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index d055b3f2a207..72af95c9ea22 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -2217,7 +2217,7 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, debug_id, (u64)fda->num_fds); continue; } - fd_array = (u32 *)(parent_buffer + fda->parent_offset); + fd_array = (u32 *)(parent_buffer + (uintptr_t)fda->parent_offset); for (fd_index = 0; fd_index < fda->num_fds; fd_index++) task_close_fd(proc, fd_array[fd_index]); } break; @@ -2442,7 +2442,7 @@ static int binder_translate_fd_array(struct binder_fd_array_object *fda, */ parent_buffer = parent->buffer - binder_alloc_get_user_buffer_offset(&target_proc->alloc); - fd_array = (u32 *)(parent_buffer + fda->parent_offset); + fd_array = (u32 *)(parent_buffer + (uintptr_t)fda->parent_offset); if (!IS_ALIGNED((unsigned long)fd_array, sizeof(u32))) { binder_user_error("%d:%d parent offset not aligned correctly.\n", proc->pid, thread->pid); @@ -2508,7 +2508,7 @@ static int binder_fixup_parent(struct binder_transaction *t, proc->pid, thread->pid); return -EINVAL; } - parent_buffer = (u8 *)(parent->buffer - + parent_buffer = (u8 *)((uintptr_t)parent->buffer - binder_alloc_get_user_buffer_offset( &target_proc->alloc)); *(binder_uintptr_t *)(parent_buffer + bp->parent_offset) = bp->buffer; -- cgit From 52b81611f209da5f49019260522633e994e241b5 Mon Sep 17 00:00:00 2001 From: Xu YiPing Date: Tue, 5 Sep 2017 10:25:38 -0700 Subject: binder: fix an ret value override commit 372e3147df70 ("binder: guarantee txn complete / errors delivered in-order") incorrectly defined a local ret value. This ret value will be invalid when out of the if block Fixes: 372e3147df70 ("binder: refactor binder ref inc/dec for thread safety") Signed-off-by: Xu YiPing Signed-off-by: Todd Kjos Signed-off-by: Greg Kroah-Hartman --- drivers/android/binder.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 72af95c9ea22..b257bb0e2cfe 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -2326,7 +2326,6 @@ static int binder_translate_handle(struct flat_binder_object *fp, (u64)node->ptr); binder_node_unlock(node); } else { - int ret; struct binder_ref_data dest_rdata; binder_node_unlock(node); -- cgit From d53bebdf4d779497b29e1aad26e19cac1d446f42 Mon Sep 17 00:00:00 2001 From: Xu YiPing Date: Tue, 5 Sep 2017 10:21:52 -0700 Subject: binder: fix memory corruption in binder_transaction binder commit 7a4408c6bd3e ("binder: make sure accesses to proc/thread are safe") made a change to enqueue tcomplete to thread->todo before enqueuing the transaction. However, in err_dead_proc_or_thread case, the tcomplete is directly freed, without dequeued. It may cause the thread->todo list to be corrupted. So, dequeue it before freeing. Fixes: 7a4408c6bd3e ("binder: make sure accesses to proc/thread are safe") Signed-off-by: Xu YiPing Signed-off-by: Todd Kjos Signed-off-by: Greg Kroah-Hartman --- drivers/android/binder.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index b257bb0e2cfe..ab34239a76ee 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -3082,6 +3082,7 @@ static void binder_transaction(struct binder_proc *proc, err_dead_proc_or_thread: return_error = BR_DEAD_REPLY; return_error_line = __LINE__; + binder_dequeue_work(proc, tcomplete); err_translate_failed: err_bad_object_type: err_bad_offset: -- cgit From 93dc1774d2a4c7a298d5cdf78cc8acdcb7b1428d Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Thu, 7 Sep 2017 15:37:30 +0200 Subject: auxdisplay: charlcd: properly restore atomic counter on error path Commit f4757af ("staging: panel: Fix single-open policy race condition") introduced in 3.19-rc1 attempted to fix a race condition on the open, but failed to properly do it and used to exit without restoring the semaphore. This results in -EBUSY being returned after the first open error until the module is reloaded or the system restarted (ie: consecutive to a dual open resulting in -EBUSY or to a permission error). [ Note for stable maintainers: the code moved from drivers/misc/panel.c to drivers/auxdisplay/{charlcd,panel}.c during 4.12. The patch easily applies there (modulo the renamed atomic counter) but I can provide a tested backport if desired. ] Fixes: f4757af85 # 3.19-rc1 Cc: stable@vger.kernel.org Cc: Mariusz Gorski Cc: Geert Uytterhoeven Cc: Miguel Ojeda Sandonis Signed-off-by: Willy Tarreau Signed-off-by: Greg Kroah-Hartman --- drivers/auxdisplay/charlcd.c | 11 +++++++++-- drivers/auxdisplay/panel.c | 11 +++++++++-- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/drivers/auxdisplay/charlcd.c b/drivers/auxdisplay/charlcd.c index cfeb049a01ef..642afd88870b 100644 --- a/drivers/auxdisplay/charlcd.c +++ b/drivers/auxdisplay/charlcd.c @@ -647,18 +647,25 @@ static ssize_t charlcd_write(struct file *file, const char __user *buf, static int charlcd_open(struct inode *inode, struct file *file) { struct charlcd_priv *priv = to_priv(the_charlcd); + int ret; + ret = -EBUSY; if (!atomic_dec_and_test(&charlcd_available)) - return -EBUSY; /* open only once at a time */ + goto fail; /* open only once at a time */ + ret = -EPERM; if (file->f_mode & FMODE_READ) /* device is write-only */ - return -EPERM; + goto fail; if (priv->must_clear) { charlcd_clear_display(&priv->lcd); priv->must_clear = false; } return nonseekable_open(inode, file); + + fail: + atomic_inc(&charlcd_available); + return ret; } static int charlcd_release(struct inode *inode, struct file *file) diff --git a/drivers/auxdisplay/panel.c b/drivers/auxdisplay/panel.c index df126dcdaf18..6911acd896d9 100644 --- a/drivers/auxdisplay/panel.c +++ b/drivers/auxdisplay/panel.c @@ -1105,14 +1105,21 @@ static ssize_t keypad_read(struct file *file, static int keypad_open(struct inode *inode, struct file *file) { + int ret; + + ret = -EBUSY; if (!atomic_dec_and_test(&keypad_available)) - return -EBUSY; /* open only once at a time */ + goto fail; /* open only once at a time */ + ret = -EPERM; if (file->f_mode & FMODE_WRITE) /* device is read-only */ - return -EPERM; + goto fail; keypad_buflen = 0; /* flush the buffer on opening */ return 0; + fail: + atomic_inc(&keypad_available); + return ret; } static int keypad_release(struct inode *inode, struct file *file) -- cgit From 38b0774c0598c7a54b8499d18c2b764c35dc94ab Mon Sep 17 00:00:00 2001 From: Guy Shapiro Date: Mon, 11 Sep 2017 11:00:11 +0200 Subject: nvmem: core: return EFBIG on out-of-range write When writing data that exceeds the nvmem size to a nvmem sysfs file using the sh redirection operator >, the shell hangs, trying to write the out-of-range bytes endlessly. Fix the problem by returning EFBIG described in man 2 write. Similar change was done for binary sysfs files on commit 0936896056365349afa867c16e9f9100a6707cbf Signed-off-by: Guy Shapiro Cc: linux-api@vger.kernel.org Signed-off-by: Srinivas Kandagatla Signed-off-by: Greg Kroah-Hartman --- drivers/nvmem/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvmem/core.c b/drivers/nvmem/core.c index de54c7f5048a..3866117bc285 100644 --- a/drivers/nvmem/core.c +++ b/drivers/nvmem/core.c @@ -135,7 +135,7 @@ static ssize_t bin_attr_nvmem_write(struct file *filp, struct kobject *kobj, /* Stop the user from writing */ if (pos >= nvmem->size) - return 0; + return -EFBIG; if (count < nvmem->word_size) return -EINVAL; -- cgit From aad8d097c9224be264939fc6c02a5570ea094f60 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 11 Sep 2017 11:00:12 +0200 Subject: nvmem: add missing of_node_put() in of_nvmem_cell_get() of_get_next_parent() increments the refcount of the returned node. It should be put when done. Signed-off-by: Masahiro Yamada Signed-off-by: Srinivas Kandagatla Signed-off-by: Greg Kroah-Hartman --- drivers/nvmem/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvmem/core.c b/drivers/nvmem/core.c index 3866117bc285..d12e5de78e70 100644 --- a/drivers/nvmem/core.c +++ b/drivers/nvmem/core.c @@ -789,6 +789,7 @@ struct nvmem_cell *of_nvmem_cell_get(struct device_node *np, return ERR_PTR(-EINVAL); nvmem = __nvmem_device_get(nvmem_np, NULL, NULL); + of_node_put(nvmem_np); if (IS_ERR(nvmem)) return ERR_CAST(nvmem); -- cgit From 6878e7de6af726de47f9f3bec649c3f49e786586 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Wed, 13 Sep 2017 16:29:48 -0700 Subject: driver core: suppress sending MODALIAS in UNBIND uevents The current udev rules cause modules to be loaded on all device events save for "remove". With the introduction of KOBJ_BIND/KOBJ_UNBIND this causes issues, as driver modules that have devices bound to their drivers get immediately reloaded, and it appears to the user that module unloading doe snot work. The standard udev matching rule is foillowing: ENV{MODALIAS}=="?*", RUN{builtin}+="kmod load $env{MODALIAS}" Given that MODALIAS data is not terribly useful for UNBIND event, let's zap it from the generated uevent environment until we get userspace updated with the correct udev rule that only loads modules on "add" event. Reported-by: Jakub Kicinski Tested-by: Jakub Kicinski Fixes: 1455cf8dbfd0 ("driver core: emit uevents when device is bound ...") Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- lib/kobject_uevent.c | 49 +++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 41 insertions(+), 8 deletions(-) diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c index e590523ea476..f237a09a5862 100644 --- a/lib/kobject_uevent.c +++ b/lib/kobject_uevent.c @@ -294,6 +294,26 @@ static void cleanup_uevent_env(struct subprocess_info *info) } #endif +static void zap_modalias_env(struct kobj_uevent_env *env) +{ + static const char modalias_prefix[] = "MODALIAS="; + int i; + + for (i = 0; i < env->envp_idx;) { + if (strncmp(env->envp[i], modalias_prefix, + sizeof(modalias_prefix) - 1)) { + i++; + continue; + } + + if (i != env->envp_idx - 1) + memmove(&env->envp[i], &env->envp[i + 1], + sizeof(env->envp[i]) * env->envp_idx - 1); + + env->envp_idx--; + } +} + /** * kobject_uevent_env - send an uevent with environmental data * @@ -409,16 +429,29 @@ int kobject_uevent_env(struct kobject *kobj, enum kobject_action action, } } - /* - * Mark "add" and "remove" events in the object to ensure proper - * events to userspace during automatic cleanup. If the object did - * send an "add" event, "remove" will automatically generated by - * the core, if not already done by the caller. - */ - if (action == KOBJ_ADD) + switch (action) { + case KOBJ_ADD: + /* + * Mark "add" event so we can make sure we deliver "remove" + * event to userspace during automatic cleanup. If + * the object did send an "add" event, "remove" will + * automatically generated by the core, if not already done + * by the caller. + */ kobj->state_add_uevent_sent = 1; - else if (action == KOBJ_REMOVE) + break; + + case KOBJ_REMOVE: kobj->state_remove_uevent_sent = 1; + break; + + case KOBJ_UNBIND: + zap_modalias_env(env); + break; + + default: + break; + } mutex_lock(&uevent_sock_mutex); /* we will send an event, so request a new sequence number */ -- cgit From 452562abb5b76c14449dead2a7113f641893e8bc Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Thu, 7 Sep 2017 15:16:05 +0100 Subject: base: arch_topology: fix section mismatch build warnings Commit 2ef7a2953c81 ("arm, arm64: factorize common cpu capacity default code") introduced init_cpu_capacity_callback and init_cpu_capacity_notifier which are referenced from initcall and are missing __init{,data} annotations resulting the below section mismatch build warnings. "WARNING: vmlinux.o(.text+0xbab790): Section mismatch in reference from the function init_cpu_capacity_callback() to the variable .init.text:$x The function init_cpu_capacity_callback() references the variable __init $x. This is often because init_cpu_capacity_callback lacks a __init annotation or the annotation of $x is wrong." This patch fixes the above build warnings by adding the required annotations. Fixes: 2ef7a2953c81 ("arm, arm64: factorize common cpu capacity default code") Cc: Juri Lelli Cc: stable Signed-off-by: Sudeep Holla Signed-off-by: Greg Kroah-Hartman --- drivers/base/arch_topology.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index 41be9ff7d70a..6df7d6676a48 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -166,11 +166,11 @@ bool __init topology_parse_cpu_capacity(struct device_node *cpu_node, int cpu) } #ifdef CONFIG_CPU_FREQ -static cpumask_var_t cpus_to_visit; -static void parsing_done_workfn(struct work_struct *work); -static DECLARE_WORK(parsing_done_work, parsing_done_workfn); +static cpumask_var_t cpus_to_visit __initdata; +static void __init parsing_done_workfn(struct work_struct *work); +static __initdata DECLARE_WORK(parsing_done_work, parsing_done_workfn); -static int +static int __init init_cpu_capacity_callback(struct notifier_block *nb, unsigned long val, void *data) @@ -206,7 +206,7 @@ init_cpu_capacity_callback(struct notifier_block *nb, return 0; } -static struct notifier_block init_cpu_capacity_notifier = { +static struct notifier_block init_cpu_capacity_notifier __initdata = { .notifier_call = init_cpu_capacity_callback, }; @@ -232,7 +232,7 @@ static int __init register_cpufreq_notifier(void) } core_initcall(register_cpufreq_notifier); -static void parsing_done_workfn(struct work_struct *work) +static void __init parsing_done_workfn(struct work_struct *work) { cpufreq_unregister_notifier(&init_cpu_capacity_notifier, CPUFREQ_POLICY_NOTIFIER); -- cgit From 9821786d7c90eee2f6852261893d9703801aae47 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 18 Sep 2017 17:39:12 +0300 Subject: usb: xhci: Free the right ring in xhci_add_endpoint() In the xhci_add_endpoint(), a new ring was allocated and saved at xhci_virt_ep->new_ring. Hence, when error happens, we need to free the allocated ring before returning error. Current code frees xhci_virt_ep->ring instead of the new_ring. This patch fixes this. Cc: Signed-off-by: Lu Baolu Signed-off-by: Mathias Nyman Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c index b2ff1ff1a02f..ee198ea47f49 100644 --- a/drivers/usb/host/xhci.c +++ b/drivers/usb/host/xhci.c @@ -1703,7 +1703,8 @@ static int xhci_add_endpoint(struct usb_hcd *hcd, struct usb_device *udev, if (xhci->quirks & XHCI_MTK_HOST) { ret = xhci_mtk_add_ep_quirk(hcd, udev, ep); if (ret < 0) { - xhci_free_endpoint_ring(xhci, virt_dev, ep_index); + xhci_ring_free(xhci, virt_dev->eps[ep_index].new_ring); + virt_dev->eps[ep_index].new_ring = NULL; return ret; } } -- cgit From 5a838a13c9b4e5dd188b7a6eaeb894e9358ead0c Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Mon, 18 Sep 2017 17:39:13 +0300 Subject: xhci: fix finding correct bus_state structure for USB 3.1 hosts xhci driver keeps a bus_state structure for each hcd (usb2 and usb3) The structure is picked based on hcd speed, but driver only compared for HCD_USB3 speed, returning the wrong bus_state for HCD_USB31 hosts. This caused null pointer dereference errors in bus_resume function. Cc: Signed-off-by: Mathias Nyman Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h index 2abaa4d6d39d..e96e90d0fe9f 100644 --- a/drivers/usb/host/xhci.h +++ b/drivers/usb/host/xhci.h @@ -1681,7 +1681,7 @@ struct xhci_bus_state { static inline unsigned int hcd_index(struct usb_hcd *hcd) { - if (hcd->speed == HCD_USB3) + if (hcd->speed >= HCD_USB3) return 0; else return 1; -- cgit From 114ec3a6f9096d211a4aff4277793ba969a62c73 Mon Sep 17 00:00:00 2001 From: Jim Dickerson Date: Mon, 18 Sep 2017 17:39:14 +0300 Subject: usb: pci-quirks.c: Corrected timeout values used in handshake Servers were emitting failed handoff messages but were not waiting the full 1 second as designated in section 4.22.1 of the eXtensible Host Controller Interface specifications. The handshake was using wrong units so calls were made with milliseconds not microseconds. Comments referenced 5 seconds not 1 second as in specs. The wrong units were also corrected in a second handshake call. Cc: Signed-off-by: Jim Dickerson Signed-off-by: Mathias Nyman Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/pci-quirks.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/usb/host/pci-quirks.c b/drivers/usb/host/pci-quirks.c index 658d9d1f9ea3..e02dbb1e1cdd 100644 --- a/drivers/usb/host/pci-quirks.c +++ b/drivers/usb/host/pci-quirks.c @@ -1022,7 +1022,7 @@ EXPORT_SYMBOL_GPL(usb_disable_xhci_ports); * * Takes care of the handoff between the Pre-OS (i.e. BIOS) and the OS. * It signals to the BIOS that the OS wants control of the host controller, - * and then waits 5 seconds for the BIOS to hand over control. + * and then waits 1 second for the BIOS to hand over control. * If we timeout, assume the BIOS is broken and take control anyway. */ static void quirk_usb_handoff_xhci(struct pci_dev *pdev) @@ -1069,9 +1069,9 @@ static void quirk_usb_handoff_xhci(struct pci_dev *pdev) if (val & XHCI_HC_BIOS_OWNED) { writel(val | XHCI_HC_OS_OWNED, base + ext_cap_offset); - /* Wait for 5 seconds with 10 microsecond polling interval */ + /* Wait for 1 second with 10 microsecond polling interval */ timeout = handshake(base + ext_cap_offset, XHCI_HC_BIOS_OWNED, - 0, 5000, 10); + 0, 1000000, 10); /* Assume a buggy BIOS and take HC ownership anyway */ if (timeout) { @@ -1100,7 +1100,7 @@ hc_init: * operational or runtime registers. Wait 5 seconds and no more. */ timeout = handshake(op_reg_base + XHCI_STS_OFFSET, XHCI_STS_CNR, 0, - 5000, 10); + 5000000, 10); /* Assume a buggy HC and start HC initialization anyway */ if (timeout) { val = readl(op_reg_base + XHCI_STS_OFFSET); -- cgit From 76a14d7bf92960aac2f5450bfd23783bfa618be9 Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Mon, 18 Sep 2017 17:39:15 +0300 Subject: xhci: fix wrong endpoint ESIT value shown in tracing Read the endpiont ESIT from endpiont context using correct macro. Add a macro for reading the high bits of ESIT for Large ESIT Payload Capable hosts (LEC=1) Cc: # 4.12 Signed-off-by: Mathias Nyman Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h index e96e90d0fe9f..7c87189e8110 100644 --- a/drivers/usb/host/xhci.h +++ b/drivers/usb/host/xhci.h @@ -735,6 +735,8 @@ struct xhci_ep_ctx { #define EP_MAXPSTREAMS(p) (((p) << 10) & EP_MAXPSTREAMS_MASK) /* Endpoint is set up with a Linear Stream Array (vs. Secondary Stream Array) */ #define EP_HAS_LSA (1 << 15) +/* hosts with LEC=1 use bits 31:24 as ESIT high bits. */ +#define CTX_TO_MAX_ESIT_PAYLOAD_HI(p) (((p) >> 24) & 0xff) /* ep_info2 bitmasks */ /* @@ -2540,8 +2542,8 @@ static inline const char *xhci_decode_ep_context(u32 info, u32 info2, u64 deq, u8 lsa; u8 hid; - esit = EP_MAX_ESIT_PAYLOAD_HI(info) << 16 | - EP_MAX_ESIT_PAYLOAD_LO(tx_info); + esit = CTX_TO_MAX_ESIT_PAYLOAD_HI(info) << 16 | + CTX_TO_MAX_ESIT_PAYLOAD(tx_info); ep_state = info & EP_STATE_MASK; max_pstr = info & EP_MAXPSTREAMS_MASK; -- cgit From c6b8e79306f515b5483eb11076e0fbfc140434a8 Mon Sep 17 00:00:00 2001 From: Adam Wallis Date: Mon, 18 Sep 2017 17:39:16 +0300 Subject: usb: host: xhci-plat: allow sysdev to inherit from ACPI Commit 4c39d4b949d3 ("usb: xhci: use bus->sysdev for DMA configuration") updated the method determining DMA for XHCI from sysdev. However, this patch broke the ability to enumerate the FWNODE from parent ACPI devices from the child plat XHCI device. Currently, xhci_plat is not set up properly when the parent device is an ACPI node. The conditions that xhci_plat_probe should satisfy are 1. xhci_plat comes from firmware 2. xhci_plat is child of a device from firmware (dwc3-plat) 3. xhci_plat is grandchild of a pci device (dwc3-pci) Case 2 is covered when the child is an OF node (by checking sysdev->parent->of_node), however, an ACPI parent will return NULL in the of_node check and will thus not result in sysdev being set to sysdev->parent [ 17.591549] xhci-hcd: probe of xhci-hcd.6.auto failed with error -5 This change adds a check for ACPI to completely allow for condition 2. This is done by first checking if the parent node is of type ACPI (e.g., dwc3-plat) and set sysdev to sysdev->parent if either of the two following conditions are met: 1: If fwnode is empty (in the case that platform_device_add_properties was not called on the allocated platform device) 2: fwnode exists but is not of type ACPI (this would happen if platform_device_add_properties was called on the allocated device. Instead of type FWNODE_ACPI, you would end up with FWNODE_PDATA) Cc: stable@vger.kernel.org #4.12.x Cc: stable@vger.kernel.org #4.13.x Fixes: 4c39d4b949d3 ("usb: xhci: use bus->sysdev for DMA configuration") Tested-by: Thang Q. Nguyen Signed-off-by: Adam Wallis Signed-off-by: Mathias Nyman Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-plat.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/drivers/usb/host/xhci-plat.c b/drivers/usb/host/xhci-plat.c index 163bafde709f..1cb6eaef4ae1 100644 --- a/drivers/usb/host/xhci-plat.c +++ b/drivers/usb/host/xhci-plat.c @@ -178,14 +178,18 @@ static int xhci_plat_probe(struct platform_device *pdev) * 2. xhci_plat is child of a device from firmware (dwc3-plat) * 3. xhci_plat is grandchild of a pci device (dwc3-pci) */ - sysdev = &pdev->dev; - if (sysdev->parent && !sysdev->of_node && sysdev->parent->of_node) - sysdev = sysdev->parent; + for (sysdev = &pdev->dev; sysdev; sysdev = sysdev->parent) { + if (is_of_node(sysdev->fwnode) || + is_acpi_device_node(sysdev->fwnode)) + break; #ifdef CONFIG_PCI - else if (sysdev->parent && sysdev->parent->parent && - sysdev->parent->parent->bus == &pci_bus_type) - sysdev = sysdev->parent->parent; + else if (sysdev->bus == &pci_bus_type) + break; #endif + } + + if (!sysdev) + sysdev = &pdev->dev; /* Try to set 64-bit DMA first */ if (WARN_ON(!sysdev->dma_mask)) -- cgit From 4ec1cd3eeeee7ccc35681270da028dbc29ca7bbd Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Mon, 18 Sep 2017 17:39:17 +0300 Subject: xhci: Fix sleeping with spin_lock_irq() held in ASmedia 1042A workaround The flow control workaround for ASM1042A xHC hosts sleeps between register polling. The workaround gets called in several places, among them with spin_lock_irq() held when xHC host is resumed or hoplug removed. This was noticed as kernel panics at resume on a Dell XPS15 9550 with TB16 thunderbolt dock. Avoid sleeping with spin_lock_irq() held, use udelay() instead The original workaround was added to 4.9 and 4.12 stable releases, this patch needs to be applied to those as well. Fixes: 9da5a1092b13 ("xhci: Bad Ethernet performance plugged in ASM1042A host") Cc: #4.9+ Reported-by: Jose Marino Tested-by: Jose Marino Signed-off-by: Mathias Nyman Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/pci-quirks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/host/pci-quirks.c b/drivers/usb/host/pci-quirks.c index e02dbb1e1cdd..6dda3623a276 100644 --- a/drivers/usb/host/pci-quirks.c +++ b/drivers/usb/host/pci-quirks.c @@ -447,7 +447,7 @@ static int usb_asmedia_wait_write(struct pci_dev *pdev) if ((value & ASMT_CONTROL_WRITE_BIT) == 0) return 0; - usleep_range(40, 60); + udelay(50); } dev_warn(&pdev->dev, "%s: check_write_ready timeout", __func__); -- cgit From 7bea22b124d77845c85a62eaa29a85ba6cc2f899 Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Mon, 18 Sep 2017 17:39:18 +0300 Subject: xhci: set missing SuperSpeedPlus Link Protocol bit in roothub descriptor A SuperSpeedPlus roothub needs to have the Link Protocol (LP) bit set in the bmSublinkSpeedAttr[] entry of a SuperSpeedPlus descriptor. If the xhci controller has an optional Protocol Speed ID (PSI) table then that will be used as a base to create the roothub SuperSpeedPlus descriptor. The PSI table does not however necessary contain the LP bit so we need to set it manually. Check the psi speed and set LP bit if speed is 10Gbps or higher. We're not setting it for 5 to 10Gbps as USB 3.1 specification always mention SuperSpeedPlus for 10Gbps or higher, and some SSIC USB 3.0 speeds can be over 5Gbps, such as SSIC-G3B-L1 at 5830 Mbps Cc: # 4.6+ Signed-off-by: Mathias Nyman Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-hub.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/usb/host/xhci-hub.c b/drivers/usb/host/xhci-hub.c index ad89a6d4111b..4dba2cf73cbe 100644 --- a/drivers/usb/host/xhci-hub.c +++ b/drivers/usb/host/xhci-hub.c @@ -112,7 +112,7 @@ static int xhci_create_usb3_bos_desc(struct xhci_hcd *xhci, char *buf, /* If PSI table exists, add the custom speed attributes from it */ if (usb3_1 && xhci->usb3_rhub.psi_count) { - u32 ssp_cap_base, bm_attrib, psi; + u32 ssp_cap_base, bm_attrib, psi, psi_mant, psi_exp; int offset; ssp_cap_base = USB_DT_BOS_SIZE + USB_DT_USB_SS_CAP_SIZE; @@ -139,6 +139,15 @@ static int xhci_create_usb3_bos_desc(struct xhci_hcd *xhci, char *buf, for (i = 0; i < xhci->usb3_rhub.psi_count; i++) { psi = xhci->usb3_rhub.psi[i]; psi &= ~USB_SSP_SUBLINK_SPEED_RSVD; + psi_exp = XHCI_EXT_PORT_PSIE(psi); + psi_mant = XHCI_EXT_PORT_PSIM(psi); + + /* Shift to Gbps and set SSP Link BIT(14) if 10Gpbs */ + for (; psi_exp < 3; psi_exp++) + psi_mant /= 1000; + if (psi_mant >= 10) + psi |= BIT(14); + if ((psi & PLT_MASK) == PLT_SYM) { /* Symmetric, create SSA RX and TX from one PSI entry */ put_unaligned_le32(psi, &buf[offset]); -- cgit From bcd6a7aa13800afc1418e6b29d944d882214939a Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Mon, 18 Sep 2017 17:39:19 +0300 Subject: Revert "xhci: Limit USB2 port wake support for AMD Promontory hosts" This reverts commit dec08194ffeccfa1cf085906b53d301930eae18f. Commit dec08194ffec ("xhci: Limit USB2 port wake support for AMD Promontory hosts") makes all high speed USB ports on ASUS PRIME B350M-A cease to function after enabling runtime PM. All boards with this chipsets will be affected, so revert the commit. The original patch was added to stable 4.9, 4.11 and 4.12 and needs to reverted from there as well Cc: # 4.9+ Signed-off-by: Kai-Heng Feng Signed-off-by: Mathias Nyman Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-hub.c | 3 --- drivers/usb/host/xhci-pci.c | 12 ------------ drivers/usb/host/xhci.h | 2 +- 3 files changed, 1 insertion(+), 16 deletions(-) diff --git a/drivers/usb/host/xhci-hub.c b/drivers/usb/host/xhci-hub.c index 4dba2cf73cbe..da9158f171cb 100644 --- a/drivers/usb/host/xhci-hub.c +++ b/drivers/usb/host/xhci-hub.c @@ -1515,9 +1515,6 @@ int xhci_bus_suspend(struct usb_hcd *hcd) t2 |= PORT_WKOC_E | PORT_WKCONN_E; t2 &= ~PORT_WKDISC_E; } - if ((xhci->quirks & XHCI_U2_DISABLE_WAKE) && - (hcd->speed < HCD_USB3)) - t2 &= ~PORT_WAKE_BITS; } else t2 &= ~PORT_WAKE_BITS; diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c index 8071c8fdd15e..76f392954733 100644 --- a/drivers/usb/host/xhci-pci.c +++ b/drivers/usb/host/xhci-pci.c @@ -54,11 +54,6 @@ #define PCI_DEVICE_ID_INTEL_APL_XHCI 0x5aa8 #define PCI_DEVICE_ID_INTEL_DNV_XHCI 0x19d0 -#define PCI_DEVICE_ID_AMD_PROMONTORYA_4 0x43b9 -#define PCI_DEVICE_ID_AMD_PROMONTORYA_3 0x43ba -#define PCI_DEVICE_ID_AMD_PROMONTORYA_2 0x43bb -#define PCI_DEVICE_ID_AMD_PROMONTORYA_1 0x43bc - #define PCI_DEVICE_ID_ASMEDIA_1042A_XHCI 0x1142 static const char hcd_name[] = "xhci_hcd"; @@ -142,13 +137,6 @@ static void xhci_pci_quirks(struct device *dev, struct xhci_hcd *xhci) if (pdev->vendor == PCI_VENDOR_ID_AMD) xhci->quirks |= XHCI_TRUST_TX_LENGTH; - if ((pdev->vendor == PCI_VENDOR_ID_AMD) && - ((pdev->device == PCI_DEVICE_ID_AMD_PROMONTORYA_4) || - (pdev->device == PCI_DEVICE_ID_AMD_PROMONTORYA_3) || - (pdev->device == PCI_DEVICE_ID_AMD_PROMONTORYA_2) || - (pdev->device == PCI_DEVICE_ID_AMD_PROMONTORYA_1))) - xhci->quirks |= XHCI_U2_DISABLE_WAKE; - if (pdev->vendor == PCI_VENDOR_ID_INTEL) { xhci->quirks |= XHCI_LPM_SUPPORT; xhci->quirks |= XHCI_INTEL_HOST; diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h index 7c87189e8110..2b48aa4f6b76 100644 --- a/drivers/usb/host/xhci.h +++ b/drivers/usb/host/xhci.h @@ -1828,7 +1828,7 @@ struct xhci_hcd { /* For controller with a broken Port Disable implementation */ #define XHCI_BROKEN_PORT_PED (1 << 25) #define XHCI_LIMIT_ENDPOINT_INTERVAL_7 (1 << 26) -#define XHCI_U2_DISABLE_WAKE (1 << 27) +/* Reserved. It was XHCI_U2_DISABLE_WAKE */ #define XHCI_ASMEDIA_MODIFY_FLOWCONTROL (1 << 28) unsigned int num_active_eps; -- cgit From bf563b01c2895a4bfd1a29cc5abc67fe706ecffd Mon Sep 17 00:00:00 2001 From: Nicolai Stange Date: Mon, 11 Sep 2017 09:45:42 +0200 Subject: driver core: platform: Don't read past the end of "driver_override" buffer When printing the driver_override parameter when it is 4095 and 4094 bytes long, the printing code would access invalid memory because we need count+1 bytes for printing. Reject driver_override values of these lengths in driver_override_store(). This is in close analogy to commit 4efe874aace5 ("PCI: Don't read past the end of sysfs "driver_override" buffer") from Sasha Levin. Fixes: 3d713e0e382e ("driver core: platform: add device binding path 'driver_override'") Cc: stable@vger.kernel.org # v3.17+ Signed-off-by: Nicolai Stange Signed-off-by: Greg Kroah-Hartman --- drivers/base/platform.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/base/platform.c b/drivers/base/platform.c index d1bd99271066..9045c5f3734e 100644 --- a/drivers/base/platform.c +++ b/drivers/base/platform.c @@ -868,7 +868,8 @@ static ssize_t driver_override_store(struct device *dev, struct platform_device *pdev = to_platform_device(dev); char *driver_override, *old, *cp; - if (count > PATH_MAX) + /* We need to keep extra room for a newline */ + if (count >= (PAGE_SIZE - 1)) return -EINVAL; driver_override = kstrndup(buf, count, GFP_KERNEL); -- cgit From 432219fd002ada95d2f45dd3a25c3f7c73f27864 Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Sat, 2 Sep 2017 01:15:13 +0300 Subject: serial: sh-sci: document R8A77970 bindings R-Car V3M (R8A77970) SoC also has the R-Car gen3 compatible SCIF and HSCIF ports, so document the SoC specific bindings. Signed-off-by: Sergei Shtylyov Acked-by: Rob Herring Acked-by: Geert Uytterhoeven Acked-by: Simon Horman Signed-off-by: Greg Kroah-Hartman --- Documentation/devicetree/bindings/serial/renesas,sci-serial.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/serial/renesas,sci-serial.txt b/Documentation/devicetree/bindings/serial/renesas,sci-serial.txt index 4fc96946f81d..cf504d0380ae 100644 --- a/Documentation/devicetree/bindings/serial/renesas,sci-serial.txt +++ b/Documentation/devicetree/bindings/serial/renesas,sci-serial.txt @@ -41,6 +41,8 @@ Required properties: - "renesas,hscif-r8a7795" for R8A7795 (R-Car H3) HSCIF compatible UART. - "renesas,scif-r8a7796" for R8A7796 (R-Car M3-W) SCIF compatible UART. - "renesas,hscif-r8a7796" for R8A7796 (R-Car M3-W) HSCIF compatible UART. + - "renesas,scif-r8a77970" for R8A77970 (R-Car V3M) SCIF compatible UART. + - "renesas,hscif-r8a77970" for R8A77970 (R-Car V3M) HSCIF compatible UART. - "renesas,scif-r8a77995" for R8A77995 (R-Car D3) SCIF compatible UART. - "renesas,hscif-r8a77995" for R8A77995 (R-Car D3) HSCIF compatible UART. - "renesas,scifa-sh73a0" for SH73A0 (SH-Mobile AG5) SCIFA compatible UART. -- cgit From 104583b504da8a3ad86d033b497cb6e58941a9a1 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Tue, 12 Sep 2017 12:39:54 +0200 Subject: mxser: fix timeout calculation for low rates Paul reported, that low rates like B300 make the driver to hang in mxser_wait_until_sent. His debugging tackled the issue down to the info->timeout computation in mxser_set_baud. Obviously, ints are used there and they easily overflow with these low rates: B300 makes info->timeout to be -373. So switch all these types to unsigned as it ought to be. And use the u64 domain to perform the computation as in the worst case, we need 35 bits to store the computed value (before division). And use do_div not to break 32 bit kernels. [v2] make it actually build Signed-off-by: Jiri Slaby Cc: Paul Tested-by: Signed-off-by: Greg Kroah-Hartman --- drivers/tty/mxser.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/tty/mxser.c b/drivers/tty/mxser.c index 1c0c9553bc05..7dd38047ba23 100644 --- a/drivers/tty/mxser.c +++ b/drivers/tty/mxser.c @@ -246,11 +246,11 @@ struct mxser_port { unsigned char err_shadow; struct async_icount icount; /* kernel counters for 4 input interrupts */ - int timeout; + unsigned int timeout; int read_status_mask; int ignore_status_mask; - int xmit_fifo_size; + unsigned int xmit_fifo_size; int xmit_head; int xmit_tail; int xmit_cnt; @@ -572,8 +572,9 @@ static void mxser_dtr_rts(struct tty_port *port, int on) static int mxser_set_baud(struct tty_struct *tty, long newspd) { struct mxser_port *info = tty->driver_data; - int quot = 0, baud; + unsigned int quot = 0, baud; unsigned char cval; + u64 timeout; if (!info->ioaddr) return -1; @@ -594,8 +595,13 @@ static int mxser_set_baud(struct tty_struct *tty, long newspd) quot = 0; } - info->timeout = ((info->xmit_fifo_size * HZ * 10 * quot) / info->baud_base); - info->timeout += HZ / 50; /* Add .02 seconds of slop */ + /* + * worst case (128 * 1000 * 10 * 18432) needs 35 bits, so divide in the + * u64 domain + */ + timeout = (u64)info->xmit_fifo_size * HZ * 10 * quot; + do_div(timeout, info->baud_base); + info->timeout = timeout + HZ / 50; /* Add .02 seconds of slop */ if (quot) { info->MCR |= UART_MCR_DTR; -- cgit From 0e5ec4140c4a8a38c4ff7293c018eb7da69a30db Mon Sep 17 00:00:00 2001 From: Russell Enderby Date: Tue, 5 Sep 2017 12:16:47 -0500 Subject: serial: bcm63xx: fix timing issue. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue where unprintable characters can occur or output is cut off over the serial uart / linux console depending on timing. Problem occurs when changing the serial baud rate when setting up the new console.The bcm63xx driver does a disable and flush of the uart tx fifo while there is data still in the tx fifo. If the tx fifo still has data it is trying to send out, we need to wait until it is empty before disabling and flushing the uart. When we now go to change the uart parameters including speed we check if there is data currently in the tx fifo.If there is was mdelay(10) and check again.If it tries 3 times and still has data in it we just continue and sacrifice the tx fifo buffer. A cleaner and more preferred approach would be to remove : - spin_lock_irqsave() - bcm_uart_disable() - bcm_uart_flush() However it is not clear if the author put those in to fix another underlying issue.As a result this solution is a safer approach. Output before the fix: [0.306000] 14e00520.serial: ttyS0 at MMIO 0x14e00520 (irq = 9, base_baud = 1687500) is a° 0.315000] console[ttyS0] enabled Output verified after the fix: [0.315000] 14e00520.serial: ttyS0 at MMIO 0x14e00520 (irq = 9, base_baud = 1687500) is a bcm63xx_uart [0.334000] console[ttyS0] enabled Signed-off-by: Russell Enderby Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/bcm63xx_uart.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/tty/serial/bcm63xx_uart.c b/drivers/tty/serial/bcm63xx_uart.c index 583c9a0c7ecc..8c48c3784831 100644 --- a/drivers/tty/serial/bcm63xx_uart.c +++ b/drivers/tty/serial/bcm63xx_uart.c @@ -507,9 +507,14 @@ static void bcm_uart_set_termios(struct uart_port *port, { unsigned int ctl, baud, quot, ier; unsigned long flags; + int tries; spin_lock_irqsave(&port->lock, flags); + /* Drain the hot tub fully before we power it off for the winter. */ + for (tries = 3; !bcm_uart_tx_empty(port) && tries; tries--) + mdelay(10); + /* disable uart while changing speed */ bcm_uart_disable(port); bcm_uart_flush(port); -- cgit From 9d7ee0e28da59b05647c3d2a7ad4076c16b1a6ef Mon Sep 17 00:00:00 2001 From: Fugang Duan Date: Mon, 4 Sep 2017 19:20:24 +0800 Subject: tty: serial: lpuart: avoid report NULL interrupt The current driver register irq in .startup() and free the irq in .shutdown(), then user will see the NULL interrupt output from 'cat /proc/interrupts' after the uart port test completed: ... 41: 515 0 0 0 GICv3 257 Level fsl-lpuart 42: 2 0 0 0 GICv3 258 Level ... It is better to register all the irqs during probe function via devm_request_irq() to avoid to call free_irq(). Signed-off-by: Fugang Duan Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/fsl_lpuart.c | 40 +++++++++++++++++----------------------- 1 file changed, 17 insertions(+), 23 deletions(-) diff --git a/drivers/tty/serial/fsl_lpuart.c b/drivers/tty/serial/fsl_lpuart.c index 849c1f9991ce..f0252184291e 100644 --- a/drivers/tty/serial/fsl_lpuart.c +++ b/drivers/tty/serial/fsl_lpuart.c @@ -1276,7 +1276,6 @@ static void rx_dma_timer_init(struct lpuart_port *sport) static int lpuart_startup(struct uart_port *port) { struct lpuart_port *sport = container_of(port, struct lpuart_port, port); - int ret; unsigned long flags; unsigned char temp; @@ -1291,11 +1290,6 @@ static int lpuart_startup(struct uart_port *port) sport->rxfifo_size = 0x1 << (((temp >> UARTPFIFO_RXSIZE_OFF) & UARTPFIFO_FIFOSIZE_MASK) + 1); - ret = devm_request_irq(port->dev, port->irq, lpuart_int, 0, - DRIVER_NAME, sport); - if (ret) - return ret; - spin_lock_irqsave(&sport->port.lock, flags); lpuart_setup_watermark(sport); @@ -1333,7 +1327,6 @@ static int lpuart_startup(struct uart_port *port) static int lpuart32_startup(struct uart_port *port) { struct lpuart_port *sport = container_of(port, struct lpuart_port, port); - int ret; unsigned long flags; unsigned long temp; @@ -1346,11 +1339,6 @@ static int lpuart32_startup(struct uart_port *port) sport->rxfifo_size = 0x1 << (((temp >> UARTFIFO_RXSIZE_OFF) & UARTFIFO_FIFOSIZE_MASK) - 1); - ret = devm_request_irq(port->dev, port->irq, lpuart32_int, 0, - DRIVER_NAME, sport); - if (ret) - return ret; - spin_lock_irqsave(&sport->port.lock, flags); lpuart32_setup_watermark(sport); @@ -1380,8 +1368,6 @@ static void lpuart_shutdown(struct uart_port *port) spin_unlock_irqrestore(&port->lock, flags); - devm_free_irq(port->dev, port->irq, sport); - if (sport->lpuart_dma_rx_use) { del_timer_sync(&sport->lpuart_timer); lpuart_dma_rx_free(&sport->port); @@ -1400,7 +1386,6 @@ static void lpuart_shutdown(struct uart_port *port) static void lpuart32_shutdown(struct uart_port *port) { - struct lpuart_port *sport = container_of(port, struct lpuart_port, port); unsigned long temp; unsigned long flags; @@ -1413,8 +1398,6 @@ static void lpuart32_shutdown(struct uart_port *port) lpuart32_write(port, temp, UARTCTRL); spin_unlock_irqrestore(&port->lock, flags); - - devm_free_irq(port->dev, port->irq, sport); } static void @@ -2212,16 +2195,22 @@ static int lpuart_probe(struct platform_device *pdev) platform_set_drvdata(pdev, &sport->port); - if (lpuart_is_32(sport)) + if (lpuart_is_32(sport)) { lpuart_reg.cons = LPUART32_CONSOLE; - else + ret = devm_request_irq(&pdev->dev, sport->port.irq, lpuart32_int, 0, + DRIVER_NAME, sport); + } else { lpuart_reg.cons = LPUART_CONSOLE; + ret = devm_request_irq(&pdev->dev, sport->port.irq, lpuart_int, 0, + DRIVER_NAME, sport); + } + + if (ret) + goto failed_irq_request; ret = uart_add_one_port(&lpuart_reg, &sport->port); - if (ret) { - clk_disable_unprepare(sport->clk); - return ret; - } + if (ret) + goto failed_attach_port; sport->dma_tx_chan = dma_request_slave_channel(sport->port.dev, "tx"); if (!sport->dma_tx_chan) @@ -2240,6 +2229,11 @@ static int lpuart_probe(struct platform_device *pdev) } return 0; + +failed_attach_port: +failed_irq_request: + clk_disable_unprepare(sport->clk); + return ret; } static int lpuart_remove(struct platform_device *pdev) -- cgit From c91261437985d481c472639d4397931d77f5d4e9 Mon Sep 17 00:00:00 2001 From: Alexey Khoroshilov Date: Sat, 2 Sep 2017 23:13:55 +0300 Subject: serial: sccnxp: Fix error handling in sccnxp_probe() sccnxp_probe() returns result of regulator_disable() that may lead to returning zero, while device is not properly initialized. Also the driver enables clocks, but it does not disable it. Found by Linux Driver Verification project (linuxtesting.org). Signed-off-by: Alexey Khoroshilov Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/sccnxp.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/tty/serial/sccnxp.c b/drivers/tty/serial/sccnxp.c index cdd2f942317c..b9c7a904c1ea 100644 --- a/drivers/tty/serial/sccnxp.c +++ b/drivers/tty/serial/sccnxp.c @@ -889,7 +889,16 @@ static int sccnxp_probe(struct platform_device *pdev) goto err_out; uartclk = 0; } else { - clk_prepare_enable(clk); + ret = clk_prepare_enable(clk); + if (ret) + goto err_out; + + ret = devm_add_action_or_reset(&pdev->dev, + (void(*)(void *))clk_disable_unprepare, + clk); + if (ret) + goto err_out; + uartclk = clk_get_rate(clk); } @@ -988,7 +997,7 @@ static int sccnxp_probe(struct platform_device *pdev) uart_unregister_driver(&s->uart); err_out: if (!IS_ERR(s->regulator)) - return regulator_disable(s->regulator); + regulator_disable(s->regulator); return ret; } -- cgit From 64cfcaed7b25f69d8b7a091a23961f50c6788a66 Mon Sep 17 00:00:00 2001 From: Daniel Díaz Date: Fri, 7 Jul 2017 10:27:06 -0500 Subject: selftests: net: More graceful finding of `ip'. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ip tool might be provided by another package (such as Busybox), not necessarily implementing the -Version switch. Trying an actual usage (`ip link show') might be a better test that would work with all implementations of `ip'. Signed-off-by: Daniel Díaz Signed-off-by: Shuah Khan --- tools/testing/selftests/net/netdevice.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/netdevice.sh b/tools/testing/selftests/net/netdevice.sh index 4e00568d70c2..90cb903c3381 100755 --- a/tools/testing/selftests/net/netdevice.sh +++ b/tools/testing/selftests/net/netdevice.sh @@ -178,7 +178,7 @@ if [ "$(id -u)" -ne 0 ];then exit 0 fi -ip -Version 2>/dev/null >/dev/null +ip link show 2>/dev/null >/dev/null if [ $? -ne 0 ];then echo "SKIP: Could not run test without the ip tool" exit 0 -- cgit From 96e5335859e30fa2bb2da8befc86292c3c1a73dd Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Mon, 11 Sep 2017 12:49:01 +0200 Subject: tools: fix testing/selftests/sigaltstack for s390x MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On s390x the compilation of the file sas.c in directory tools/testing/selftests/sigaltstack fails with this error message: root@s35lp76 testing]# make selftests/sigaltstack/sas cc selftests/sigaltstack/sas.c -o selftests/sigaltstack/sas selftests/sigaltstack/sas.c: In function ‘my_usr1’: selftests/sigaltstack/sas.c:42:25: error: invalid register name for ‘sp’ register unsigned long sp asm("sp"); ^~ : recipe for target 'selftests/sigaltstack/sas' failed make: *** [selftests/sigaltstack/sas] Error 1 [root@s35lp76 testing]# On s390x the stack pointer is register r15, the register name "sp" is unknown. Make this line platform dependend and use register r15. With this patch the compilation and test succeeds: [root@s35lp76 testing]# ./selftests/sigaltstack/sas TAP version 13 ok 1 Initial sigaltstack state was SS_DISABLE # [RUN] signal USR1 ok 2 sigaltstack is disabled in sighandler # [RUN] switched to user ctx # [RUN] signal USR2 # [OK] Stack preserved ok 3 sigaltstack is still SS_AUTODISARM after signal Pass 3 Fail 0 Xfail 0 Xpass 0 Skip 0 Error 0 1..3 [root@s35lp76 testing]# Signed-off-by: Thomas Richter Signed-off-by: Shuah Khan --- tools/testing/selftests/sigaltstack/sas.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/sigaltstack/sas.c b/tools/testing/selftests/sigaltstack/sas.c index 7d406c3973ba..97bb150837df 100644 --- a/tools/testing/selftests/sigaltstack/sas.c +++ b/tools/testing/selftests/sigaltstack/sas.c @@ -39,7 +39,11 @@ void my_usr1(int sig, siginfo_t *si, void *u) stack_t stk; struct stk_data *p; +#if __s390x__ + register unsigned long sp asm("%15"); +#else register unsigned long sp asm("sp"); +#endif if (sp < (unsigned long)sstack || sp >= (unsigned long)sstack + SIGSTKSZ) { -- cgit From 172a8ca880f1c8b01bc4d1e0239bcb293ef65e0b Mon Sep 17 00:00:00 2001 From: Fathi Boudra Date: Thu, 29 Jun 2017 12:39:53 +0300 Subject: selftests: breakpoints: re-order TEST_GEN_PROGS targets breakpoint_test can fail on arm64 with older/unpatched glibc: breakpoint_test_arm64.c: In function 'run_test': breakpoint_test_arm64.c:170:25: error: 'TRAP_HWBKPT' undeclared (first use in this function) due to glibc missing several of the TRAP_* constants in the userspace definitions. Specifically TRAP_BRANCH and TRAP_HWBKPT. See https://sourceware.org/bugzilla/show_bug.cgi?id=21286 It prevents to build step_after_suspend_test afterward, since make won't continue. We still want to be able to build and run the test, independently of breakpoint_test_arm64 build failure. Re-order TEST_GEN_PROGS to be able to build step_after_suspend_test first. Signed-off-by: Fathi Boudra Signed-off-by: Shuah Khan --- tools/testing/selftests/breakpoints/Makefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/breakpoints/Makefile b/tools/testing/selftests/breakpoints/Makefile index 6b214b7b10fb..247b0a1899d7 100644 --- a/tools/testing/selftests/breakpoints/Makefile +++ b/tools/testing/selftests/breakpoints/Makefile @@ -2,14 +2,14 @@ uname_M := $(shell uname -m 2>/dev/null || echo not) ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/x86/ -e s/x86_64/x86/) +TEST_GEN_PROGS := step_after_suspend_test + ifeq ($(ARCH),x86) -TEST_GEN_PROGS := breakpoint_test +TEST_GEN_PROGS += breakpoint_test endif ifneq (,$(filter $(ARCH),aarch64 arm64)) -TEST_GEN_PROGS := breakpoint_test_arm64 +TEST_GEN_PROGS += breakpoint_test_arm64 endif -TEST_GEN_PROGS += step_after_suspend_test - include ../lib.mk -- cgit From 67b2e30eb7b346db60afe91b0927738fb604d0a4 Mon Sep 17 00:00:00 2001 From: Daniel Díaz Date: Thu, 17 Aug 2017 10:55:30 -0500 Subject: selftests: intel_pstate: build only on x86 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These tests are only for x86, so don't try to build or run them on other architectures. Signed-off-by: Daniel Díaz Signed-off-by: Shuah Khan --- tools/testing/selftests/intel_pstate/Makefile | 2 ++ tools/testing/selftests/intel_pstate/run.sh | 5 +++++ 2 files changed, 7 insertions(+) diff --git a/tools/testing/selftests/intel_pstate/Makefile b/tools/testing/selftests/intel_pstate/Makefile index 849a90ffe8dd..a97e24edde39 100644 --- a/tools/testing/selftests/intel_pstate/Makefile +++ b/tools/testing/selftests/intel_pstate/Makefile @@ -1,7 +1,9 @@ CFLAGS := $(CFLAGS) -Wall -D_GNU_SOURCE LDLIBS := $(LDLIBS) -lm +ifeq (,$(filter $(ARCH),x86)) TEST_GEN_FILES := msr aperf +endif TEST_PROGS := run.sh diff --git a/tools/testing/selftests/intel_pstate/run.sh b/tools/testing/selftests/intel_pstate/run.sh index 7868c106b8b1..1b4b8302dfc2 100755 --- a/tools/testing/selftests/intel_pstate/run.sh +++ b/tools/testing/selftests/intel_pstate/run.sh @@ -29,6 +29,11 @@ EVALUATE_ONLY=0 +if ! uname -m | sed -e s/i.86/x86/ -e s/x86_64/x86/ | grep -q x86; then + echo "$0 # Skipped: Test can only run on x86 architectures." + exit 0 +fi + max_cpus=$(($(nproc)-1)) # compile programs -- cgit From 6f0003363a13be699ba945cfa4074193e04fbea5 Mon Sep 17 00:00:00 2001 From: Thomas Meyer Date: Fri, 8 Sep 2017 14:01:18 +0200 Subject: selftests/intel_pstate: No need to compile test progs in the run script Both test programs are being compiled by make, so no need to compile both programs in the runner script. This resolves an error when installing all selftests via make install and run them in a different environemnt. Running tests in intel_pstate ======================================== ./run.sh: line 35: gcc: command not found Problem compiling aperf.c. Signed-off-by: Thomas Meyer Signed-off-by: Shuah Khan --- tools/testing/selftests/intel_pstate/run.sh | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tools/testing/selftests/intel_pstate/run.sh b/tools/testing/selftests/intel_pstate/run.sh index 1b4b8302dfc2..d3ab48f91cd6 100755 --- a/tools/testing/selftests/intel_pstate/run.sh +++ b/tools/testing/selftests/intel_pstate/run.sh @@ -36,12 +36,6 @@ fi max_cpus=$(($(nproc)-1)) -# compile programs -gcc aperf.c -Wall -D_GNU_SOURCE -o aperf -lm -[ $? -ne 0 ] && echo "Problem compiling aperf.c." && exit 1 -gcc -o msr msr.c -lm -[ $? -ne 0 ] && echo "Problem compiling msr.c." && exit 1 - function run_test () { file_ext=$1 -- cgit From 56a268cd4a410081d477913d51e044039d3a8834 Mon Sep 17 00:00:00 2001 From: Thomas Meyer Date: Fri, 8 Sep 2017 13:19:23 +0200 Subject: selftests/bpf: Make bpf_util work on uniprocessor systems The current implementation fails to work on uniprocessor systems. Fix the parser to also handle the uniprocessor case. Signed-off-by: Thomas Meyer Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: Shuah Khan --- tools/testing/selftests/bpf/bpf_util.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/bpf/bpf_util.h b/tools/testing/selftests/bpf/bpf_util.h index 20ecbaa0d85d..6c53a8906eff 100644 --- a/tools/testing/selftests/bpf/bpf_util.h +++ b/tools/testing/selftests/bpf/bpf_util.h @@ -12,6 +12,7 @@ static inline unsigned int bpf_num_possible_cpus(void) unsigned int start, end, possible_cpus = 0; char buff[128]; FILE *fp; + int n; fp = fopen(fcpu, "r"); if (!fp) { @@ -20,17 +21,17 @@ static inline unsigned int bpf_num_possible_cpus(void) } while (fgets(buff, sizeof(buff), fp)) { - if (sscanf(buff, "%u-%u", &start, &end) == 2) { - possible_cpus = start == 0 ? end + 1 : 0; - break; + n = sscanf(buff, "%u-%u", &start, &end); + if (n == 0) { + printf("Failed to retrieve # possible CPUs!\n"); + exit(1); + } else if (n == 1) { + end = start; } + possible_cpus = start == 0 ? end + 1 : 0; + break; } - fclose(fp); - if (!possible_cpus) { - printf("Failed to retrieve # possible CPUs!\n"); - exit(1); - } return possible_cpus; } -- cgit From 84c06566cfb84e5f6aed9582d4570df8406700c9 Mon Sep 17 00:00:00 2001 From: Thomas Meyer Date: Fri, 8 Sep 2017 14:01:17 +0200 Subject: selftests/ftrace: multiple_kprobes: Also check for support The multiple_kprobes test case fails to check for KPROBE_EVENT support. Add the check to prevent a false test result. Signed-off-by: Thomas Meyer Acked-by: Masami Hiramatsu Acked-by: Steven Rostedt (VMware) Signed-off-by: Shuah Khan --- tools/testing/selftests/ftrace/test.d/kprobe/multiple_kprobes.tc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/multiple_kprobes.tc b/tools/testing/selftests/ftrace/test.d/kprobe/multiple_kprobes.tc index 2a1cb9908746..a4fd4c851a5b 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/multiple_kprobes.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/multiple_kprobes.tc @@ -1,6 +1,8 @@ #!/bin/sh # description: Register/unregister many kprobe events +[ -f kprobe_events ] || exit_unsupported # this is configurable + # ftrace fentry skip size depends on the machine architecture. # Currently HAVE_KPROBES_ON_FTRACE defined on x86 and powerpc64le case `uname -m` in -- cgit From 47684e111f52fface17820d3ef84cc845b26070e Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 18 Sep 2017 13:10:15 -0700 Subject: Documentation: core-api: minor workqueue.rst cleanups Clean up workqueue.rst: - fix minor typos - put '@' after `` instead of preceding them (one place) - use "CPU" instead of "cpu" in text consistently - quote one function name Signed-off-by: Randy Dunlap Cc: Tejun Heo Cc: Florian Mickler Signed-off-by: Tejun Heo --- Documentation/core-api/workqueue.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Documentation/core-api/workqueue.rst b/Documentation/core-api/workqueue.rst index 3943b5bfa8cf..00a5ba51e63f 100644 --- a/Documentation/core-api/workqueue.rst +++ b/Documentation/core-api/workqueue.rst @@ -39,8 +39,8 @@ up. Although MT wq wasted a lot of resource, the level of concurrency provided was unsatisfactory. The limitation was common to both ST and MT wq albeit less severe on MT. Each wq maintained its own separate -worker pool. A MT wq could provide only one execution context per CPU -while a ST wq one for the whole system. Work items had to compete for +worker pool. An MT wq could provide only one execution context per CPU +while an ST wq one for the whole system. Work items had to compete for those very limited execution contexts leading to various problems including proneness to deadlocks around the single execution context. @@ -151,7 +151,7 @@ Application Programming Interface (API) ``alloc_workqueue()`` allocates a wq. The original ``create_*workqueue()`` functions are deprecated and scheduled for -removal. ``alloc_workqueue()`` takes three arguments - @``name``, +removal. ``alloc_workqueue()`` takes three arguments - ``@name``, ``@flags`` and ``@max_active``. ``@name`` is the name of the wq and also used as the name of the rescuer thread if there is one. @@ -197,7 +197,7 @@ resources, scheduled and executed. served by worker threads with elevated nice level. Note that normal and highpri worker-pools don't interact with - each other. Each maintain its separate pool of workers and + each other. Each maintains its separate pool of workers and implements concurrency management among its workers. ``WQ_CPU_INTENSIVE`` @@ -249,8 +249,8 @@ unbound worker-pools and only one work item could be active at any given time thus achieving the same ordering property as ST wq. In the current implementation the above configuration only guarantees -ST behavior within a given NUMA node. Instead alloc_ordered_queue should -be used to achieve system wide ST behavior. +ST behavior within a given NUMA node. Instead ``alloc_ordered_queue()`` should +be used to achieve system-wide ST behavior. Example Execution Scenarios -- cgit From 80a921e207cea3dd18038501187078669a218dab Mon Sep 17 00:00:00 2001 From: Ville Syrjälä Date: Mon, 18 Sep 2017 23:00:08 +0300 Subject: ata_piix: Add Fujitsu-Siemens Lifebook S6120 to short cable IDs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fujitsu-Siemens Lifebook S6120 misdetects the cable type for some drives. The problematic one in this case is an mSATA SSD hooked up via a mSATA->PATA bridge. With regular hard disks the detection seems to work correctly. Strangely an older Lifebook model (S6020) detects the cable as 80c with the mSATA SSD, even if using the exact same flex cable. Cc: Tejun Heo Signed-off-by: Ville Syrjälä Signed-off-by: Tejun Heo --- drivers/ata/ata_piix.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/ata/ata_piix.c b/drivers/ata/ata_piix.c index 8401c3b5be92..b702c20fbc2b 100644 --- a/drivers/ata/ata_piix.c +++ b/drivers/ata/ata_piix.c @@ -492,6 +492,7 @@ static const struct ich_laptop ich_laptop[] = { { 0x27DF, 0x152D, 0x0778 }, /* ICH7 on unknown Intel */ { 0x24CA, 0x1025, 0x0061 }, /* ICH4 on ACER Aspire 2023WLMi */ { 0x24CA, 0x1025, 0x003d }, /* ICH4 on ACER TM290 */ + { 0x24CA, 0x10CF, 0x11AB }, /* ICH4M on Fujitsu-Siemens Lifebook S6120 */ { 0x266F, 0x1025, 0x0066 }, /* ICH6 on ACER Aspire 1694WLMi */ { 0x2653, 0x1043, 0x82D8 }, /* ICH6M on Asus Eee 701 */ { 0x27df, 0x104d, 0x900e }, /* ICH7 on Sony TZ-90 */ -- cgit From 55e001aabb826c96f09e0a440bdcbce620189dbc Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 18 Sep 2017 12:17:36 +0200 Subject: fpga: altera-cvp: remove DRIVER_ATTR() usage It's better to be explicit and use the DRIVER_ATTR_RW() macro when defining a driver's sysfs file. This is part of a series to drop DRIVER_ATTR() from the tree entirely. Cc: linux-fpga@vger.kernel.org Reviewed-by: Moritz Fischer Acked-by: Alan Tull Signed-off-by: Greg Kroah-Hartman --- drivers/fpga/altera-cvp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/fpga/altera-cvp.c b/drivers/fpga/altera-cvp.c index 08629ee69d11..00e73d28077c 100644 --- a/drivers/fpga/altera-cvp.c +++ b/drivers/fpga/altera-cvp.c @@ -361,12 +361,12 @@ static const struct fpga_manager_ops altera_cvp_ops = { .write_complete = altera_cvp_write_complete, }; -static ssize_t show_chkcfg(struct device_driver *dev, char *buf) +static ssize_t chkcfg_show(struct device_driver *dev, char *buf) { return snprintf(buf, 3, "%d\n", altera_cvp_chkcfg); } -static ssize_t store_chkcfg(struct device_driver *drv, const char *buf, +static ssize_t chkcfg_store(struct device_driver *drv, const char *buf, size_t count) { int ret; @@ -378,7 +378,7 @@ static ssize_t store_chkcfg(struct device_driver *drv, const char *buf, return count; } -static DRIVER_ATTR(chkcfg, 0600, show_chkcfg, store_chkcfg); +static DRIVER_ATTR_RW(chkcfg); static int altera_cvp_probe(struct pci_dev *pdev, const struct pci_device_id *dev_id); -- cgit From 850fdec8d2fd1eebfa003fea39bec08cd69b6155 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 18 Sep 2017 12:17:57 +0200 Subject: driver core: remove DRIVER_ATTR DRIVER_ATTR is no longer in use, and driver authors should be using DRIVER_ATTR_RW() or DRIVER_ATTR_RO() or DRIVER_ATTR_WO() instead in order to always get the permissions correct. So remove it so that no one can use it anymore. Acked-by: Alan Tull Reviewed-by: Moritz Fischer Signed-off-by: Greg Kroah-Hartman --- Documentation/driver-model/driver.txt | 7 ++++--- Documentation/filesystems/sysfs.txt | 3 ++- include/linux/device.h | 2 -- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Documentation/driver-model/driver.txt b/Documentation/driver-model/driver.txt index 4421135826a2..d661e6f7e6a0 100644 --- a/Documentation/driver-model/driver.txt +++ b/Documentation/driver-model/driver.txt @@ -196,12 +196,13 @@ struct driver_attribute { }; Device drivers can export attributes via their sysfs directories. -Drivers can declare attributes using a DRIVER_ATTR macro that works -identically to the DEVICE_ATTR macro. +Drivers can declare attributes using a DRIVER_ATTR_RW and DRIVER_ATTR_RO +macro that works identically to the DEVICE_ATTR_RW and DEVICE_ATTR_RO +macros. Example: -DRIVER_ATTR(debug,0644,show_debug,store_debug); +DRIVER_ATTR_RW(debug); This is equivalent to declaring: diff --git a/Documentation/filesystems/sysfs.txt b/Documentation/filesystems/sysfs.txt index 24da7b32c489..9a3658cc399e 100644 --- a/Documentation/filesystems/sysfs.txt +++ b/Documentation/filesystems/sysfs.txt @@ -366,7 +366,8 @@ struct driver_attribute { Declaring: -DRIVER_ATTR(_name, _mode, _show, _store) +DRIVER_ATTR_RO(_name) +DRIVER_ATTR_RW(_name) Creation/Removal: diff --git a/include/linux/device.h b/include/linux/device.h index c6f27207dbe8..2bc70ddda09b 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -307,8 +307,6 @@ struct driver_attribute { size_t count); }; -#define DRIVER_ATTR(_name, _mode, _show, _store) \ - struct driver_attribute driver_attr_##_name = __ATTR(_name, _mode, _show, _store) #define DRIVER_ATTR_RW(_name) \ struct driver_attribute driver_attr_##_name = __ATTR_RW(_name) #define DRIVER_ATTR_RO(_name) \ -- cgit From d03d5d530a46c9def981a2a4ec98f6ec2e9b7878 Mon Sep 17 00:00:00 2001 From: Russell King Date: Sat, 8 Jul 2017 20:16:29 +0100 Subject: MAINTAINERS: add Macchiatobin maintainers entry Add a maintainers entry for the Macchiatobin board. Signed-off-by: Russell King Signed-off-by: Gregory CLEMENT --- MAINTAINERS | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 2281af4b41b6..531ba1acae3e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8253,6 +8253,12 @@ L: libertas-dev@lists.infradead.org S: Orphan F: drivers/net/wireless/marvell/libertas/ +MARVELL MACCHIATOBIN SUPPORT +M: Russell King +L: linux-arm-kernel@lists.infradead.org +S: Maintained +F: arch/arm64/boot/dts/marvell/armada-8040-mcbin.dts + MARVELL MV643XX ETHERNET DRIVER M: Sebastian Hesselbarth L: netdev@vger.kernel.org -- cgit From 9e7460fc325dad06d2066abdbc1f4dd49456f9a4 Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Fri, 15 Sep 2017 10:50:07 +0300 Subject: arm64: dt marvell: Fix AP806 system controller size Extend the container size to 0x2000 to include the gpio controller at offset 0x1040. While at it, add start address notation to the gpio node name to match its 'offset' property. Fixes: 63dac0f4924b ("arm64: dts: marvell: add gpio support for Armada 7K/8K") Cc: Signed-off-by: Baruch Siach Signed-off-by: Gregory CLEMENT --- arch/arm64/boot/dts/marvell/armada-ap806.dtsi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/marvell/armada-ap806.dtsi b/arch/arm64/boot/dts/marvell/armada-ap806.dtsi index 4d360713ed12..30d48ecf46e0 100644 --- a/arch/arm64/boot/dts/marvell/armada-ap806.dtsi +++ b/arch/arm64/boot/dts/marvell/armada-ap806.dtsi @@ -254,7 +254,7 @@ ap_syscon: system-controller@6f4000 { compatible = "syscon", "simple-mfd"; - reg = <0x6f4000 0x1000>; + reg = <0x6f4000 0x2000>; ap_clk: clock { compatible = "marvell,ap806-clock"; @@ -265,7 +265,7 @@ compatible = "marvell,ap806-pinctrl"; }; - ap_gpio: gpio { + ap_gpio: gpio@1040 { compatible = "marvell,armada-8k-gpio"; offset = <0x1040>; ngpios = <20>; -- cgit From bd7a3fe770ebd8391d1c7d072ff88e9e76d063eb Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 19 Sep 2017 15:07:17 +0200 Subject: USB: fix out-of-bounds in usb_set_configuration Andrey Konovalov reported a possible out-of-bounds problem for a USB interface association descriptor. He writes: It seems there's no proper size check of a USB_DT_INTERFACE_ASSOCIATION descriptor. It's only checked that the size is >= 2 in usb_parse_configuration(), so find_iad() might do out-of-bounds access to intf_assoc->bInterfaceCount. And he's right, we don't check for crazy descriptors of this type very well, so resolve this problem. Yet another issue found by syzkaller... Reported-by: Andrey Konovalov Tested-by: Andrey Konovalov Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/config.c | 14 +++++++++++--- include/uapi/linux/usb/ch9.h | 1 + 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/usb/core/config.c b/drivers/usb/core/config.c index 854c8d66cfbe..68b54bd88d1e 100644 --- a/drivers/usb/core/config.c +++ b/drivers/usb/core/config.c @@ -643,15 +643,23 @@ static int usb_parse_configuration(struct usb_device *dev, int cfgidx, } else if (header->bDescriptorType == USB_DT_INTERFACE_ASSOCIATION) { + struct usb_interface_assoc_descriptor *d; + + d = (struct usb_interface_assoc_descriptor *)header; + if (d->bLength < USB_DT_INTERFACE_ASSOCIATION_SIZE) { + dev_warn(ddev, + "config %d has an invalid interface association descriptor of length %d, skipping\n", + cfgno, d->bLength); + continue; + } + if (iad_num == USB_MAXIADS) { dev_warn(ddev, "found more Interface " "Association Descriptors " "than allocated for in " "configuration %d\n", cfgno); } else { - config->intf_assoc[iad_num] = - (struct usb_interface_assoc_descriptor - *)header; + config->intf_assoc[iad_num] = d; iad_num++; } diff --git a/include/uapi/linux/usb/ch9.h b/include/uapi/linux/usb/ch9.h index ce1169af39d7..2a5d63040a0b 100644 --- a/include/uapi/linux/usb/ch9.h +++ b/include/uapi/linux/usb/ch9.h @@ -780,6 +780,7 @@ struct usb_interface_assoc_descriptor { __u8 iFunction; } __attribute__ ((packed)); +#define USB_DT_INTERFACE_ASSOCIATION_SIZE 8 /*-------------------------------------------------------------------------*/ -- cgit From bb4e6ff01ac356f82327d980e45fee8a65491328 Mon Sep 17 00:00:00 2001 From: Nickey Yang Date: Mon, 18 Sep 2017 17:05:37 +0800 Subject: arm64: dts: rockchip: Correct MIPI DPHY PLL clock on rk3399 There is a further gate in between the mipidphy reference clock and the actual ref-clock input to the dsi host, making the clock hirarchy look like clk_24m --> Gate11[14] --> clk_mipidphy_ref --> Gate21[0] --> clk_dphy_pll Fix the clock reference so that the whole clock subtree gets enabled when the dsi host needs it. Signed-off-by: Nickey Yang [amended commit message] Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3399.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3399.dtsi b/arch/arm64/boot/dts/rockchip/rk3399.dtsi index d79e9b3265b9..6aa43fd47148 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3399.dtsi @@ -1629,7 +1629,7 @@ compatible = "rockchip,rk3399-mipi-dsi", "snps,dw-mipi-dsi"; reg = <0x0 0xff960000 0x0 0x8000>; interrupts = ; - clocks = <&cru SCLK_MIPIDPHY_REF>, <&cru PCLK_MIPI_DSI0>, + clocks = <&cru SCLK_DPHY_PLL>, <&cru PCLK_MIPI_DSI0>, <&cru SCLK_DPHY_TX0_CFG>; clock-names = "ref", "pclk", "phy_cfg"; power-domains = <&power RK3399_PD_VIO>; -- cgit From 4ae7c364b9320063504db78834fabe59d16f85bf Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Mon, 28 Aug 2017 15:18:16 +0200 Subject: ARM: dts: nokia n900: drop unneeded/undocumented parts of the dts Sakari mentioned that some parts of the dts are not needed and do not have proper documentation, yet. As the camera works without them, remove them for now. Signed-off-by: Pavel Machek Signed-off-by: Tony Lindgren --- arch/arm/boot/dts/omap3-n900.dts | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/arch/arm/boot/dts/omap3-n900.dts b/arch/arm/boot/dts/omap3-n900.dts index 26c20e1167b9..4acd32a1c4ef 100644 --- a/arch/arm/boot/dts/omap3-n900.dts +++ b/arch/arm/boot/dts/omap3-n900.dts @@ -144,15 +144,6 @@ io-channel-names = "temp", "bsi", "vbat"; }; - rear_camera: camera@0 { - compatible = "linux,camera"; - - module { - model = "TCM8341MD"; - sensor = <&cam1>; - }; - }; - pwm9: dmtimer-pwm { compatible = "ti,omap-dmtimer-pwm"; #pwm-cells = <3>; @@ -189,10 +180,8 @@ clock-lanes = <1>; data-lanes = <0>; lane-polarity = <0 0>; - clock-inv = <0>; /* Select strobe = <1> for back camera, <0> for front camera */ strobe = <1>; - crc = <0>; }; }; }; -- cgit From 06480f8cf559001c9eb49b4e9d822e13ad1cc5c4 Mon Sep 17 00:00:00 2001 From: Keerthy Date: Wed, 6 Sep 2017 16:03:58 +0530 Subject: ARM: OMAP2+: dra7xx: Set OPT_CLKS_IN_RESET flag for gpio1 gpio1 soft reset fails in the kexec path as the optional clock is not enabled hence enable the HWMOD_CONTROL_OPT_CLKS_IN_RESET flag for gpio1 hwmod. Signed-off-by: Keerthy Signed-off-by: Tony Lindgren --- arch/arm/mach-omap2/omap_hwmod_7xx_data.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/mach-omap2/omap_hwmod_7xx_data.c b/arch/arm/mach-omap2/omap_hwmod_7xx_data.c index f040244c57e7..2f4f7002f38d 100644 --- a/arch/arm/mach-omap2/omap_hwmod_7xx_data.c +++ b/arch/arm/mach-omap2/omap_hwmod_7xx_data.c @@ -839,6 +839,7 @@ static struct omap_hwmod dra7xx_gpio1_hwmod = { .name = "gpio1", .class = &dra7xx_gpio_hwmod_class, .clkdm_name = "wkupaon_clkdm", + .flags = HWMOD_CONTROL_OPT_CLKS_IN_RESET, .main_clk = "wkupaon_iclk_mux", .prcm = { .omap4 = { -- cgit From 8aed1026ccfe9cf5772c62bde35bc101ead9308c Mon Sep 17 00:00:00 2001 From: Keerthy Date: Wed, 6 Sep 2017 19:09:32 +0530 Subject: ARM: dts: dra7: Set a default parent to mcasp3_ahclkx_mux Assign a default parent to mcasp3_ahclkx_mux clock using the assigned-clock-parents property. This is helpful in cases like kexec where in the clock parent can be something other than the value at reset. Suggested-by: Tero Kristo Signed-off-by: Keerthy Signed-off-by: Tony Lindgren --- arch/arm/boot/dts/dra7xx-clocks.dtsi | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm/boot/dts/dra7xx-clocks.dtsi b/arch/arm/boot/dts/dra7xx-clocks.dtsi index cf229dfabf61..e62b62875cba 100644 --- a/arch/arm/boot/dts/dra7xx-clocks.dtsi +++ b/arch/arm/boot/dts/dra7xx-clocks.dtsi @@ -1817,6 +1817,8 @@ clocks = <&abe_24m_fclk>, <&abe_sys_clk_div>, <&func_24m_clk>, <&atl_clkin3_ck>, <&atl_clkin2_ck>, <&atl_clkin1_ck>, <&atl_clkin0_ck>, <&sys_clkin2>, <&ref_clkin0_ck>, <&ref_clkin1_ck>, <&ref_clkin2_ck>, <&ref_clkin3_ck>, <&mlb_clk>, <&mlbp_clk>; ti,bit-shift = <24>; reg = <0x1868>; + assigned-clocks = <&mcasp3_ahclkx_mux>; + assigned-clock-parents = <&abe_24m_fclk>; }; mcasp3_aux_gfclk_mux: mcasp3_aux_gfclk_mux@1868 { -- cgit From 20547dfd85f5baaf27ca01b32570bd6bfd7b209c Mon Sep 17 00:00:00 2001 From: "H. Nikolaus Schaller" Date: Fri, 8 Sep 2017 21:15:51 +0200 Subject: ARM: OMAP2+: hsmmc: fix logic to call either omap_hsmmc_init or omap_hsmmc_late_init but not both MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With 4.13 kernel I get this boot message: [    1.051727] ------------[ cut here ]------------ [    1.051818] WARNING: CPU: 0 PID: 1 at fs/sysfs/dir.c:31 sysfs_warn_dup+0x54/0x74 [    1.051849] sysfs: cannot create duplicate filename '/devices/platform/omap_hsmmc.2' [    1.051879] Modules linked in: [    1.051971] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.13.0-letux+ #1360 [    1.052001] Hardware name: Generic OMAP3 (Flattened Device Tree) [    1.052062] [] (unwind_backtrace) from [] (show_stack+0x10/0x14) [    1.052124] [] (show_stack) from [] (dump_stack+0x98/0xd0) [    1.052185] [] (dump_stack) from [] (__warn+0xd0/0x100) [    1.052215] [] (__warn) from [] (warn_slowpath_fmt+0x34/0x44) [    1.052276] [] (warn_slowpath_fmt) from [] (sysfs_warn_dup+0x54/0x74) [    1.052337] [] (sysfs_warn_dup) from [] (sysfs_create_dir_ns+0x74/0x84) [    1.052398] [] (sysfs_create_dir_ns) from [] (kobject_add_internal+0xd0/0x294) [    1.052429] [] (kobject_add_internal) from [] (kobject_add+0x6c/0x8c) [    1.052490] [] (kobject_add) from [] (device_add+0xe4/0x510) [    1.052551] [] (device_add) from [] (platform_device_add+0x130/0x1c0) [    1.052612] [] (platform_device_add) from [] (omap_hsmmc_late_init+0x3c/0x60) [    1.052673] [] (omap_hsmmc_late_init) from [] (omap3_pandora_legacy_init+0x24/0xb4) [    1.052734] [] (omap3_pandora_legacy_init) from [] (pdata_quirks_check+0x30/0x3c) [    1.052795] [] (pdata_quirks_check) from [] (omap_generic_init+0xc/0x18) [    1.052856] [] (omap_generic_init) from [] (customize_machine+0x1c/0x28) [    1.052917] [] (customize_machine) from [] (do_one_initcall+0xa8/0x150) [    1.052947] [] (do_one_initcall) from [] (kernel_init_freeable+0x110/0x1d4) [    1.053009] [] (kernel_init_freeable) from [] (kernel_init+0x8/0x10c) [    1.053070] [] (kernel_init) from [] (ret_from_fork+0x14/0x24) [    1.055023] ---[ end trace 44e490b09ac4ab88 ]--- This can be traced down to the calls of omap_hsmmc_init(pandora_mmc3); omap_hsmmc_late_init(pandora_mmc3); in omap3_pandora_legacy_init(). It turns out that both funcions disagree how to decide if the other one was alredy called. Signed-off-by: H. Nikolaus Schaller Signed-off-by: Tony Lindgren --- arch/arm/mach-omap2/hsmmc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/mach-omap2/hsmmc.c b/arch/arm/mach-omap2/hsmmc.c index 5b614388d72f..6d28aa20a7d3 100644 --- a/arch/arm/mach-omap2/hsmmc.c +++ b/arch/arm/mach-omap2/hsmmc.c @@ -58,10 +58,10 @@ void omap_hsmmc_late_init(struct omap2_hsmmc_info *c) struct platform_device *pdev; int res; - if (omap_hsmmc_done != 1) + if (omap_hsmmc_done) return; - omap_hsmmc_done++; + omap_hsmmc_done = 1; for (; c->mmc; c++) { pdev = c->pdev; -- cgit From cddfae253c875076750a03bd05ba5b1569e6876e Mon Sep 17 00:00:00 2001 From: Suniel Mahesh Date: Mon, 11 Sep 2017 12:00:16 +0530 Subject: ARM: dts: am33xx: Add spi alias to match SOC schematics Linux bus numbers should match the numbers defined by the chip manufacturer. This patch add's spi aliases to achieve that bus naming convention. Signed-off-by: Suniel Mahesh Signed-off-by: Karthik Tummala Tested-by: Karthik Tummala Signed-off-by: Tony Lindgren --- arch/arm/boot/dts/am33xx.dtsi | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm/boot/dts/am33xx.dtsi b/arch/arm/boot/dts/am33xx.dtsi index 7d7ca054c557..e58fab8aec5d 100644 --- a/arch/arm/boot/dts/am33xx.dtsi +++ b/arch/arm/boot/dts/am33xx.dtsi @@ -36,6 +36,8 @@ phy1 = &usb1_phy; ethernet0 = &cpsw_emac0; ethernet1 = &cpsw_emac1; + spi0 = &spi0; + spi1 = &spi1; }; cpus { -- cgit From 4afa616ce937f88d9a69a71b8c561551596a81e3 Mon Sep 17 00:00:00 2001 From: Yogesh Siraswar Date: Thu, 14 Sep 2017 14:30:07 -0500 Subject: ARM: dts: am43xx-epos-evm: Remove extra CPSW EMAC entry On am438x EPOS boards there is only one ethernet port, remove extra port definition. This boot log warnings during PHY detection. Signed-off-by: Yogesh Siraswar Signed-off-by: Andrew F. Davis Signed-off-by: Tony Lindgren --- arch/arm/boot/dts/am43x-epos-evm.dts | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/arm/boot/dts/am43x-epos-evm.dts b/arch/arm/boot/dts/am43x-epos-evm.dts index 9d276af7c539..081fa68b6f98 100644 --- a/arch/arm/boot/dts/am43x-epos-evm.dts +++ b/arch/arm/boot/dts/am43x-epos-evm.dts @@ -388,6 +388,7 @@ pinctrl-0 = <&cpsw_default>; pinctrl-1 = <&cpsw_sleep>; status = "okay"; + slaves = <1>; }; &davinci_mdio { @@ -402,11 +403,6 @@ phy-mode = "rmii"; }; -&cpsw_emac1 { - phy_id = <&davinci_mdio>, <1>; - phy-mode = "rmii"; -}; - &phy_sel { rmii-clock-ext; }; -- cgit From e1af344df4e5c8fe90f4a63235a68d5405afc41b Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Tue, 19 Sep 2017 01:21:56 -0400 Subject: ALSA: asihpi: fix a potential double-fetch bug when copying puhm The hm->h.size is intended to hold the actual size of the hm struct that is copied from userspace and should always be <= sizeof(*hm). However, after copy_from_user(hm, puhm, hm->h.size), since userspace process has full control over the memory region pointed by puhm, it is possible that the value of hm->h.size is different from what is fetched-in previously (get_user(hm->h.size, (u16 __user *)puhm)). In other words, hm->h.size is overriden and the relation between hm->h.size and the hm struct is broken. This patch proposes to use a seperate variable, msg_size, to hold the value of the first fetch and override hm->h.size to msg_size after the second fetch to maintain the relation. Signed-off-by: Meng Xu Signed-off-by: Takashi Iwai --- sound/pci/asihpi/hpioctl.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/sound/pci/asihpi/hpioctl.c b/sound/pci/asihpi/hpioctl.c index 7e3aa50b21f9..5badd08e1d69 100644 --- a/sound/pci/asihpi/hpioctl.c +++ b/sound/pci/asihpi/hpioctl.c @@ -103,6 +103,7 @@ long asihpi_hpi_ioctl(struct file *file, unsigned int cmd, unsigned long arg) void __user *puhr; union hpi_message_buffer_v1 *hm; union hpi_response_buffer_v1 *hr; + u16 msg_size; u16 res_max_size; u32 uncopied_bytes; int err = 0; @@ -127,22 +128,25 @@ long asihpi_hpi_ioctl(struct file *file, unsigned int cmd, unsigned long arg) } /* Now read the message size and data from user space. */ - if (get_user(hm->h.size, (u16 __user *)puhm)) { + if (get_user(msg_size, (u16 __user *)puhm)) { err = -EFAULT; goto out; } - if (hm->h.size > sizeof(*hm)) - hm->h.size = sizeof(*hm); + if (msg_size > sizeof(*hm)) + msg_size = sizeof(*hm); /* printk(KERN_INFO "message size %d\n", hm->h.wSize); */ - uncopied_bytes = copy_from_user(hm, puhm, hm->h.size); + uncopied_bytes = copy_from_user(hm, puhm, msg_size); if (uncopied_bytes) { HPI_DEBUG_LOG(ERROR, "uncopied bytes %d\n", uncopied_bytes); err = -EFAULT; goto out; } + /* Override h.size in case it is changed between two userspace fetches */ + hm->h.size = msg_size; + if (get_user(res_max_size, (u16 __user *)puhr)) { err = -EFAULT; goto out; -- cgit From fbcab13d2e2511a858590846ac2e2d7cbd830591 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 19 Sep 2017 09:51:28 -0400 Subject: selftests: silence test output by default Some of the networking tests are very noisy and make it impossible to see if we actually passed the tests as they run. Default to suppressing the output from any tests run in order to make it easier to track what failed. Signed-off-by: Josef Bacik Signed-off-by: Shuah Khan --- tools/testing/selftests/lib.mk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk index 693616651da5..4665463779f5 100644 --- a/tools/testing/selftests/lib.mk +++ b/tools/testing/selftests/lib.mk @@ -24,7 +24,7 @@ define RUN_TESTS echo "selftests: Warning: file $$BASENAME_TEST is not executable, correct this.";\ echo "not ok 1..$$test_num selftests: $$BASENAME_TEST [FAIL]"; \ else \ - cd `dirname $$TEST` > /dev/null; (./$$BASENAME_TEST && echo "ok 1..$$test_num selftests: $$BASENAME_TEST [PASS]") || echo "not ok 1..$$test_num selftests: $$BASENAME_TEST [FAIL]"; cd - > /dev/null;\ + cd `dirname $$TEST` > /dev/null; (./$$BASENAME_TEST > /tmp/$$BASENAME_TEST 2>&1 && echo "ok 1..$$test_num selftests: $$BASENAME_TEST [PASS]") || echo "not ok 1..$$test_num selftests: $$BASENAME_TEST [FAIL]"; cd - > /dev/null;\ fi; \ done; endef @@ -55,7 +55,7 @@ endif define EMIT_TESTS @for TEST in $(TEST_GEN_PROGS) $(TEST_PROGS); do \ BASENAME_TEST=`basename $$TEST`; \ - echo "(./$$BASENAME_TEST && echo \"selftests: $$BASENAME_TEST [PASS]\") || echo \"selftests: $$BASENAME_TEST [FAIL]\""; \ + echo "(./$$BASENAME_TEST > /tmp/$$BASENAME_TEST 2>&1 && echo \"selftests: $$BASENAME_TEST [PASS]\") || echo \"selftests: $$BASENAME_TEST [FAIL]\""; \ done; endef -- cgit From 422d8dc6fd3afecd66bd6acfcd73a2d53e338ff3 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 19 Sep 2017 09:51:26 -0400 Subject: selftest: add a reuseaddr test This is to test for a regression introduced by b9470c27607b ("inet: kill smallest_size and smallest_port") which introduced a problem with reuseaddr and bind conflicts. Signed-off-by: Josef Bacik Signed-off-by: Shuah Khan --- tools/testing/selftests/net/.gitignore | 1 + tools/testing/selftests/net/Makefile | 2 +- tools/testing/selftests/net/reuseaddr_conflict.c | 114 +++++++++++++++++++++++ 3 files changed, 116 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/net/reuseaddr_conflict.c diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index 9801253e4802..c612d6e38c62 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -6,3 +6,4 @@ reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa reuseport_dualstack +reuseaddr_conflict diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index de1f5772b878..3df542c84610 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -7,7 +7,7 @@ TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh rtnetl TEST_GEN_FILES = socket TEST_GEN_FILES += psock_fanout psock_tpacket TEST_GEN_FILES += reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa -TEST_GEN_FILES += reuseport_dualstack msg_zerocopy +TEST_GEN_FILES += reuseport_dualstack msg_zerocopy reuseaddr_conflict include ../lib.mk diff --git a/tools/testing/selftests/net/reuseaddr_conflict.c b/tools/testing/selftests/net/reuseaddr_conflict.c new file mode 100644 index 000000000000..7c5b12664b03 --- /dev/null +++ b/tools/testing/selftests/net/reuseaddr_conflict.c @@ -0,0 +1,114 @@ +/* + * Test for the regression introduced by + * + * b9470c27607b ("inet: kill smallest_size and smallest_port") + * + * If we open an ipv4 socket on a port with reuseaddr we shouldn't reset the tb + * when we open the ipv6 conterpart, which is what was happening previously. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define PORT 9999 + +int open_port(int ipv6, int any) +{ + int fd = -1; + int reuseaddr = 1; + int v6only = 1; + int addrlen; + int ret = -1; + struct sockaddr *addr; + int family = ipv6 ? AF_INET6 : AF_INET; + + struct sockaddr_in6 addr6 = { + .sin6_family = AF_INET6, + .sin6_port = htons(PORT), + .sin6_addr = in6addr_any + }; + struct sockaddr_in addr4 = { + .sin_family = AF_INET, + .sin_port = htons(PORT), + .sin_addr.s_addr = any ? htonl(INADDR_ANY) : inet_addr("127.0.0.1"), + }; + + + if (ipv6) { + addr = (struct sockaddr*)&addr6; + addrlen = sizeof(addr6); + } else { + addr = (struct sockaddr*)&addr4; + addrlen = sizeof(addr4); + } + + if ((fd = socket(family, SOCK_STREAM, IPPROTO_TCP)) < 0) { + perror("socket"); + goto out; + } + + if (ipv6 && setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, (void*)&v6only, + sizeof(v6only)) < 0) { + perror("setsockopt IPV6_V6ONLY"); + goto out; + } + + if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &reuseaddr, + sizeof(reuseaddr)) < 0) { + perror("setsockopt SO_REUSEADDR"); + goto out; + } + + if (bind(fd, addr, addrlen) < 0) { + perror("bind"); + goto out; + } + + if (any) + return fd; + + if (listen(fd, 1) < 0) { + perror("listen"); + goto out; + } + return fd; +out: + close(fd); + return ret; +} + +int main(void) +{ + int listenfd; + int fd1, fd2; + + fprintf(stderr, "Opening 127.0.0.1:%d\n", PORT); + listenfd = open_port(0, 0); + if (listenfd < 0) + error(1, errno, "Couldn't open listen socket"); + fprintf(stderr, "Opening INADDR_ANY:%d\n", PORT); + fd1 = open_port(0, 1); + if (fd1 >= 0) + error(1, 0, "Was allowed to create an ipv4 reuseport on a already bound non-reuseport socket"); + fprintf(stderr, "Opening in6addr_any:%d\n", PORT); + fd1 = open_port(1, 1); + if (fd1 < 0) + error(1, errno, "Couldn't open ipv6 reuseport"); + fprintf(stderr, "Opening INADDR_ANY:%d\n", PORT); + fd2 = open_port(0, 1); + if (fd2 >= 0) + error(1, 0, "Was allowed to create an ipv4 reuseport on a already bound non-reuseport socket"); + close(fd1); + fprintf(stderr, "Opening INADDR_ANY:%d after closing ipv6 socket\n", PORT); + fd1 = open_port(0, 1); + if (fd1 >= 0) + error(1, 0, "Was allowed to create an ipv4 reuseport on an already bound non-reuseport socket with no ipv6"); + fprintf(stderr, "Success"); + return 0; +} -- cgit From e06d79fbc338935ef27befc84f8b8b2e5f878a10 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 19 Sep 2017 09:51:27 -0400 Subject: selftests: actually run the various net selftests These self tests are just self contained binaries, they are not run by any of the scripts in the directory. This means they need to be marked with TEST_GEN_PROGS to actually be run, not TEST_GEN_FILES. Signed-off-by: Josef Bacik Signed-off-by: Shuah Khan --- tools/testing/selftests/net/Makefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 3df542c84610..d86bca991f45 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -5,9 +5,9 @@ CFLAGS += -I../../../../usr/include/ TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh rtnetlink.sh TEST_GEN_FILES = socket -TEST_GEN_FILES += psock_fanout psock_tpacket -TEST_GEN_FILES += reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa -TEST_GEN_FILES += reuseport_dualstack msg_zerocopy reuseaddr_conflict +TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy +TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa +TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict include ../lib.mk -- cgit From 06e8852ceecccdeff6841a6c5cd78a947a75d5bc Mon Sep 17 00:00:00 2001 From: Thomas Meyer Date: Fri, 8 Sep 2017 14:01:19 +0200 Subject: selftests/net: msg_zerocopy enable build with older kernel headers Explicitly define SO_EE_ORIGIN_ZEROCOPY. This makes the test program build with older kernel headers, e.g. from Debian 9. Signed-off-by: Thomas Meyer Signed-off-by: Shuah Khan --- tools/testing/selftests/net/msg_zerocopy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/msg_zerocopy.c b/tools/testing/selftests/net/msg_zerocopy.c index 40232af5b023..3ab6ec403905 100644 --- a/tools/testing/selftests/net/msg_zerocopy.c +++ b/tools/testing/selftests/net/msg_zerocopy.c @@ -55,7 +55,7 @@ #include #ifndef SO_EE_ORIGIN_ZEROCOPY -#define SO_EE_ORIGIN_ZEROCOPY SO_EE_ORIGIN_UPAGE +#define SO_EE_ORIGIN_ZEROCOPY 5 #endif #ifndef SO_ZEROCOPY -- cgit From 17df6453d4be17910456e99c5a85025aa1b7a246 Mon Sep 17 00:00:00 2001 From: Arend Van Spriel Date: Tue, 12 Sep 2017 10:47:53 +0200 Subject: brcmfmac: add length check in brcmf_cfg80211_escan_handler() Upon handling the firmware notification for scans the length was checked properly and may result in corrupting kernel heap memory due to buffer overruns. This fix addresses CVE-2017-0786. Cc: stable@vger.kernel.org # v4.0.x Cc: Kevin Cernekee Reviewed-by: Hante Meuleman Reviewed-by: Pieter-Paul Giesberts Reviewed-by: Franky Lin Signed-off-by: Arend van Spriel Signed-off-by: Kalle Valo --- .../wireless/broadcom/brcm80211/brcmfmac/cfg80211.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c index aaed4ab503ad..26a0de371c26 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c @@ -3162,6 +3162,7 @@ brcmf_cfg80211_escan_handler(struct brcmf_if *ifp, struct brcmf_cfg80211_info *cfg = ifp->drvr->config; s32 status; struct brcmf_escan_result_le *escan_result_le; + u32 escan_buflen; struct brcmf_bss_info_le *bss_info_le; struct brcmf_bss_info_le *bss = NULL; u32 bi_length; @@ -3181,11 +3182,23 @@ brcmf_cfg80211_escan_handler(struct brcmf_if *ifp, if (status == BRCMF_E_STATUS_PARTIAL) { brcmf_dbg(SCAN, "ESCAN Partial result\n"); + if (e->datalen < sizeof(*escan_result_le)) { + brcmf_err("invalid event data length\n"); + goto exit; + } escan_result_le = (struct brcmf_escan_result_le *) data; if (!escan_result_le) { brcmf_err("Invalid escan result (NULL pointer)\n"); goto exit; } + escan_buflen = le32_to_cpu(escan_result_le->buflen); + if (escan_buflen > BRCMF_ESCAN_BUF_SIZE || + escan_buflen > e->datalen || + escan_buflen < sizeof(*escan_result_le)) { + brcmf_err("Invalid escan buffer length: %d\n", + escan_buflen); + goto exit; + } if (le16_to_cpu(escan_result_le->bss_count) != 1) { brcmf_err("Invalid bss_count %d: ignoring\n", escan_result_le->bss_count); @@ -3202,9 +3215,8 @@ brcmf_cfg80211_escan_handler(struct brcmf_if *ifp, } bi_length = le32_to_cpu(bss_info_le->length); - if (bi_length != (le32_to_cpu(escan_result_le->buflen) - - WL_ESCAN_RESULTS_FIXED_SIZE)) { - brcmf_err("Invalid bss_info length %d: ignoring\n", + if (bi_length != escan_buflen - WL_ESCAN_RESULTS_FIXED_SIZE) { + brcmf_err("Ignoring invalid bss_info length: %d\n", bi_length); goto exit; } -- cgit From 35f62727df0ed8e5e4857e162d94fd46d861f1cf Mon Sep 17 00:00:00 2001 From: Arend Van Spriel Date: Tue, 12 Sep 2017 10:47:54 +0200 Subject: brcmfmac: setup passive scan if requested by user-space The driver was not properly configuring firmware with regard to the type of scan. It always performed an active scan even when user-space was requesting for passive scan, ie. the scan request was done without any SSIDs specified. Cc: stable@vger.kernel.org # v4.0.x Reported-by: Huang, Jiangyang Reviewed-by: Hante Meuleman Reviewed-by: Pieter-Paul Giesberts Reviewed-by: Franky Lin Signed-off-by: Arend van Spriel Signed-off-by: Kalle Valo --- .../wireless/broadcom/brcm80211/brcmfmac/cfg80211.c | 19 ++++--------------- .../wireless/broadcom/brcm80211/brcmfmac/fwil_types.h | 5 +++++ 2 files changed, 9 insertions(+), 15 deletions(-) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c index 26a0de371c26..4157c90ad973 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c @@ -980,7 +980,7 @@ static void brcmf_escan_prep(struct brcmf_cfg80211_info *cfg, eth_broadcast_addr(params_le->bssid); params_le->bss_type = DOT11_BSSTYPE_ANY; - params_le->scan_type = 0; + params_le->scan_type = BRCMF_SCANTYPE_ACTIVE; params_le->channel_num = 0; params_le->nprobes = cpu_to_le32(-1); params_le->active_time = cpu_to_le32(-1); @@ -988,12 +988,9 @@ static void brcmf_escan_prep(struct brcmf_cfg80211_info *cfg, params_le->home_time = cpu_to_le32(-1); memset(¶ms_le->ssid_le, 0, sizeof(params_le->ssid_le)); - /* if request is null exit so it will be all channel broadcast scan */ - if (!request) - return; - n_ssids = request->n_ssids; n_channels = request->n_channels; + /* Copy channel array if applicable */ brcmf_dbg(SCAN, "### List of channelspecs to scan ### %d\n", n_channels); @@ -1030,16 +1027,8 @@ static void brcmf_escan_prep(struct brcmf_cfg80211_info *cfg, ptr += sizeof(ssid_le); } } else { - brcmf_dbg(SCAN, "Broadcast scan %p\n", request->ssids); - if ((request->ssids) && request->ssids->ssid_len) { - brcmf_dbg(SCAN, "SSID %s len=%d\n", - params_le->ssid_le.SSID, - request->ssids->ssid_len); - params_le->ssid_le.SSID_len = - cpu_to_le32(request->ssids->ssid_len); - memcpy(¶ms_le->ssid_le.SSID, request->ssids->ssid, - request->ssids->ssid_len); - } + brcmf_dbg(SCAN, "Performing passive scan\n"); + params_le->scan_type = BRCMF_SCANTYPE_PASSIVE; } /* Adding mask to channel numbers */ params_le->channel_num = diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/fwil_types.h b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/fwil_types.h index 8391989b1882..e0d22fedb2b4 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/fwil_types.h +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/fwil_types.h @@ -45,6 +45,11 @@ #define BRCMF_SCAN_PARAMS_COUNT_MASK 0x0000ffff #define BRCMF_SCAN_PARAMS_NSSID_SHIFT 16 +/* scan type definitions */ +#define BRCMF_SCANTYPE_DEFAULT 0xFF +#define BRCMF_SCANTYPE_ACTIVE 0 +#define BRCMF_SCANTYPE_PASSIVE 1 + #define BRCMF_WSEC_MAX_PSK_LEN 32 #define BRCMF_WSEC_PASSPHRASE BIT(0) -- cgit From ce21574ad1922b403198ee664c4dff276f514f1d Mon Sep 17 00:00:00 2001 From: Sekhar Nori Date: Tue, 29 Aug 2017 13:52:51 +0530 Subject: ARM: dts: da850-evm: add serial and ethernet aliases Add aliases for serial and ethernet nodes. Serial aliases help keep order of tty nodes fixed and ethernet alias is used by bootloader to setup mac address correctly. Reported-by: Adam Ford Acked-by: Tony Lindgren Fixes: dd7deaf218bf ("ARM: davinci: da850: add DT node for ethernet") Signed-off-by: Sekhar Nori --- arch/arm/boot/dts/da850-evm.dts | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/arm/boot/dts/da850-evm.dts b/arch/arm/boot/dts/da850-evm.dts index 67e72bc72e80..c75507922f7d 100644 --- a/arch/arm/boot/dts/da850-evm.dts +++ b/arch/arm/boot/dts/da850-evm.dts @@ -15,6 +15,13 @@ compatible = "ti,da850-evm", "ti,da850"; model = "DA850/AM1808/OMAP-L138 EVM"; + aliases { + serial0 = &serial0; + serial1 = &serial1; + serial2 = &serial2; + ethernet0 = ð0; + }; + soc@1c00000 { pmx_core: pinmux@14120 { status = "okay"; -- cgit From 5a5d718f952b55e7fa96bfaf6f31f2c08babd77b Mon Sep 17 00:00:00 2001 From: Sriram Periyasamy Date: Tue, 19 Sep 2017 17:25:05 -0500 Subject: ALSA: hda - program ICT bits to support HBR audio On recent Intel platforms (Haswell, Broadwell, Skylake, ApolloLake, KabyLake, ...), the IEC Coding Type (ICT) bitfield in the Digital Converter Control #3 needs to be set explicitly for HDMI/DisplayPort High Bit Rate (HBR) audio playback to work. This was not required in earlier platforms when HBR was first introduced. The ICT bits are defined in Section 7.3.3.9 of the HDaudio 1.0a specification. Since the ICT bitfield was not specified for HDAudio 1.0 devices (before 2009), we only program it on machines more recent than Haswell. We tested that this fix is not needed on Baytrail-I (MinnowBoard Turbot) and believe by extension it also does not apply to Braswell. [ Moved AC_VERB_SET_DIGI_CONVERT_3 definition to the right place by tiwai ] Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=98797 Signed-off-by: Sriram Periyasamy Signed-off-by: Pierre-Louis Bossart Signed-off-by: Subhransu S. Prusty Acked-by: Vinod Koul Signed-off-by: Takashi Iwai --- include/sound/hda_verbs.h | 1 + sound/pci/hda/patch_hdmi.c | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/include/sound/hda_verbs.h b/include/sound/hda_verbs.h index d0509db6d0ec..f89cd5ee1c7a 100644 --- a/include/sound/hda_verbs.h +++ b/include/sound/hda_verbs.h @@ -95,6 +95,7 @@ enum { #define AC_VERB_SET_EAPD_BTLENABLE 0x70c #define AC_VERB_SET_DIGI_CONVERT_1 0x70d #define AC_VERB_SET_DIGI_CONVERT_2 0x70e +#define AC_VERB_SET_DIGI_CONVERT_3 0x73e #define AC_VERB_SET_VOLUME_KNOB_CONTROL 0x70f #define AC_VERB_SET_GPIO_DATA 0x715 #define AC_VERB_SET_GPIO_MASK 0x716 diff --git a/sound/pci/hda/patch_hdmi.c b/sound/pci/hda/patch_hdmi.c index 2b64fabd5faa..c19c81d230bd 100644 --- a/sound/pci/hda/patch_hdmi.c +++ b/sound/pci/hda/patch_hdmi.c @@ -906,6 +906,7 @@ static int hdmi_setup_stream(struct hda_codec *codec, hda_nid_t cvt_nid, hda_nid_t pin_nid, u32 stream_tag, int format) { struct hdmi_spec *spec = codec->spec; + unsigned int param; int err; err = spec->ops.pin_hbr_setup(codec, pin_nid, is_hbr_format(format)); @@ -915,6 +916,26 @@ static int hdmi_setup_stream(struct hda_codec *codec, hda_nid_t cvt_nid, return err; } + if (is_haswell_plus(codec)) { + + /* + * on recent platforms IEC Coding Type is required for HBR + * support, read current Digital Converter settings and set + * ICT bitfield if needed. + */ + param = snd_hda_codec_read(codec, cvt_nid, 0, + AC_VERB_GET_DIGI_CONVERT_1, 0); + + param = (param >> 16) & ~(AC_DIG3_ICT); + + /* on recent platforms ICT mode is required for HBR support */ + if (is_hbr_format(format)) + param |= 0x1; + + snd_hda_codec_write(codec, cvt_nid, 0, + AC_VERB_SET_DIGI_CONVERT_3, param); + } + snd_hda_codec_setup_stream(codec, cvt_nid, stream_tag, 0, format); return 0; } -- cgit From 97e133d54c1ca8948b191e5721a145a76c4db33d Mon Sep 17 00:00:00 2001 From: Roger Quadros Date: Tue, 19 Sep 2017 11:46:16 +0300 Subject: usb: gadget: core: fix ->udc_set_speed() logic Consider the following case: udc controller supports SuperSpeed. If we first load a HighSpeed gadget followed by a SuperSpeed gadget, the SuperSpeed gadget will be limited to HighSpeed as UDC core driver doesn't call ->udc_set_speed() in the second case. Call ->udc_set_speed() unconditionally to fix this issue. This will also fix the case for dwc3 controller driver when SuperSpeed gadget is loaded first and works in HighSpeed only as udc_set_speed() was never being called. Fixes: 6099eca796ae ("usb: gadget: core: introduce ->udc_set_speed() method") Cc: [v4.13+] Signed-off-by: Roger Quadros Signed-off-by: Felipe Balbi --- drivers/usb/gadget/udc/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/usb/gadget/udc/core.c b/drivers/usb/gadget/udc/core.c index 75c51ca4ee0f..d41d07aae0ce 100644 --- a/drivers/usb/gadget/udc/core.c +++ b/drivers/usb/gadget/udc/core.c @@ -1320,8 +1320,7 @@ static int udc_bind_to_driver(struct usb_udc *udc, struct usb_gadget_driver *dri udc->dev.driver = &driver->driver; udc->gadget->dev.driver = &driver->driver; - if (driver->max_speed < udc->gadget->max_speed) - usb_gadget_udc_set_speed(udc, driver->max_speed); + usb_gadget_udc_set_speed(udc, driver->max_speed); ret = driver->bind(udc->gadget, driver); if (ret) -- cgit From 9ada8c582088d32bd5c071c17213bc6edf37443a Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Wed, 13 Sep 2017 15:31:33 +0900 Subject: usb: gadget: function: printer: avoid spinlock recursion If usb_gadget_giveback_request() is called in usb_ep_queue(), this printer_write() is possible to cause spinlock recursion. So, this patch adds spin_unlock() before calls usb_ep_queue() to avoid it. Signed-off-by: Yoshihiro Shimoda Signed-off-by: Felipe Balbi --- drivers/usb/gadget/function/f_printer.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/usb/gadget/function/f_printer.c b/drivers/usb/gadget/function/f_printer.c index 8df244fc9d80..ea0da35a44e2 100644 --- a/drivers/usb/gadget/function/f_printer.c +++ b/drivers/usb/gadget/function/f_printer.c @@ -555,6 +555,7 @@ printer_write(struct file *fd, const char __user *buf, size_t len, loff_t *ptr) size_t size; /* Amount of data in a TX request. */ size_t bytes_copied = 0; struct usb_request *req; + int value; DBG(dev, "printer_write trying to send %d bytes\n", (int)len); @@ -634,7 +635,11 @@ printer_write(struct file *fd, const char __user *buf, size_t len, loff_t *ptr) return -EAGAIN; } - if (usb_ep_queue(dev->in_ep, req, GFP_ATOMIC)) { + /* here, we unlock, and only unlock, to avoid deadlock. */ + spin_unlock(&dev->lock); + value = usb_ep_queue(dev->in_ep, req, GFP_ATOMIC); + spin_lock(&dev->lock); + if (value) { list_add(&req->list, &dev->tx_reqs); spin_unlock_irqrestore(&dev->lock, flags); mutex_unlock(&dev->lock_printer_io); -- cgit From 641663a19f6002ecc5c5af517a5184d4164cc749 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 11 Sep 2017 09:24:15 -0700 Subject: usb: gadget: udc: fix snps_udc_plat.c build errors Fix build errors that happen when CONFIG_EXTCON=m and CONFIG_USB_SNP_UDC_PLAT=y by preventing that combination in Kconfig. CONFIG_EXTCON can still be disabled or enabled for this driver since has stubs for the disabled case, but if CONFIG_EXTCON=m, USB_SNP_UDC_PLAT is restricted to m or n (cannot be builtin). drivers/built-in.o: In function `udc_plat_remove': snps_udc_plat.c:(.text+0x2c4060): undefined reference to `extcon_unregister_notifier' drivers/built-in.o: In function `udc_plat_probe': snps_udc_plat.c:(.text+0x2c438c): undefined reference to `extcon_get_edev_by_phandle' snps_udc_plat.c:(.text+0x2c43f2): undefined reference to `extcon_register_notifier' snps_udc_plat.c:(.text+0x2c4416): undefined reference to `extcon_get_state' snps_udc_plat.c:(.text+0x2c44f7): undefined reference to `extcon_unregister_notifier' Reported-by: kbuild test robot Signed-off-by: Randy Dunlap Signed-off-by: Felipe Balbi --- drivers/usb/gadget/udc/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/gadget/udc/Kconfig b/drivers/usb/gadget/udc/Kconfig index 7cd5c969fcbe..1e9567091d86 100644 --- a/drivers/usb/gadget/udc/Kconfig +++ b/drivers/usb/gadget/udc/Kconfig @@ -273,6 +273,7 @@ config USB_SNP_CORE config USB_SNP_UDC_PLAT tristate "Synopsys USB 2.0 Device controller" depends on USB_GADGET && OF && HAS_DMA + depends on EXTCON || EXTCON=n select USB_GADGET_DUALSPEED select USB_SNP_CORE default ARCH_BCM_IPROC -- cgit From 7661ca09b2ff98f48693f431bb01fed62830e433 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 7 Sep 2017 16:14:31 +0200 Subject: usb: gadget: dummy: fix nonsensical comparisons gcc-8 points out two comparisons that are clearly bogus and almost certainly not what the author intended to write: drivers/usb/gadget/udc/dummy_hcd.c: In function 'set_link_state_by_speed': drivers/usb/gadget/udc/dummy_hcd.c:379:31: error: bitwise comparison always evaluates to false [-Werror=tautological-compare] USB_PORT_STAT_ENABLE) == 1 && ^~ drivers/usb/gadget/udc/dummy_hcd.c:381:25: error: bitwise comparison always evaluates to false [-Werror=tautological-compare] USB_SS_PORT_LS_U0) == 1 && ^~ I looked at the code for a bit and came up with a change that makes it look like what the author probably meant here. This makes it look reasonable to me and to gcc, shutting up the warning. It does of course change behavior as the two conditions are actually evaluated rather than being hardcoded to false, and I have made no attempt at verifying that the changed logic makes sense in the context of a USB HCD, so that part needs to be reviewed carefully. Fixes: 1cd8fd2887e1 ("usb: gadget: dummy_hcd: add SuperSpeed support") Cc: Tatyana Brokhman Cc: Felipe Balbi Acked-by: Alan Stern Signed-off-by: Arnd Bergmann Signed-off-by: Felipe Balbi --- drivers/usb/gadget/udc/dummy_hcd.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/usb/gadget/udc/dummy_hcd.c b/drivers/usb/gadget/udc/dummy_hcd.c index a030d7923d7d..b1e21b3be6e1 100644 --- a/drivers/usb/gadget/udc/dummy_hcd.c +++ b/drivers/usb/gadget/udc/dummy_hcd.c @@ -375,11 +375,10 @@ static void set_link_state_by_speed(struct dummy_hcd *dum_hcd) USB_PORT_STAT_CONNECTION) == 0) dum_hcd->port_status |= (USB_PORT_STAT_C_CONNECTION << 16); - if ((dum_hcd->port_status & - USB_PORT_STAT_ENABLE) == 1 && - (dum_hcd->port_status & - USB_SS_PORT_LS_U0) == 1 && - dum_hcd->rh_state != DUMMY_RH_SUSPENDED) + if ((dum_hcd->port_status & USB_PORT_STAT_ENABLE) && + (dum_hcd->port_status & + USB_PORT_STAT_LINK_STATE) == USB_SS_PORT_LS_U0 && + dum_hcd->rh_state != DUMMY_RH_SUSPENDED) dum_hcd->active = 1; } } else { -- cgit From 20da2ec06bfad2d4dfd40d77d3831f5e56365d20 Mon Sep 17 00:00:00 2001 From: Sergey Matyukevich Date: Mon, 18 Sep 2017 15:29:49 +0300 Subject: qtnfmac: lock access to h/w in tx path Fix tx path regression. Lock should be held when queuing packets to h/w fifos in order to properly handle configurations with multiple enabled interfaces. Signed-off-by: Sergey Matyukevich Signed-off-by: Kalle Valo --- drivers/net/wireless/quantenna/qtnfmac/pearl/pcie.c | 9 ++++++++- drivers/net/wireless/quantenna/qtnfmac/pearl/pcie_bus_priv.h | 2 ++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/quantenna/qtnfmac/pearl/pcie.c b/drivers/net/wireless/quantenna/qtnfmac/pearl/pcie.c index 502e72b7cdcc..69131965a298 100644 --- a/drivers/net/wireless/quantenna/qtnfmac/pearl/pcie.c +++ b/drivers/net/wireless/quantenna/qtnfmac/pearl/pcie.c @@ -661,14 +661,18 @@ static int qtnf_pcie_data_tx(struct qtnf_bus *bus, struct sk_buff *skb) struct qtnf_pcie_bus_priv *priv = (void *)get_bus_priv(bus); dma_addr_t txbd_paddr, skb_paddr; struct qtnf_tx_bd *txbd; + unsigned long flags; int len, i; u32 info; int ret = 0; + spin_lock_irqsave(&priv->tx0_lock, flags); + if (!qtnf_tx_queue_ready(priv)) { if (skb->dev) netif_stop_queue(skb->dev); + spin_unlock_irqrestore(&priv->tx0_lock, flags); return NETDEV_TX_BUSY; } @@ -717,8 +721,10 @@ tx_done: dev_kfree_skb_any(skb); } - qtnf_pcie_data_tx_reclaim(priv); priv->tx_done_count++; + spin_unlock_irqrestore(&priv->tx0_lock, flags); + + qtnf_pcie_data_tx_reclaim(priv); return NETDEV_TX_OK; } @@ -1247,6 +1253,7 @@ static int qtnf_pcie_probe(struct pci_dev *pdev, const struct pci_device_id *id) strcpy(bus->fwname, QTN_PCI_PEARL_FW_NAME); init_completion(&bus->request_firmware_complete); mutex_init(&bus->bus_lock); + spin_lock_init(&pcie_priv->tx0_lock); spin_lock_init(&pcie_priv->irq_lock); spin_lock_init(&pcie_priv->tx_reclaim_lock); diff --git a/drivers/net/wireless/quantenna/qtnfmac/pearl/pcie_bus_priv.h b/drivers/net/wireless/quantenna/qtnfmac/pearl/pcie_bus_priv.h index e76a23716ee0..86ac1ccedb52 100644 --- a/drivers/net/wireless/quantenna/qtnfmac/pearl/pcie_bus_priv.h +++ b/drivers/net/wireless/quantenna/qtnfmac/pearl/pcie_bus_priv.h @@ -34,6 +34,8 @@ struct qtnf_pcie_bus_priv { /* lock for tx reclaim operations */ spinlock_t tx_reclaim_lock; + /* lock for tx0 operations */ + spinlock_t tx0_lock; u8 msi_enabled; int mps; -- cgit From a715b3a0efe76d36c3ef96a93894a13db9d3a72f Mon Sep 17 00:00:00 2001 From: Sergey Matyukevich Date: Mon, 18 Sep 2017 15:29:50 +0300 Subject: qtnfmac: cancel scans on wireless interface changes Cancel active scans and deactivate firmware scan watchdog timer when wireless interface configuration is changed. The usecases include wireless interface mode change, interface down, AP stop, virtual interface removal. Signed-off-by: Sergey Matyukevich Signed-off-by: Kalle Valo --- drivers/net/wireless/quantenna/qtnfmac/cfg80211.c | 9 ++++++--- drivers/net/wireless/quantenna/qtnfmac/cfg80211.h | 3 +++ drivers/net/wireless/quantenna/qtnfmac/event.c | 2 -- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c index 856fa6e8327e..a450bc6bc774 100644 --- a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c +++ b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c @@ -115,6 +115,8 @@ int qtnf_del_virtual_intf(struct wiphy *wiphy, struct wireless_dev *wdev) vif = qtnf_netdev_get_priv(wdev->netdev); + qtnf_scan_done(vif->mac, true); + if (qtnf_cmd_send_del_intf(vif)) pr_err("VIF%u.%u: failed to delete VIF\n", vif->mac->macid, vif->vifid); @@ -335,6 +337,8 @@ static int qtnf_stop_ap(struct wiphy *wiphy, struct net_device *dev) struct qtnf_vif *vif = qtnf_netdev_get_priv(dev); int ret; + qtnf_scan_done(vif->mac, true); + ret = qtnf_cmd_send_stop_ap(vif); if (ret) { pr_err("VIF%u.%u: failed to stop AP operation in FW\n", @@ -570,8 +574,6 @@ qtnf_del_station(struct wiphy *wiphy, struct net_device *dev, !qtnf_sta_list_lookup(&vif->sta_list, params->mac)) return 0; - qtnf_scan_done(vif->mac, true); - ret = qtnf_cmd_send_del_sta(vif, params); if (ret) pr_err("VIF%u.%u: failed to delete STA %pM\n", @@ -1134,8 +1136,9 @@ void qtnf_virtual_intf_cleanup(struct net_device *ndev) } vif->sta_state = QTNF_STA_DISCONNECTED; - qtnf_scan_done(mac, true); } + + qtnf_scan_done(mac, true); } void qtnf_cfg80211_vif_reset(struct qtnf_vif *vif) diff --git a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.h b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.h index 6a4af52522b8..66db26613b1f 100644 --- a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.h +++ b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.h @@ -34,6 +34,9 @@ static inline void qtnf_scan_done(struct qtnf_wmac *mac, bool aborted) .aborted = aborted, }; + if (timer_pending(&mac->scan_timeout)) + del_timer_sync(&mac->scan_timeout); + mutex_lock(&mac->mac_lock); if (mac->scan_req) { diff --git a/drivers/net/wireless/quantenna/qtnfmac/event.c b/drivers/net/wireless/quantenna/qtnfmac/event.c index 0fc2814eafad..43d2e7fd6e02 100644 --- a/drivers/net/wireless/quantenna/qtnfmac/event.c +++ b/drivers/net/wireless/quantenna/qtnfmac/event.c @@ -345,8 +345,6 @@ qtnf_event_handle_scan_complete(struct qtnf_wmac *mac, return -EINVAL; } - if (timer_pending(&mac->scan_timeout)) - del_timer_sync(&mac->scan_timeout); qtnf_scan_done(mac, le32_to_cpu(status->flags) & QLINK_SCAN_ABORTED); return 0; -- cgit From 0dcd020b7abbff238f188d4db8f02389dc849553 Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Wed, 20 Sep 2017 09:21:40 +0800 Subject: ALSA: usb-audio: Add sample rate quirk for Plantronics C310/C520-M Like other Plantronics devices, C310 and C520-M do not support sample rate reading. Add them to the sample rate quirk accordingly. BugLink: https://bugs.launchpad.net/bugs/1708499 BugLink: https://bugs.launchpad.net/bugs/1709282 Signed-off-by: Kai-Heng Feng Signed-off-by: Takashi Iwai --- sound/usb/quirks.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index 913552078285..b8cb57aeec77 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -1137,6 +1137,8 @@ bool snd_usb_get_sample_rate_quirk(struct snd_usb_audio *chip) case USB_ID(0x047F, 0x02F7): /* Plantronics BT-600 */ case USB_ID(0x047F, 0x0415): /* Plantronics BT-300 */ case USB_ID(0x047F, 0xAA05): /* Plantronics DA45 */ + case USB_ID(0x047F, 0xC022): /* Plantronics C310 */ + case USB_ID(0x047F, 0xC036): /* Plantronics C520-M */ case USB_ID(0x04D8, 0xFEEA): /* Benchmark DAC1 Pre */ case USB_ID(0x0556, 0x0014): /* Phoenix Audio TMX320VC */ case USB_ID(0x05A3, 0x9420): /* ELP HD USB Camera */ -- cgit From b22666febf6fc67776d49782057fe4dd06f41552 Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Wed, 20 Sep 2017 18:10:17 -0400 Subject: drm/amdkfd: Fix incorrect destroy_mqd parameter When uninitializing a kernel queue. Signed-off-by: Yong Zhao Signed-off-by: Felix Kuehling Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c index 0649dd43e780..a4ca1133482c 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c @@ -184,7 +184,7 @@ static void uninitialize(struct kernel_queue *kq) if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ) kq->mqd->destroy_mqd(kq->mqd, kq->queue->mqd, - false, + KFD_PREEMPT_TYPE_WAVEFRONT_RESET, QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS, kq->queue->pipe, kq->queue->queue); -- cgit From cb1d9967461cdf3b6aac6317c8d954a14f842571 Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Wed, 20 Sep 2017 18:10:21 -0400 Subject: drm/amdkfd: Fix kernel-queue wrapping bugs Avoid intermediate negative numbers when doing calculations with a mix of signed and unsigned variables where implicit conversions can lead to unexpected results. When kernel queue buffer wraps around to 0, we need to check that rptr won't be overwritten by the new packet. Signed-off-by: Yong Zhao Signed-off-by: Felix Kuehling Signed-off-by: Oded Gabbay --- drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c index a4ca1133482c..ed71ad40e8f7 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c @@ -210,6 +210,11 @@ static int acquire_packet_buffer(struct kernel_queue *kq, uint32_t wptr, rptr; unsigned int *queue_address; + /* When rptr == wptr, the buffer is empty. + * When rptr == wptr + 1, the buffer is full. + * It is always rptr that advances to the position of wptr, rather than + * the opposite. So we can only use up to queue_size_dwords - 1 dwords. + */ rptr = *kq->rptr_kernel; wptr = *kq->wptr_kernel; queue_address = (unsigned int *)kq->pq_kernel_addr; @@ -219,11 +224,10 @@ static int acquire_packet_buffer(struct kernel_queue *kq, pr_debug("wptr: %d\n", wptr); pr_debug("queue_address 0x%p\n", queue_address); - available_size = (rptr - 1 - wptr + queue_size_dwords) % + available_size = (rptr + queue_size_dwords - 1 - wptr) % queue_size_dwords; - if (packet_size_in_dwords >= queue_size_dwords || - packet_size_in_dwords >= available_size) { + if (packet_size_in_dwords > available_size) { /* * make sure calling functions know * acquire_packet_buffer() failed @@ -233,6 +237,14 @@ static int acquire_packet_buffer(struct kernel_queue *kq, } if (wptr + packet_size_in_dwords >= queue_size_dwords) { + /* make sure after rolling back to position 0, there is + * still enough space. + */ + if (packet_size_in_dwords >= rptr) { + *buffer_ptr = NULL; + return -ENOMEM; + } + /* fill nops, roll back and start at position 0 */ while (wptr > 0) { queue_address[wptr] = kq->nop_packet; wptr = (wptr + 1) % queue_size_dwords; -- cgit From c986169fdec992c464c84d0a3abdbfc846ed3df9 Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Wed, 20 Sep 2017 18:10:22 -0400 Subject: drm/amdkfd: Print event limit messages only once per process To avoid spamming the log. Signed-off-by: Felix Kuehling Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/gpu/drm/amd/amdkfd/kfd_events.c | 5 ++++- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c index 5979158c3f7b..944abfad39c1 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c @@ -292,7 +292,10 @@ static int create_signal_event(struct file *devkfd, struct kfd_event *ev) { if (p->signal_event_count == KFD_SIGNAL_EVENT_LIMIT) { - pr_warn("Signal event wasn't created because limit was reached\n"); + if (!p->signal_event_limit_reached) { + pr_warn("Signal event wasn't created because limit was reached\n"); + p->signal_event_limit_reached = true; + } return -ENOMEM; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index b397ec726400..b87e96cee5fa 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -521,6 +521,7 @@ struct kfd_process { struct list_head signal_event_pages; u32 next_nonsignal_event_id; size_t signal_event_count; + bool signal_event_limit_reached; }; /** -- cgit From 544e3bf4f0e8278400f19ca7918a3cdf2548b4eb Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 11 Sep 2017 14:22:08 +0200 Subject: reset: Restrict RESET_HSDK to ARC_SOC_HSDK or COMPILE_TEST The HSDK reset driver is only useful when building for an ARC HSDK platform. While at it, drop the "default n", as that is the default. Fixes: e0be864f14240cb1 ("ARC: reset: introduce HSDKv1 reset driver") Signed-off-by: Geert Uytterhoeven [p.zabel@pengutronix.de: rebased, renamed RESET_HSDK_V1 to RESET_HSDK] Signed-off-by: Philipp Zabel --- drivers/reset/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/reset/Kconfig b/drivers/reset/Kconfig index a7c7d5a8c089..e2baecbb9dd3 100644 --- a/drivers/reset/Kconfig +++ b/drivers/reset/Kconfig @@ -37,7 +37,7 @@ config RESET_BERLIN config RESET_HSDK bool "Synopsys HSDK Reset Driver" depends on HAS_IOMEM - default n + depends on ARC_SOC_HSDK || COMPILE_TEST help This enables the reset controller driver for HSDK board. -- cgit From 2bc84526d174a2a89c76438f049fc03ac259a159 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Wed, 6 Sep 2017 16:44:35 -0600 Subject: Makefile: kselftest and kselftest-clean fail for make O=dir case kselftest and kselftest-clean targets fail when object directory is specified to relocate objects. Fix it so it can find the source tree to build from. make O=/tmp/kselftest_top kselftest make[1]: Entering directory '/tmp/kselftest_top' make[2]: Entering directory '/tmp/kselftest_top' make[2]: *** tools/testing/selftests: No such file or directory. Stop. make[2]: Leaving directory '/tmp/kselftest_top' ./linux-kselftest/Makefile:1185: recipe for target 'kselftest' failed make[1]: *** [kselftest] Error 2 make[1]: Leaving directory '/tmp/kselftest_top' Makefile:145: recipe for target 'sub-make' failed make: *** [sub-make] Error 2 Signed-off-by: Shuah Khan Acked-by: Masahiro Yamada --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 64cbc66cebca..9a70578922d3 100644 --- a/Makefile +++ b/Makefile @@ -1172,11 +1172,11 @@ headers_check: headers_install PHONY += kselftest kselftest: - $(Q)$(MAKE) -C tools/testing/selftests run_tests + $(Q)$(MAKE) -C $(srctree)/tools/testing/selftests run_tests PHONY += kselftest-clean kselftest-clean: - $(Q)$(MAKE) -C tools/testing/selftests clean + $(Q)$(MAKE) -C $(srctree)/tools/testing/selftests clean PHONY += kselftest-merge kselftest-merge: -- cgit From 8050ef2b83a18f628f9501af958fbff39443d58d Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Wed, 6 Sep 2017 18:36:22 -0600 Subject: selftests: lib.mk: kselftest and kselftest-clean fail for make O=dir case kselftest and kselftest-clean targets fail when object directory is specified to relocate objects. Main Makefile make O= path clears the built-in defines LINK.c, COMPILE.S, LINK.S, and RM that are used in lib.mk to build and clean targets. Define them. Signed-off-by: Shuah Khan --- tools/testing/selftests/lib.mk | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk index 4665463779f5..266d3ed4bb41 100644 --- a/tools/testing/selftests/lib.mk +++ b/tools/testing/selftests/lib.mk @@ -7,6 +7,7 @@ OUTPUT := $(shell pwd) endif TEST_GEN_PROGS := $(patsubst %,$(OUTPUT)/%,$(TEST_GEN_PROGS)) +TEST_GEN_PROGS_EXTENDED := $(patsubst %,$(OUTPUT)/%,$(TEST_GEN_PROGS_EXTENDED)) TEST_GEN_FILES := $(patsubst %,$(OUTPUT)/%,$(TEST_GEN_FILES)) all: $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES) @@ -62,6 +63,11 @@ endef emit_tests: $(EMIT_TESTS) +# define if isn't already. It is undefined in make O= case. +ifeq ($(RM),) +RM := rm -f +endif + define CLEAN $(RM) -r $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES) $(EXTRA_CLEAN) endef @@ -69,6 +75,15 @@ endef clean: $(CLEAN) +# When make O= with kselftest target from main level +# the following aren't defined. +# +ifneq ($(KBUILD_SRC),) +LINK.c = $(CC) $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) $(TARGET_ARCH) +COMPILE.S = $(CC) $(ASFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c +LINK.S = $(CC) $(ASFLAGS) $(CPPFLAGS) $(LDFLAGS) $(TARGET_ARCH) +endif + $(OUTPUT)/%:%.c $(LINK.c) $^ $(LDLIBS) -o $@ -- cgit From 52fd1d082398b928a86d4fdf33c9f3abe1bf7914 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Thu, 7 Sep 2017 19:57:43 -0600 Subject: selftests: Makefile: clear LDFLAGS for make O=dir use-case kselftest target fails when object directory is specified to relocate objects. Inherited "LDFLAGS = -m" fails the test builds. Clear it. Signed-off-by: Shuah Khan --- tools/testing/selftests/Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 26ce4f7168be..f4368db011ea 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -52,6 +52,10 @@ override LDFLAGS = override MAKEFLAGS = endif +ifneq ($(KBUILD_SRC),) +override LDFLAGS = +endif + BUILD := $(O) ifndef BUILD BUILD := $(KBUILD_OUTPUT) -- cgit From e0a5696a23290c31c5ac2c76f0e7fe50a12c1fc6 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Thu, 7 Sep 2017 20:04:26 -0600 Subject: selftests: lib.mk: fix test executable status check to use full path Fix test executable status check to use full path for make O=dir case,m when tests are relocated to user specified object directory. Without the full path, this check fails to find the file and fails the test. Signed-off-by: Shuah Khan --- tools/testing/selftests/lib.mk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk index 266d3ed4bb41..fd1cbbbca8d7 100644 --- a/tools/testing/selftests/lib.mk +++ b/tools/testing/selftests/lib.mk @@ -21,7 +21,7 @@ define RUN_TESTS test_num=`echo $$test_num+1 | bc`; \ echo "selftests: $$BASENAME_TEST"; \ echo "========================================"; \ - if [ ! -x $$BASENAME_TEST ]; then \ + if [ ! -x $$TEST ]; then \ echo "selftests: Warning: file $$BASENAME_TEST is not executable, correct this.";\ echo "not ok 1..$$test_num selftests: $$BASENAME_TEST [FAIL]"; \ else \ -- cgit From e2fb65594cae9a016fab4639a5d22a914f1e16c8 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Fri, 8 Sep 2017 16:05:50 -0600 Subject: selftests: watchdog: fix to use TEST_GEN_PROGS and remove clean TEST_PROGS should be used for test scripts that don't ned to be built. Use TEST_GEN_PROGS instead which is intended for test executables. Remove clean target and let the common clean take care of cleaning. Signed-off-by: Shuah Khan --- tools/testing/selftests/watchdog/Makefile | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/tools/testing/selftests/watchdog/Makefile b/tools/testing/selftests/watchdog/Makefile index f863c664e3d1..ee068511fd0b 100644 --- a/tools/testing/selftests/watchdog/Makefile +++ b/tools/testing/selftests/watchdog/Makefile @@ -1,8 +1,3 @@ -TEST_PROGS := watchdog-test - -all: $(TEST_PROGS) +TEST_GEN_PROGS := watchdog-test include ../lib.mk - -clean: - rm -fr $(TEST_PROGS) -- cgit From be16a244c199983656e58ddb10d80c67197e502f Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Mon, 11 Sep 2017 13:06:33 -0600 Subject: selftests: lib.mk: add TEST_CUSTOM_PROGS to allow custom test run/install Some tests such as sync can't use generic build rules in lib.mk and require custom rules. Currently there is no provision to allow custom builds and test such as sync use TEST_PROGS which is reserved for test shell scripts. Add TEST_CUSTOM_PROGS variable to lib.mk to run and install custom tests built by individual test make files. Signed-off-by: Shuah Khan --- tools/testing/selftests/lib.mk | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk index fd1cbbbca8d7..b4699c71aee4 100644 --- a/tools/testing/selftests/lib.mk +++ b/tools/testing/selftests/lib.mk @@ -6,6 +6,12 @@ ifeq (0,$(MAKELEVEL)) OUTPUT := $(shell pwd) endif +# The following are built by lib.mk common compile rules. +# TEST_CUSTOM_PROGS should be used by tests that require +# custom build rule and prevent common build rule use. +# TEST_PROGS are for test shell scripts. +# TEST_CUSTOM_PROGS and TEST_PROGS will be run by common run_tests +# and install targets. Common clean doesn't touch them. TEST_GEN_PROGS := $(patsubst %,$(OUTPUT)/%,$(TEST_GEN_PROGS)) TEST_GEN_PROGS_EXTENDED := $(patsubst %,$(OUTPUT)/%,$(TEST_GEN_PROGS_EXTENDED)) TEST_GEN_FILES := $(patsubst %,$(OUTPUT)/%,$(TEST_GEN_FILES)) @@ -31,7 +37,7 @@ define RUN_TESTS endef run_tests: all - $(call RUN_TESTS, $(TEST_GEN_PROGS) $(TEST_PROGS)) + $(call RUN_TESTS, $(TEST_GEN_PROGS) $(TEST_CUSTOM_PROGS) $(TEST_PROGS)) define INSTALL_RULE @if [ "X$(TEST_PROGS)$(TEST_PROGS_EXTENDED)$(TEST_FILES)" != "X" ]; then \ @@ -39,10 +45,10 @@ define INSTALL_RULE echo "rsync -a $(TEST_PROGS) $(TEST_PROGS_EXTENDED) $(TEST_FILES) $(INSTALL_PATH)/"; \ rsync -a $(TEST_PROGS) $(TEST_PROGS_EXTENDED) $(TEST_FILES) $(INSTALL_PATH)/; \ fi - @if [ "X$(TEST_GEN_PROGS)$(TEST_GEN_PROGS_EXTENDED)$(TEST_GEN_FILES)" != "X" ]; then \ + @if [ "X$(TEST_GEN_PROGS)$(TEST_CUSTOM_PROGS)$(TEST_GEN_PROGS_EXTENDED)$(TEST_GEN_FILES)" != "X" ]; then \ mkdir -p ${INSTALL_PATH}; \ - echo "rsync -a $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES) $(INSTALL_PATH)/"; \ - rsync -a $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES) $(INSTALL_PATH)/; \ + echo "rsync -a $(TEST_GEN_PROGS) $(TEST_CUSTOM_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES) $(INSTALL_PATH)/"; \ + rsync -a $(TEST_GEN_PROGS) $(TEST_CUSTOM_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES) $(INSTALL_PATH)/; \ fi endef @@ -54,7 +60,7 @@ else endif define EMIT_TESTS - @for TEST in $(TEST_GEN_PROGS) $(TEST_PROGS); do \ + @for TEST in $(TEST_GEN_PROGS) $(TEST_CUSTOM_PROGS) $(TEST_PROGS); do \ BASENAME_TEST=`basename $$TEST`; \ echo "(./$$BASENAME_TEST > /tmp/$$BASENAME_TEST 2>&1 && echo \"selftests: $$BASENAME_TEST [PASS]\") || echo \"selftests: $$BASENAME_TEST [FAIL]\""; \ done; -- cgit From 38f7251852a0cd47f34af4a6f84df0fadafa8ac7 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Mon, 11 Sep 2017 13:46:44 -0600 Subject: selftests: sync: use TEST_CUSTOM_PROGS instead of TEST_PROGS lib.mk var TEST_CUSTOM_PROGS is for tests that need custom build rules. TEST_PROGS is used for test shell scripts. Fix it to use TEST_CUSTOM_PROGS. lib.mk will run and install them. Signed-off-by: Shuah Khan --- tools/testing/selftests/sync/Makefile | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/sync/Makefile b/tools/testing/selftests/sync/Makefile index 4981c6b6d050..43db80b71e80 100644 --- a/tools/testing/selftests/sync/Makefile +++ b/tools/testing/selftests/sync/Makefile @@ -2,9 +2,11 @@ CFLAGS += -O2 -g -std=gnu89 -pthread -Wall -Wextra CFLAGS += -I../../../../usr/include/ LDFLAGS += -pthread -TEST_PROGS = sync_test +# lib.mk TEST_CUSTOM_PROGS var is for custome tests that need special +# build rules. lib.mk will run and install them. +TEST_CUSTOM_PROGS = sync_test -all: $(TEST_PROGS) +all: $(TEST_CUSTOM_PROGS) include ../lib.mk -- cgit From b2fc6ade9fe2118591e7f51e21b51f65e36f138f Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Mon, 11 Sep 2017 14:14:41 -0600 Subject: selftests: sync: kselftest and kselftest-clean fail for make O=dir case sync test fails to build when object directory is specified to relocate object files. Fix it to specify the correct path. Fix clean target to remove objects. Also include simplified logic to use TEST_CUSTOM_PROGS in build and clean targets instead of hard-coding the test name each time. Signed-off-by: Shuah Khan --- tools/testing/selftests/sync/Makefile | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/sync/Makefile b/tools/testing/selftests/sync/Makefile index 43db80b71e80..8e04d0afcbd7 100644 --- a/tools/testing/selftests/sync/Makefile +++ b/tools/testing/selftests/sync/Makefile @@ -2,14 +2,16 @@ CFLAGS += -O2 -g -std=gnu89 -pthread -Wall -Wextra CFLAGS += -I../../../../usr/include/ LDFLAGS += -pthread -# lib.mk TEST_CUSTOM_PROGS var is for custome tests that need special +.PHONY: all clean + +include ../lib.mk + +# lib.mk TEST_CUSTOM_PROGS var is for custom tests that need special # build rules. lib.mk will run and install them. -TEST_CUSTOM_PROGS = sync_test +TEST_CUSTOM_PROGS := $(OUTPUT)/sync_test all: $(TEST_CUSTOM_PROGS) -include ../lib.mk - OBJS = sync_test.o sync.o TESTS += sync_alloc.o @@ -20,6 +22,16 @@ TESTS += sync_stress_parallelism.o TESTS += sync_stress_consumer.o TESTS += sync_stress_merge.o -sync_test: $(OBJS) $(TESTS) +OBJS := $(patsubst %,$(OUTPUT)/%,$(OBJS)) +TESTS := $(patsubst %,$(OUTPUT)/%,$(TESTS)) + +$(TEST_CUSTOM_PROGS): $(TESTS) $(OBJS) + $(CC) -o $(TEST_CUSTOM_PROGS) $(OBJS) $(TESTS) $(CFLAGS) $(LDFLAGS) + +$(OBJS): $(OUTPUT)/%.o: %.c + $(CC) -c $^ -o $@ + +$(TESTS): $(OUTPUT)/%.o: %.c + $(CC) -c $^ -o $@ -EXTRA_CLEAN := sync_test $(OBJS) $(TESTS) +EXTRA_CLEAN := $(TEST_CUSTOM_PROGS) $(OBJS) $(TESTS) -- cgit From 1a940687e424080a6ed285f2de8b2f0d018edf3e Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Mon, 11 Sep 2017 19:03:11 -0600 Subject: selftests: lib.mk: copy test scripts and test files for make O=dir run For make O=dir run_tests to work, test scripts, test files, and other dependencies need to be copied over to the object directory. Running tests from the object directory is necessary to avoid making the source tree dirty. Signed-off-by: Shuah Khan --- tools/testing/selftests/lib.mk | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk index b4699c71aee4..f65886af7c0c 100644 --- a/tools/testing/selftests/lib.mk +++ b/tools/testing/selftests/lib.mk @@ -37,7 +37,18 @@ define RUN_TESTS endef run_tests: all +ifneq ($(KBUILD_SRC),) + @if [ "X$(TEST_PROGS) $(TEST_PROGS_EXTENDED) $(TEST_FILES)" != "X" ]; then + @rsync -aq $(TEST_PROGS) $(TEST_PROGS_EXTENDED) $(TEST_FILES) $(OUTPUT) + fi + @if [ "X$(TEST_PROGS)" != "X" ]; then + $(call RUN_TESTS, $(TEST_GEN_PROGS) $(TEST_CUSTOM_PROGS) $(OUTPUT)/$(TEST_PROGS)) + else + $(call RUN_TESTS, $(TEST_GEN_PROGS) $(TEST_CUSTOM_PROGS)) + fi +else $(call RUN_TESTS, $(TEST_GEN_PROGS) $(TEST_CUSTOM_PROGS) $(TEST_PROGS)) +endif define INSTALL_RULE @if [ "X$(TEST_PROGS)$(TEST_PROGS_EXTENDED)$(TEST_FILES)" != "X" ]; then \ -- cgit From 6af1de2e4ec49635905aaed31d073a0d92c8d3bf Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 6 Sep 2017 14:58:53 +0200 Subject: ath10k: mark PM functions as __maybe_unused When CONFIG_PM_SLEEP is disabled, we get a compile-time warning: drivers/net/wireless/ath/ath10k/pci.c:3417:12: error: 'ath10k_pci_pm_resume' defined but not used [-Werror=unused-function] static int ath10k_pci_pm_resume(struct device *dev) ^~~~~~~~~~~~~~~~~~~~ drivers/net/wireless/ath/ath10k/pci.c:3401:12: error: 'ath10k_pci_pm_suspend' defined but not used [-Werror=unused-function] static int ath10k_pci_pm_suspend(struct device *dev) Rather than fixing the #ifdef, this just marks both functions as __maybe_unused, which is a more robust way to do this. Fixes: 32faa3f0ee50 ("ath10k: add the PCI PM core suspend/resume ops") Signed-off-by: Arnd Bergmann Signed-off-by: Kalle Valo --- drivers/net/wireless/ath/ath10k/pci.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/net/wireless/ath/ath10k/pci.c b/drivers/net/wireless/ath/ath10k/pci.c index bc1633945a56..195dafb98131 100644 --- a/drivers/net/wireless/ath/ath10k/pci.c +++ b/drivers/net/wireless/ath/ath10k/pci.c @@ -3396,9 +3396,7 @@ static void ath10k_pci_remove(struct pci_dev *pdev) MODULE_DEVICE_TABLE(pci, ath10k_pci_id_table); -#ifdef CONFIG_PM - -static int ath10k_pci_pm_suspend(struct device *dev) +static __maybe_unused int ath10k_pci_pm_suspend(struct device *dev) { struct ath10k *ar = dev_get_drvdata(dev); int ret; @@ -3414,7 +3412,7 @@ static int ath10k_pci_pm_suspend(struct device *dev) return ret; } -static int ath10k_pci_pm_resume(struct device *dev) +static __maybe_unused int ath10k_pci_pm_resume(struct device *dev) { struct ath10k *ar = dev_get_drvdata(dev); int ret; @@ -3433,7 +3431,6 @@ static int ath10k_pci_pm_resume(struct device *dev) static SIMPLE_DEV_PM_OPS(ath10k_pci_pm_ops, ath10k_pci_pm_suspend, ath10k_pci_pm_resume); -#endif static struct pci_driver ath10k_pci_driver = { .name = "ath10k_pci", -- cgit From 2e1c42391ff2556387b3cb6308b24f6f65619feb Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 21 Sep 2017 16:58:48 +0200 Subject: USB: core: harden cdc_parse_cdc_header Andrey Konovalov reported a possible out-of-bounds problem for the cdc_parse_cdc_header function. He writes: It looks like cdc_parse_cdc_header() doesn't validate buflen before accessing buffer[1], buffer[2] and so on. The only check present is while (buflen > 0). So fix this issue up by properly validating the buffer length matches what the descriptor says it is. Reported-by: Andrey Konovalov Tested-by: Andrey Konovalov Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/message.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/usb/core/message.c b/drivers/usb/core/message.c index 4c38ea41ae96..371a07d874a3 100644 --- a/drivers/usb/core/message.c +++ b/drivers/usb/core/message.c @@ -2069,6 +2069,10 @@ int cdc_parse_cdc_header(struct usb_cdc_parsed_header *hdr, elength = 1; goto next_desc; } + if ((buflen < elength) || (elength < 3)) { + dev_err(&intf->dev, "invalid descriptor buffer length\n"); + break; + } if (buffer[1] != USB_DT_CS_INTERFACE) { dev_err(&intf->dev, "skipping garbage\n"); goto next_desc; -- cgit From 51a9a8284e43642fc3e85810fd54f4c245d23a14 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 18 Sep 2017 10:03:12 +0100 Subject: x86/xen: clean up clang build warning In the case where sizeof(maddr) != sizeof(long) p is initialized and never read and clang throws a warning on this. Move declaration of p to clean up the clang build warning: warning: Value stored to 'p' during its initialization is never read Signed-off-by: Colin Ian King Reviewed-by: Juergen Gross Signed-off-by: Boris Ostrovsky --- arch/x86/include/asm/xen/hypercall.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 9606688caa4b..e089c1675a7c 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -552,13 +552,13 @@ static inline void MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr, struct desc_struct desc) { - u32 *p = (u32 *) &desc; - mcl->op = __HYPERVISOR_update_descriptor; if (sizeof(maddr) == sizeof(long)) { mcl->args[0] = maddr; mcl->args[1] = *(unsigned long *)&desc; } else { + u32 *p = (u32 *)&desc; + mcl->args[0] = maddr; mcl->args[1] = maddr >> 32; mcl->args[2] = *p++; -- cgit From 4ba72fc080ad44a5c1e93449ec070cd4d331803f Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Thu, 21 Sep 2017 22:34:54 +0200 Subject: drm/sun4i: cec: Enable back CEC-pin framework Now that the cec-pin framework has been merged, we can remove the safeguard that were preventing the CEC part of the sun4i HDMI driver and actually start to use it. Signed-off-by: Hans Verkuil Signed-off-by: Maxime Ripard --- drivers/gpu/drm/sun4i/Kconfig | 2 +- drivers/gpu/drm/sun4i/sun4i_hdmi.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/sun4i/Kconfig b/drivers/gpu/drm/sun4i/Kconfig index 06f05302ee75..882d85db9053 100644 --- a/drivers/gpu/drm/sun4i/Kconfig +++ b/drivers/gpu/drm/sun4i/Kconfig @@ -26,7 +26,7 @@ config DRM_SUN4I_HDMI_CEC bool "Allwinner A10 HDMI CEC Support" depends on DRM_SUN4I_HDMI select CEC_CORE - depends on CEC_PIN + select CEC_PIN help Choose this option if you have an Allwinner SoC with an HDMI controller and want to use CEC. diff --git a/drivers/gpu/drm/sun4i/sun4i_hdmi.h b/drivers/gpu/drm/sun4i/sun4i_hdmi.h index 1457750988da..a1f8cba251a2 100644 --- a/drivers/gpu/drm/sun4i/sun4i_hdmi.h +++ b/drivers/gpu/drm/sun4i/sun4i_hdmi.h @@ -15,7 +15,7 @@ #include #include -#include +#include #define SUN4I_HDMI_CTRL_REG 0x004 #define SUN4I_HDMI_CTRL_ENABLE BIT(31) -- cgit From a4fd4a724d6c30ad671046d83be2e9be2f11d275 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Thu, 21 Sep 2017 16:02:05 -0400 Subject: usb-storage: fix bogus hardware error messages for ATA pass-thru devices Ever since commit a621bac3044e ("scsi_lib: correctly retry failed zero length REQ_TYPE_FS commands"), people have been getting bogus error messages for USB disk drives using ATA pass-thru. For example: [ 1344.880193] sd 6:0:0:0: [sdb] Attached SCSI disk [ 1345.069152] sd 6:0:0:0: [sdb] tag#0 FAILED Result: hostbyte=DID_ERROR driverbyte=DRIVER_SENSE [ 1345.069159] sd 6:0:0:0: [sdb] tag#0 Sense Key : Hardware Error [current] [descriptor] [ 1345.069162] sd 6:0:0:0: [sdb] tag#0 Add. Sense: No additional sense information [ 1345.069168] sd 6:0:0:0: [sdb] tag#0 CDB: ATA command pass through(16) 85 06 20 00 00 00 00 00 00 00 00 00 00 00 e5 00 [ 1345.172252] sd 6:0:0:0: [sdb] tag#0 FAILED Result: hostbyte=DID_ERROR driverbyte=DRIVER_SENSE [ 1345.172258] sd 6:0:0:0: [sdb] tag#0 Sense Key : Hardware Error [current] [descriptor] [ 1345.172261] sd 6:0:0:0: [sdb] tag#0 Add. Sense: No additional sense information [ 1345.172266] sd 6:0:0:0: [sdb] tag#0 CDB: ATA command pass through(12)/Blank a1 06 20 da 00 00 4f c2 00 b0 00 00 These messages can be quite annoying, because programs like udisks2 provoke them every 10 minutes or so. Other programs can also have this effect, such as those in smartmontools. I don't fully understand how that commit induced the SCSI core to log these error messages, but the underlying cause for them is code added to usb-storage by commit f1a0743bc0e7 ("USB: storage: When a device returns no sense data, call it a Hardware Error"). At the time it was necessary to do this, in order to prevent an infinite retry loop with some not-so-great mass storage devices. However, the ATA pass-thru protocol uses SCSI sense data to return command status values, and some devices always report Check Condition status for ATA pass-thru commands to ensure that the host retrieves the sense data, even if the command succeeded. This violates the USB mass-storage protocol (Check Condition status is supposed to mean the command failed), but we can't help that. This patch attempts to mitigate the problem of these bogus error reports by changing usb-storage. The HARDWARE ERROR sense key will be inserted only for commands that aren't ATA pass-thru. Thanks to Ewan Milne for pointing out that this mechanism was present in usb-storage. 8 years after writing it, I had completely forgotten its existence. Signed-off-by: Alan Stern Tested-by: Kris Lindgren Ref: https://bugzilla.redhat.com/show_bug.cgi?id=1351305 CC: Ewan D. Milne CC: Signed-off-by: Greg Kroah-Hartman --- drivers/usb/storage/transport.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/usb/storage/transport.c b/drivers/usb/storage/transport.c index 1a59f335b063..a3ccb899df60 100644 --- a/drivers/usb/storage/transport.c +++ b/drivers/usb/storage/transport.c @@ -834,13 +834,25 @@ Retry_Sense: if (result == USB_STOR_TRANSPORT_GOOD) { srb->result = SAM_STAT_GOOD; srb->sense_buffer[0] = 0x0; + } + + /* + * ATA-passthru commands use sense data to report + * the command completion status, and often devices + * return Check Condition status when nothing is + * wrong. + */ + else if (srb->cmnd[0] == ATA_16 || + srb->cmnd[0] == ATA_12) { + /* leave the data alone */ + } /* * If there was a problem, report an unspecified * hardware error to prevent the higher layers from * entering an infinite retry loop. */ - } else { + else { srb->result = DID_ERROR << 16; if ((sshdr.response_code & 0x72) == 0x72) srb->sense_buffer[1] = HARDWARE_ERROR; -- cgit From 113f6eb6d50cfa5e2a1cdcf1678b12661fa272ab Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Thu, 21 Sep 2017 15:59:30 -0400 Subject: usb-storage: unusual_devs entry to fix write-access regression for Seagate external drives Kris Lindgren reports that without the NO_WP_DETECT flag, his Seagate external disk drive fails all write accesses. This regresssion dates back approximately to the start of the 4.x kernel releases. Signed-off-by: Alan Stern Reported-by: Kris Lindgren CC: Signed-off-by: Greg Kroah-Hartman --- drivers/usb/storage/unusual_devs.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/usb/storage/unusual_devs.h b/drivers/usb/storage/unusual_devs.h index 5a70c33ef0e0..eb06d88b41d6 100644 --- a/drivers/usb/storage/unusual_devs.h +++ b/drivers/usb/storage/unusual_devs.h @@ -1459,6 +1459,13 @@ UNUSUAL_DEV( 0x0bc2, 0x3010, 0x0000, 0x0000, USB_SC_DEVICE, USB_PR_DEVICE, NULL, US_FL_SANE_SENSE ), +/* Reported by Kris Lindgren */ +UNUSUAL_DEV( 0x0bc2, 0x3332, 0x0000, 0x9999, + "Seagate", + "External", + USB_SC_DEVICE, USB_PR_DEVICE, NULL, + US_FL_NO_WP_DETECT ), + UNUSUAL_DEV( 0x0d49, 0x7310, 0x0000, 0x9999, "Maxtor", "USB to SATA", -- cgit From fd085bb1766d6a598f53af2308374a546a49775a Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 19 Sep 2017 18:47:40 +0300 Subject: stm class: Fix a use-after-free For reasons unknown, the stm_source removal path uses device_destroy() to kill the underlying device object. Because device_destroy() uses devt to look for the device to destroy and the fact that stm_source devices don't have one (or all have the same one), it just picks the first device in the class, which may well be the wrong one. That is, loading stm_console and stm_heartbeat and then removing both will die in dereferencing a freed object. Since this should have been device_unregister() in the first place, use it instead of device_destroy(). Signed-off-by: Alexander Shishkin Fixes: 7bd1d4093c2 ("stm class: Introduce an abstraction for System Trace Module devices") Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/hwtracing/stm/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hwtracing/stm/core.c b/drivers/hwtracing/stm/core.c index 9414900575d8..f129869e05a9 100644 --- a/drivers/hwtracing/stm/core.c +++ b/drivers/hwtracing/stm/core.c @@ -1119,7 +1119,7 @@ void stm_source_unregister_device(struct stm_source_data *data) stm_source_link_drop(src); - device_destroy(&stm_source_class, src->dev.devt); + device_unregister(&src->dev); } EXPORT_SYMBOL_GPL(stm_source_unregister_device); -- cgit From 920ce7c33db25cf4acb4ade3ae8c93bd23dfd730 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 19 Sep 2017 18:47:41 +0300 Subject: intel_th: pci: Add Cedar Fork PCH support This adds Intel(R) Trace Hub PCI ID for Cedar Fork PCH. Signed-off-by: Alexander Shishkin Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/hwtracing/intel_th/pci.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/hwtracing/intel_th/pci.c b/drivers/hwtracing/intel_th/pci.c index bc9cebc30526..00ee60d9789e 100644 --- a/drivers/hwtracing/intel_th/pci.c +++ b/drivers/hwtracing/intel_th/pci.c @@ -158,6 +158,11 @@ static const struct pci_device_id intel_th_pci_id_table[] = { PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x9da6), .driver_data = (kernel_ulong_t)&intel_th_2x, }, + { + /* Cedar Fork PCH */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x18e1), + .driver_data = (kernel_ulong_t)&intel_th_2x, + }, { 0 }, }; -- cgit From 24600840c74112ad04a9ddd99d7d7f731dcaa1cb Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 19 Sep 2017 18:47:42 +0300 Subject: intel_th: pci: Add Lewisburg PCH support This adds Intel(R) Trace Hub PCI ID for Lewisburg PCH. Signed-off-by: Alexander Shishkin Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/hwtracing/intel_th/pci.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/hwtracing/intel_th/pci.c b/drivers/hwtracing/intel_th/pci.c index 00ee60d9789e..c2a2ce8ee541 100644 --- a/drivers/hwtracing/intel_th/pci.c +++ b/drivers/hwtracing/intel_th/pci.c @@ -143,6 +143,11 @@ static const struct pci_device_id intel_th_pci_id_table[] = { PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x19e1), .driver_data = (kernel_ulong_t)0, }, + { + /* Lewisburg PCH */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xa1a6), + .driver_data = (kernel_ulong_t)0, + }, { /* Gemini Lake */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x318e), -- cgit From 33c150c2ee4a65a59190a124b45d05b1abf9478e Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Thu, 21 Sep 2017 23:41:47 -0700 Subject: vmbus: don't acquire the mutex in vmbus_hvsock_device_unregister() Due to commit 54a66265d675 ("Drivers: hv: vmbus: Fix rescind handling"), we need this patch to resolve the below deadlock: after we get the mutex in vmbus_hvsock_device_unregister() and call vmbus_device_unregister() -> device_unregister() -> ... -> device_release() -> vmbus_device_release(), we'll get a deadlock, because vmbus_device_release() tries to get the same mutex. Signed-off-by: Dexuan Cui Cc: K. Y. Srinivasan Cc: Haiyang Zhang Cc: Stephen Hemminger Signed-off-by: K. Y. Srinivasan Cc: stable@vger.kernel.org (4.13 and above) Signed-off-by: Greg Kroah-Hartman --- drivers/hv/channel_mgmt.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index 060df71c2e8b..bcbb031f7263 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -936,14 +936,10 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr) void vmbus_hvsock_device_unregister(struct vmbus_channel *channel) { - mutex_lock(&vmbus_connection.channel_mutex); - BUG_ON(!is_hvsock_channel(channel)); channel->rescind = true; vmbus_device_unregister(channel->device_obj); - - mutex_unlock(&vmbus_connection.channel_mutex); } EXPORT_SYMBOL_GPL(vmbus_hvsock_device_unregister); -- cgit From 549e658a0919e355a2b2144dc380b3729bef7f3e Mon Sep 17 00:00:00 2001 From: Olaf Hering Date: Thu, 21 Sep 2017 23:41:48 -0700 Subject: Drivers: hv: fcopy: restore correct transfer length Till recently the expected length of bytes read by the daemon did depend on the context. It was either hv_start_fcopy or hv_do_fcopy. The daemon had a buffer size of two pages, which was much larger than needed. Now the expected length of bytes read by the daemon changed slightly. For START_FILE_COPY it is still the size of hv_start_fcopy. But for WRITE_TO_FILE and the other operations it is as large as the buffer that arrived via vmbus. In case of WRITE_TO_FILE that is slightly larger than a struct hv_do_fcopy. Since the buffer in the daemon was still larger everything was fine. Currently, the daemon reads only what is actually needed. The new buffer layout is as large as a struct hv_do_fcopy, for the WRITE_TO_FILE operation. Since the kernel expects a slightly larger size, hvt_op_read will return -EINVAL because the daemon will read slightly less than expected. Address this by restoring the expected buffer size in case of WRITE_TO_FILE. Fixes: 'c7e490fc23eb ("Drivers: hv: fcopy: convert to hv_utils_transport")' Fixes: '3f2baa8a7d2e ("Tools: hv: update buffer handling in hv_fcopy_daemon")' Signed-off-by: Olaf Hering Signed-off-by: K. Y. Srinivasan Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/hv/hv_fcopy.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/hv/hv_fcopy.c b/drivers/hv/hv_fcopy.c index daa75bd41f86..2364281d8593 100644 --- a/drivers/hv/hv_fcopy.c +++ b/drivers/hv/hv_fcopy.c @@ -170,6 +170,10 @@ static void fcopy_send_data(struct work_struct *dummy) out_src = smsg_out; break; + case WRITE_TO_FILE: + out_src = fcopy_transaction.fcopy_msg; + out_len = sizeof(struct hv_do_fcopy); + break; default: out_src = fcopy_transaction.fcopy_msg; out_len = fcopy_transaction.recv_len; -- cgit From 44889942b6eb356eab27ce25fe10701adfec7776 Mon Sep 17 00:00:00 2001 From: Ladi Prosek Date: Fri, 22 Sep 2017 07:53:15 +0200 Subject: KVM: nVMX: fix HOST_CR3/HOST_CR4 cache For nested virt we maintain multiple VMCS that can run on a vCPU. So it is incorrect to keep vmcs_host_cr3 and vmcs_host_cr4, whose purpose is caching the value of the rarely changing HOST_CR3 and HOST_CR4 VMCS fields, in vCPU-wide data structures. Hyper-V nested on KVM runs into this consistently for me with PCID enabled. CR3 is updated with a new value, unlikely(cr3 != vmx->host_state.vmcs_host_cr3) fires, and the currently loaded VMCS is updated. Then we switch from L2 to L1 and the next exit reverts CR3 to its old value. Fixes: d6e41f1151fe ("x86/mm, KVM: Teach KVM's VMX code that CR3 isn't a constant") Signed-off-by: Ladi Prosek Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 0726ca7a1b02..c83d28b0ab05 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -200,6 +200,8 @@ struct loaded_vmcs { int cpu; bool launched; bool nmi_known_unmasked; + unsigned long vmcs_host_cr3; /* May not match real cr3 */ + unsigned long vmcs_host_cr4; /* May not match real cr4 */ struct list_head loaded_vmcss_on_cpu_link; }; @@ -600,8 +602,6 @@ struct vcpu_vmx { int gs_ldt_reload_needed; int fs_reload_needed; u64 msr_host_bndcfgs; - unsigned long vmcs_host_cr3; /* May not match real cr3 */ - unsigned long vmcs_host_cr4; /* May not match real cr4 */ } host_state; struct { int vm86_active; @@ -5178,12 +5178,12 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx) */ cr3 = __read_cr3(); vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */ - vmx->host_state.vmcs_host_cr3 = cr3; + vmx->loaded_vmcs->vmcs_host_cr3 = cr3; /* Save the most likely value for this task's CR4 in the VMCS. */ cr4 = cr4_read_shadow(); vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */ - vmx->host_state.vmcs_host_cr4 = cr4; + vmx->loaded_vmcs->vmcs_host_cr4 = cr4; vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ #ifdef CONFIG_X86_64 @@ -9274,15 +9274,15 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); cr3 = __get_current_cr3_fast(); - if (unlikely(cr3 != vmx->host_state.vmcs_host_cr3)) { + if (unlikely(cr3 != vmx->loaded_vmcs->vmcs_host_cr3)) { vmcs_writel(HOST_CR3, cr3); - vmx->host_state.vmcs_host_cr3 = cr3; + vmx->loaded_vmcs->vmcs_host_cr3 = cr3; } cr4 = cr4_read_shadow(); - if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) { + if (unlikely(cr4 != vmx->loaded_vmcs->vmcs_host_cr4)) { vmcs_writel(HOST_CR4, cr4); - vmx->host_state.vmcs_host_cr4 = cr4; + vmx->loaded_vmcs->vmcs_host_cr4 = cr4; } /* When single-stepping over STI and MOV SS, we must clear the -- cgit From e001fa78d44d0b5c7b1498d1e4a038740efa3b1e Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Fri, 15 Sep 2017 15:26:14 +1000 Subject: KVM: PPC: Book3S HV: Check for updated HDSISR on P9 HDSI exception On POWER9 DD2.1 and below, sometimes on a Hypervisor Data Storage Interrupt (HDSI) the HDSISR is not be updated at all. To work around this we put a canary value into the HDSISR before returning to a guest and then check for this canary when we take a HDSI. If we find the canary on a HDSI, we know the hardware didn't update the HDSISR. In this case we return to the guest to retake the HDSI which should correctly update the HDSISR the second time HDSI entry. After talking to Paulus we've applied this workaround to all POWER9 CPUs. The workaround of returning to the guest shouldn't ever be triggered on well behaving CPU. The extra instructions should have negligible performance impact. Signed-off-by: Michael Neuling Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 17936f82d3c7..ec69fa45d5a2 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -1121,6 +1121,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) BEGIN_FTR_SECTION mtspr SPRN_PPR, r0 END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) + +/* Move canary into DSISR to check for later */ +BEGIN_FTR_SECTION + li r0, 0x7fff + mtspr SPRN_HDSISR, r0 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) + ld r0, VCPU_GPR(R0)(r4) ld r4, VCPU_GPR(R4)(r4) @@ -1956,9 +1963,14 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX) kvmppc_hdsi: ld r3, VCPU_KVM(r9) lbz r0, KVM_RADIX(r3) - cmpwi r0, 0 mfspr r4, SPRN_HDAR mfspr r6, SPRN_HDSISR +BEGIN_FTR_SECTION + /* Look for DSISR canary. If we find it, retry instruction */ + cmpdi r6, 0x7fff + beq 6f +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) + cmpwi r0, 0 bne .Lradix_hdsi /* on radix, just save DAR/DSISR/ASDR */ /* HPTE not found fault or protection fault? */ andis. r0, r6, (DSISR_NOHPTE | DSISR_PROTFAULT)@h -- cgit From e87be9b29c22852ec300826e3b1d551b00c1eb7a Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 14 Sep 2017 14:30:43 +0200 Subject: mmc: tmio: remove broken and noisy debug macro Some change for v4.14 broke the debug output for TMIO. But since it was not helpful to me and too noisy for my taste anyhow, let's just remove it instead of fixing it. We'll find something better if we'd need it... Signed-off-by: Wolfram Sang Signed-off-by: Ulf Hansson --- drivers/mmc/host/tmio_mmc_core.c | 47 ---------------------------------------- 1 file changed, 47 deletions(-) diff --git a/drivers/mmc/host/tmio_mmc_core.c b/drivers/mmc/host/tmio_mmc_core.c index 12cf8288d663..a7293e186e03 100644 --- a/drivers/mmc/host/tmio_mmc_core.c +++ b/drivers/mmc/host/tmio_mmc_core.c @@ -129,50 +129,6 @@ static int tmio_mmc_next_sg(struct tmio_mmc_host *host) #define CMDREQ_TIMEOUT 5000 -#ifdef CONFIG_MMC_DEBUG - -#define STATUS_TO_TEXT(a, status, i) \ - do { \ - if ((status) & TMIO_STAT_##a) { \ - if ((i)++) \ - printk(KERN_DEBUG " | "); \ - printk(KERN_DEBUG #a); \ - } \ - } while (0) - -static void pr_debug_status(u32 status) -{ - int i = 0; - - pr_debug("status: %08x = ", status); - STATUS_TO_TEXT(CARD_REMOVE, status, i); - STATUS_TO_TEXT(CARD_INSERT, status, i); - STATUS_TO_TEXT(SIGSTATE, status, i); - STATUS_TO_TEXT(WRPROTECT, status, i); - STATUS_TO_TEXT(CARD_REMOVE_A, status, i); - STATUS_TO_TEXT(CARD_INSERT_A, status, i); - STATUS_TO_TEXT(SIGSTATE_A, status, i); - STATUS_TO_TEXT(CMD_IDX_ERR, status, i); - STATUS_TO_TEXT(STOPBIT_ERR, status, i); - STATUS_TO_TEXT(ILL_FUNC, status, i); - STATUS_TO_TEXT(CMD_BUSY, status, i); - STATUS_TO_TEXT(CMDRESPEND, status, i); - STATUS_TO_TEXT(DATAEND, status, i); - STATUS_TO_TEXT(CRCFAIL, status, i); - STATUS_TO_TEXT(DATATIMEOUT, status, i); - STATUS_TO_TEXT(CMDTIMEOUT, status, i); - STATUS_TO_TEXT(RXOVERFLOW, status, i); - STATUS_TO_TEXT(TXUNDERRUN, status, i); - STATUS_TO_TEXT(RXRDY, status, i); - STATUS_TO_TEXT(TXRQ, status, i); - STATUS_TO_TEXT(ILL_ACCESS, status, i); - printk("\n"); -} - -#else -#define pr_debug_status(s) do { } while (0) -#endif - static void tmio_mmc_enable_sdio_irq(struct mmc_host *mmc, int enable) { struct tmio_mmc_host *host = mmc_priv(mmc); @@ -762,9 +718,6 @@ irqreturn_t tmio_mmc_irq(int irq, void *devid) status = sd_ctrl_read16_and_16_as_32(host, CTL_STATUS); ireg = status & TMIO_MASK_IRQ & ~host->sdcard_irq_mask; - pr_debug_status(status); - pr_debug_status(ireg); - /* Clear the status except the interrupt status */ sd_ctrl_write32_as_16_and_16(host, CTL_STATUS, TMIO_MASK_IRQ); -- cgit From c51b46dd5b9950436b6b1f8189e93e1ad380cee1 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 20 Sep 2017 22:19:57 +0100 Subject: staging: rtl8723bs: add missing range check on id The value of the u8 id needs to be upper bounds checked to ensure the cam_cache array on the adapter dvobj is not indexed outside of the allowed range of 0..TOTAL_CAM_ENTRY-1. This can currently occur if id is >= TOTAL_CAM_ENTRY when calling write_cam_from_cache. Fix this by adding an upper range check. Detected by CoverityScan, CID#1428464 ("Use of untrusted scalar value") Fixes: 554c0a3abf21 ("staging: Add rtl8723bs sdio wifi driver") Signed-off-by: Colin Ian King Signed-off-by: Greg Kroah-Hartman --- drivers/staging/rtl8723bs/os_dep/rtw_proc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/staging/rtl8723bs/os_dep/rtw_proc.c b/drivers/staging/rtl8723bs/os_dep/rtw_proc.c index 92277457aba4..ce1dd6f9036f 100644 --- a/drivers/staging/rtl8723bs/os_dep/rtw_proc.c +++ b/drivers/staging/rtl8723bs/os_dep/rtw_proc.c @@ -311,6 +311,8 @@ static ssize_t proc_set_cam(struct file *file, const char __user *buffer, size_t if (num < 2) return count; + if (id >= TOTAL_CAM_ENTRY) + return -EINVAL; if (strcmp("c", cmd) == 0) { _clear_cam_entry(adapter, id); -- cgit From ec14121931a24f8d3678b8a9c408adee3b21d465 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 20 Sep 2017 18:34:18 +0100 Subject: staging: rtl8723bs: avoid null pointer dereference on pmlmepriv There is a check to see if pmlmepriv is null before vfree'ing pmlmepriv->free_bss_buf hence implying pmlmepriv could potenially be null. However, a previous call to rtw_free_mlme_priv_ie_data can also dereference pmlmepriv, so move this call so that it is only called if pmlmepriv non-null. Detected by CoverityScan, CID#1077739 ("Dereference before null check") Fixes: 554c0a3abf21 ("staging: Add rtl8723bs sdio wifi driver") Signed-off-by: Colin Ian King Signed-off-by: Greg Kroah-Hartman --- drivers/staging/rtl8723bs/core/rtw_mlme.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/staging/rtl8723bs/core/rtw_mlme.c b/drivers/staging/rtl8723bs/core/rtw_mlme.c index 6b778206a1a3..cb8a95aabd6c 100644 --- a/drivers/staging/rtl8723bs/core/rtw_mlme.c +++ b/drivers/staging/rtl8723bs/core/rtw_mlme.c @@ -119,9 +119,8 @@ void rtw_free_mlme_priv_ie_data(struct mlme_priv *pmlmepriv) void _rtw_free_mlme_priv(struct mlme_priv *pmlmepriv) { - rtw_free_mlme_priv_ie_data(pmlmepriv); - if (pmlmepriv) { + rtw_free_mlme_priv_ie_data(pmlmepriv); if (pmlmepriv->free_bss_buf) { vfree(pmlmepriv->free_bss_buf); } -- cgit From 6ae033689d7b1a419def78e8e990b0eab8bb6419 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 18 Sep 2017 15:16:08 +0300 Subject: mmc: sdhci-pci: Fix voltage switch for some Intel host controllers Some Intel host controllers (e.g. CNP) use an ACPI device-specific method to ensure correct voltage switching. Fix voltage switch for those, by adding a call to the DSM. Signed-off-by: Adrian Hunter Cc: stable@vger.kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-pci-core.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/mmc/host/sdhci-pci-core.c b/drivers/mmc/host/sdhci-pci-core.c index bbaddf18a1b3..d0ccc6729fd2 100644 --- a/drivers/mmc/host/sdhci-pci-core.c +++ b/drivers/mmc/host/sdhci-pci-core.c @@ -392,6 +392,7 @@ static const struct sdhci_pci_fixes sdhci_intel_pch_sdio = { enum { INTEL_DSM_FNS = 0, + INTEL_DSM_V18_SWITCH = 3, INTEL_DSM_DRV_STRENGTH = 9, INTEL_DSM_D3_RETUNE = 10, }; @@ -557,6 +558,19 @@ static void intel_hs400_enhanced_strobe(struct mmc_host *mmc, sdhci_writel(host, val, INTEL_HS400_ES_REG); } +static void sdhci_intel_voltage_switch(struct sdhci_host *host) +{ + struct sdhci_pci_slot *slot = sdhci_priv(host); + struct intel_host *intel_host = sdhci_pci_priv(slot); + struct device *dev = &slot->chip->pdev->dev; + u32 result = 0; + int err; + + err = intel_dsm(intel_host, dev, INTEL_DSM_V18_SWITCH, &result); + pr_debug("%s: %s DSM error %d result %u\n", + mmc_hostname(host->mmc), __func__, err, result); +} + static const struct sdhci_ops sdhci_intel_byt_ops = { .set_clock = sdhci_set_clock, .set_power = sdhci_intel_set_power, @@ -565,6 +579,7 @@ static const struct sdhci_ops sdhci_intel_byt_ops = { .reset = sdhci_reset, .set_uhs_signaling = sdhci_set_uhs_signaling, .hw_reset = sdhci_pci_hw_reset, + .voltage_switch = sdhci_intel_voltage_switch, }; static void byt_read_dsm(struct sdhci_pci_slot *slot) -- cgit From c9adcdbc653b52e4be834f535eefec833f9ca6b1 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 21 Sep 2017 14:03:29 +0800 Subject: ALSA: pcm: Fix structure definition for X32 ABI X32 ABI uses the 64bit timespec in addition to 64bit alignment of 64bit values. We have added compat ABI for these ioctls, but this patch adds one missing padding into 'struct snd_pcm_mmap_status_x32' to fix incompatibilities. Signed-off-by: Baolin Wang Signed-off-by: Takashi Iwai --- sound/core/pcm_compat.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/core/pcm_compat.c b/sound/core/pcm_compat.c index 3a1cc7b97e46..b719d0bd833e 100644 --- a/sound/core/pcm_compat.c +++ b/sound/core/pcm_compat.c @@ -547,6 +547,7 @@ struct snd_pcm_mmap_status_x32 { u32 pad2; /* alignment */ struct timespec tstamp; s32 suspended_state; + s32 pad3; struct timespec audio_tstamp; } __packed; -- cgit From bfc81a8bc18e3c4ba0cbaa7666ff76be2f998991 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 22 Sep 2017 16:18:53 +0200 Subject: ALSA: usb-audio: Check out-of-bounds access by corrupted buffer descriptor When a USB-audio device receives a maliciously adjusted or corrupted buffer descriptor, the USB-audio driver may access an out-of-bounce value at its parser. This was detected by syzkaller, something like: BUG: KASAN: slab-out-of-bounds in usb_audio_probe+0x27b2/0x2ab0 Read of size 1 at addr ffff88006b83a9e8 by task kworker/0:1/24 CPU: 0 PID: 24 Comm: kworker/0:1 Not tainted 4.14.0-rc1-42251-gebb2c2437d80 #224 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Workqueue: usb_hub_wq hub_event Call Trace: __dump_stack lib/dump_stack.c:16 dump_stack+0x292/0x395 lib/dump_stack.c:52 print_address_description+0x78/0x280 mm/kasan/report.c:252 kasan_report_error mm/kasan/report.c:351 kasan_report+0x22f/0x340 mm/kasan/report.c:409 __asan_report_load1_noabort+0x19/0x20 mm/kasan/report.c:427 snd_usb_create_streams sound/usb/card.c:248 usb_audio_probe+0x27b2/0x2ab0 sound/usb/card.c:605 usb_probe_interface+0x35d/0x8e0 drivers/usb/core/driver.c:361 really_probe drivers/base/dd.c:413 driver_probe_device+0x610/0xa00 drivers/base/dd.c:557 __device_attach_driver+0x230/0x290 drivers/base/dd.c:653 bus_for_each_drv+0x161/0x210 drivers/base/bus.c:463 __device_attach+0x26e/0x3d0 drivers/base/dd.c:710 device_initial_probe+0x1f/0x30 drivers/base/dd.c:757 bus_probe_device+0x1eb/0x290 drivers/base/bus.c:523 device_add+0xd0b/0x1660 drivers/base/core.c:1835 usb_set_configuration+0x104e/0x1870 drivers/usb/core/message.c:1932 generic_probe+0x73/0xe0 drivers/usb/core/generic.c:174 usb_probe_device+0xaf/0xe0 drivers/usb/core/driver.c:266 really_probe drivers/base/dd.c:413 driver_probe_device+0x610/0xa00 drivers/base/dd.c:557 __device_attach_driver+0x230/0x290 drivers/base/dd.c:653 bus_for_each_drv+0x161/0x210 drivers/base/bus.c:463 __device_attach+0x26e/0x3d0 drivers/base/dd.c:710 device_initial_probe+0x1f/0x30 drivers/base/dd.c:757 bus_probe_device+0x1eb/0x290 drivers/base/bus.c:523 device_add+0xd0b/0x1660 drivers/base/core.c:1835 usb_new_device+0x7b8/0x1020 drivers/usb/core/hub.c:2457 hub_port_connect drivers/usb/core/hub.c:4903 hub_port_connect_change drivers/usb/core/hub.c:5009 port_event drivers/usb/core/hub.c:5115 hub_event+0x194d/0x3740 drivers/usb/core/hub.c:5195 process_one_work+0xc7f/0x1db0 kernel/workqueue.c:2119 worker_thread+0x221/0x1850 kernel/workqueue.c:2253 kthread+0x3a1/0x470 kernel/kthread.c:231 ret_from_fork+0x2a/0x40 arch/x86/entry/entry_64.S:431 This patch adds the checks of out-of-bounce accesses at appropriate places and bails out when it goes out of the given buffer. Reported-by: Andrey Konovalov Tested-by: Andrey Konovalov Cc: Signed-off-by: Takashi Iwai --- sound/usb/card.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/sound/usb/card.c b/sound/usb/card.c index 3dc36d913550..23d1d23aefec 100644 --- a/sound/usb/card.c +++ b/sound/usb/card.c @@ -221,6 +221,7 @@ static int snd_usb_create_streams(struct snd_usb_audio *chip, int ctrlif) struct usb_interface_descriptor *altsd; void *control_header; int i, protocol; + int rest_bytes; /* find audiocontrol interface */ host_iface = &usb_ifnum_to_if(dev, ctrlif)->altsetting[0]; @@ -235,6 +236,15 @@ static int snd_usb_create_streams(struct snd_usb_audio *chip, int ctrlif) return -EINVAL; } + rest_bytes = (void *)(host_iface->extra + host_iface->extralen) - + control_header; + + /* just to be sure -- this shouldn't hit at all */ + if (rest_bytes <= 0) { + dev_err(&dev->dev, "invalid control header\n"); + return -EINVAL; + } + switch (protocol) { default: dev_warn(&dev->dev, @@ -245,11 +255,21 @@ static int snd_usb_create_streams(struct snd_usb_audio *chip, int ctrlif) case UAC_VERSION_1: { struct uac1_ac_header_descriptor *h1 = control_header; + if (rest_bytes < sizeof(*h1)) { + dev_err(&dev->dev, "too short v1 buffer descriptor\n"); + return -EINVAL; + } + if (!h1->bInCollection) { dev_info(&dev->dev, "skipping empty audio interface (v1)\n"); return -EINVAL; } + if (rest_bytes < h1->bLength) { + dev_err(&dev->dev, "invalid buffer length (v1)\n"); + return -EINVAL; + } + if (h1->bLength < sizeof(*h1) + h1->bInCollection) { dev_err(&dev->dev, "invalid UAC_HEADER (v1)\n"); return -EINVAL; -- cgit From c4fa6c43ce4b427350cfbb659436bfe3d9e09a1d Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 21 Sep 2017 09:54:13 -0400 Subject: cgroup: Reinit cgroup_taskset structure before cgroup_migrate_execute() returns The cgroup_taskset structure within the larger cgroup_mgctx structure is supposed to be used once and then discarded. That is not really the case in the hotplug code path: cpuset_hotplug_workfn() - cgroup_transfer_tasks() - cgroup_migrate() - cgroup_migrate_add_task() - cgroup_migrate_execute() In this case, the cgroup_migrate() function is called multiple time with the same cgroup_mgctx structure to transfer the tasks from one cgroup to another one-by-one. The second time cgroup_migrate() is called, the cgroup_taskset will be in an incorrect state and so may cause the system to panic. For example, [ 150.888410] Faulting instruction address: 0xc0000000001db648 [ 150.888414] Oops: Kernel access of bad area, sig: 11 [#1] [ 150.888417] SMP NR_CPUS=2048 [ 150.888417] NUMA [ 150.888419] pSeries : [ 150.888545] NIP [c0000000001db648] cpuset_can_attach+0x58/0x1b0 [ 150.888548] LR [c0000000001db638] cpuset_can_attach+0x48/0x1b0 [ 150.888551] Call Trace: [ 150.888554] [c0000005f65cb940] [c0000000001db638] cpuset_can_attach+0x48/0x1b 0 (unreliable) [ 150.888559] [c0000005f65cb9a0] [c0000000001cff04] cgroup_migrate_execute+0xc4/0x4b0 [ 150.888563] [c0000005f65cba20] [c0000000001d7d14] cgroup_transfer_tasks+0x1d4/0x370 [ 150.888568] [c0000005f65cbb70] [c0000000001ddcb0] cpuset_hotplug_workfn+0x710/0x8f0 [ 150.888572] [c0000005f65cbc80] [c00000000012032c] process_one_work+0x1ac/0x4d0 [ 150.888576] [c0000005f65cbd20] [c0000000001206f8] worker_thread+0xa8/0x5b0 [ 150.888580] [c0000005f65cbdc0] [c0000000001293f8] kthread+0x168/0x1b0 [ 150.888584] [c0000005f65cbe30] [c00000000000b368] ret_from_kernel_thread+0x5c/0x74 To allow reuse of the cgroup_mgctx structure, some fields in that structure are now re-initialized at the end of cgroup_migrate_execute() function call so that the structure can be reused again in a later iteration without causing problem. This bug was introduced in the commit e595cd706982 ("group: track migration context in cgroup_mgctx") in 4.11. This commit moves the cgroup_taskset initialization out of cgroup_migrate(). The commit 10467270fb3 ("cgroup: don't call migration methods if there are no tasks to migrate") helped, but did not completely resolve the problem. Fixes: e595cd706982bff0211e6fafe5a108421e747fbc ("group: track migration context in cgroup_mgctx") Signed-off-by: Waiman Long Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org # v4.11+ --- kernel/cgroup/cgroup.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index d6551cd45238..44857278eb8a 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2311,6 +2311,14 @@ out_release_tset: list_del_init(&cset->mg_node); } spin_unlock_irq(&css_set_lock); + + /* + * Re-initialize the cgroup_taskset structure in case it is reused + * again in another cgroup_migrate_add_task()/cgroup_migrate_execute() + * iteration. + */ + tset->nr_tasks = 0; + tset->csets = &tset->src_csets; return ret; } -- cgit From 786de92b3cb26012d3d0f00ee37adf14527f35c4 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Fri, 22 Sep 2017 11:56:49 -0400 Subject: USB: uas: fix bug in handling of alternate settings The uas driver has a subtle bug in the way it handles alternate settings. The uas_find_uas_alt_setting() routine returns an altsetting value (the bAlternateSetting number in the descriptor), but uas_use_uas_driver() then treats that value as an index to the intf->altsetting array, which it isn't. Normally this doesn't cause any problems because the various alternate settings have bAlternateSetting values 0, 1, 2, ..., so the value is equal to the index in the array. But this is not guaranteed, and Andrey Konovalov used the syzkaller fuzzer with KASAN to get a slab-out-of-bounds error by violating this assumption. This patch fixes the bug by making uas_find_uas_alt_setting() return a pointer to the altsetting entry rather than either the value or the index. Pointers are less subject to misinterpretation. Signed-off-by: Alan Stern Reported-by: Andrey Konovalov Tested-by: Andrey Konovalov CC: Oliver Neukum CC: Signed-off-by: Greg Kroah-Hartman --- drivers/usb/storage/uas-detect.h | 15 ++++++++------- drivers/usb/storage/uas.c | 10 +++++----- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/drivers/usb/storage/uas-detect.h b/drivers/usb/storage/uas-detect.h index f58caa9e6a27..a155cd02bce2 100644 --- a/drivers/usb/storage/uas-detect.h +++ b/drivers/usb/storage/uas-detect.h @@ -9,7 +9,8 @@ static int uas_is_interface(struct usb_host_interface *intf) intf->desc.bInterfaceProtocol == USB_PR_UAS); } -static int uas_find_uas_alt_setting(struct usb_interface *intf) +static struct usb_host_interface *uas_find_uas_alt_setting( + struct usb_interface *intf) { int i; @@ -17,10 +18,10 @@ static int uas_find_uas_alt_setting(struct usb_interface *intf) struct usb_host_interface *alt = &intf->altsetting[i]; if (uas_is_interface(alt)) - return alt->desc.bAlternateSetting; + return alt; } - return -ENODEV; + return NULL; } static int uas_find_endpoints(struct usb_host_interface *alt, @@ -58,14 +59,14 @@ static int uas_use_uas_driver(struct usb_interface *intf, struct usb_device *udev = interface_to_usbdev(intf); struct usb_hcd *hcd = bus_to_hcd(udev->bus); unsigned long flags = id->driver_info; - int r, alt; - + struct usb_host_interface *alt; + int r; alt = uas_find_uas_alt_setting(intf); - if (alt < 0) + if (!alt) return 0; - r = uas_find_endpoints(&intf->altsetting[alt], eps); + r = uas_find_endpoints(alt, eps); if (r < 0) return 0; diff --git a/drivers/usb/storage/uas.c b/drivers/usb/storage/uas.c index cfb1e3bbd434..63cf981ed81c 100644 --- a/drivers/usb/storage/uas.c +++ b/drivers/usb/storage/uas.c @@ -873,14 +873,14 @@ MODULE_DEVICE_TABLE(usb, uas_usb_ids); static int uas_switch_interface(struct usb_device *udev, struct usb_interface *intf) { - int alt; + struct usb_host_interface *alt; alt = uas_find_uas_alt_setting(intf); - if (alt < 0) - return alt; + if (!alt) + return -ENODEV; - return usb_set_interface(udev, - intf->altsetting[0].desc.bInterfaceNumber, alt); + return usb_set_interface(udev, alt->desc.bInterfaceNumber, + alt->desc.bAlternateSetting); } static int uas_configure_endpoints(struct uas_dev_info *devinfo) -- cgit From 6e76c01e71551cb221c1f3deacb9dcd9a7346784 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Thu, 21 Sep 2017 16:12:01 -0400 Subject: USB: gadgetfs: fix copy_to_user while holding spinlock The gadgetfs driver as a long-outstanding FIXME, regarding a call of copy_to_user() made while holding a spinlock. This patch fixes the issue by dropping the spinlock and using the dev->udc_usage mechanism introduced by another recent patch to guard against status changes while the lock isn't held. Signed-off-by: Alan Stern Reported-by: Andrey Konovalov CC: Acked-by: Felipe Balbi Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/legacy/inode.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c index 684900fcfe24..956b3dc7c3a4 100644 --- a/drivers/usb/gadget/legacy/inode.c +++ b/drivers/usb/gadget/legacy/inode.c @@ -983,11 +983,14 @@ ep0_read (struct file *fd, char __user *buf, size_t len, loff_t *ptr) retval = -EIO; else { len = min (len, (size_t)dev->req->actual); -// FIXME don't call this with the spinlock held ... + ++dev->udc_usage; + spin_unlock_irq(&dev->lock); if (copy_to_user (buf, dev->req->buf, len)) retval = -EFAULT; else retval = len; + spin_lock_irq(&dev->lock); + --dev->udc_usage; clean_req (dev->gadget->ep0, dev->req); /* NOTE userspace can't yet choose to stall */ } -- cgit From 520b72fc64debf8a86c3853b8e486aa5982188f0 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Thu, 21 Sep 2017 13:23:58 -0400 Subject: USB: gadgetfs: Fix crash caused by inadequate synchronization The gadgetfs driver (drivers/usb/gadget/legacy/inode.c) was written before the UDC and composite frameworks were adopted; it is a legacy driver. As such, it expects that once bound to a UDC controller, it will not be unbound until it unregisters itself. However, the UDC framework does unbind function drivers while they are still registered. When this happens, it can cause the gadgetfs driver to misbehave or crash. For example, userspace can cause a crash by opening the device file and doing an ioctl call before setting up a configuration (found by Andrey Konovalov using the syzkaller fuzzer). This patch adds checks and synchronization to prevent these bad behaviors. It adds a udc_usage counter that the driver increments at times when it is using a gadget interface without holding the private spinlock. The unbind routine waits for this counter to go to 0 before returning, thereby ensuring that the UDC is no longer in use. The patch also adds a check in the dev_ioctl() routine to make sure the driver is bound to a UDC before dereferencing the gadget pointer, and it makes destroy_ep_files() synchronize with the endpoint I/O routines, to prevent the user from accessing an endpoint data structure after it has been removed. Signed-off-by: Alan Stern Reported-by: Andrey Konovalov Tested-by: Andrey Konovalov CC: Acked-by: Felipe Balbi Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/legacy/inode.c | 41 ++++++++++++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c index 956b3dc7c3a4..5c28bee327e1 100644 --- a/drivers/usb/gadget/legacy/inode.c +++ b/drivers/usb/gadget/legacy/inode.c @@ -28,7 +28,7 @@ #include #include #include - +#include #include #include @@ -116,6 +116,7 @@ enum ep0_state { struct dev_data { spinlock_t lock; refcount_t count; + int udc_usage; enum ep0_state state; /* P: lock */ struct usb_gadgetfs_event event [N_EVENT]; unsigned ev_next; @@ -513,9 +514,9 @@ static void ep_aio_complete(struct usb_ep *ep, struct usb_request *req) INIT_WORK(&priv->work, ep_user_copy_worker); schedule_work(&priv->work); } - spin_unlock(&epdata->dev->lock); usb_ep_free_request(ep, req); + spin_unlock(&epdata->dev->lock); put_ep(epdata); } @@ -939,9 +940,11 @@ ep0_read (struct file *fd, char __user *buf, size_t len, loff_t *ptr) struct usb_request *req = dev->req; if ((retval = setup_req (ep, req, 0)) == 0) { + ++dev->udc_usage; spin_unlock_irq (&dev->lock); retval = usb_ep_queue (ep, req, GFP_KERNEL); spin_lock_irq (&dev->lock); + --dev->udc_usage; } dev->state = STATE_DEV_CONNECTED; @@ -1134,6 +1137,7 @@ ep0_write (struct file *fd, const char __user *buf, size_t len, loff_t *ptr) retval = setup_req (dev->gadget->ep0, dev->req, len); if (retval == 0) { dev->state = STATE_DEV_CONNECTED; + ++dev->udc_usage; spin_unlock_irq (&dev->lock); if (copy_from_user (dev->req->buf, buf, len)) retval = -EFAULT; @@ -1145,6 +1149,7 @@ ep0_write (struct file *fd, const char __user *buf, size_t len, loff_t *ptr) GFP_KERNEL); } spin_lock_irq(&dev->lock); + --dev->udc_usage; if (retval < 0) { clean_req (dev->gadget->ep0, dev->req); } else @@ -1246,9 +1251,21 @@ static long dev_ioctl (struct file *fd, unsigned code, unsigned long value) struct usb_gadget *gadget = dev->gadget; long ret = -ENOTTY; - if (gadget->ops->ioctl) + spin_lock_irq(&dev->lock); + if (dev->state == STATE_DEV_OPENED || + dev->state == STATE_DEV_UNBOUND) { + /* Not bound to a UDC */ + } else if (gadget->ops->ioctl) { + ++dev->udc_usage; + spin_unlock_irq(&dev->lock); + ret = gadget->ops->ioctl (gadget, code, value); + spin_lock_irq(&dev->lock); + --dev->udc_usage; + } + spin_unlock_irq(&dev->lock); + return ret; } @@ -1466,10 +1483,12 @@ delegate: if (value < 0) break; + ++dev->udc_usage; spin_unlock (&dev->lock); value = usb_ep_queue (gadget->ep0, dev->req, GFP_KERNEL); spin_lock (&dev->lock); + --dev->udc_usage; if (value < 0) { clean_req (gadget->ep0, dev->req); break; @@ -1493,8 +1512,12 @@ delegate: req->length = value; req->zero = value < w_length; + ++dev->udc_usage; spin_unlock (&dev->lock); value = usb_ep_queue (gadget->ep0, req, GFP_KERNEL); + spin_lock(&dev->lock); + --dev->udc_usage; + spin_unlock(&dev->lock); if (value < 0) { DBG (dev, "ep_queue --> %d\n", value); req->status = 0; @@ -1521,21 +1544,24 @@ static void destroy_ep_files (struct dev_data *dev) /* break link to FS */ ep = list_first_entry (&dev->epfiles, struct ep_data, epfiles); list_del_init (&ep->epfiles); + spin_unlock_irq (&dev->lock); + dentry = ep->dentry; ep->dentry = NULL; parent = d_inode(dentry->d_parent); /* break link to controller */ + mutex_lock(&ep->lock); if (ep->state == STATE_EP_ENABLED) (void) usb_ep_disable (ep->ep); ep->state = STATE_EP_UNBOUND; usb_ep_free_request (ep->ep, ep->req); ep->ep = NULL; + mutex_unlock(&ep->lock); + wake_up (&ep->wait); put_ep (ep); - spin_unlock_irq (&dev->lock); - /* break link to dcache */ inode_lock(parent); d_delete (dentry); @@ -1606,6 +1632,11 @@ gadgetfs_unbind (struct usb_gadget *gadget) spin_lock_irq (&dev->lock); dev->state = STATE_DEV_UNBOUND; + while (dev->udc_usage > 0) { + spin_unlock_irq(&dev->lock); + usleep_range(1000, 2000); + spin_lock_irq(&dev->lock); + } spin_unlock_irq (&dev->lock); destroy_ep_files (dev); -- cgit From 1fbbb78f25d1291274f320462bf6908906f538db Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Thu, 21 Sep 2017 13:22:00 -0400 Subject: USB: g_mass_storage: Fix deadlock when driver is unbound As a holdover from the old g_file_storage gadget, the g_mass_storage legacy gadget driver attempts to unregister itself when its main operating thread terminates (if it hasn't been unregistered already). This is not strictly necessary; it was never more than an attempt to have the gadget fail cleanly if something went wrong and the main thread was killed. However, now that the UDC core manages gadget drivers independently of UDC drivers, this scheme doesn't work any more. A simple test: modprobe dummy-hcd modprobe g-mass-storage file=... rmmod dummy-hcd ends up in a deadlock with the following backtrace: sysrq: SysRq : Show Blocked State task PC stack pid father file-storage D 0 1130 2 0x00000000 Call Trace: __schedule+0x53e/0x58c schedule+0x6e/0x77 schedule_preempt_disabled+0xd/0xf __mutex_lock.isra.1+0x129/0x224 ? _raw_spin_unlock_irqrestore+0x12/0x14 __mutex_lock_slowpath+0x12/0x14 mutex_lock+0x28/0x2b usb_gadget_unregister_driver+0x29/0x9b [udc_core] usb_composite_unregister+0x10/0x12 [libcomposite] msg_cleanup+0x1d/0x20 [g_mass_storage] msg_thread_exits+0xd/0xdd7 [g_mass_storage] fsg_main_thread+0x1395/0x13d6 [usb_f_mass_storage] ? __schedule+0x573/0x58c kthread+0xd9/0xdb ? do_set_interface+0x25c/0x25c [usb_f_mass_storage] ? init_completion+0x1e/0x1e ret_from_fork+0x19/0x24 rmmod D 0 1155 683 0x00000000 Call Trace: __schedule+0x53e/0x58c schedule+0x6e/0x77 schedule_timeout+0x26/0xbc ? __schedule+0x573/0x58c do_wait_for_common+0xb3/0x128 ? usleep_range+0x81/0x81 ? wake_up_q+0x3f/0x3f wait_for_common+0x2e/0x45 wait_for_completion+0x17/0x19 fsg_common_put+0x34/0x81 [usb_f_mass_storage] fsg_free_inst+0x13/0x1e [usb_f_mass_storage] usb_put_function_instance+0x1a/0x25 [libcomposite] msg_unbind+0x2a/0x42 [g_mass_storage] __composite_unbind+0x4a/0x6f [libcomposite] composite_unbind+0x12/0x14 [libcomposite] usb_gadget_remove_driver+0x4f/0x77 [udc_core] usb_del_gadget_udc+0x52/0xcc [udc_core] dummy_udc_remove+0x27/0x2c [dummy_hcd] platform_drv_remove+0x1d/0x31 device_release_driver_internal+0xe9/0x16d device_release_driver+0x11/0x13 bus_remove_device+0xd2/0xe2 device_del+0x19f/0x221 ? selinux_capable+0x22/0x27 platform_device_del+0x21/0x63 platform_device_unregister+0x10/0x1a cleanup+0x20/0x817 [dummy_hcd] SyS_delete_module+0x10c/0x197 ? ____fput+0xd/0xf ? task_work_run+0x55/0x62 ? prepare_exit_to_usermode+0x65/0x75 do_fast_syscall_32+0x86/0xc3 entry_SYSENTER_32+0x4e/0x7c What happens is that removing the dummy-hcd driver causes the UDC core to unbind the gadget driver, which it does while holding the udc_lock mutex. The unbind routine in g_mass_storage tells the main thread to exit and waits for it to terminate. But as mentioned above, when the main thread exits it tries to unregister the mass-storage function driver. Via the composite framework this ends up calling usb_gadget_unregister_driver(), which tries to acquire the udc_lock mutex. The result is deadlock. The simplest way to fix the problem is not to be so clever: The main thread doesn't have to unregister the function driver. The side effects won't be so terrible; if the gadget is still attached to a USB host when the main thread is killed, it will appear to the host as though the gadget's firmware has crashed -- a reasonably accurate interpretation, and an all-too-common occurrence for USB mass-storage devices. In fact, the code to unregister the driver when the main thread exits is specific to g-mass-storage; it is not used when f-mass-storage is included as a function in a larger composite device. Therefore the entire mechanism responsible for this (the fsg_operations structure with its ->thread_exits method, the fsg_common_set_ops() routine, and the msg_thread_exits() callback routine) can all be eliminated. Even the msg_registered bitflag can be removed, because now the driver is unregistered in only one place rather than in two places. Signed-off-by: Alan Stern CC: Acked-by: Felipe Balbi Acked-by: Michal Nazarewicz Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/f_mass_storage.c | 27 +++++++-------------------- drivers/usb/gadget/function/f_mass_storage.h | 14 -------------- drivers/usb/gadget/legacy/mass_storage.c | 26 +++----------------------- 3 files changed, 10 insertions(+), 57 deletions(-) diff --git a/drivers/usb/gadget/function/f_mass_storage.c b/drivers/usb/gadget/function/f_mass_storage.c index d6bd0244b008..5153e29870c3 100644 --- a/drivers/usb/gadget/function/f_mass_storage.c +++ b/drivers/usb/gadget/function/f_mass_storage.c @@ -307,8 +307,6 @@ struct fsg_common { struct completion thread_notifier; struct task_struct *thread_task; - /* Callback functions. */ - const struct fsg_operations *ops; /* Gadget's private data. */ void *private_data; @@ -2438,6 +2436,7 @@ static void handle_exception(struct fsg_common *common) static int fsg_main_thread(void *common_) { struct fsg_common *common = common_; + int i; /* * Allow the thread to be killed by a signal, but set the signal mask @@ -2476,21 +2475,16 @@ static int fsg_main_thread(void *common_) common->thread_task = NULL; spin_unlock_irq(&common->lock); - if (!common->ops || !common->ops->thread_exits - || common->ops->thread_exits(common) < 0) { - int i; + /* Eject media from all LUNs */ - down_write(&common->filesem); - for (i = 0; i < ARRAY_SIZE(common->luns); i++) { - struct fsg_lun *curlun = common->luns[i]; - if (!curlun || !fsg_lun_is_open(curlun)) - continue; + down_write(&common->filesem); + for (i = 0; i < ARRAY_SIZE(common->luns); i++) { + struct fsg_lun *curlun = common->luns[i]; + if (curlun && fsg_lun_is_open(curlun)) fsg_lun_close(curlun); - curlun->unit_attention_data = SS_MEDIUM_NOT_PRESENT; - } - up_write(&common->filesem); } + up_write(&common->filesem); /* Let fsg_unbind() know the thread has exited */ complete_and_exit(&common->thread_notifier, 0); @@ -2681,13 +2675,6 @@ void fsg_common_remove_luns(struct fsg_common *common) } EXPORT_SYMBOL_GPL(fsg_common_remove_luns); -void fsg_common_set_ops(struct fsg_common *common, - const struct fsg_operations *ops) -{ - common->ops = ops; -} -EXPORT_SYMBOL_GPL(fsg_common_set_ops); - void fsg_common_free_buffers(struct fsg_common *common) { _fsg_common_free_buffers(common->buffhds, common->fsg_num_buffers); diff --git a/drivers/usb/gadget/function/f_mass_storage.h b/drivers/usb/gadget/function/f_mass_storage.h index d3902313b8ac..dc05ca0c4359 100644 --- a/drivers/usb/gadget/function/f_mass_storage.h +++ b/drivers/usb/gadget/function/f_mass_storage.h @@ -60,17 +60,6 @@ struct fsg_module_parameters { struct fsg_common; /* FSF callback functions */ -struct fsg_operations { - /* - * Callback function to call when thread exits. If no - * callback is set or it returns value lower then zero MSF - * will force eject all LUNs it operates on (including those - * marked as non-removable or with prevent_medium_removal flag - * set). - */ - int (*thread_exits)(struct fsg_common *common); -}; - struct fsg_lun_opts { struct config_group group; struct fsg_lun *lun; @@ -142,9 +131,6 @@ void fsg_common_remove_lun(struct fsg_lun *lun); void fsg_common_remove_luns(struct fsg_common *common); -void fsg_common_set_ops(struct fsg_common *common, - const struct fsg_operations *ops); - int fsg_common_create_lun(struct fsg_common *common, struct fsg_lun_config *cfg, unsigned int id, const char *name, const char **name_pfx); diff --git a/drivers/usb/gadget/legacy/mass_storage.c b/drivers/usb/gadget/legacy/mass_storage.c index e99ab57ee3e5..fcba59782f26 100644 --- a/drivers/usb/gadget/legacy/mass_storage.c +++ b/drivers/usb/gadget/legacy/mass_storage.c @@ -107,15 +107,6 @@ static unsigned int fsg_num_buffers = CONFIG_USB_GADGET_STORAGE_NUM_BUFFERS; FSG_MODULE_PARAMETERS(/* no prefix */, mod_data); -static unsigned long msg_registered; -static void msg_cleanup(void); - -static int msg_thread_exits(struct fsg_common *common) -{ - msg_cleanup(); - return 0; -} - static int msg_do_config(struct usb_configuration *c) { struct fsg_opts *opts; @@ -154,9 +145,6 @@ static struct usb_configuration msg_config_driver = { static int msg_bind(struct usb_composite_dev *cdev) { - static const struct fsg_operations ops = { - .thread_exits = msg_thread_exits, - }; struct fsg_opts *opts; struct fsg_config config; int status; @@ -173,8 +161,6 @@ static int msg_bind(struct usb_composite_dev *cdev) if (status) goto fail; - fsg_common_set_ops(opts->common, &ops); - status = fsg_common_set_cdev(opts->common, cdev, config.can_stall); if (status) goto fail_set_cdev; @@ -256,18 +242,12 @@ MODULE_LICENSE("GPL"); static int __init msg_init(void) { - int ret; - - ret = usb_composite_probe(&msg_driver); - set_bit(0, &msg_registered); - - return ret; + return usb_composite_probe(&msg_driver); } module_init(msg_init); -static void msg_cleanup(void) +static void __exit msg_cleanup(void) { - if (test_and_clear_bit(0, &msg_registered)) - usb_composite_unregister(&msg_driver); + usb_composite_unregister(&msg_driver); } module_exit(msg_cleanup); -- cgit From af2e658fc08a397b10352265e50b83f27e25d73e Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Fri, 22 Sep 2017 12:32:35 +0300 Subject: as3645a: Use ams,input-max-microamp as documented in DT bindings DT bindings document the property "ams,input-max-microamp" that limits the chip's maximum input current. The driver and the DTS however used "peak-current-limit" property. Fix this by using the property documented in DT binding documentation. Signed-off-by: Sakari Ailus Acked-by: Pavel Machek Signed-off-by: Jacek Anaszewski --- arch/arm/boot/dts/omap3-n950-n9.dtsi | 2 +- drivers/leds/leds-as3645a.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/boot/dts/omap3-n950-n9.dtsi b/arch/arm/boot/dts/omap3-n950-n9.dtsi index cb47ae79a5f9..b86fc83a5a65 100644 --- a/arch/arm/boot/dts/omap3-n950-n9.dtsi +++ b/arch/arm/boot/dts/omap3-n950-n9.dtsi @@ -273,7 +273,7 @@ flash-timeout-us = <150000>; flash-max-microamp = <320000>; led-max-microamp = <60000>; - peak-current-limit = <1750000>; + ams,input-max-microamp = <1750000>; }; indicator { led-max-microamp = <10000>; diff --git a/drivers/leds/leds-as3645a.c b/drivers/leds/leds-as3645a.c index bbbbe0898233..e3f89c6130d2 100644 --- a/drivers/leds/leds-as3645a.c +++ b/drivers/leds/leds-as3645a.c @@ -534,7 +534,7 @@ static int as3645a_parse_node(struct as3645a *flash, of_property_read_u32(flash->flash_node, "voltage-reference", &cfg->voltage_reference); - of_property_read_u32(flash->flash_node, "peak-current-limit", + of_property_read_u32(flash->flash_node, "ams,input-max-microamp", &cfg->peak); cfg->peak = AS_PEAK_mA_TO_REG(cfg->peak); -- cgit From 75f9f7279e874ff95d1abe4613abc0826c9a8dcc Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Fri, 22 Sep 2017 12:32:36 +0300 Subject: dt: bindings: as3645a: Use LED number to refer to LEDs Use integers (reg property) to tell the number of the LED to the driver instead of the node name. While both of these approaches are currently used by the LED bindings, using integers will require less driver changes for ACPI support. Additionally, it will make possible LED naming using chip and LED node names, effectively making the label property most useful for human-readable names only. Signed-off-by: Sakari Ailus Acked-by: Rob Herring Signed-off-by: Jacek Anaszewski --- .../devicetree/bindings/leds/ams,as3645a.txt | 28 ++++++++++++++-------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/Documentation/devicetree/bindings/leds/ams,as3645a.txt b/Documentation/devicetree/bindings/leds/ams,as3645a.txt index 12c5ef26ec73..fdc40e354a64 100644 --- a/Documentation/devicetree/bindings/leds/ams,as3645a.txt +++ b/Documentation/devicetree/bindings/leds/ams,as3645a.txt @@ -15,11 +15,14 @@ Required properties compatible : Must be "ams,as3645a". reg : The I2C address of the device. Typically 0x30. +#address-cells : 1 +#size-cells : 0 -Required properties of the "flash" child node -============================================= +Required properties of the flash child node (0) +=============================================== +reg: 0 flash-timeout-us: Flash timeout in microseconds. The value must be in the range [100000, 850000] and divisible by 50000. flash-max-microamp: Maximum flash current in microamperes. Has to be @@ -33,20 +36,21 @@ ams,input-max-microamp: Maximum flash controller input current. The and divisible by 50000. -Optional properties of the "flash" child node -============================================= +Optional properties of the flash child node +=========================================== label : The label of the flash LED. -Required properties of the "indicator" child node -================================================= +Required properties of the indicator child node (1) +=================================================== +reg: 1 led-max-microamp: Maximum indicator current. The allowed values are 2500, 5000, 7500 and 10000. -Optional properties of the "indicator" child node -================================================= +Optional properties of the indicator child node +=============================================== label : The label of the indicator LED. @@ -55,16 +59,20 @@ Example ======= as3645a@30 { + #address-cells: 1 + #size-cells: 0 reg = <0x30>; compatible = "ams,as3645a"; - flash { + flash@0 { + reg = <0x0>; flash-timeout-us = <150000>; flash-max-microamp = <320000>; led-max-microamp = <60000>; ams,input-max-microamp = <1750000>; label = "as3645a:flash"; }; - indicator { + indicator@1 { + reg = <0x1>; led-max-microamp = <10000>; label = "as3645a:indicator"; }; -- cgit From e626c325277531db15314b80610d1f5a1c2637b2 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Fri, 22 Sep 2017 12:32:37 +0300 Subject: as3645a: Use integer numbers for parsing LEDs Use integer numbers for LEDs, 0 is the flash and 1 is the indicator. Signed-off-by: Sakari Ailus Acked-by: Pavel Machek Signed-off-by: Jacek Anaszewski --- arch/arm/boot/dts/omap3-n950-n9.dtsi | 8 ++++++-- drivers/leds/leds-as3645a.c | 26 ++++++++++++++++++++++++-- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/arch/arm/boot/dts/omap3-n950-n9.dtsi b/arch/arm/boot/dts/omap3-n950-n9.dtsi index b86fc83a5a65..1b0bd72945f2 100644 --- a/arch/arm/boot/dts/omap3-n950-n9.dtsi +++ b/arch/arm/boot/dts/omap3-n950-n9.dtsi @@ -267,15 +267,19 @@ clock-frequency = <400000>; as3645a@30 { + #address-cells = <1>; + #size-cells = <0>; reg = <0x30>; compatible = "ams,as3645a"; - flash { + flash@0 { + reg = <0x0>; flash-timeout-us = <150000>; flash-max-microamp = <320000>; led-max-microamp = <60000>; ams,input-max-microamp = <1750000>; }; - indicator { + indicator@1 { + reg = <0x1>; led-max-microamp = <10000>; }; }; diff --git a/drivers/leds/leds-as3645a.c b/drivers/leds/leds-as3645a.c index e3f89c6130d2..605e0c64e974 100644 --- a/drivers/leds/leds-as3645a.c +++ b/drivers/leds/leds-as3645a.c @@ -112,6 +112,10 @@ #define AS_PEAK_mA_TO_REG(a) \ ((min_t(u32, AS_PEAK_mA_MAX, a) - 1250) / 250) +/* LED numbers for Devicetree */ +#define AS_LED_FLASH 0 +#define AS_LED_INDICATOR 1 + enum as_mode { AS_MODE_EXT_TORCH = 0 << AS_CONTROL_MODE_SETTING_SHIFT, AS_MODE_INDICATOR = 1 << AS_CONTROL_MODE_SETTING_SHIFT, @@ -491,10 +495,29 @@ static int as3645a_parse_node(struct as3645a *flash, struct device_node *node) { struct as3645a_config *cfg = &flash->cfg; + struct device_node *child; const char *name; int rval; - flash->flash_node = of_get_child_by_name(node, "flash"); + for_each_child_of_node(node, child) { + u32 id = 0; + + of_property_read_u32(child, "reg", &id); + + switch (id) { + case AS_LED_FLASH: + flash->flash_node = of_node_get(child); + break; + case AS_LED_INDICATOR: + flash->indicator_node = of_node_get(child); + break; + default: + dev_warn(&flash->client->dev, + "unknown LED %u encountered, ignoring\n", id); + break; + } + } + if (!flash->flash_node) { dev_err(&flash->client->dev, "can't find flash node\n"); return -ENODEV; @@ -538,7 +561,6 @@ static int as3645a_parse_node(struct as3645a *flash, &cfg->peak); cfg->peak = AS_PEAK_mA_TO_REG(cfg->peak); - flash->indicator_node = of_get_child_by_name(node, "indicator"); if (!flash->indicator_node) { dev_warn(&flash->client->dev, "can't find indicator node\n"); -- cgit From 12c4b878e71fa8b65bc479b2460765c7d1d81a26 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Fri, 22 Sep 2017 12:32:38 +0300 Subject: as3645a: Unregister indicator LED on device unbind The indicator LED was registered in probe but was not removed in driver remove callback. Fix this. Signed-off-by: Sakari Ailus Signed-off-by: Jacek Anaszewski --- drivers/leds/leds-as3645a.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/leds/leds-as3645a.c b/drivers/leds/leds-as3645a.c index 605e0c64e974..9a257f969300 100644 --- a/drivers/leds/leds-as3645a.c +++ b/drivers/leds/leds-as3645a.c @@ -743,6 +743,7 @@ static int as3645a_remove(struct i2c_client *client) as3645a_set_control(flash, AS_MODE_EXT_TORCH, false); v4l2_flash_release(flash->vf); + v4l2_flash_release(flash->vfind); led_classdev_flash_unregister(&flash->fled); led_classdev_unregister(&flash->iled_cdev); -- cgit From 28585a832602747cbfa88ad8934013177a3aae38 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 22 Sep 2017 14:10:22 -0700 Subject: rcu: Allow for page faults in NMI handlers A number of architecture invoke rcu_irq_enter() on exception entry in order to allow RCU read-side critical sections in the exception handler when the exception is from an idle or nohz_full CPU. This works, at least unless the exception happens in an NMI handler. In that case, rcu_nmi_enter() would already have exited the extended quiescent state, which would mean that rcu_irq_enter() would (incorrectly) cause RCU to think that it is again in an extended quiescent state. This will in turn result in lockdep splats in response to later RCU read-side critical sections. This commit therefore causes rcu_irq_enter() and rcu_irq_exit() to take no action if there is an rcu_nmi_enter() in effect, thus avoiding the unscheduled return to RCU quiescent state. This in turn should make the kernel safe for on-demand RCU voyeurism. Link: http://lkml.kernel.org/r/20170922211022.GA18084@linux.vnet.ibm.com Cc: stable@vger.kernel.org Fixes: 0be964be0 ("module: Sanitize RCU usage and locking") Reported-by: Steven Rostedt Signed-off-by: Paul E. McKenney Signed-off-by: Steven Rostedt (VMware) --- kernel/rcu/tree.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 51d4c3acf32d..63bee8e1b193 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -888,6 +888,11 @@ void rcu_irq_exit(void) RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_exit() invoked with irqs enabled!!!"); rdtp = this_cpu_ptr(&rcu_dynticks); + + /* Page faults can happen in NMI handlers, so check... */ + if (READ_ONCE(rdtp->dynticks_nmi_nesting)) + return; + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && rdtp->dynticks_nesting < 1); if (rdtp->dynticks_nesting <= 1) { @@ -1020,6 +1025,11 @@ void rcu_irq_enter(void) RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_enter() invoked with irqs enabled!!!"); rdtp = this_cpu_ptr(&rcu_dynticks); + + /* Page faults can happen in NMI handlers, so check... */ + if (READ_ONCE(rdtp->dynticks_nmi_nesting)) + return; + oldval = rdtp->dynticks_nesting; rdtp->dynticks_nesting++; WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && -- cgit From 9aadde91b3c035413c806619beb3e3ef6e697953 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 22 Sep 2017 17:22:19 -0400 Subject: extable: Consolidate *kernel_text_address() functions The functionality between kernel_text_address() and _kernel_text_address() is the same except that _kernel_text_address() does a little more (that function needs a rename, but that can be done another time). Instead of having duplicate code in both, simply have _kernel_text_address() calls kernel_text_address() instead. This is marked for stable because there's an RCU bug that can happen if one of these functions gets called while RCU is not watching. That fix depends on this fix to keep from having to write the fix twice. Cc: stable@vger.kernel.org Fixes: 0be964be0 ("module: Sanitize RCU usage and locking") Acked-by: Paul E. McKenney Signed-off-by: Steven Rostedt (VMware) --- kernel/extable.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/kernel/extable.c b/kernel/extable.c index 38c2412401a1..a7024a494faf 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -102,15 +102,7 @@ int core_kernel_data(unsigned long addr) int __kernel_text_address(unsigned long addr) { - if (core_kernel_text(addr)) - return 1; - if (is_module_text_address(addr)) - return 1; - if (is_ftrace_trampoline(addr)) - return 1; - if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr)) - return 1; - if (is_bpf_text_address(addr)) + if (kernel_text_address(addr)) return 1; /* * There might be init symbols in saved stacktraces. -- cgit From e8cac8b1d10589be45671a5ade0926a639b543b7 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 22 Sep 2017 17:36:32 -0400 Subject: extable: Enable RCU if it is not watching in kernel_text_address() If kernel_text_address() is called when RCU is not watching, it can cause an RCU bug because is_module_text_address(), the is_kprobe_*insn_slot() and is_bpf_text_address() functions require the use of RCU. Only enable RCU if it is not currently watching before it calls is_module_text_address(). The use of rcu_nmi_enter() is used to enable RCU because kernel_text_address() can happen pretty much anywhere (like an NMI), and even from within an NMI. It is called via save_stack_trace() that can be called by any WARN() or tracing function, which can happen while RCU is not watching (for example, going to or coming from idle, or during CPU take down or bring up). Cc: stable@vger.kernel.org Fixes: 0be964be0 ("module: Sanitize RCU usage and locking") Acked-by: Paul E. McKenney Signed-off-by: Steven Rostedt (VMware) --- kernel/extable.c | 35 ++++++++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/kernel/extable.c b/kernel/extable.c index a7024a494faf..9aa1cc41ecf7 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -119,17 +119,42 @@ int __kernel_text_address(unsigned long addr) int kernel_text_address(unsigned long addr) { + bool no_rcu; + int ret = 1; + if (core_kernel_text(addr)) return 1; + + /* + * If a stack dump happens while RCU is not watching, then + * RCU needs to be notified that it requires to start + * watching again. This can happen either by tracing that + * triggers a stack trace, or a WARN() that happens during + * coming back from idle, or cpu on or offlining. + * + * is_module_text_address() as well as the kprobe slots + * and is_bpf_text_address() require RCU to be watching. + */ + no_rcu = !rcu_is_watching(); + + /* Treat this like an NMI as it can happen anywhere */ + if (no_rcu) + rcu_nmi_enter(); + if (is_module_text_address(addr)) - return 1; + goto out; if (is_ftrace_trampoline(addr)) - return 1; + goto out; if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr)) - return 1; + goto out; if (is_bpf_text_address(addr)) - return 1; - return 0; + goto out; + ret = 0; +out: + if (no_rcu) + rcu_nmi_exit(); + + return ret; } /* -- cgit From 15516c89acce948debc4c598e03c3fee53045797 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 21 Sep 2017 13:00:21 -0400 Subject: tracing: Remove RCU work arounds from stack tracer Currently the stack tracer calls rcu_irq_enter() to make sure RCU is watching when it records a stack trace. But if the stack tracer is triggered while tracing inside of a rcu_irq_enter(), calling rcu_irq_enter() unconditionally can be problematic. The reason for having rcu_irq_enter() in the first place has been fixed from within the saving of the stack trace code, and there's no reason for doing it in the stack tracer itself. Just remove it. Cc: stable@vger.kernel.org Fixes: 0be964be0 ("module: Sanitize RCU usage and locking") Acked-by: Paul E. McKenney Suggested-by: "Paul E. McKenney" Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_stack.c | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index a4df67cbc711..49cb41412eec 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -96,23 +96,9 @@ check_stack(unsigned long ip, unsigned long *stack) if (in_nmi()) return; - /* - * There's a slight chance that we are tracing inside the - * RCU infrastructure, and rcu_irq_enter() will not work - * as expected. - */ - if (unlikely(rcu_irq_enter_disabled())) - return; - local_irq_save(flags); arch_spin_lock(&stack_trace_max_lock); - /* - * RCU may not be watching, make it see us. - * The stack trace code uses rcu_sched. - */ - rcu_irq_enter(); - /* In case another CPU set the tracer_frame on us */ if (unlikely(!frame_size)) this_size -= tracer_frame; @@ -205,7 +191,6 @@ check_stack(unsigned long ip, unsigned long *stack) } out: - rcu_irq_exit(); arch_spin_unlock(&stack_trace_max_lock); local_irq_restore(flags); } -- cgit From 05cf97e7a619fc7ede81ee6bb8ebfa7531b633f5 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Fri, 22 Sep 2017 01:01:11 +0200 Subject: cnic: Fix an error handling path in 'cnic_alloc_bnx2x_resc()' All the error handling paths 'goto error', except this one. We should also go to error in this case, or some resources will be leaking. Signed-off-by: Christophe JAILLET Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/cnic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/cnic.c b/drivers/net/ethernet/broadcom/cnic.c index cec94bbb2ea5..8bc126a156e8 100644 --- a/drivers/net/ethernet/broadcom/cnic.c +++ b/drivers/net/ethernet/broadcom/cnic.c @@ -1278,7 +1278,7 @@ static int cnic_alloc_bnx2x_resc(struct cnic_dev *dev) ret = cnic_alloc_dma(dev, kwq_16_dma, pages, 0); if (ret) - return -ENOMEM; + goto error; n = CNIC_PAGE_SIZE / CNIC_KWQ16_DATA_SIZE; for (i = 0, j = 0; i < cp->max_cid_space; i++) { -- cgit From 5c346525d3591cb032eca86d0f904cc01f1069ff Mon Sep 17 00:00:00 2001 From: Subash Abhinov Kasiviswanathan Date: Thu, 21 Sep 2017 18:00:36 -0600 Subject: net: qualcomm: rmnet: Fix rcu splat in rmnet_is_real_dev_registered Xiaolong reported a suspicious rcu_dereference_check in the device unregister notifier callback. Since we do not dereference the rx_handler_data, it's ok to just check for the value of the pointer. Note that this section is already protected by rtnl_lock. [ 101.364846] WARNING: suspicious RCU usage [ 101.365654] 4.13.0-rc6-01701-gceed73a #1 Not tainted [ 101.370873] ----------------------------- [ 101.372472] drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c:57 suspicious rcu_dereference_check() usage! [ 101.374427] [ 101.374427] other info that might help us debug this: [ 101.374427] [ 101.387491] [ 101.387491] rcu_scheduler_active = 2, debug_locks = 1 [ 101.389368] 1 lock held by trinity-main/2809: [ 101.390736] #0: (rtnl_mutex){+.+.+.}, at: [<8146085b>] rtnl_lock+0xf/0x11 [ 101.395482] [ 101.395482] stack backtrace: [ 101.396948] CPU: 0 PID: 2809 Comm: trinity-main Not tainted 4.13.0-rc6-01701-gceed73a #1 [ 101.398857] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.9.3-20161025_171302-gandalf 04/01/2014 [ 101.401079] Call Trace: [ 101.401656] dump_stack+0xa1/0xeb [ 101.402871] lockdep_rcu_suspicious+0xc7/0xd0 [ 101.403665] rmnet_is_real_dev_registered+0x40/0x4e [ 101.405199] rmnet_config_notify_cb+0x2c/0x142 [ 101.406344] ? wireless_nlevent_flush+0x47/0x71 [ 101.407385] notifier_call_chain+0x2d/0x47 [ 101.408645] raw_notifier_call_chain+0xc/0xe [ 101.409882] call_netdevice_notifiers_info+0x41/0x49 [ 101.411402] call_netdevice_notifiers+0xc/0xe [ 101.412713] rollback_registered_many+0x268/0x36e [ 101.413702] rollback_registered+0x39/0x56 [ 101.414965] unregister_netdevice_queue+0x79/0x88 [ 101.415908] unregister_netdev+0x16/0x1d Fixes: ceed73a2cf4a ("drivers: net: ethernet: qualcomm: rmnet: Initial implementation") Signed-off-by: Subash Abhinov Kasiviswanathan Reported-by: kernel test robot Signed-off-by: David S. Miller --- drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c index 98f22551eb45..1e33aea59f50 100644 --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c @@ -51,10 +51,7 @@ struct rmnet_walk_data { static int rmnet_is_real_dev_registered(const struct net_device *real_dev) { - rx_handler_func_t *rx_handler; - - rx_handler = rcu_dereference(real_dev->rx_handler); - return (rx_handler == rmnet_rx_handler); + return rcu_access_pointer(real_dev->rx_handler) == rmnet_rx_handler; } /* Needs rtnl lock */ -- cgit From 656f083116a4799d8c0194976b8a2d66bf306538 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 14:59:44 +0200 Subject: x86/fpu: Rename copyin_to_xsaves()/copyout_from_xsaves() to copy_user_to_xstate()/copy_xstate_to_user() The 'copyin/copyout' nomenclature needlessly departs from what the modern FPU code uses, which is: copy_fpregs_to_fpstate() copy_fpstate_to_sigframe() copy_fregs_to_user() copy_fxregs_to_kernel() copy_fxregs_to_user() copy_kernel_to_fpregs() copy_kernel_to_fregs() copy_kernel_to_fxregs() copy_kernel_to_xregs() copy_user_to_fregs() copy_user_to_fxregs() copy_user_to_xregs() copy_xregs_to_kernel() copy_xregs_to_user() I.e. according to this pattern, the following rename should be done: copyin_to_xsaves() -> copy_user_to_xstate() copyout_from_xsaves() -> copy_xstate_to_user() or, if we want to be pedantic, denote that that the user-space format is ptrace: copyin_to_xsaves() -> copy_user_ptrace_to_xstate() copyout_from_xsaves() -> copy_xstate_to_user_ptrace() But I'd suggest the shorter, non-pedantic name. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-2-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/xstate.h | 4 ++-- arch/x86/kernel/fpu/regset.c | 4 ++-- arch/x86/kernel/fpu/signal.c | 2 +- arch/x86/kernel/fpu/xstate.c | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 1b2799e0699a..a1baa17e9748 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -48,8 +48,8 @@ void fpu__xstate_clear_all_cpu_caps(void); void *get_xsave_addr(struct xregs_state *xsave, int xstate); const void *get_xsave_field_ptr(int xstate_field); int using_compacted_format(void); -int copyout_from_xsaves(unsigned int pos, unsigned int count, void *kbuf, +int copy_xstate_to_user(unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf, struct xregs_state *xsave); -int copyin_to_xsaves(const void *kbuf, const void __user *ubuf, +int copy_user_to_xstate(const void *kbuf, const void __user *ubuf, struct xregs_state *xsave); #endif diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index b188b16841e3..165d0545c924 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -92,7 +92,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, fpu__activate_fpstate_read(fpu); if (using_compacted_format()) { - ret = copyout_from_xsaves(pos, count, kbuf, ubuf, xsave); + ret = copy_xstate_to_user(pos, count, kbuf, ubuf, xsave); } else { fpstate_sanitize_xstate(fpu); /* @@ -132,7 +132,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, fpu__activate_fpstate_write(fpu); if (boot_cpu_has(X86_FEATURE_XSAVES)) - ret = copyin_to_xsaves(kbuf, ubuf, xsave); + ret = copy_user_to_xstate(kbuf, ubuf, xsave); else ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 83c23c230b4c..b1fe9a1fc4e0 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -324,7 +324,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) fpu__drop(fpu); if (using_compacted_format()) { - err = copyin_to_xsaves(NULL, buf_fx, + err = copy_user_to_xstate(NULL, buf_fx, &fpu->state.xsave); } else { err = __copy_from_user(&fpu->state.xsave, diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index c24ac1efb12d..e7bb41723eaa 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -951,7 +951,7 @@ static inline int xstate_copyout(unsigned int pos, unsigned int count, * zero. This is called from xstateregs_get() and there we check the CPU * has XSAVES. */ -int copyout_from_xsaves(unsigned int pos, unsigned int count, void *kbuf, +int copy_xstate_to_user(unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf, struct xregs_state *xsave) { unsigned int offset, size; @@ -1023,7 +1023,7 @@ int copyout_from_xsaves(unsigned int pos, unsigned int count, void *kbuf, * there we check the CPU has XSAVES and a whole standard-sized buffer * exists. */ -int copyin_to_xsaves(const void *kbuf, const void __user *ubuf, +int copy_user_to_xstate(const void *kbuf, const void __user *ubuf, struct xregs_state *xsave) { unsigned int offset, size; -- cgit From f0d4f30a7fd299587840a028655285a87f334904 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 14:59:45 +0200 Subject: x86/fpu: Split copy_xstate_to_user() into copy_xstate_to_kernel() & copy_xstate_to_user() copy_xstate_to_user() is a weird API - in part due to a bad API inherited from the regset APIs. But don't propagate that bad API choice into the FPU code - so as a first step split the API into kernel and user buffer handling routines. (Also split the xstate_copyout() internal helper.) The split API is a dumb duplication that should be obviously correct, the real splitting will be done in the next patch. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-3-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/xstate.h | 4 +- arch/x86/kernel/fpu/regset.c | 5 +- arch/x86/kernel/fpu/xstate.c | 110 +++++++++++++++++++++++++++++++++++--- 3 files changed, 109 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index a1baa17e9748..92dc8ca14124 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -48,8 +48,8 @@ void fpu__xstate_clear_all_cpu_caps(void); void *get_xsave_addr(struct xregs_state *xsave, int xstate); const void *get_xsave_field_ptr(int xstate_field); int using_compacted_format(void); -int copy_xstate_to_user(unsigned int pos, unsigned int count, void *kbuf, - void __user *ubuf, struct xregs_state *xsave); +int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf, struct xregs_state *xsave); +int copy_xstate_to_user(unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf, struct xregs_state *xsave); int copy_user_to_xstate(const void *kbuf, const void __user *ubuf, struct xregs_state *xsave); #endif diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 165d0545c924..b6d12d66d04b 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -92,7 +92,10 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, fpu__activate_fpstate_read(fpu); if (using_compacted_format()) { - ret = copy_xstate_to_user(pos, count, kbuf, ubuf, xsave); + if (kbuf) + ret = copy_xstate_to_kernel(pos, count, kbuf, ubuf, xsave); + else + ret = copy_xstate_to_user(pos, count, kbuf, ubuf, xsave); } else { fpstate_sanitize_xstate(fpu); /* diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index e7bb41723eaa..38561539cb99 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -924,10 +924,106 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, * This is similar to user_regset_copyout(), but will not add offset to * the source data pointer or increment pos, count, kbuf, and ubuf. */ -static inline int xstate_copyout(unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf, - const void *data, const int start_pos, - const int end_pos) +static inline int +__copy_xstate_to_kernel(unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf, + const void *data, const int start_pos, + const int end_pos) +{ + if ((count == 0) || (pos < start_pos)) + return 0; + + if (end_pos < 0 || pos < end_pos) { + unsigned int copy = (end_pos < 0 ? count : min(count, end_pos - pos)); + + if (kbuf) { + memcpy(kbuf + pos, data, copy); + } else { + if (__copy_to_user(ubuf + pos, data, copy)) + return -EFAULT; + } + } + return 0; +} + +/* + * Convert from kernel XSAVES compacted format to standard format and copy + * to a kernel-space ptrace buffer. + * + * It supports partial copy but pos always starts from zero. This is called + * from xstateregs_get() and there we check the CPU has XSAVES. + */ +int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, + void __user *ubuf, struct xregs_state *xsave) +{ + unsigned int offset, size; + int ret, i; + struct xstate_header header; + + /* + * Currently copy_regset_to_user() starts from pos 0: + */ + if (unlikely(pos != 0)) + return -EFAULT; + + /* + * The destination is a ptrace buffer; we put in only user xstates: + */ + memset(&header, 0, sizeof(header)); + header.xfeatures = xsave->header.xfeatures; + header.xfeatures &= ~XFEATURE_MASK_SUPERVISOR; + + /* + * Copy xregs_state->header: + */ + offset = offsetof(struct xregs_state, header); + size = sizeof(header); + + ret = __copy_xstate_to_kernel(offset, size, kbuf, ubuf, &header, 0, count); + + if (ret) + return ret; + + for (i = 0; i < XFEATURE_MAX; i++) { + /* + * Copy only in-use xstates: + */ + if ((header.xfeatures >> i) & 1) { + void *src = __raw_xsave_addr(xsave, 1 << i); + + offset = xstate_offsets[i]; + size = xstate_sizes[i]; + + ret = __copy_xstate_to_kernel(offset, size, kbuf, ubuf, src, 0, count); + + if (ret) + return ret; + + if (offset + size >= count) + break; + } + + } + + /* + * Fill xsave->i387.sw_reserved value for ptrace frame: + */ + offset = offsetof(struct fxregs_state, sw_reserved); + size = sizeof(xstate_fx_sw_bytes); + + ret = __copy_xstate_to_kernel(offset, size, kbuf, ubuf, xstate_fx_sw_bytes, 0, count); + + if (ret) + return ret; + + return 0; +} + +static inline int +__copy_xstate_to_user(unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf, + const void *data, const int start_pos, + const int end_pos) { if ((count == 0) || (pos < start_pos)) return 0; @@ -977,7 +1073,7 @@ int copy_xstate_to_user(unsigned int pos, unsigned int count, void *kbuf, offset = offsetof(struct xregs_state, header); size = sizeof(header); - ret = xstate_copyout(offset, size, kbuf, ubuf, &header, 0, count); + ret = __copy_xstate_to_user(offset, size, kbuf, ubuf, &header, 0, count); if (ret) return ret; @@ -992,7 +1088,7 @@ int copy_xstate_to_user(unsigned int pos, unsigned int count, void *kbuf, offset = xstate_offsets[i]; size = xstate_sizes[i]; - ret = xstate_copyout(offset, size, kbuf, ubuf, src, 0, count); + ret = __copy_xstate_to_user(offset, size, kbuf, ubuf, src, 0, count); if (ret) return ret; @@ -1009,7 +1105,7 @@ int copy_xstate_to_user(unsigned int pos, unsigned int count, void *kbuf, offset = offsetof(struct fxregs_state, sw_reserved); size = sizeof(xstate_fx_sw_bytes); - ret = xstate_copyout(offset, size, kbuf, ubuf, xstate_fx_sw_bytes, 0, count); + ret = __copy_xstate_to_user(offset, size, kbuf, ubuf, xstate_fx_sw_bytes, 0, count); if (ret) return ret; -- cgit From 4d981cf2d96f29cdfa7d4972c8b377fe7baa9c4c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 14:59:46 +0200 Subject: x86/fpu: Remove 'ubuf' parameter from the copy_xstate_to_kernel() APIs The 'ubuf' parameter is unused in the _kernel() side of the API, remove it. This simplifies the code and makes it easier to think about. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-4-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/xstate.h | 2 +- arch/x86/kernel/fpu/regset.c | 2 +- arch/x86/kernel/fpu/xstate.c | 21 ++++++--------------- 3 files changed, 8 insertions(+), 17 deletions(-) diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 92dc8ca14124..c762574a245f 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -48,7 +48,7 @@ void fpu__xstate_clear_all_cpu_caps(void); void *get_xsave_addr(struct xregs_state *xsave, int xstate); const void *get_xsave_field_ptr(int xstate_field); int using_compacted_format(void); -int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf, struct xregs_state *xsave); +int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, struct xregs_state *xsave); int copy_xstate_to_user(unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf, struct xregs_state *xsave); int copy_user_to_xstate(const void *kbuf, const void __user *ubuf, struct xregs_state *xsave); diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index b6d12d66d04b..34e74adf9d5d 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -93,7 +93,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, if (using_compacted_format()) { if (kbuf) - ret = copy_xstate_to_kernel(pos, count, kbuf, ubuf, xsave); + ret = copy_xstate_to_kernel(pos, count, kbuf, xsave); else ret = copy_xstate_to_user(pos, count, kbuf, ubuf, xsave); } else { diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 38561539cb99..71d3bda2b898 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -926,7 +926,7 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, */ static inline int __copy_xstate_to_kernel(unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf, + void *kbuf, const void *data, const int start_pos, const int end_pos) { @@ -936,12 +936,7 @@ __copy_xstate_to_kernel(unsigned int pos, unsigned int count, if (end_pos < 0 || pos < end_pos) { unsigned int copy = (end_pos < 0 ? count : min(count, end_pos - pos)); - if (kbuf) { - memcpy(kbuf + pos, data, copy); - } else { - if (__copy_to_user(ubuf + pos, data, copy)) - return -EFAULT; - } + memcpy(kbuf + pos, data, copy); } return 0; } @@ -953,8 +948,7 @@ __copy_xstate_to_kernel(unsigned int pos, unsigned int count, * It supports partial copy but pos always starts from zero. This is called * from xstateregs_get() and there we check the CPU has XSAVES. */ -int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, - void __user *ubuf, struct xregs_state *xsave) +int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, struct xregs_state *xsave) { unsigned int offset, size; int ret, i; @@ -979,8 +973,7 @@ int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, offset = offsetof(struct xregs_state, header); size = sizeof(header); - ret = __copy_xstate_to_kernel(offset, size, kbuf, ubuf, &header, 0, count); - + ret = __copy_xstate_to_kernel(offset, size, kbuf, &header, 0, count); if (ret) return ret; @@ -994,8 +987,7 @@ int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, offset = xstate_offsets[i]; size = xstate_sizes[i]; - ret = __copy_xstate_to_kernel(offset, size, kbuf, ubuf, src, 0, count); - + ret = __copy_xstate_to_kernel(offset, size, kbuf, src, 0, count); if (ret) return ret; @@ -1011,8 +1003,7 @@ int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, offset = offsetof(struct fxregs_state, sw_reserved); size = sizeof(xstate_fx_sw_bytes); - ret = __copy_xstate_to_kernel(offset, size, kbuf, ubuf, xstate_fx_sw_bytes, 0, count); - + ret = __copy_xstate_to_kernel(offset, size, kbuf, xstate_fx_sw_bytes, 0, count); if (ret) return ret; -- cgit From a69c158fb3e7a91220f55029bf222a4e678d16e9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 14:59:47 +0200 Subject: x86/fpu: Remove 'kbuf' parameter from the copy_xstate_to_user() APIs The 'kbuf' parameter is unused in the _user() side of the API, remove it. This simplifies the code and makes it easier to think about. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-5-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/xstate.h | 2 +- arch/x86/kernel/fpu/regset.c | 2 +- arch/x86/kernel/fpu/xstate.c | 25 +++++++------------------ 3 files changed, 9 insertions(+), 20 deletions(-) diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index c762574a245f..65bd68c30cd0 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -49,7 +49,7 @@ void *get_xsave_addr(struct xregs_state *xsave, int xstate); const void *get_xsave_field_ptr(int xstate_field); int using_compacted_format(void); int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, struct xregs_state *xsave); -int copy_xstate_to_user(unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf, struct xregs_state *xsave); +int copy_xstate_to_user(unsigned int pos, unsigned int count, void __user *ubuf, struct xregs_state *xsave); int copy_user_to_xstate(const void *kbuf, const void __user *ubuf, struct xregs_state *xsave); #endif diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 34e74adf9d5d..fd6dbdd8fde6 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -95,7 +95,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, if (kbuf) ret = copy_xstate_to_kernel(pos, count, kbuf, xsave); else - ret = copy_xstate_to_user(pos, count, kbuf, ubuf, xsave); + ret = copy_xstate_to_user(pos, count, ubuf, xsave); } else { fpstate_sanitize_xstate(fpu); /* diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 71d3bda2b898..2d8f3344875d 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1011,10 +1011,7 @@ int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, stru } static inline int -__copy_xstate_to_user(unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf, - const void *data, const int start_pos, - const int end_pos) +__copy_xstate_to_user(unsigned int pos, unsigned int count, void __user *ubuf, const void *data, const int start_pos, const int end_pos) { if ((count == 0) || (pos < start_pos)) return 0; @@ -1022,12 +1019,8 @@ __copy_xstate_to_user(unsigned int pos, unsigned int count, if (end_pos < 0 || pos < end_pos) { unsigned int copy = (end_pos < 0 ? count : min(count, end_pos - pos)); - if (kbuf) { - memcpy(kbuf + pos, data, copy); - } else { - if (__copy_to_user(ubuf + pos, data, copy)) - return -EFAULT; - } + if (__copy_to_user(ubuf + pos, data, copy)) + return -EFAULT; } return 0; } @@ -1038,8 +1031,7 @@ __copy_xstate_to_user(unsigned int pos, unsigned int count, * zero. This is called from xstateregs_get() and there we check the CPU * has XSAVES. */ -int copy_xstate_to_user(unsigned int pos, unsigned int count, void *kbuf, - void __user *ubuf, struct xregs_state *xsave) +int copy_xstate_to_user(unsigned int pos, unsigned int count, void __user *ubuf, struct xregs_state *xsave) { unsigned int offset, size; int ret, i; @@ -1064,8 +1056,7 @@ int copy_xstate_to_user(unsigned int pos, unsigned int count, void *kbuf, offset = offsetof(struct xregs_state, header); size = sizeof(header); - ret = __copy_xstate_to_user(offset, size, kbuf, ubuf, &header, 0, count); - + ret = __copy_xstate_to_user(offset, size, ubuf, &header, 0, count); if (ret) return ret; @@ -1079,8 +1070,7 @@ int copy_xstate_to_user(unsigned int pos, unsigned int count, void *kbuf, offset = xstate_offsets[i]; size = xstate_sizes[i]; - ret = __copy_xstate_to_user(offset, size, kbuf, ubuf, src, 0, count); - + ret = __copy_xstate_to_user(offset, size, ubuf, src, 0, count); if (ret) return ret; @@ -1096,8 +1086,7 @@ int copy_xstate_to_user(unsigned int pos, unsigned int count, void *kbuf, offset = offsetof(struct fxregs_state, sw_reserved); size = sizeof(xstate_fx_sw_bytes); - ret = __copy_xstate_to_user(offset, size, kbuf, ubuf, xstate_fx_sw_bytes, 0, count); - + ret = __copy_xstate_to_user(offset, size, ubuf, xstate_fx_sw_bytes, 0, count); if (ret) return ret; -- cgit From d7eda6c99cc75f1c41d67abf988f37a10045a370 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 14:59:48 +0200 Subject: x86/fpu: Clean up parameter order in the copy_xstate_to_*() APIs Parameter ordering is weird: int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, struct xregs_state *xsave); int copy_xstate_to_user(unsigned int pos, unsigned int count, void __user *ubuf, struct xregs_state *xsave); 'pos' and 'count', which are attributes of the destination buffer, are listed before the destination buffer itself ... List them after the primary arguments instead. This makes the code more similar to regular memcpy() variant APIs. No change in functionality. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-6-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/xstate.h | 4 ++-- arch/x86/kernel/fpu/regset.c | 4 ++-- arch/x86/kernel/fpu/xstate.c | 25 ++++++++++++------------- 3 files changed, 16 insertions(+), 17 deletions(-) diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 65bd68c30cd0..e4430b84939d 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -48,8 +48,8 @@ void fpu__xstate_clear_all_cpu_caps(void); void *get_xsave_addr(struct xregs_state *xsave, int xstate); const void *get_xsave_field_ptr(int xstate_field); int using_compacted_format(void); -int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, struct xregs_state *xsave); -int copy_xstate_to_user(unsigned int pos, unsigned int count, void __user *ubuf, struct xregs_state *xsave); +int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int pos, unsigned int count); +int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int pos, unsigned int count); int copy_user_to_xstate(const void *kbuf, const void __user *ubuf, struct xregs_state *xsave); #endif diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index fd6dbdd8fde6..ec1404194b65 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -93,9 +93,9 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, if (using_compacted_format()) { if (kbuf) - ret = copy_xstate_to_kernel(pos, count, kbuf, xsave); + ret = copy_xstate_to_kernel(kbuf, xsave, pos, count); else - ret = copy_xstate_to_user(pos, count, ubuf, xsave); + ret = copy_xstate_to_user(ubuf, xsave, pos, count); } else { fpstate_sanitize_xstate(fpu); /* diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 2d8f3344875d..0a299468510f 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -925,10 +925,9 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, * the source data pointer or increment pos, count, kbuf, and ubuf. */ static inline int -__copy_xstate_to_kernel(unsigned int pos, unsigned int count, - void *kbuf, - const void *data, const int start_pos, - const int end_pos) +__copy_xstate_to_kernel(void *kbuf, + const void *data, + unsigned int pos, unsigned int count, const int start_pos, const int end_pos) { if ((count == 0) || (pos < start_pos)) return 0; @@ -948,7 +947,7 @@ __copy_xstate_to_kernel(unsigned int pos, unsigned int count, * It supports partial copy but pos always starts from zero. This is called * from xstateregs_get() and there we check the CPU has XSAVES. */ -int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, struct xregs_state *xsave) +int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int pos, unsigned int count) { unsigned int offset, size; int ret, i; @@ -973,7 +972,7 @@ int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, stru offset = offsetof(struct xregs_state, header); size = sizeof(header); - ret = __copy_xstate_to_kernel(offset, size, kbuf, &header, 0, count); + ret = __copy_xstate_to_kernel(kbuf, &header, offset, size, 0, count); if (ret) return ret; @@ -987,7 +986,7 @@ int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, stru offset = xstate_offsets[i]; size = xstate_sizes[i]; - ret = __copy_xstate_to_kernel(offset, size, kbuf, src, 0, count); + ret = __copy_xstate_to_kernel(kbuf, src, offset, size, 0, count); if (ret) return ret; @@ -1003,7 +1002,7 @@ int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, stru offset = offsetof(struct fxregs_state, sw_reserved); size = sizeof(xstate_fx_sw_bytes); - ret = __copy_xstate_to_kernel(offset, size, kbuf, xstate_fx_sw_bytes, 0, count); + ret = __copy_xstate_to_kernel(kbuf, xstate_fx_sw_bytes, offset, size, 0, count); if (ret) return ret; @@ -1011,7 +1010,7 @@ int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, stru } static inline int -__copy_xstate_to_user(unsigned int pos, unsigned int count, void __user *ubuf, const void *data, const int start_pos, const int end_pos) +__copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int pos, unsigned int count, const int start_pos, const int end_pos) { if ((count == 0) || (pos < start_pos)) return 0; @@ -1031,7 +1030,7 @@ __copy_xstate_to_user(unsigned int pos, unsigned int count, void __user *ubuf, c * zero. This is called from xstateregs_get() and there we check the CPU * has XSAVES. */ -int copy_xstate_to_user(unsigned int pos, unsigned int count, void __user *ubuf, struct xregs_state *xsave) +int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int pos, unsigned int count) { unsigned int offset, size; int ret, i; @@ -1056,7 +1055,7 @@ int copy_xstate_to_user(unsigned int pos, unsigned int count, void __user *ubuf, offset = offsetof(struct xregs_state, header); size = sizeof(header); - ret = __copy_xstate_to_user(offset, size, ubuf, &header, 0, count); + ret = __copy_xstate_to_user(ubuf, &header, offset, size, 0, count); if (ret) return ret; @@ -1070,7 +1069,7 @@ int copy_xstate_to_user(unsigned int pos, unsigned int count, void __user *ubuf, offset = xstate_offsets[i]; size = xstate_sizes[i]; - ret = __copy_xstate_to_user(offset, size, ubuf, src, 0, count); + ret = __copy_xstate_to_user(ubuf, src, offset, size, 0, count); if (ret) return ret; @@ -1086,7 +1085,7 @@ int copy_xstate_to_user(unsigned int pos, unsigned int count, void __user *ubuf, offset = offsetof(struct fxregs_state, sw_reserved); size = sizeof(xstate_fx_sw_bytes); - ret = __copy_xstate_to_user(offset, size, ubuf, xstate_fx_sw_bytes, 0, count); + ret = __copy_xstate_to_user(ubuf, xstate_fx_sw_bytes, offset, size, 0, count); if (ret) return ret; -- cgit From becb2bb72ff906cc0d2bac3ee9574f694364823b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 14:59:49 +0200 Subject: x86/fpu: Clean up the parameter definitions of copy_xstate_to_*() Remove pointless 'const' of non-pointer input parameter. Remove unnecessary parenthesis that shows uncertainty about arithmetic operator precedence. Clarify copy_xstate_to_user() description. No change in functionality. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-7-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 0a299468510f..9647e7256179 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -927,13 +927,13 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, static inline int __copy_xstate_to_kernel(void *kbuf, const void *data, - unsigned int pos, unsigned int count, const int start_pos, const int end_pos) + unsigned int pos, unsigned int count, int start_pos, int end_pos) { if ((count == 0) || (pos < start_pos)) return 0; if (end_pos < 0 || pos < end_pos) { - unsigned int copy = (end_pos < 0 ? count : min(count, end_pos - pos)); + unsigned int copy = end_pos < 0 ? count : min(count, end_pos - pos); memcpy(kbuf + pos, data, copy); } @@ -1010,13 +1010,13 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int po } static inline int -__copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int pos, unsigned int count, const int start_pos, const int end_pos) +__copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int pos, unsigned int count, int start_pos, int end_pos) { if ((count == 0) || (pos < start_pos)) return 0; if (end_pos < 0 || pos < end_pos) { - unsigned int copy = (end_pos < 0 ? count : min(count, end_pos - pos)); + unsigned int copy = end_pos < 0 ? count : min(count, end_pos - pos); if (__copy_to_user(ubuf + pos, data, copy)) return -EFAULT; @@ -1026,7 +1026,7 @@ __copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int pos, uns /* * Convert from kernel XSAVES compacted format to standard format and copy - * to a ptrace buffer. It supports partial copy but pos always starts from + * to a user-space buffer. It supports partial copy but pos always starts from * zero. This is called from xstateregs_get() and there we check the CPU * has XSAVES. */ -- cgit From 8a5b731889cbf004b406d988dc591c8a7aac773e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 14:59:50 +0200 Subject: x86/fpu: Remove the 'start_pos' parameter from the __copy_xstate_to_*() functions 'start_pos' is always 0, so remove it and remove the pointless check of 'pos < 0' which can not ever be true as 'pos' is unsigned ... No change in functionality. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-8-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 9647e7256179..1f50fdaf4c5a 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -927,9 +927,9 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, static inline int __copy_xstate_to_kernel(void *kbuf, const void *data, - unsigned int pos, unsigned int count, int start_pos, int end_pos) + unsigned int pos, unsigned int count, int end_pos) { - if ((count == 0) || (pos < start_pos)) + if (!count) return 0; if (end_pos < 0 || pos < end_pos) { @@ -972,7 +972,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int po offset = offsetof(struct xregs_state, header); size = sizeof(header); - ret = __copy_xstate_to_kernel(kbuf, &header, offset, size, 0, count); + ret = __copy_xstate_to_kernel(kbuf, &header, offset, size, count); if (ret) return ret; @@ -986,7 +986,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int po offset = xstate_offsets[i]; size = xstate_sizes[i]; - ret = __copy_xstate_to_kernel(kbuf, src, offset, size, 0, count); + ret = __copy_xstate_to_kernel(kbuf, src, offset, size, count); if (ret) return ret; @@ -1002,7 +1002,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int po offset = offsetof(struct fxregs_state, sw_reserved); size = sizeof(xstate_fx_sw_bytes); - ret = __copy_xstate_to_kernel(kbuf, xstate_fx_sw_bytes, offset, size, 0, count); + ret = __copy_xstate_to_kernel(kbuf, xstate_fx_sw_bytes, offset, size, count); if (ret) return ret; @@ -1010,9 +1010,9 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int po } static inline int -__copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int pos, unsigned int count, int start_pos, int end_pos) +__copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int pos, unsigned int count, int end_pos) { - if ((count == 0) || (pos < start_pos)) + if (!count) return 0; if (end_pos < 0 || pos < end_pos) { @@ -1055,7 +1055,7 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i offset = offsetof(struct xregs_state, header); size = sizeof(header); - ret = __copy_xstate_to_user(ubuf, &header, offset, size, 0, count); + ret = __copy_xstate_to_user(ubuf, &header, offset, size, count); if (ret) return ret; @@ -1069,7 +1069,7 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i offset = xstate_offsets[i]; size = xstate_sizes[i]; - ret = __copy_xstate_to_user(ubuf, src, offset, size, 0, count); + ret = __copy_xstate_to_user(ubuf, src, offset, size, count); if (ret) return ret; @@ -1085,7 +1085,7 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i offset = offsetof(struct fxregs_state, sw_reserved); size = sizeof(xstate_fx_sw_bytes); - ret = __copy_xstate_to_user(ubuf, xstate_fx_sw_bytes, offset, size, 0, count); + ret = __copy_xstate_to_user(ubuf, xstate_fx_sw_bytes, offset, size, count); if (ret) return ret; -- cgit From 56583c9a1400fe1935edd55b24b4fbbc779b59cb Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 14:59:51 +0200 Subject: x86/fpu: Clarify parameter names in the copy_xstate_to_*() methods Right now there's a confusing mixture of 'offset' and 'size' parameters: - __copy_xstate_to_*() input parameter 'end_pos' not not really an offset, but the full size of the copy to be performed. - input parameter 'count' to copy_xstate_to_*() shadows that of __copy_xstate_to_*()'s 'count' parameter name - but the roles are different: the first one is the total number of bytes to be copied, while the second one is a partial copy size. To unconfuse all this, use a consistent set of parameter names: - 'size' is the partial copy size within a single xstate component - 'size_total' is the total copy requested - 'offset_start' is the requested starting offset. - 'offset' is the offset within an xstate component. No change in functionality. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-9-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/xstate.h | 4 ++-- arch/x86/kernel/fpu/xstate.c | 44 +++++++++++++++++++-------------------- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index e4430b84939d..fed6617a1079 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -48,8 +48,8 @@ void fpu__xstate_clear_all_cpu_caps(void); void *get_xsave_addr(struct xregs_state *xsave, int xstate); const void *get_xsave_field_ptr(int xstate_field); int using_compacted_format(void); -int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int pos, unsigned int count); -int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int pos, unsigned int count); +int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset, unsigned int size); +int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset, unsigned int size); int copy_user_to_xstate(const void *kbuf, const void __user *ubuf, struct xregs_state *xsave); #endif diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 1f50fdaf4c5a..c13083579655 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -927,15 +927,15 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, static inline int __copy_xstate_to_kernel(void *kbuf, const void *data, - unsigned int pos, unsigned int count, int end_pos) + unsigned int offset, unsigned int size, int size_total) { - if (!count) + if (!size) return 0; - if (end_pos < 0 || pos < end_pos) { - unsigned int copy = end_pos < 0 ? count : min(count, end_pos - pos); + if (size_total < 0 || offset < size_total) { + unsigned int copy = size_total < 0 ? size : min(size, size_total - offset); - memcpy(kbuf + pos, data, copy); + memcpy(kbuf + offset, data, copy); } return 0; } @@ -947,7 +947,7 @@ __copy_xstate_to_kernel(void *kbuf, * It supports partial copy but pos always starts from zero. This is called * from xstateregs_get() and there we check the CPU has XSAVES. */ -int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int pos, unsigned int count) +int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset_start, unsigned int size_total) { unsigned int offset, size; int ret, i; @@ -956,7 +956,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int po /* * Currently copy_regset_to_user() starts from pos 0: */ - if (unlikely(pos != 0)) + if (unlikely(offset_start != 0)) return -EFAULT; /* @@ -972,7 +972,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int po offset = offsetof(struct xregs_state, header); size = sizeof(header); - ret = __copy_xstate_to_kernel(kbuf, &header, offset, size, count); + ret = __copy_xstate_to_kernel(kbuf, &header, offset, size, size_total); if (ret) return ret; @@ -986,11 +986,11 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int po offset = xstate_offsets[i]; size = xstate_sizes[i]; - ret = __copy_xstate_to_kernel(kbuf, src, offset, size, count); + ret = __copy_xstate_to_kernel(kbuf, src, offset, size, size_total); if (ret) return ret; - if (offset + size >= count) + if (offset + size >= size_total) break; } @@ -1002,7 +1002,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int po offset = offsetof(struct fxregs_state, sw_reserved); size = sizeof(xstate_fx_sw_bytes); - ret = __copy_xstate_to_kernel(kbuf, xstate_fx_sw_bytes, offset, size, count); + ret = __copy_xstate_to_kernel(kbuf, xstate_fx_sw_bytes, offset, size, size_total); if (ret) return ret; @@ -1010,15 +1010,15 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int po } static inline int -__copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int pos, unsigned int count, int end_pos) +__copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int offset, unsigned int size, int size_total) { - if (!count) + if (!size) return 0; - if (end_pos < 0 || pos < end_pos) { - unsigned int copy = end_pos < 0 ? count : min(count, end_pos - pos); + if (size_total < 0 || offset < size_total) { + unsigned int copy = size_total < 0 ? size : min(size, size_total - offset); - if (__copy_to_user(ubuf + pos, data, copy)) + if (__copy_to_user(ubuf + offset, data, copy)) return -EFAULT; } return 0; @@ -1030,7 +1030,7 @@ __copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int pos, uns * zero. This is called from xstateregs_get() and there we check the CPU * has XSAVES. */ -int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int pos, unsigned int count) +int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset_start, unsigned int size_total) { unsigned int offset, size; int ret, i; @@ -1039,7 +1039,7 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i /* * Currently copy_regset_to_user() starts from pos 0: */ - if (unlikely(pos != 0)) + if (unlikely(offset_start != 0)) return -EFAULT; /* @@ -1055,7 +1055,7 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i offset = offsetof(struct xregs_state, header); size = sizeof(header); - ret = __copy_xstate_to_user(ubuf, &header, offset, size, count); + ret = __copy_xstate_to_user(ubuf, &header, offset, size, size_total); if (ret) return ret; @@ -1069,11 +1069,11 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i offset = xstate_offsets[i]; size = xstate_sizes[i]; - ret = __copy_xstate_to_user(ubuf, src, offset, size, count); + ret = __copy_xstate_to_user(ubuf, src, offset, size, size_total); if (ret) return ret; - if (offset + size >= count) + if (offset + size >= size_total) break; } @@ -1085,7 +1085,7 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i offset = offsetof(struct fxregs_state, sw_reserved); size = sizeof(xstate_fx_sw_bytes); - ret = __copy_xstate_to_user(ubuf, xstate_fx_sw_bytes, offset, size, count); + ret = __copy_xstate_to_user(ubuf, xstate_fx_sw_bytes, offset, size, size_total); if (ret) return ret; -- cgit From 6ff15f8db7eaf29ef5ead6afbec9b25485fe8703 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 14:59:52 +0200 Subject: x86/fpu: Change 'size_total' parameter to unsigned and standardize the size checks in copy_xstate_to_*() 'size_total' is derived from an unsigned input parameter - and then converted to 'int' and checked for negative ranges: if (size_total < 0 || offset < size_total) { This conversion and the checks are unnecessary obfuscation, reject overly large requested copy sizes outright and simplify the underlying code. Reported-by: Rik van Riel Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-10-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index c13083579655..b18c5457065a 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -925,15 +925,11 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, * the source data pointer or increment pos, count, kbuf, and ubuf. */ static inline int -__copy_xstate_to_kernel(void *kbuf, - const void *data, - unsigned int offset, unsigned int size, int size_total) +__copy_xstate_to_kernel(void *kbuf, const void *data, + unsigned int offset, unsigned int size, unsigned int size_total) { - if (!size) - return 0; - - if (size_total < 0 || offset < size_total) { - unsigned int copy = size_total < 0 ? size : min(size, size_total - offset); + if (offset < size_total) { + unsigned int copy = min(size, size_total - offset); memcpy(kbuf + offset, data, copy); } @@ -986,12 +982,13 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of offset = xstate_offsets[i]; size = xstate_sizes[i]; + /* The next component has to fit fully into the output buffer: */ + if (offset + size > size_total) + break; + ret = __copy_xstate_to_kernel(kbuf, src, offset, size, size_total); if (ret) return ret; - - if (offset + size >= size_total) - break; } } @@ -1010,13 +1007,13 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of } static inline int -__copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int offset, unsigned int size, int size_total) +__copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int offset, unsigned int size, unsigned int size_total) { if (!size) return 0; - if (size_total < 0 || offset < size_total) { - unsigned int copy = size_total < 0 ? size : min(size, size_total - offset); + if (offset < size_total) { + unsigned int copy = min(size, size_total - offset); if (__copy_to_user(ubuf + offset, data, copy)) return -EFAULT; @@ -1069,12 +1066,13 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i offset = xstate_offsets[i]; size = xstate_sizes[i]; + /* The next component has to fit fully into the output buffer: */ + if (offset + size > size_total) + break; + ret = __copy_xstate_to_user(ubuf, src, offset, size, size_total); if (ret) return ret; - - if (offset + size >= size_total) - break; } } -- cgit From 8c0817f4a3188ac5485ce14f96f12a175800b881 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 14:59:53 +0200 Subject: x86/fpu: Simplify __copy_xstate_to_kernel() return values __copy_xstate_to_kernel() can only return 0 (because kernel copies cannot fail), simplify the code throughout. No change in functionality. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-11-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index b18c5457065a..00c3b41c3cf1 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -924,7 +924,7 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, * This is similar to user_regset_copyout(), but will not add offset to * the source data pointer or increment pos, count, kbuf, and ubuf. */ -static inline int +static inline void __copy_xstate_to_kernel(void *kbuf, const void *data, unsigned int offset, unsigned int size, unsigned int size_total) { @@ -933,7 +933,6 @@ __copy_xstate_to_kernel(void *kbuf, const void *data, memcpy(kbuf + offset, data, copy); } - return 0; } /* @@ -946,8 +945,8 @@ __copy_xstate_to_kernel(void *kbuf, const void *data, int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset_start, unsigned int size_total) { unsigned int offset, size; - int ret, i; struct xstate_header header; + int i; /* * Currently copy_regset_to_user() starts from pos 0: @@ -968,9 +967,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of offset = offsetof(struct xregs_state, header); size = sizeof(header); - ret = __copy_xstate_to_kernel(kbuf, &header, offset, size, size_total); - if (ret) - return ret; + __copy_xstate_to_kernel(kbuf, &header, offset, size, size_total); for (i = 0; i < XFEATURE_MAX; i++) { /* @@ -986,9 +983,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of if (offset + size > size_total) break; - ret = __copy_xstate_to_kernel(kbuf, src, offset, size, size_total); - if (ret) - return ret; + __copy_xstate_to_kernel(kbuf, src, offset, size, size_total); } } @@ -999,9 +994,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of offset = offsetof(struct fxregs_state, sw_reserved); size = sizeof(xstate_fx_sw_bytes); - ret = __copy_xstate_to_kernel(kbuf, xstate_fx_sw_bytes, offset, size, size_total); - if (ret) - return ret; + __copy_xstate_to_kernel(kbuf, xstate_fx_sw_bytes, offset, size, size_total); return 0; } -- cgit From 79fecc2b7506f29fb91becc65e8788e5ae7eba9f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 14:59:54 +0200 Subject: x86/fpu: Split copy_user_to_xstate() into copy_kernel_to_xstate() & copy_user_to_xstate() Similar to: x86/fpu: Split copy_xstate_to_user() into copy_xstate_to_kernel() & copy_xstate_to_user() No change in functionality. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-12-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/xstate.h | 4 +-- arch/x86/kernel/fpu/regset.c | 10 ++++-- arch/x86/kernel/fpu/xstate.c | 66 ++++++++++++++++++++++++++++++++++++++- 3 files changed, 74 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index fed6617a1079..79af79dbcab6 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -50,6 +50,6 @@ const void *get_xsave_field_ptr(int xstate_field); int using_compacted_format(void); int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset, unsigned int size); int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset, unsigned int size); -int copy_user_to_xstate(const void *kbuf, const void __user *ubuf, - struct xregs_state *xsave); +int copy_kernel_to_xstate(const void *kbuf, const void __user *ubuf, struct xregs_state *xsave); +int copy_user_to_xstate(const void *kbuf, const void __user *ubuf, struct xregs_state *xsave); #endif diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index ec1404194b65..cb45dd81d617 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -134,10 +134,14 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, fpu__activate_fpstate_write(fpu); - if (boot_cpu_has(X86_FEATURE_XSAVES)) - ret = copy_user_to_xstate(kbuf, ubuf, xsave); - else + if (boot_cpu_has(X86_FEATURE_XSAVES)) { + if (kbuf) + ret = copy_kernel_to_xstate(kbuf, ubuf, xsave); + else + ret = copy_user_to_xstate(kbuf, ubuf, xsave); + } else { ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); + } /* * In case of failure, mark all states as init: diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 00c3b41c3cf1..1ad25d1b8056 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1084,7 +1084,71 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i } /* - * Convert from a ptrace standard-format buffer to kernel XSAVES format + * Convert from a ptrace standard-format kernel buffer to kernel XSAVES format + * and copy to the target thread. This is called from xstateregs_set() and + * there we check the CPU has XSAVES and a whole standard-sized buffer + * exists. + */ +int copy_kernel_to_xstate(const void *kbuf, const void __user *ubuf, + struct xregs_state *xsave) +{ + unsigned int offset, size; + int i; + u64 xfeatures; + u64 allowed_features; + + offset = offsetof(struct xregs_state, header); + size = sizeof(xfeatures); + + if (kbuf) { + memcpy(&xfeatures, kbuf + offset, size); + } else { + if (__copy_from_user(&xfeatures, ubuf + offset, size)) + return -EFAULT; + } + + /* + * Reject if the user sets any disabled or supervisor features: + */ + allowed_features = xfeatures_mask & ~XFEATURE_MASK_SUPERVISOR; + + if (xfeatures & ~allowed_features) + return -EINVAL; + + for (i = 0; i < XFEATURE_MAX; i++) { + u64 mask = ((u64)1 << i); + + if (xfeatures & mask) { + void *dst = __raw_xsave_addr(xsave, 1 << i); + + offset = xstate_offsets[i]; + size = xstate_sizes[i]; + + if (kbuf) { + memcpy(dst, kbuf + offset, size); + } else { + if (__copy_from_user(dst, ubuf + offset, size)) + return -EFAULT; + } + } + } + + /* + * The state that came in from userspace was user-state only. + * Mask all the user states out of 'xfeatures': + */ + xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR; + + /* + * Add back in the features that came in from userspace: + */ + xsave->header.xfeatures |= xfeatures; + + return 0; +} + +/* + * Convert from a ptrace standard-format user-space buffer to kernel XSAVES format * and copy to the target thread. This is called from xstateregs_set() and * there we check the CPU has XSAVES and a whole standard-sized buffer * exists. -- cgit From 59dffa4edba1f15b2bfdbe608aca1efe664c674c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 14:59:55 +0200 Subject: x86/fpu: Remove 'ubuf' parameter from the copy_kernel_to_xstate() API No change in functionality. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-13-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/xstate.h | 2 +- arch/x86/kernel/fpu/regset.c | 2 +- arch/x86/kernel/fpu/xstate.c | 17 +++-------------- 3 files changed, 5 insertions(+), 16 deletions(-) diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 79af79dbcab6..f10889bc0c88 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -50,6 +50,6 @@ const void *get_xsave_field_ptr(int xstate_field); int using_compacted_format(void); int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset, unsigned int size); int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset, unsigned int size); -int copy_kernel_to_xstate(const void *kbuf, const void __user *ubuf, struct xregs_state *xsave); +int copy_kernel_to_xstate(const void *kbuf, struct xregs_state *xsave); int copy_user_to_xstate(const void *kbuf, const void __user *ubuf, struct xregs_state *xsave); #endif diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index cb45dd81d617..785302c75f38 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -136,7 +136,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, if (boot_cpu_has(X86_FEATURE_XSAVES)) { if (kbuf) - ret = copy_kernel_to_xstate(kbuf, ubuf, xsave); + ret = copy_kernel_to_xstate(kbuf, xsave); else ret = copy_user_to_xstate(kbuf, ubuf, xsave); } else { diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 1ad25d1b8056..71cc8d367fdd 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1089,8 +1089,7 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i * there we check the CPU has XSAVES and a whole standard-sized buffer * exists. */ -int copy_kernel_to_xstate(const void *kbuf, const void __user *ubuf, - struct xregs_state *xsave) +int copy_kernel_to_xstate(const void *kbuf, struct xregs_state *xsave) { unsigned int offset, size; int i; @@ -1100,12 +1099,7 @@ int copy_kernel_to_xstate(const void *kbuf, const void __user *ubuf, offset = offsetof(struct xregs_state, header); size = sizeof(xfeatures); - if (kbuf) { - memcpy(&xfeatures, kbuf + offset, size); - } else { - if (__copy_from_user(&xfeatures, ubuf + offset, size)) - return -EFAULT; - } + memcpy(&xfeatures, kbuf + offset, size); /* * Reject if the user sets any disabled or supervisor features: @@ -1124,12 +1118,7 @@ int copy_kernel_to_xstate(const void *kbuf, const void __user *ubuf, offset = xstate_offsets[i]; size = xstate_sizes[i]; - if (kbuf) { - memcpy(dst, kbuf + offset, size); - } else { - if (__copy_from_user(dst, ubuf + offset, size)) - return -EFAULT; - } + memcpy(dst, kbuf + offset, size); } } -- cgit From 7b9094c688f807c110a2dab6f6edc5876bfa7b0b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 14:59:56 +0200 Subject: x86/fpu: Remove 'kbuf' parameter from the copy_user_to_xstate() API No change in functionality. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-14-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/xstate.h | 2 +- arch/x86/kernel/fpu/regset.c | 2 +- arch/x86/kernel/fpu/signal.c | 11 ++++------- arch/x86/kernel/fpu/xstate.c | 19 +++++-------------- 4 files changed, 11 insertions(+), 23 deletions(-) diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index f10889bc0c88..4ceb90740d80 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -51,5 +51,5 @@ int using_compacted_format(void); int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset, unsigned int size); int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset, unsigned int size); int copy_kernel_to_xstate(const void *kbuf, struct xregs_state *xsave); -int copy_user_to_xstate(const void *kbuf, const void __user *ubuf, struct xregs_state *xsave); +int copy_user_to_xstate(const void __user *ubuf, struct xregs_state *xsave); #endif diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 785302c75f38..caf723f31737 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -138,7 +138,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, if (kbuf) ret = copy_kernel_to_xstate(kbuf, xsave); else - ret = copy_user_to_xstate(kbuf, ubuf, xsave); + ret = copy_user_to_xstate(ubuf, xsave); } else { ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); } diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index b1fe9a1fc4e0..2c685b492fd6 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -323,13 +323,10 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) */ fpu__drop(fpu); - if (using_compacted_format()) { - err = copy_user_to_xstate(NULL, buf_fx, - &fpu->state.xsave); - } else { - err = __copy_from_user(&fpu->state.xsave, - buf_fx, state_size); - } + if (using_compacted_format()) + err = copy_user_to_xstate(buf_fx, &fpu->state.xsave); + else + err = __copy_from_user(&fpu->state.xsave, buf_fx, state_size); if (err || __copy_from_user(&env, buf, sizeof(env))) { fpstate_init(&fpu->state); diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 71cc8d367fdd..b1f3e4dae2e3 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1142,8 +1142,7 @@ int copy_kernel_to_xstate(const void *kbuf, struct xregs_state *xsave) * there we check the CPU has XSAVES and a whole standard-sized buffer * exists. */ -int copy_user_to_xstate(const void *kbuf, const void __user *ubuf, - struct xregs_state *xsave) +int copy_user_to_xstate(const void __user *ubuf, struct xregs_state *xsave) { unsigned int offset, size; int i; @@ -1153,12 +1152,8 @@ int copy_user_to_xstate(const void *kbuf, const void __user *ubuf, offset = offsetof(struct xregs_state, header); size = sizeof(xfeatures); - if (kbuf) { - memcpy(&xfeatures, kbuf + offset, size); - } else { - if (__copy_from_user(&xfeatures, ubuf + offset, size)) - return -EFAULT; - } + if (__copy_from_user(&xfeatures, ubuf + offset, size)) + return -EFAULT; /* * Reject if the user sets any disabled or supervisor features: @@ -1177,12 +1172,8 @@ int copy_user_to_xstate(const void *kbuf, const void __user *ubuf, offset = xstate_offsets[i]; size = xstate_sizes[i]; - if (kbuf) { - memcpy(dst, kbuf + offset, size); - } else { - if (__copy_from_user(dst, ubuf + offset, size)) - return -EFAULT; - } + if (__copy_from_user(dst, ubuf + offset, size)) + return -EFAULT; } } -- cgit From 6d7f7da5533a3f841eeb1d9657257c9367924274 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 14:59:57 +0200 Subject: x86/fpu: Flip the parameter order in copy_*_to_xstate() Make it more consistent with regular memcpy() semantics, where the destination argument comes first. No change in functionality. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-15-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/xstate.h | 4 ++-- arch/x86/kernel/fpu/regset.c | 4 ++-- arch/x86/kernel/fpu/signal.c | 2 +- arch/x86/kernel/fpu/xstate.c | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 4ceb90740d80..579ac2358e63 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -50,6 +50,6 @@ const void *get_xsave_field_ptr(int xstate_field); int using_compacted_format(void); int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset, unsigned int size); int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset, unsigned int size); -int copy_kernel_to_xstate(const void *kbuf, struct xregs_state *xsave); -int copy_user_to_xstate(const void __user *ubuf, struct xregs_state *xsave); +int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf); +int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf); #endif diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index caf723f31737..19a7385a912c 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -136,9 +136,9 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, if (boot_cpu_has(X86_FEATURE_XSAVES)) { if (kbuf) - ret = copy_kernel_to_xstate(kbuf, xsave); + ret = copy_kernel_to_xstate(xsave, kbuf); else - ret = copy_user_to_xstate(ubuf, xsave); + ret = copy_user_to_xstate(xsave, ubuf); } else { ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); } diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 2c685b492fd6..2d682dac35d4 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -324,7 +324,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) fpu__drop(fpu); if (using_compacted_format()) - err = copy_user_to_xstate(buf_fx, &fpu->state.xsave); + err = copy_user_to_xstate(&fpu->state.xsave, buf_fx); else err = __copy_from_user(&fpu->state.xsave, buf_fx, state_size); diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index b1f3e4dae2e3..0ef35040d0ad 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1089,7 +1089,7 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i * there we check the CPU has XSAVES and a whole standard-sized buffer * exists. */ -int copy_kernel_to_xstate(const void *kbuf, struct xregs_state *xsave) +int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) { unsigned int offset, size; int i; @@ -1142,7 +1142,7 @@ int copy_kernel_to_xstate(const void *kbuf, struct xregs_state *xsave) * there we check the CPU has XSAVES and a whole standard-sized buffer * exists. */ -int copy_user_to_xstate(const void __user *ubuf, struct xregs_state *xsave) +int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) { unsigned int offset, size; int i; -- cgit From b3a163081c28d1a4d1ad76259a9d93b34a82f1da Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 14:59:58 +0200 Subject: x86/fpu: Simplify fpu->fpregs_active use The fpregs_active() inline function is pretty pointless - in almost all the callsites it can be replaced with a direct fpu->fpregs_active access. Do so and eliminate the extra layer of obfuscation. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-16-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 17 +---------------- arch/x86/kernel/fpu/core.c | 2 +- arch/x86/kernel/fpu/signal.c | 9 +++++---- arch/x86/mm/pkeys.c | 2 +- 4 files changed, 8 insertions(+), 22 deletions(-) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 554cdb205d17..b223c57dd5dc 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -542,21 +542,6 @@ static inline void fpregs_activate(struct fpu *fpu) trace_x86_fpu_regs_activated(fpu); } -/* - * The question "does this thread have fpu access?" - * is slightly racy, since preemption could come in - * and revoke it immediately after the test. - * - * However, even in that very unlikely scenario, - * we can just assume we have FPU access - typically - * to save the FP state - we'll just take a #NM - * fault and get the FPU access back. - */ -static inline int fpregs_active(void) -{ - return current->thread.fpu.fpregs_active; -} - /* * FPU state switching for scheduling. * @@ -617,7 +602,7 @@ static inline void user_fpu_begin(void) struct fpu *fpu = ¤t->thread.fpu; preempt_disable(); - if (!fpregs_active()) + if (!fpu->fpregs_active) fpregs_activate(fpu); preempt_enable(); } diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index e1114f070c2d..bad57248e5a0 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -367,7 +367,7 @@ void fpu__current_fpstate_write_end(void) * registers may still be out of date. Update them with * an XRSTOR if they are active. */ - if (fpregs_active()) + if (fpu->fpregs_active) copy_kernel_to_fpregs(&fpu->state); /* diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 2d682dac35d4..684025654d0c 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -155,7 +155,8 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) */ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) { - struct xregs_state *xsave = ¤t->thread.fpu.state.xsave; + struct fpu *fpu = ¤t->thread.fpu; + struct xregs_state *xsave = &fpu->state.xsave; struct task_struct *tsk = current; int ia32_fxstate = (buf != buf_fx); @@ -170,13 +171,13 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) sizeof(struct user_i387_ia32_struct), NULL, (struct _fpstate_32 __user *) buf) ? -1 : 1; - if (fpregs_active() || using_compacted_format()) { + if (fpu->fpregs_active || using_compacted_format()) { /* Save the live register state to the user directly. */ if (copy_fpregs_to_sigframe(buf_fx)) return -1; /* Update the thread's fxstate to save the fsave header. */ if (ia32_fxstate) - copy_fxregs_to_kernel(&tsk->thread.fpu); + copy_fxregs_to_kernel(fpu); } else { /* * It is a *bug* if kernel uses compacted-format for xsave @@ -189,7 +190,7 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) return -1; } - fpstate_sanitize_xstate(&tsk->thread.fpu); + fpstate_sanitize_xstate(fpu); if (__copy_to_user(buf_fx, xsave, fpu_user_xstate_size)) return -1; } diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c index 2dab69a706ec..e2c23472233e 100644 --- a/arch/x86/mm/pkeys.c +++ b/arch/x86/mm/pkeys.c @@ -45,7 +45,7 @@ int __execute_only_pkey(struct mm_struct *mm) */ preempt_disable(); if (!need_to_set_mm_pkey && - fpregs_active() && + current->thread.fpu.fpregs_active && !__pkru_allows_read(read_pkru(), execute_only_pkey)) { preempt_enable(); return execute_only_pkey; -- cgit From a10b6a16cdad88170f546d008c77453cddf918e6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 14:59:59 +0200 Subject: x86/fpu: Make the fpu state change in fpu__clear() scheduler-atomic Do this temporarily only, to make it easier to change the FPU state machine, in particular this change couples the fpu->fpregs_active and fpu->fpstate_active states: they are only set/cleared together (as far as the scheduler sees them). This will be removed by later patches. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-17-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index bad57248e5a0..b7dc3833d41a 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -462,9 +462,11 @@ void fpu__clear(struct fpu *fpu) * Make sure fpstate is cleared and initialized. */ if (static_cpu_has(X86_FEATURE_FPU)) { + preempt_disable(); fpu__activate_curr(fpu); user_fpu_begin(); copy_init_fpstate_to_fpregs(); + preempt_enable(); } } -- cgit From b6aa85558d7e7b18fc3470d2bc1731d2205dd275 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 15:00:00 +0200 Subject: x86/fpu: Split the state handling in fpu__drop() Prepare fpu__drop() to use fpu->fpregs_active. There are two distinct usecases for fpu__drop() in this context: exit_thread() when called for 'current' in exit(), and when called for another task in fork(). This patch does not change behavior, it only adds a couple of debug checks and structures the code to make the ->fpregs_active change more obviously correct. All the complications will be removed later on. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-18-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/core.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index b7dc3833d41a..815dfba7781a 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -414,12 +414,19 @@ void fpu__drop(struct fpu *fpu) { preempt_disable(); - if (fpu->fpregs_active) { - /* Ignore delayed exceptions from user space */ - asm volatile("1: fwait\n" - "2:\n" - _ASM_EXTABLE(1b, 2b)); - fpregs_deactivate(fpu); + if (fpu == ¤t->thread.fpu) { + WARN_ON_FPU(fpu->fpstate_active != fpu->fpregs_active); + + if (fpu->fpregs_active) { + /* Ignore delayed exceptions from user space */ + asm volatile("1: fwait\n" + "2:\n" + _ASM_EXTABLE(1b, 2b)); + if (fpu->fpregs_active) + fpregs_deactivate(fpu); + } + } else { + WARN_ON_FPU(fpu->fpregs_active); } fpu->fpstate_active = 0; -- cgit From f1c8cd0176078c7bcafdc89cac447cab672a0b5e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 15:00:01 +0200 Subject: x86/fpu: Change fpu->fpregs_active users to fpu->fpstate_active We want to simplify the FPU state machine by eliminating fpu->fpregs_active, and we can do that because the two state flags (::fpregs_active and ::fpstate_active) are set essentially together. The old lazy FPU switching code used to make a distinction - but there's no lazy switching code anymore, we always switch in an 'eager' fashion. Do this by first changing all substantial uses of fpu->fpregs_active to fpu->fpstate_active and adding a few debug checks to double check our assumption is correct. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-19-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 4 +++- arch/x86/kernel/fpu/core.c | 16 ++++++++++------ arch/x86/kernel/fpu/signal.c | 4 +++- arch/x86/mm/pkeys.c | 3 +-- 4 files changed, 17 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index b223c57dd5dc..7fa676f93ac1 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -556,7 +556,9 @@ static inline void fpregs_activate(struct fpu *fpu) static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu) { - if (old_fpu->fpregs_active) { + WARN_ON_FPU(old_fpu->fpregs_active != old_fpu->fpstate_active); + + if (old_fpu->fpstate_active) { if (!copy_fpregs_to_fpstate(old_fpu)) old_fpu->last_cpu = -1; else diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 815dfba7781a..eab244622402 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -100,7 +100,7 @@ void __kernel_fpu_begin(void) kernel_fpu_disable(); - if (fpu->fpregs_active) { + if (fpu->fpstate_active) { /* * Ignore return value -- we don't care if reg state * is clobbered. @@ -116,7 +116,7 @@ void __kernel_fpu_end(void) { struct fpu *fpu = ¤t->thread.fpu; - if (fpu->fpregs_active) + if (fpu->fpstate_active) copy_kernel_to_fpregs(&fpu->state); kernel_fpu_enable(); @@ -147,8 +147,10 @@ void fpu__save(struct fpu *fpu) WARN_ON_FPU(fpu != ¤t->thread.fpu); preempt_disable(); + WARN_ON_FPU(fpu->fpstate_active != fpu->fpregs_active); + trace_x86_fpu_before_save(fpu); - if (fpu->fpregs_active) { + if (fpu->fpstate_active) { if (!copy_fpregs_to_fpstate(fpu)) { copy_kernel_to_fpregs(&fpu->state); } @@ -262,11 +264,12 @@ EXPORT_SYMBOL_GPL(fpu__activate_curr); */ void fpu__activate_fpstate_read(struct fpu *fpu) { + WARN_ON_FPU(fpu->fpstate_active != fpu->fpregs_active); /* * If fpregs are active (in the current CPU), then * copy them to the fpstate: */ - if (fpu->fpregs_active) { + if (fpu->fpstate_active) { fpu__save(fpu); } else { if (!fpu->fpstate_active) { @@ -362,12 +365,13 @@ void fpu__current_fpstate_write_end(void) { struct fpu *fpu = ¤t->thread.fpu; + WARN_ON_FPU(fpu->fpstate_active != fpu->fpregs_active); /* * 'fpu' now has an updated copy of the state, but the * registers may still be out of date. Update them with * an XRSTOR if they are active. */ - if (fpu->fpregs_active) + if (fpu->fpstate_active) copy_kernel_to_fpregs(&fpu->state); /* @@ -417,7 +421,7 @@ void fpu__drop(struct fpu *fpu) if (fpu == ¤t->thread.fpu) { WARN_ON_FPU(fpu->fpstate_active != fpu->fpregs_active); - if (fpu->fpregs_active) { + if (fpu->fpstate_active) { /* Ignore delayed exceptions from user space */ asm volatile("1: fwait\n" "2:\n" diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 684025654d0c..a88083ba7f8b 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -171,7 +171,9 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) sizeof(struct user_i387_ia32_struct), NULL, (struct _fpstate_32 __user *) buf) ? -1 : 1; - if (fpu->fpregs_active || using_compacted_format()) { + WARN_ON_FPU(fpu->fpstate_active != fpu->fpregs_active); + + if (fpu->fpstate_active || using_compacted_format()) { /* Save the live register state to the user directly. */ if (copy_fpregs_to_sigframe(buf_fx)) return -1; diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c index e2c23472233e..4d24269c071f 100644 --- a/arch/x86/mm/pkeys.c +++ b/arch/x86/mm/pkeys.c @@ -18,7 +18,6 @@ #include /* boot_cpu_has, ... */ #include /* vma_pkey() */ -#include /* fpregs_active() */ int __execute_only_pkey(struct mm_struct *mm) { @@ -45,7 +44,7 @@ int __execute_only_pkey(struct mm_struct *mm) */ preempt_disable(); if (!need_to_set_mm_pkey && - current->thread.fpu.fpregs_active && + current->thread.fpu.fpstate_active && !__pkru_allows_read(read_pkru(), execute_only_pkey)) { preempt_enable(); return execute_only_pkey; -- cgit From 6cf4edbe0526db311a28734609da888fdfcb3604 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 15:00:02 +0200 Subject: x86/fpu: Decouple fpregs_activate()/fpregs_deactivate() from fpu->fpregs_active The fpregs_activate()/fpregs_deactivate() are currently called in such a pattern: if (!fpu->fpregs_active) fpregs_activate(fpu); ... if (fpu->fpregs_active) fpregs_deactivate(fpu); But note that it's actually safe to call them without checking the flag first. This further decouples the fpu->fpregs_active flag from actual FPU logic. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-20-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 7 +------ arch/x86/kernel/fpu/core.c | 3 +-- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 7fa676f93ac1..42a601673c09 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -526,8 +526,6 @@ static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu) */ static inline void fpregs_deactivate(struct fpu *fpu) { - WARN_ON_FPU(!fpu->fpregs_active); - fpu->fpregs_active = 0; this_cpu_write(fpu_fpregs_owner_ctx, NULL); trace_x86_fpu_regs_deactivated(fpu); @@ -535,8 +533,6 @@ static inline void fpregs_deactivate(struct fpu *fpu) static inline void fpregs_activate(struct fpu *fpu) { - WARN_ON_FPU(fpu->fpregs_active); - fpu->fpregs_active = 1; this_cpu_write(fpu_fpregs_owner_ctx, fpu); trace_x86_fpu_regs_activated(fpu); @@ -604,8 +600,7 @@ static inline void user_fpu_begin(void) struct fpu *fpu = ¤t->thread.fpu; preempt_disable(); - if (!fpu->fpregs_active) - fpregs_activate(fpu); + fpregs_activate(fpu); preempt_enable(); } diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index eab244622402..01a47e9edfb4 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -426,8 +426,7 @@ void fpu__drop(struct fpu *fpu) asm volatile("1: fwait\n" "2:\n" _ASM_EXTABLE(1b, 2b)); - if (fpu->fpregs_active) - fpregs_deactivate(fpu); + fpregs_deactivate(fpu); } } else { WARN_ON_FPU(fpu->fpregs_active); -- cgit From 99dc26bda233ee722bbd370bddf20beece3ffb93 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 15:00:03 +0200 Subject: x86/fpu: Remove struct fpu::fpregs_active The previous changes paved the way for the removal of the fpu::fpregs_active state flag - we now only have the fpu::fpstate_active and fpu::last_cpu fields left. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-21-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 5 ----- arch/x86/include/asm/fpu/types.h | 23 ----------------------- arch/x86/include/asm/trace/fpu.h | 5 +---- arch/x86/kernel/fpu/core.c | 9 --------- arch/x86/kernel/fpu/signal.c | 2 -- 5 files changed, 1 insertion(+), 43 deletions(-) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 42a601673c09..629e7abcd6c9 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -526,14 +526,12 @@ static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu) */ static inline void fpregs_deactivate(struct fpu *fpu) { - fpu->fpregs_active = 0; this_cpu_write(fpu_fpregs_owner_ctx, NULL); trace_x86_fpu_regs_deactivated(fpu); } static inline void fpregs_activate(struct fpu *fpu) { - fpu->fpregs_active = 1; this_cpu_write(fpu_fpregs_owner_ctx, fpu); trace_x86_fpu_regs_activated(fpu); } @@ -552,8 +550,6 @@ static inline void fpregs_activate(struct fpu *fpu) static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu) { - WARN_ON_FPU(old_fpu->fpregs_active != old_fpu->fpstate_active); - if (old_fpu->fpstate_active) { if (!copy_fpregs_to_fpstate(old_fpu)) old_fpu->last_cpu = -1; @@ -561,7 +557,6 @@ switch_fpu_prepare(struct fpu *old_fpu, int cpu) old_fpu->last_cpu = cpu; /* But leave fpu_fpregs_owner_ctx! */ - old_fpu->fpregs_active = 0; trace_x86_fpu_regs_deactivated(old_fpu); } else old_fpu->last_cpu = -1; diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 3c80f5b9c09d..0c314a397cf5 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -298,29 +298,6 @@ struct fpu { */ unsigned char fpstate_active; - /* - * @fpregs_active: - * - * This flag determines whether a given context is actively - * loaded into the FPU's registers and that those registers - * represent the task's current FPU state. - * - * Note the interaction with fpstate_active: - * - * # task does not use the FPU: - * fpstate_active == 0 - * - * # task uses the FPU and regs are active: - * fpstate_active == 1 && fpregs_active == 1 - * - * # the regs are inactive but still match fpstate: - * fpstate_active == 1 && fpregs_active == 0 && fpregs_owner == fpu - * - * The third state is what we use for the lazy restore optimization - * on lazy-switching CPUs. - */ - unsigned char fpregs_active; - /* * @state: * diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h index 342e59789fcd..da565aae9fd2 100644 --- a/arch/x86/include/asm/trace/fpu.h +++ b/arch/x86/include/asm/trace/fpu.h @@ -12,7 +12,6 @@ DECLARE_EVENT_CLASS(x86_fpu, TP_STRUCT__entry( __field(struct fpu *, fpu) - __field(bool, fpregs_active) __field(bool, fpstate_active) __field(u64, xfeatures) __field(u64, xcomp_bv) @@ -20,16 +19,14 @@ DECLARE_EVENT_CLASS(x86_fpu, TP_fast_assign( __entry->fpu = fpu; - __entry->fpregs_active = fpu->fpregs_active; __entry->fpstate_active = fpu->fpstate_active; if (boot_cpu_has(X86_FEATURE_OSXSAVE)) { __entry->xfeatures = fpu->state.xsave.header.xfeatures; __entry->xcomp_bv = fpu->state.xsave.header.xcomp_bv; } ), - TP_printk("x86/fpu: %p fpregs_active: %d fpstate_active: %d xfeatures: %llx xcomp_bv: %llx", + TP_printk("x86/fpu: %p fpstate_active: %d xfeatures: %llx xcomp_bv: %llx", __entry->fpu, - __entry->fpregs_active, __entry->fpstate_active, __entry->xfeatures, __entry->xcomp_bv diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 01a47e9edfb4..93103a909c47 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -147,8 +147,6 @@ void fpu__save(struct fpu *fpu) WARN_ON_FPU(fpu != ¤t->thread.fpu); preempt_disable(); - WARN_ON_FPU(fpu->fpstate_active != fpu->fpregs_active); - trace_x86_fpu_before_save(fpu); if (fpu->fpstate_active) { if (!copy_fpregs_to_fpstate(fpu)) { @@ -191,7 +189,6 @@ EXPORT_SYMBOL_GPL(fpstate_init); int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) { - dst_fpu->fpregs_active = 0; dst_fpu->last_cpu = -1; if (!src_fpu->fpstate_active || !static_cpu_has(X86_FEATURE_FPU)) @@ -264,7 +261,6 @@ EXPORT_SYMBOL_GPL(fpu__activate_curr); */ void fpu__activate_fpstate_read(struct fpu *fpu) { - WARN_ON_FPU(fpu->fpstate_active != fpu->fpregs_active); /* * If fpregs are active (in the current CPU), then * copy them to the fpstate: @@ -365,7 +361,6 @@ void fpu__current_fpstate_write_end(void) { struct fpu *fpu = ¤t->thread.fpu; - WARN_ON_FPU(fpu->fpstate_active != fpu->fpregs_active); /* * 'fpu' now has an updated copy of the state, but the * registers may still be out of date. Update them with @@ -419,8 +414,6 @@ void fpu__drop(struct fpu *fpu) preempt_disable(); if (fpu == ¤t->thread.fpu) { - WARN_ON_FPU(fpu->fpstate_active != fpu->fpregs_active); - if (fpu->fpstate_active) { /* Ignore delayed exceptions from user space */ asm volatile("1: fwait\n" @@ -428,8 +421,6 @@ void fpu__drop(struct fpu *fpu) _ASM_EXTABLE(1b, 2b)); fpregs_deactivate(fpu); } - } else { - WARN_ON_FPU(fpu->fpregs_active); } fpu->fpstate_active = 0; diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index a88083ba7f8b..629106e51a29 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -171,8 +171,6 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) sizeof(struct user_i387_ia32_struct), NULL, (struct _fpstate_32 __user *) buf) ? -1 : 1; - WARN_ON_FPU(fpu->fpstate_active != fpu->fpregs_active); - if (fpu->fpstate_active || using_compacted_format()) { /* Save the live register state to the user directly. */ if (copy_fpregs_to_sigframe(buf_fx)) -- cgit From 0852b374173bb57f870d78e6c6839c77b339be5f Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Sat, 23 Sep 2017 15:00:04 +0200 Subject: x86/fpu: Add FPU state copying quirk to handle XRSTOR failure on Intel Skylake CPUs On Skylake CPUs I noticed that XRSTOR is unable to deal with states created by copyout_from_xsaves() if the xstate has only SSE/YMM state, and no FP state. That is, xfeatures had XFEATURE_MASK_SSE set, but not XFEATURE_MASK_FP. The reason is that part of the SSE/YMM state lives in the MXCSR and MXCSR_FLAGS fields of the FP state. Ensure that whenever we copy SSE or YMM state around, the MXCSR and MXCSR_FLAGS fields are also copied around. Signed-off-by: Rik van Riel Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170210085445.0f1cc708@annuminas.surriel.com Link: http://lkml.kernel.org/r/20170923130016.21448-22-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/types.h | 3 +++ arch/x86/kernel/fpu/xstate.c | 42 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 0c314a397cf5..71db45ca8870 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -68,6 +68,9 @@ struct fxregs_state { /* Default value for fxregs_state.mxcsr: */ #define MXCSR_DEFAULT 0x1f80 +/* Copy both mxcsr & mxcsr_flags with a single u64 memcpy: */ +#define MXCSR_AND_FLAGS_SIZE sizeof(u64) + /* * Software based FPU emulation state. This is arbitrary really, * it matches the x87 format to make it easier to understand: diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 0ef35040d0ad..41c52256bdce 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -920,6 +920,23 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, } #endif /* ! CONFIG_ARCH_HAS_PKEYS */ +/* + * Weird legacy quirk: SSE and YMM states store information in the + * MXCSR and MXCSR_FLAGS fields of the FP area. That means if the FP + * area is marked as unused in the xfeatures header, we need to copy + * MXCSR and MXCSR_FLAGS if either SSE or YMM are in use. + */ +static inline bool xfeatures_mxcsr_quirk(u64 xfeatures) +{ + if (!(xfeatures & (XFEATURE_MASK_SSE|XFEATURE_MASK_YMM))) + return 0; + + if (xfeatures & XFEATURE_MASK_FP) + return 0; + + return 1; +} + /* * This is similar to user_regset_copyout(), but will not add offset to * the source data pointer or increment pos, count, kbuf, and ubuf. @@ -988,6 +1005,12 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of } + if (xfeatures_mxcsr_quirk(header.xfeatures)) { + offset = offsetof(struct fxregs_state, mxcsr); + size = MXCSR_AND_FLAGS_SIZE; + __copy_xstate_to_kernel(kbuf, &xsave->i387.mxcsr, offset, size, size_total); + } + /* * Fill xsave->i387.sw_reserved value for ptrace frame: */ @@ -1070,6 +1093,12 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i } + if (xfeatures_mxcsr_quirk(header.xfeatures)) { + offset = offsetof(struct fxregs_state, mxcsr); + size = MXCSR_AND_FLAGS_SIZE; + __copy_xstate_to_user(ubuf, &xsave->i387.mxcsr, offset, size, size_total); + } + /* * Fill xsave->i387.sw_reserved value for ptrace frame: */ @@ -1122,6 +1151,12 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) } } + if (xfeatures_mxcsr_quirk(xfeatures)) { + offset = offsetof(struct fxregs_state, mxcsr); + size = MXCSR_AND_FLAGS_SIZE; + memcpy(&xsave->i387.mxcsr, kbuf + offset, size); + } + /* * The state that came in from userspace was user-state only. * Mask all the user states out of 'xfeatures': @@ -1177,6 +1212,13 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) } } + if (xfeatures_mxcsr_quirk(xfeatures)) { + offset = offsetof(struct fxregs_state, mxcsr); + size = MXCSR_AND_FLAGS_SIZE; + if (__copy_from_user(&xsave->i387.mxcsr, ubuf + offset, size)) + return -EFAULT; + } + /* * The state that came in from userspace was user-state only. * Mask all the user states out of 'xfeatures': -- cgit From 4f8cef59bad29344aca0e2e6b0ad18dadd078fd0 Mon Sep 17 00:00:00 2001 From: kbuild test robot Date: Sat, 23 Sep 2017 15:00:05 +0200 Subject: x86/fpu: Fix boolreturn.cocci warnings arch/x86/kernel/fpu/xstate.c:931:9-10: WARNING: return of 0/1 in function 'xfeatures_mxcsr_quirk' with return type bool Return statements in functions returning bool should use true/false instead of 1/0. Generated by: scripts/coccinelle/misc/boolreturn.cocci Signed-off-by: Fengguang Wu Signed-off-by: Thomas Gleixner Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Yu-cheng Yu Cc: kbuild-all@01.org Cc: tipbuild@zytor.com Link: http://lkml.kernel.org/r/20170306004553.GA25764@lkp-wsm-ep1 Link: http://lkml.kernel.org/r/20170923130016.21448-23-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 41c52256bdce..fda1109cc355 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -929,12 +929,12 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, static inline bool xfeatures_mxcsr_quirk(u64 xfeatures) { if (!(xfeatures & (XFEATURE_MASK_SSE|XFEATURE_MASK_YMM))) - return 0; + return false; if (xfeatures & XFEATURE_MASK_FP) - return 0; + return false; - return 1; + return true; } /* -- cgit From 03eaec81ac09814817e9f0307d572ffe8365f980 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 23 Sep 2017 15:00:06 +0200 Subject: x86/fpu: Turn WARN_ON() in context switch into WARN_ON_FPU() copy_xregs_to_kernel checks if the alternatives have been already patched. This WARN_ON() is always executed in every context switch. All the other checks in fpu internal.h are WARN_ON_FPU(), but this one is plain WARN_ON(). I assume it was forgotten to switch it. So switch it to WARN_ON_FPU() too to avoid some unnecessary code in the context switch, and a potentially expensive cache line miss for the global variable. Signed-off-by: Andi Kleen Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170329062605.4970-1-andi@firstfloor.org Link: http://lkml.kernel.org/r/20170923130016.21448-24-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 629e7abcd6c9..2dca7c65319c 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -350,7 +350,7 @@ static inline void copy_xregs_to_kernel(struct xregs_state *xstate) u32 hmask = mask >> 32; int err; - WARN_ON(!alternatives_patched); + WARN_ON_FPU(!alternatives_patched); XSTATE_XSAVE(xstate, lmask, hmask, err); -- cgit From 245a396a9b1a67ac5c3228737c261b3e48708a2a Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 23 Sep 2017 08:06:18 +0200 Subject: iio: adc: twl4030: Fix an error handling path in 'twl4030_madc_probe()' If 'devm_regulator_get()' fails, we should go through the existing error handling path instead of returning directly, as done is all the other error handling paths in this function. Fixes: 7cc97d77ee8a ("iio: adc: twl4030: Fix ADC[3:6] readings") Signed-off-by: Christophe JAILLET Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/adc/twl4030-madc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/iio/adc/twl4030-madc.c b/drivers/iio/adc/twl4030-madc.c index bd3d37fc2144..252f5890311e 100644 --- a/drivers/iio/adc/twl4030-madc.c +++ b/drivers/iio/adc/twl4030-madc.c @@ -887,8 +887,10 @@ static int twl4030_madc_probe(struct platform_device *pdev) /* Enable 3v1 bias regulator for MADC[3:6] */ madc->usb3v1 = devm_regulator_get(madc->dev, "vusb3v1"); - if (IS_ERR(madc->usb3v1)) - return -ENODEV; + if (IS_ERR(madc->usb3v1)) { + ret = -ENODEV; + goto err_i2c; + } ret = regulator_enable(madc->usb3v1); if (ret) -- cgit From 7f70be6e4025db0551e6863e7eb9cca07122695c Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 23 Sep 2017 08:06:19 +0200 Subject: iio: adc: twl4030: Disable the vusb3v1 rugulator in the error handling path of 'twl4030_madc_probe()' Commit 7cc97d77ee8a has introduced a call to 'regulator_disable()' in the .remove function. So we should also have such a call in the .probe function in case of error after a successful 'regulator_enable()' call. Add a new label for that and use it. Fixes: 7cc97d77ee8a ("iio: adc: twl4030: Fix ADC[3:6] readings") Signed-off-by: Christophe JAILLET Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/adc/twl4030-madc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/iio/adc/twl4030-madc.c b/drivers/iio/adc/twl4030-madc.c index 252f5890311e..0c86fbb3033e 100644 --- a/drivers/iio/adc/twl4030-madc.c +++ b/drivers/iio/adc/twl4030-madc.c @@ -899,11 +899,13 @@ static int twl4030_madc_probe(struct platform_device *pdev) ret = iio_device_register(iio_dev); if (ret) { dev_err(&pdev->dev, "could not register iio device\n"); - goto err_i2c; + goto err_usb3v1; } return 0; +err_usb3v1: + regulator_disable(madc->usb3v1); err_i2c: twl4030_madc_set_current_generator(madc, 0, 0); err_current_generator: -- cgit From 53063846affd27def6f96e13a9fb80b9a3c2d126 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 23 Sep 2017 08:06:20 +0200 Subject: iio: adc: twl4030: Return an error if we can not enable the vusb3v1 regulator in 'twl4030_madc_probe()' If we can not enable the regulator, go through the error handling path instead of silently continuing. Fixes: 7cc97d77ee8a ("iio: adc: twl4030: Fix ADC[3:6] readings") Signed-off-by: Christophe JAILLET Signed-off-by: Jonathan Cameron --- drivers/iio/adc/twl4030-madc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/iio/adc/twl4030-madc.c b/drivers/iio/adc/twl4030-madc.c index 0c86fbb3033e..28df096e84ec 100644 --- a/drivers/iio/adc/twl4030-madc.c +++ b/drivers/iio/adc/twl4030-madc.c @@ -893,8 +893,10 @@ static int twl4030_madc_probe(struct platform_device *pdev) } ret = regulator_enable(madc->usb3v1); - if (ret) + if (ret) { dev_err(madc->dev, "could not enable 3v1 bias regulator\n"); + goto err_i2c; + } ret = iio_device_register(iio_dev); if (ret) { -- cgit From 0a56eabc4e3f730782e4a9f3af4f60aa03a8a849 Mon Sep 17 00:00:00 2001 From: Fabrice Gasnier Date: Mon, 18 Sep 2017 12:05:30 +0200 Subject: iio: trigger: stm32-timer: preset shouldn't be buffered Currently, setting preset value (ARR) will update directly 'Auto reload value' only on 1st write access. But then, ARPE is set. This makes ARR a shadow register. Preset value should be updated upon each write request: ensure ARPE is 0. This fixes successive writes to preset attribute. Fixes: 4adec7da0536 ("iio: stm32 trigger: Add quadrature encoder device") Signed-off-by: Fabrice Gasnier Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/trigger/stm32-timer-trigger.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/iio/trigger/stm32-timer-trigger.c b/drivers/iio/trigger/stm32-timer-trigger.c index a9bc5b603b86..4cec28af3ecf 100644 --- a/drivers/iio/trigger/stm32-timer-trigger.c +++ b/drivers/iio/trigger/stm32-timer-trigger.c @@ -681,8 +681,9 @@ static ssize_t stm32_count_set_preset(struct iio_dev *indio_dev, if (ret) return ret; + /* TIMx_ARR register shouldn't be buffered (ARPE=0) */ + regmap_update_bits(priv->regmap, TIM_CR1, TIM_CR1_ARPE, 0); regmap_write(priv->regmap, TIM_ARR, preset); - regmap_update_bits(priv->regmap, TIM_CR1, TIM_CR1_ARPE, TIM_CR1_ARPE); return len; } -- cgit From b7a9776c1f9443326632486fcbd82dca82f8511e Mon Sep 17 00:00:00 2001 From: Fabrice Gasnier Date: Mon, 18 Sep 2017 12:05:31 +0200 Subject: iio: trigger: stm32-timer: fix a corner case to write preset Balance timer start routine that sets ARPE: clear it in stop routine. This fixes a corner case, when timer is used successively as trigger (with sampling_frequency start/stop routines), then as a counter (with preset). Fixes: 93fbe91b5521 ("iio: Add STM32 timer trigger driver") Signed-off-by: Fabrice Gasnier Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/trigger/stm32-timer-trigger.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/iio/trigger/stm32-timer-trigger.c b/drivers/iio/trigger/stm32-timer-trigger.c index 4cec28af3ecf..a30ba6e1dfec 100644 --- a/drivers/iio/trigger/stm32-timer-trigger.c +++ b/drivers/iio/trigger/stm32-timer-trigger.c @@ -174,6 +174,7 @@ static void stm32_timer_stop(struct stm32_timer_trigger *priv) clk_disable(priv->clk); /* Stop timer */ + regmap_update_bits(priv->regmap, TIM_CR1, TIM_CR1_ARPE, 0); regmap_update_bits(priv->regmap, TIM_CR1, TIM_CR1_CEN, 0); regmap_write(priv->regmap, TIM_PSC, 0); regmap_write(priv->regmap, TIM_ARR, 0); -- cgit From 4fb840c95f82652cece7352be9080884cafb92a0 Mon Sep 17 00:00:00 2001 From: Fabrice Gasnier Date: Mon, 18 Sep 2017 18:24:15 +0200 Subject: iio: adc: stm32: fix bad error check on max_channels Fix a bad error check when counting 'st,adc-channels' array elements. This is seen when all channels are in use simultaneously. Fixes: 64ad7f643 ("iio: adc: stm32: introduce compatible data cfg") Signed-off-by: Fabrice Gasnier Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/adc/stm32-adc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iio/adc/stm32-adc.c b/drivers/iio/adc/stm32-adc.c index 6bc602891f2f..e93244bc3edd 100644 --- a/drivers/iio/adc/stm32-adc.c +++ b/drivers/iio/adc/stm32-adc.c @@ -1656,7 +1656,7 @@ static int stm32_adc_chan_of_init(struct iio_dev *indio_dev) num_channels = of_property_count_u32_elems(node, "st,adc-channels"); if (num_channels < 0 || - num_channels >= adc_info->max_channels) { + num_channels > adc_info->max_channels) { dev_err(&indio_dev->dev, "Bad st,adc-channels?\n"); return num_channels < 0 ? num_channels : -EINVAL; } -- cgit From 0964e40947a630a2a6f724e968246992f97bcf1c Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Tue, 22 Aug 2017 15:33:00 +0200 Subject: iio: adc: mcp320x: Fix oops on module unload The driver calls spi_get_drvdata() in its ->remove hook even though it has never called spi_set_drvdata(). Stack trace for posterity: Unable to handle kernel NULL pointer dereference at virtual address 00000220 Internal error: Oops: 5 [#1] SMP ARM [<8072f564>] (mutex_lock) from [<7f1400d0>] (iio_device_unregister+0x24/0x7c [industrialio]) [<7f1400d0>] (iio_device_unregister [industrialio]) from [<7f15e020>] (mcp320x_remove+0x20/0x30 [mcp320x]) [<7f15e020>] (mcp320x_remove [mcp320x]) from [<8055a8cc>] (spi_drv_remove+0x2c/0x44) [<8055a8cc>] (spi_drv_remove) from [<805087bc>] (__device_release_driver+0x98/0x134) [<805087bc>] (__device_release_driver) from [<80509180>] (driver_detach+0xdc/0xe0) [<80509180>] (driver_detach) from [<8050823c>] (bus_remove_driver+0x5c/0xb0) [<8050823c>] (bus_remove_driver) from [<80509ab0>] (driver_unregister+0x38/0x58) [<80509ab0>] (driver_unregister) from [<7f15e69c>] (mcp320x_driver_exit+0x14/0x1c [mcp320x]) [<7f15e69c>] (mcp320x_driver_exit [mcp320x]) from [<801a78d0>] (SyS_delete_module+0x184/0x1d0) [<801a78d0>] (SyS_delete_module) from [<80108100>] (ret_fast_syscall+0x0/0x1c) Fixes: f5ce4a7a9291 ("iio: adc: add driver for MCP3204/08 12-bit ADC") Cc: Oskar Andero Signed-off-by: Lukas Wunner Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/adc/mcp320x.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/iio/adc/mcp320x.c b/drivers/iio/adc/mcp320x.c index 634717ae12f3..45d043c9a888 100644 --- a/drivers/iio/adc/mcp320x.c +++ b/drivers/iio/adc/mcp320x.c @@ -312,6 +312,7 @@ static int mcp320x_probe(struct spi_device *spi) indio_dev->name = spi_get_device_id(spi)->name; indio_dev->modes = INDIO_DIRECT_MODE; indio_dev->info = &mcp320x_info; + spi_set_drvdata(spi, indio_dev); chip_info = &mcp320x_chip_infos[spi_get_device_id(spi)->driver_data]; indio_dev->channels = chip_info->channels; -- cgit From e6f4794371ee7cce1339e7ca9542f1e703c5f84a Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Tue, 22 Aug 2017 15:33:00 +0200 Subject: iio: adc: mcp320x: Fix readout of negative voltages Commit f686a36b4b79 ("iio: adc: mcp320x: Add support for mcp3301") returns a signed voltage from mcp320x_adc_conversion() but neglects that the caller interprets a negative return value as failure. Only mcp3301 (and the upcoming mcp3550/1/3) is affected as the other chips are incapable of measuring negative voltages. Fix and while at it, add mcp3301 to the list of supported chips at the top of the file. Fixes: f686a36b4b79 ("iio: adc: mcp320x: Add support for mcp3301") Cc: Andrea Galbusera Signed-off-by: Lukas Wunner Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/adc/mcp320x.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/drivers/iio/adc/mcp320x.c b/drivers/iio/adc/mcp320x.c index 45d043c9a888..071dd23a33d9 100644 --- a/drivers/iio/adc/mcp320x.c +++ b/drivers/iio/adc/mcp320x.c @@ -17,6 +17,8 @@ * MCP3204 * MCP3208 * ------------ + * 13 bit converter + * MCP3301 * * Datasheet can be found here: * http://ww1.microchip.com/downloads/en/DeviceDoc/21293C.pdf mcp3001 @@ -96,7 +98,7 @@ static int mcp320x_channel_to_tx_data(int device_index, } static int mcp320x_adc_conversion(struct mcp320x *adc, u8 channel, - bool differential, int device_index) + bool differential, int device_index, int *val) { int ret; @@ -117,19 +119,25 @@ static int mcp320x_adc_conversion(struct mcp320x *adc, u8 channel, switch (device_index) { case mcp3001: - return (adc->rx_buf[0] << 5 | adc->rx_buf[1] >> 3); + *val = (adc->rx_buf[0] << 5 | adc->rx_buf[1] >> 3); + return 0; case mcp3002: case mcp3004: case mcp3008: - return (adc->rx_buf[0] << 2 | adc->rx_buf[1] >> 6); + *val = (adc->rx_buf[0] << 2 | adc->rx_buf[1] >> 6); + return 0; case mcp3201: - return (adc->rx_buf[0] << 7 | adc->rx_buf[1] >> 1); + *val = (adc->rx_buf[0] << 7 | adc->rx_buf[1] >> 1); + return 0; case mcp3202: case mcp3204: case mcp3208: - return (adc->rx_buf[0] << 4 | adc->rx_buf[1] >> 4); + *val = (adc->rx_buf[0] << 4 | adc->rx_buf[1] >> 4); + return 0; case mcp3301: - return sign_extend32((adc->rx_buf[0] & 0x1f) << 8 | adc->rx_buf[1], 12); + *val = sign_extend32((adc->rx_buf[0] & 0x1f) << 8 + | adc->rx_buf[1], 12); + return 0; default: return -EINVAL; } @@ -150,12 +158,10 @@ static int mcp320x_read_raw(struct iio_dev *indio_dev, switch (mask) { case IIO_CHAN_INFO_RAW: ret = mcp320x_adc_conversion(adc, channel->address, - channel->differential, device_index); - + channel->differential, device_index, val); if (ret < 0) goto out; - *val = ret; ret = IIO_VAL_INT; break; -- cgit From 4b1f0c31f96c45e8521dd84aae50f2aa4aecfb7b Mon Sep 17 00:00:00 2001 From: Colin Parker Date: Mon, 28 Aug 2017 16:21:39 -0700 Subject: IIO: BME280: Updates to Humidity readings need ctrl_reg write! The ctrl_reg register needs to be written after any write to the humidity registers. The value written to the ctrl_reg register does not necessarily need to change, but a write operation must occur. The regmap_update_bits functions will not write to a register if the register value matches the value to be written. This saves unnecessary bus operations. The change in this patch forces a bus write during the chip_config operation by switching to regmap_write_bits. This will fix issues where the Humidity Sensor Oversampling bits are not updated after initialization. Signed-off-by: Colin Parker Acked-by: Andreas Klinger Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/pressure/bmp280-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iio/pressure/bmp280-core.c b/drivers/iio/pressure/bmp280-core.c index d82b788374b6..e442c5248427 100644 --- a/drivers/iio/pressure/bmp280-core.c +++ b/drivers/iio/pressure/bmp280-core.c @@ -558,7 +558,7 @@ static int bmp280_chip_config(struct bmp280_data *data) u8 osrs = BMP280_OSRS_TEMP_X(data->oversampling_temp + 1) | BMP280_OSRS_PRESS_X(data->oversampling_press + 1); - ret = regmap_update_bits(data->regmap, BMP280_REG_CTRL_MEAS, + ret = regmap_write_bits(data->regmap, BMP280_REG_CTRL_MEAS, BMP280_OSRS_TEMP_MASK | BMP280_OSRS_PRESS_MASK | BMP280_MODE_MASK, -- cgit From 7fc10de8d49a748c476532c9d8e8fe19e548dd67 Mon Sep 17 00:00:00 2001 From: Dragos Bogdan Date: Tue, 5 Sep 2017 15:14:45 +0300 Subject: iio: ad_sigma_delta: Implement a dedicated reset function Since most of the SD ADCs have the option of reseting the serial interface by sending a number of SCLKs with CS = 0 and DIN = 1, a dedicated function that can do this is usefull. Needed for the patch: iio: ad7793: Fix the serial interface reset Signed-off-by: Dragos Bogdan Acked-by: Lars-Peter Clausen Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/adc/ad_sigma_delta.c | 28 ++++++++++++++++++++++++++++ include/linux/iio/adc/ad_sigma_delta.h | 3 +++ 2 files changed, 31 insertions(+) diff --git a/drivers/iio/adc/ad_sigma_delta.c b/drivers/iio/adc/ad_sigma_delta.c index d10bd0c97233..22c4c17cd996 100644 --- a/drivers/iio/adc/ad_sigma_delta.c +++ b/drivers/iio/adc/ad_sigma_delta.c @@ -177,6 +177,34 @@ out: } EXPORT_SYMBOL_GPL(ad_sd_read_reg); +/** + * ad_sd_reset() - Reset the serial interface + * + * @sigma_delta: The sigma delta device + * @reset_length: Number of SCLKs with DIN = 1 + * + * Returns 0 on success, an error code otherwise. + **/ +int ad_sd_reset(struct ad_sigma_delta *sigma_delta, + unsigned int reset_length) +{ + uint8_t *buf; + unsigned int size; + int ret; + + size = DIV_ROUND_UP(reset_length, 8); + buf = kcalloc(size, sizeof(*buf), GFP_KERNEL); + if (!buf) + return -ENOMEM; + + memset(buf, 0xff, size); + ret = spi_write(sigma_delta->spi, buf, size); + kfree(buf); + + return ret; +} +EXPORT_SYMBOL_GPL(ad_sd_reset); + static int ad_sd_calibrate(struct ad_sigma_delta *sigma_delta, unsigned int mode, unsigned int channel) { diff --git a/include/linux/iio/adc/ad_sigma_delta.h b/include/linux/iio/adc/ad_sigma_delta.h index 5ba430cc9a87..1fc7abd28b0b 100644 --- a/include/linux/iio/adc/ad_sigma_delta.h +++ b/include/linux/iio/adc/ad_sigma_delta.h @@ -111,6 +111,9 @@ int ad_sd_write_reg(struct ad_sigma_delta *sigma_delta, unsigned int reg, int ad_sd_read_reg(struct ad_sigma_delta *sigma_delta, unsigned int reg, unsigned int size, unsigned int *val); +int ad_sd_reset(struct ad_sigma_delta *sigma_delta, + unsigned int reset_length); + int ad_sigma_delta_single_conversion(struct iio_dev *indio_dev, const struct iio_chan_spec *chan, int *val); int ad_sd_calibrate_all(struct ad_sigma_delta *sigma_delta, -- cgit From 7ee3b7ebcb74714df6d94c8f500f307e1ee5dda5 Mon Sep 17 00:00:00 2001 From: Dragos Bogdan Date: Tue, 5 Sep 2017 15:16:13 +0300 Subject: iio: ad7793: Fix the serial interface reset The serial interface can be reset by writing 32 consecutive 1s to the device. 'ret' was initialized correctly but its value was overwritten when ad7793_check_platform_data() was called. Since a dedicated reset function is present now, it should be used instead. Fixes: 2edb769d246e ("iio:ad7793: Add support for the ad7798 and ad7799") Signed-off-by: Dragos Bogdan Acked-by: Lars-Peter Clausen Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/adc/ad7793.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iio/adc/ad7793.c b/drivers/iio/adc/ad7793.c index e6706a09e100..47c3d7f32900 100644 --- a/drivers/iio/adc/ad7793.c +++ b/drivers/iio/adc/ad7793.c @@ -257,7 +257,7 @@ static int ad7793_setup(struct iio_dev *indio_dev, unsigned int vref_mv) { struct ad7793_state *st = iio_priv(indio_dev); - int i, ret = -1; + int i, ret; unsigned long long scale_uv; u32 id; @@ -266,7 +266,7 @@ static int ad7793_setup(struct iio_dev *indio_dev, return ret; /* reset the serial interface */ - ret = spi_write(st->sd.spi, (u8 *)&ret, sizeof(ret)); + ret = ad_sd_reset(&st->sd, 32); if (ret < 0) goto out; usleep_range(500, 2000); /* Wait for at least 500us */ -- cgit From 3d62c78a6eb9a7d67bace9622b66ad51e81c5f9b Mon Sep 17 00:00:00 2001 From: Matt Fornero Date: Tue, 5 Sep 2017 16:34:10 +0200 Subject: iio: core: Return error for failed read_reg If an IIO device returns an error code for a read access via debugfs, it is currently ignored by the IIO core (other than emitting an error message). Instead, return this error code to user space, so upper layers can detect it correctly. Signed-off-by: Matt Fornero Signed-off-by: Lars-Peter Clausen Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/industrialio-core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c index 17ec4cee51dc..a47428b4d31b 100644 --- a/drivers/iio/industrialio-core.c +++ b/drivers/iio/industrialio-core.c @@ -310,8 +310,10 @@ static ssize_t iio_debugfs_read_reg(struct file *file, char __user *userbuf, ret = indio_dev->info->debugfs_reg_access(indio_dev, indio_dev->cached_reg_addr, 0, &val); - if (ret) + if (ret) { dev_err(indio_dev->dev.parent, "%s: read failed\n", __func__); + return ret; + } len = snprintf(buf, sizeof(buf), "0x%X\n", val); -- cgit From f790923f146140a261ad211e5baf75d169f16fb2 Mon Sep 17 00:00:00 2001 From: Stefan Popa Date: Thu, 14 Sep 2017 16:50:28 +0300 Subject: staging: iio: ad7192: Fix - use the dedicated reset function avoiding dma from stack. Depends on: 691c4b95d1 ("iio: ad_sigma_delta: Implement a dedicated reset function") SPI host drivers can use DMA to transfer data, so the buffer should be properly allocated. Keeping it on the stack could cause an undefined behavior. The dedicated reset function solves this issue. Signed-off-by: Stefan Popa Acked-by: Lars-Peter Clausen Acked-by: Michael Hennerich Cc: Signed-off-by: Jonathan Cameron --- drivers/staging/iio/adc/ad7192.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/staging/iio/adc/ad7192.c b/drivers/staging/iio/adc/ad7192.c index d11c6de9c777..6150d2780e22 100644 --- a/drivers/staging/iio/adc/ad7192.c +++ b/drivers/staging/iio/adc/ad7192.c @@ -223,11 +223,9 @@ static int ad7192_setup(struct ad7192_state *st, struct iio_dev *indio_dev = spi_get_drvdata(st->sd.spi); unsigned long long scale_uv; int i, ret, id; - u8 ones[6]; /* reset the serial interface */ - memset(&ones, 0xFF, 6); - ret = spi_write(st->sd.spi, &ones, 6); + ret = ad_sd_reset(&st->sd, 48); if (ret < 0) goto out; usleep_range(500, 1000); /* Wait for at least 500us */ -- cgit From 5f3d862a736398e7068fa67142133f1713fdee8c Mon Sep 17 00:00:00 2001 From: Gerd Hoffmann Date: Mon, 18 Sep 2017 09:41:45 +0200 Subject: qxl: fix framebuffer unpinning qxl_plane_cleanup_fb() unpins the just activated framebuffer instead of the old one. Oops. Fix it. Cc: Gabriel Krisman Bertazi Fixes: 1277eed5fecb8830c8cc414ad70c1ef640464bc0 Signed-off-by: Gerd Hoffmann Reviewed-by: Gabriel Krisman Bertazi Link: http://patchwork.freedesktop.org/patch/msgid/20170918074145.2257-1-kraxel@redhat.com --- drivers/gpu/drm/qxl/qxl_display.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/qxl/qxl_display.c b/drivers/gpu/drm/qxl/qxl_display.c index e1dd05423e86..afbf50d0c08f 100644 --- a/drivers/gpu/drm/qxl/qxl_display.c +++ b/drivers/gpu/drm/qxl/qxl_display.c @@ -702,14 +702,15 @@ static void qxl_plane_cleanup_fb(struct drm_plane *plane, struct drm_gem_object *obj; struct qxl_bo *user_bo; - if (!plane->state->fb) { - /* we never executed prepare_fb, so there's nothing to + if (!old_state->fb) { + /* + * we never executed prepare_fb, so there's nothing to * unpin. */ return; } - obj = to_qxl_framebuffer(plane->state->fb)->obj; + obj = to_qxl_framebuffer(old_state->fb)->obj; user_bo = gem_to_qxl_bo(obj); qxl_bo_unpin(user_bo); } -- cgit From 814fb7bb7db5433757d76f4c4502c96fc53b0b5e Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 23 Sep 2017 15:00:07 +0200 Subject: x86/fpu: Don't let userspace set bogus xcomp_bv On x86, userspace can use the ptrace() or rt_sigreturn() system calls to set a task's extended state (xstate) or "FPU" registers. ptrace() can set them for another task using the PTRACE_SETREGSET request with NT_X86_XSTATE, while rt_sigreturn() can set them for the current task. In either case, registers can be set to any value, but the kernel assumes that the XSAVE area itself remains valid in the sense that the CPU can restore it. However, in the case where the kernel is using the uncompacted xstate format (which it does whenever the XSAVES instruction is unavailable), it was possible for userspace to set the xcomp_bv field in the xstate_header to an arbitrary value. However, all bits in that field are reserved in the uncompacted case, so when switching to a task with nonzero xcomp_bv, the XRSTOR instruction failed with a #GP fault. This caused the WARN_ON_FPU(err) in copy_kernel_to_xregs() to be hit. In addition, since the error is otherwise ignored, the FPU registers from the task previously executing on the CPU were leaked. Fix the bug by checking that the user-supplied value of xcomp_bv is 0 in the uncompacted case, and returning an error otherwise. The reason for validating xcomp_bv rather than simply overwriting it with 0 is that we want userspace to see an error if it (incorrectly) provides an XSAVE area in compacted format rather than in uncompacted format. Note that as before, in case of error we clear the task's FPU state. This is perhaps non-ideal, especially for PTRACE_SETREGSET; it might be better to return an error before changing anything. But it seems the "clear on error" behavior is fine for now, and it's a little tricky to do otherwise because it would mean we couldn't simply copy the full userspace state into kernel memory in one __copy_from_user(). This bug was found by syzkaller, which hit the above-mentioned WARN_ON_FPU(): WARNING: CPU: 1 PID: 0 at ./arch/x86/include/asm/fpu/internal.h:373 __switch_to+0x5b5/0x5d0 CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.13.0 #453 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 task: ffff9ba2bc8e42c0 task.stack: ffffa78cc036c000 RIP: 0010:__switch_to+0x5b5/0x5d0 RSP: 0000:ffffa78cc08bbb88 EFLAGS: 00010082 RAX: 00000000fffffffe RBX: ffff9ba2b8bf2180 RCX: 00000000c0000100 RDX: 00000000ffffffff RSI: 000000005cb10700 RDI: ffff9ba2b8bf36c0 RBP: ffffa78cc08bbbd0 R08: 00000000929fdf46 R09: 0000000000000001 R10: 0000000000000000 R11: 0000000000000000 R12: ffff9ba2bc8e42c0 R13: 0000000000000000 R14: ffff9ba2b8bf3680 R15: ffff9ba2bf5d7b40 FS: 00007f7e5cb10700(0000) GS:ffff9ba2bf400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000004005cc CR3: 0000000079fd5000 CR4: 00000000001406e0 Call Trace: Code: 84 00 00 00 00 00 e9 11 fd ff ff 0f ff 66 0f 1f 84 00 00 00 00 00 e9 e7 fa ff ff 0f ff 66 0f 1f 84 00 00 00 00 00 e9 c2 fa ff ff <0f> ff 66 0f 1f 84 00 00 00 00 00 e9 d4 fc ff ff 66 66 2e 0f 1f Here is a C reproducer. The expected behavior is that the program spin forever with no output. However, on a buggy kernel running on a processor with the "xsave" feature but without the "xsaves" feature (e.g. Sandy Bridge through Broadwell for Intel), within a second or two the program reports that the xmm registers were corrupted, i.e. were not restored correctly. With CONFIG_X86_DEBUG_FPU=y it also hits the above kernel warning. #define _GNU_SOURCE #include #include #include #include #include #include #include #include int main(void) { int pid = fork(); uint64_t xstate[512]; struct iovec iov = { .iov_base = xstate, .iov_len = sizeof(xstate) }; if (pid == 0) { bool tracee = true; for (int i = 0; i < sysconf(_SC_NPROCESSORS_ONLN) && tracee; i++) tracee = (fork() != 0); uint32_t xmm0[4] = { [0 ... 3] = tracee ? 0x00000000 : 0xDEADBEEF }; asm volatile(" movdqu %0, %%xmm0\n" " mov %0, %%rbx\n" "1: movdqu %%xmm0, %0\n" " mov %0, %%rax\n" " cmp %%rax, %%rbx\n" " je 1b\n" : "+m" (xmm0) : : "rax", "rbx", "xmm0"); printf("BUG: xmm registers corrupted! tracee=%d, xmm0=%08X%08X%08X%08X\n", tracee, xmm0[0], xmm0[1], xmm0[2], xmm0[3]); } else { usleep(100000); ptrace(PTRACE_ATTACH, pid, 0, 0); wait(NULL); ptrace(PTRACE_GETREGSET, pid, NT_X86_XSTATE, &iov); xstate[65] = -1; ptrace(PTRACE_SETREGSET, pid, NT_X86_XSTATE, &iov); ptrace(PTRACE_CONT, pid, 0, 0); wait(NULL); } return 1; } Note: the program only tests for the bug using the ptrace() system call. The bug can also be reproduced using the rt_sigreturn() system call, but only when called from a 32-bit program, since for 64-bit programs the kernel restores the FPU state from the signal frame by doing XRSTOR directly from userspace memory (with proper error checking). Reported-by: Dmitry Vyukov Signed-off-by: Eric Biggers Reviewed-by: Kees Cook Reviewed-by: Rik van Riel Acked-by: Dave Hansen Cc: [v3.17+] Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Eric Biggers Cc: Fenghua Yu Cc: Kevin Hao Cc: Linus Torvalds Cc: Michael Halcrow Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Wanpeng Li Cc: Yu-cheng Yu Cc: kernel-hardening@lists.openwall.com Fixes: 0b29643a5843 ("x86/xsaves: Change compacted format xsave area header") Link: http://lkml.kernel.org/r/20170922174156.16780-2-ebiggers3@gmail.com Link: http://lkml.kernel.org/r/20170923130016.21448-25-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/regset.c | 4 ++++ arch/x86/kernel/fpu/signal.c | 9 +++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 19a7385a912c..c764f7405322 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -141,6 +141,10 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, ret = copy_user_to_xstate(xsave, ubuf); } else { ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); + + /* xcomp_bv must be 0 when using uncompacted format */ + if (!ret && xsave->header.xcomp_bv) + ret = -EINVAL; } /* diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 629106e51a29..da68ea1c3a44 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -324,11 +324,16 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) */ fpu__drop(fpu); - if (using_compacted_format()) + if (using_compacted_format()) { err = copy_user_to_xstate(&fpu->state.xsave, buf_fx); - else + } else { err = __copy_from_user(&fpu->state.xsave, buf_fx, state_size); + /* xcomp_bv must be 0 when using uncompacted format */ + if (!err && state_size > offsetof(struct xregs_state, header) && fpu->state.xsave.header.xcomp_bv) + err = -EINVAL; + } + if (err || __copy_from_user(&env, buf, sizeof(env))) { fpstate_init(&fpu->state); trace_x86_fpu_init_state(fpu); -- cgit From d5c8028b4788f62b31fb79a331b3ad3e041fa366 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 23 Sep 2017 15:00:09 +0200 Subject: x86/fpu: Reinitialize FPU registers if restoring FPU state fails Userspace can change the FPU state of a task using the ptrace() or rt_sigreturn() system calls. Because reserved bits in the FPU state can cause the XRSTOR instruction to fail, the kernel has to carefully validate that no reserved bits or other invalid values are being set. Unfortunately, there have been bugs in this validation code. For example, we were not checking that the 'xcomp_bv' field in the xstate_header was 0. As-is, such bugs are exploitable to read the FPU registers of other processes on the system. To do so, an attacker can create a task, assign to it an invalid FPU state, then spin in a loop and monitor the values of the FPU registers. Because the task's FPU registers are not being restored, sometimes the FPU registers will have the values from another process. This is likely to continue to be a problem in the future because the validation done by the CPU instructions like XRSTOR is not immediately visible to kernel developers. Nor will invalid FPU states ever be encountered during ordinary use --- they will only be seen during fuzzing or exploits. There can even be reserved bits outside the xstate_header which are easy to forget about. For example, the MXCSR register contains reserved bits, which were not validated by the KVM_SET_XSAVE ioctl until commit a575813bfe4b ("KVM: x86: Fix load damaged SSEx MXCSR register"). Therefore, mitigate this class of vulnerability by restoring the FPU registers from init_fpstate if restoring from the task's state fails. We actually used to do this, but it was (perhaps unwisely) removed by commit 9ccc27a5d297 ("x86/fpu: Remove error return values from copy_kernel_to_*regs() functions"). This new patch is also a bit different. First, it only clears the registers, not also the bad in-memory state; this is simpler and makes it easier to make the mitigation cover all callers of __copy_kernel_to_fpregs(). Second, it does the register clearing in an exception handler so that no extra instructions are added to context switches. In fact, we *remove* instructions, since previously we were always zeroing the register containing 'err' even if CONFIG_X86_DEBUG_FPU was disabled. Signed-off-by: Eric Biggers Reviewed-by: Rik van Riel Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Fenghua Yu Cc: Kevin Hao Cc: Linus Torvalds Cc: Michael Halcrow Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Wanpeng Li Cc: Yu-cheng Yu Cc: kernel-hardening@lists.openwall.com Link: http://lkml.kernel.org/r/20170922174156.16780-4-ebiggers3@gmail.com Link: http://lkml.kernel.org/r/20170923130016.21448-27-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 51 +++++++++++-------------------------- arch/x86/mm/extable.c | 24 +++++++++++++++++ 2 files changed, 39 insertions(+), 36 deletions(-) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 2dca7c65319c..cf290d424e48 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -120,20 +120,11 @@ extern void fpstate_sanitize_xstate(struct fpu *fpu); err; \ }) -#define check_insn(insn, output, input...) \ -({ \ - int err; \ +#define kernel_insn(insn, output, input...) \ asm volatile("1:" #insn "\n\t" \ "2:\n" \ - ".section .fixup,\"ax\"\n" \ - "3: movl $-1,%[err]\n" \ - " jmp 2b\n" \ - ".previous\n" \ - _ASM_EXTABLE(1b, 3b) \ - : [err] "=r" (err), output \ - : "0"(0), input); \ - err; \ -}) + _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_fprestore) \ + : output : input) static inline int copy_fregs_to_user(struct fregs_state __user *fx) { @@ -153,20 +144,16 @@ static inline int copy_fxregs_to_user(struct fxregs_state __user *fx) static inline void copy_kernel_to_fxregs(struct fxregs_state *fx) { - int err; - if (IS_ENABLED(CONFIG_X86_32)) { - err = check_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); + kernel_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); } else { if (IS_ENABLED(CONFIG_AS_FXSAVEQ)) { - err = check_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); + kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); } else { /* See comment in copy_fxregs_to_kernel() below. */ - err = check_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx), "m" (*fx)); + kernel_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx), "m" (*fx)); } } - /* Copying from a kernel buffer to FPU registers should never fail: */ - WARN_ON_FPU(err); } static inline int copy_user_to_fxregs(struct fxregs_state __user *fx) @@ -183,9 +170,7 @@ static inline int copy_user_to_fxregs(struct fxregs_state __user *fx) static inline void copy_kernel_to_fregs(struct fregs_state *fx) { - int err = check_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); - - WARN_ON_FPU(err); + kernel_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); } static inline int copy_user_to_fregs(struct fregs_state __user *fx) @@ -281,18 +266,13 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu) * Use XRSTORS to restore context if it is enabled. XRSTORS supports compact * XSAVE area format. */ -#define XSTATE_XRESTORE(st, lmask, hmask, err) \ +#define XSTATE_XRESTORE(st, lmask, hmask) \ asm volatile(ALTERNATIVE(XRSTOR, \ XRSTORS, X86_FEATURE_XSAVES) \ "\n" \ - "xor %[err], %[err]\n" \ "3:\n" \ - ".pushsection .fixup,\"ax\"\n" \ - "4: movl $-2, %[err]\n" \ - "jmp 3b\n" \ - ".popsection\n" \ - _ASM_EXTABLE(661b, 4b) \ - : [err] "=r" (err) \ + _ASM_EXTABLE_HANDLE(661b, 3b, ex_handler_fprestore)\ + : \ : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ : "memory") @@ -336,7 +316,10 @@ static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate) else XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); - /* We should never fault when copying from a kernel buffer: */ + /* + * We should never fault when copying from a kernel buffer, and the FPU + * state we set at boot time should be valid. + */ WARN_ON_FPU(err); } @@ -365,12 +348,8 @@ static inline void copy_kernel_to_xregs(struct xregs_state *xstate, u64 mask) { u32 lmask = mask; u32 hmask = mask >> 32; - int err; - - XSTATE_XRESTORE(xstate, lmask, hmask, err); - /* We should never fault when copying from a kernel buffer: */ - WARN_ON_FPU(err); + XSTATE_XRESTORE(xstate, lmask, hmask); } /* diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index c076f710de4c..c3521e2be396 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -78,6 +79,29 @@ bool ex_handler_refcount(const struct exception_table_entry *fixup, } EXPORT_SYMBOL_GPL(ex_handler_refcount); +/* + * Handler for when we fail to restore a task's FPU state. We should never get + * here because the FPU state of a task using the FPU (task->thread.fpu.state) + * should always be valid. However, past bugs have allowed userspace to set + * reserved bits in the XSAVE area using PTRACE_SETREGSET or sys_rt_sigreturn(). + * These caused XRSTOR to fail when switching to the task, leaking the FPU + * registers of the task previously executing on the CPU. Mitigate this class + * of vulnerability by restoring from the initial state (essentially, zeroing + * out all the FPU registers) if we can't restore from the task's FPU state. + */ +bool ex_handler_fprestore(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) +{ + regs->ip = ex_fixup_addr(fixup); + + WARN_ONCE(1, "Bad FPU state detected at %pB, reinitializing FPU registers.", + (void *)instruction_pointer(regs)); + + __copy_kernel_to_fpregs(&init_fpstate, -1); + return true; +} +EXPORT_SYMBOL_GPL(ex_handler_fprestore); + bool ex_handler_ext(const struct exception_table_entry *fixup, struct pt_regs *regs, int trapnr) { -- cgit From 10430364ebb562311ba6a6efa74e0c2298007912 Mon Sep 17 00:00:00 2001 From: Bhumika Goyal Date: Tue, 29 Aug 2017 23:47:11 +0530 Subject: x86/numachip: Add const and __initconst to numachip2_clockevent Make this const as it is only used during a copy operation and add __initconst as this usage is during the initialization phase. Signed-off-by: Bhumika Goyal Signed-off-by: Thomas Gleixner Cc: julia.lawall@lip6.fr Cc: daniel.lezcano@linaro.org Link: http://lkml.kernel.org/r/1504030631-10812-1-git-send-email-bhumirks@gmail.com --- drivers/clocksource/numachip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clocksource/numachip.c b/drivers/clocksource/numachip.c index 6a20dc8b253f..9a7d7f0f23fe 100644 --- a/drivers/clocksource/numachip.c +++ b/drivers/clocksource/numachip.c @@ -43,7 +43,7 @@ static int numachip2_set_next_event(unsigned long delta, struct clock_event_devi return 0; } -static struct clock_event_device numachip2_clockevent = { +static const struct clock_event_device numachip2_clockevent __initconst = { .name = "numachip2", .rating = 400, .set_next_event = numachip2_set_next_event, -- cgit From a3c4fb7c9c2ebfd50b8c60f6c069932bb319bc37 Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Mon, 4 Sep 2017 10:32:15 +0200 Subject: x86/mm: Fix fault error path using unsafe vma pointer commit 7b2d0dbac489 ("x86/mm/pkeys: Pass VMA down in to fault signal generation code") passes down a vma pointer to the error path, but that is done once the mmap_sem is released when calling mm_fault_error() from __do_page_fault(). This is dangerous as the vma structure is no more safe to be used once the mmap_sem has been released. As only the protection key value is required in the error processing, we could just pass down this value. Fix it by passing a pointer to a protection key value down to the fault signal generation code. The use of a pointer allows to keep the check generating a warning message in fill_sig_info_pkey() when the vma was not known. If the pointer is valid, the protection value can be accessed by deferencing the pointer. [ tglx: Made *pkey u32 as that's the type which is passed in siginfo ] Fixes: 7b2d0dbac489 ("x86/mm/pkeys: Pass VMA down in to fault signal generation code") Signed-off-by: Laurent Dufour Signed-off-by: Thomas Gleixner Cc: linux-mm@kvack.org Cc: Dave Hansen Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1504513935-12742-1-git-send-email-ldufour@linux.vnet.ibm.com --- arch/x86/mm/fault.c | 47 ++++++++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 39567b5c33da..e2baeaa053a5 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -192,8 +192,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really * faulted on a pte with its pkey=4. */ -static void fill_sig_info_pkey(int si_code, siginfo_t *info, - struct vm_area_struct *vma) +static void fill_sig_info_pkey(int si_code, siginfo_t *info, u32 *pkey) { /* This is effectively an #ifdef */ if (!boot_cpu_has(X86_FEATURE_OSPKE)) @@ -209,7 +208,7 @@ static void fill_sig_info_pkey(int si_code, siginfo_t *info, * valid VMA, so we should never reach this without a * valid VMA. */ - if (!vma) { + if (!pkey) { WARN_ONCE(1, "PKU fault with no VMA passed in"); info->si_pkey = 0; return; @@ -219,13 +218,12 @@ static void fill_sig_info_pkey(int si_code, siginfo_t *info, * absolutely guranteed to be 100% accurate because of * the race explained above. */ - info->si_pkey = vma_pkey(vma); + info->si_pkey = *pkey; } static void force_sig_info_fault(int si_signo, int si_code, unsigned long address, - struct task_struct *tsk, struct vm_area_struct *vma, - int fault) + struct task_struct *tsk, u32 *pkey, int fault) { unsigned lsb = 0; siginfo_t info; @@ -240,7 +238,7 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address, lsb = PAGE_SHIFT; info.si_addr_lsb = lsb; - fill_sig_info_pkey(si_code, &info, vma); + fill_sig_info_pkey(si_code, &info, pkey); force_sig_info(si_signo, &info, tsk); } @@ -762,8 +760,6 @@ no_context(struct pt_regs *regs, unsigned long error_code, struct task_struct *tsk = current; unsigned long flags; int sig; - /* No context means no VMA to pass down */ - struct vm_area_struct *vma = NULL; /* Are we prepared to handle this kernel fault? */ if (fixup_exception(regs, X86_TRAP_PF)) { @@ -788,7 +784,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, /* XXX: hwpoison faults will set the wrong code. */ force_sig_info_fault(signal, si_code, address, - tsk, vma, 0); + tsk, NULL, 0); } /* @@ -896,8 +892,7 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code, static void __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, - unsigned long address, struct vm_area_struct *vma, - int si_code) + unsigned long address, u32 *pkey, int si_code) { struct task_struct *tsk = current; @@ -945,7 +940,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, tsk->thread.error_code = error_code; tsk->thread.trap_nr = X86_TRAP_PF; - force_sig_info_fault(SIGSEGV, si_code, address, tsk, vma, 0); + force_sig_info_fault(SIGSEGV, si_code, address, tsk, pkey, 0); return; } @@ -958,9 +953,9 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, static noinline void bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, - unsigned long address, struct vm_area_struct *vma) + unsigned long address, u32 *pkey) { - __bad_area_nosemaphore(regs, error_code, address, vma, SEGV_MAPERR); + __bad_area_nosemaphore(regs, error_code, address, pkey, SEGV_MAPERR); } static void @@ -968,6 +963,10 @@ __bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address, struct vm_area_struct *vma, int si_code) { struct mm_struct *mm = current->mm; + u32 pkey; + + if (vma) + pkey = vma_pkey(vma); /* * Something tried to access memory that isn't in our memory map.. @@ -975,7 +974,8 @@ __bad_area(struct pt_regs *regs, unsigned long error_code, */ up_read(&mm->mmap_sem); - __bad_area_nosemaphore(regs, error_code, address, vma, si_code); + __bad_area_nosemaphore(regs, error_code, address, + (vma) ? &pkey : NULL, si_code); } static noinline void @@ -1018,7 +1018,7 @@ bad_area_access_error(struct pt_regs *regs, unsigned long error_code, static void do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, - struct vm_area_struct *vma, unsigned int fault) + u32 *pkey, unsigned int fault) { struct task_struct *tsk = current; int code = BUS_ADRERR; @@ -1045,13 +1045,12 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, code = BUS_MCEERR_AR; } #endif - force_sig_info_fault(SIGBUS, code, address, tsk, vma, fault); + force_sig_info_fault(SIGBUS, code, address, tsk, pkey, fault); } static noinline void mm_fault_error(struct pt_regs *regs, unsigned long error_code, - unsigned long address, struct vm_area_struct *vma, - unsigned int fault) + unsigned long address, u32 *pkey, unsigned int fault) { if (fatal_signal_pending(current) && !(error_code & PF_USER)) { no_context(regs, error_code, address, 0, 0); @@ -1075,9 +1074,9 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, } else { if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| VM_FAULT_HWPOISON_LARGE)) - do_sigbus(regs, error_code, address, vma, fault); + do_sigbus(regs, error_code, address, pkey, fault); else if (fault & VM_FAULT_SIGSEGV) - bad_area_nosemaphore(regs, error_code, address, vma); + bad_area_nosemaphore(regs, error_code, address, pkey); else BUG(); } @@ -1267,6 +1266,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, struct mm_struct *mm; int fault, major = 0; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + u32 pkey; tsk = current; mm = tsk->mm; @@ -1467,9 +1467,10 @@ good_area: return; } + pkey = vma_pkey(vma); up_read(&mm->mmap_sem); if (unlikely(fault & VM_FAULT_ERROR)) { - mm_fault_error(regs, error_code, address, vma, fault); + mm_fault_error(regs, error_code, address, &pkey, fault); return; } -- cgit From 7d7099433d9eaaa5a989a55f1fa354c16a3ad297 Mon Sep 17 00:00:00 2001 From: Sean Fu Date: Mon, 11 Sep 2017 08:33:21 +0800 Subject: x86/sysfs: Fix off-by-one error in loop termination An off-by-one error in loop terminantion conditions in create_setup_data_nodes() will lead to memory leak when create_setup_data_node() failed. Signed-off-by: Sean Fu Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/r/1505090001-1157-1-git-send-email-fxinrong@gmail.com --- arch/x86/kernel/ksysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c index 4b0592ca9e47..8c1cc08f514f 100644 --- a/arch/x86/kernel/ksysfs.c +++ b/arch/x86/kernel/ksysfs.c @@ -299,7 +299,7 @@ static int __init create_setup_data_nodes(struct kobject *parent) return 0; out_clean_nodes: - for (j = i - 1; j > 0; j--) + for (j = i - 1; j >= 0; j--) cleanup_setup_data_node(*(kobjp + j)); kfree(kobjp); out_setup_data_kobj: -- cgit From 5ac751d9e6b187c4a0000879d6598eb2292db949 Mon Sep 17 00:00:00 2001 From: Ville Syrjälä Date: Tue, 12 Sep 2017 19:40:00 +0300 Subject: x86: Don't cast away the __user in __get_user_asm_u64() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Don't cast away the __user in __get_user_asm_u64() on x86-32. Prevents sparse getting upset. Signed-off-by: Ville Syrjälä Signed-off-by: Thomas Gleixner Cc: Benjamin LaHaise Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20170912164000.13745-1-ville.syrjala@linux.intel.com --- arch/x86/include/asm/uaccess.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 78e8fcc87d4c..4b892917edeb 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -337,7 +337,7 @@ do { \ _ASM_EXTABLE(1b, 4b) \ _ASM_EXTABLE(2b, 4b) \ : "=r" (retval), "=&A"(x) \ - : "m" (__m(__ptr)), "m" __m(((u32 *)(__ptr)) + 1), \ + : "m" (__m(__ptr)), "m" __m(((u32 __user *)(__ptr)) + 1), \ "i" (errret), "0" (retval)); \ }) -- cgit From b09c146f8f63c0e03adba74df76bf9c2be466fec Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 8 Sep 2017 17:34:47 -0400 Subject: perf/x86/intel/cstate: Add missing CPU IDs Skylake server uses the same C-state residency events as Sandy Bridge. Denverton and Gemini lake use the same C-state residency events as Apollo Lake. Signed-off-by: Kan Liang Signed-off-by: Thomas Gleixner Cc: ak@linux.intel.com Cc: peterz@infradead.org Cc: piotr.luc@intel.com Cc: harry.pan@intel.com Cc: srinivas.pandruvada@linux.intel.com Link: http://lkml.kernel.org/r/20170908213449.6224-1-kan.liang@intel.com --- arch/x86/events/intel/cstate.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 4cf100ff2a37..72db0664a53d 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -552,6 +552,7 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_MOBILE, snb_cstates), X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_DESKTOP, snb_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_X, snb_cstates), X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE_MOBILE, snb_cstates), X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE_DESKTOP, snb_cstates), @@ -560,6 +561,9 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_CSTATES_MODEL(INTEL_FAM6_XEON_PHI_KNM, knl_cstates), X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT, glm_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_ATOM_DENVERTON, glm_cstates), + + X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GEMINI_LAKE, glm_cstates), { }, }; MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match); -- cgit From 1aaccc40a1864053da26605b0297be16dd52641e Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 8 Sep 2017 17:34:48 -0400 Subject: perf/x86/msr: Add missing CPU IDs Goldmont, Glodmont plus and Xeon Phi have MSR_SMI_COUNT as well. Signed-off-by: Kan Liang Signed-off-by: Thomas Gleixner Cc: ak@linux.intel.com Cc: peterz@infradead.org Cc: piotr.luc@intel.com Cc: harry.pan@intel.com Cc: srinivas.pandruvada@linux.intel.com Link: http://lkml.kernel.org/r/20170908213449.6224-2-kan.liang@intel.com --- arch/x86/events/msr.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index 4bb3ec69e8ea..06723671ae4e 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -63,6 +63,14 @@ static bool test_intel(int idx) case INTEL_FAM6_ATOM_SILVERMONT1: case INTEL_FAM6_ATOM_SILVERMONT2: case INTEL_FAM6_ATOM_AIRMONT: + + case INTEL_FAM6_ATOM_GOLDMONT: + case INTEL_FAM6_ATOM_DENVERTON: + + case INTEL_FAM6_ATOM_GEMINI_LAKE: + + case INTEL_FAM6_XEON_PHI_KNL: + case INTEL_FAM6_XEON_PHI_KNM: if (idx == PERF_MSR_SMI) return true; break; -- cgit From 450a97893559354b927c935f39ee11126f01f520 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 8 Sep 2017 17:34:49 -0400 Subject: perf/x86/intel/rapl: Add missing CPU IDs DENVERTON and GEMINI_LAKE support same RAPL counters as Apollo Lake. Signed-off-by: Kan Liang Signed-off-by: Thomas Gleixner Cc: ak@linux.intel.com Cc: peterz@infradead.org Cc: piotr.luc@intel.com Cc: harry.pan@intel.com Cc: srinivas.pandruvada@linux.intel.com Link: http://lkml.kernel.org/r/20170908213449.6224-3-kan.liang@intel.com --- arch/x86/events/intel/rapl.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c index 8e2457cb6b4a..005908ee9333 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/intel/rapl.c @@ -775,6 +775,9 @@ static const struct x86_cpu_id rapl_cpu_match[] __initconst = { X86_RAPL_MODEL_MATCH(INTEL_FAM6_KABYLAKE_DESKTOP, skl_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT, hsw_rapl_init), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_DENVERTON, hsw_rapl_init), + + X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GEMINI_LAKE, hsw_rapl_init), {}, }; -- cgit From 57999d1107c1e60c2ca7088f2ac0f819e2f554b3 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 22 Sep 2017 23:43:25 +0300 Subject: USB: devio: Prevent integer overflow in proc_do_submiturb() There used to be an integer overflow check in proc_do_submiturb() but we removed it. It turns out that it's still required. The uurb->buffer_length variable is a signed integer and it's controlled by the user. It can lead to an integer overflow when we do: num_sgs = DIV_ROUND_UP(uurb->buffer_length, USB_SG_SIZE); If we strip away the macro then that line looks like this: num_sgs = (uurb->buffer_length + USB_SG_SIZE - 1) / USB_SG_SIZE; ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ It's the first addition which can overflow. Fixes: 1129d270cbfb ("USB: Increase usbfs transfer limit") Cc: stable Signed-off-by: Dan Carpenter Acked-by: Alan Stern Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/devio.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c index 318bb3b96687..e9326f31db8d 100644 --- a/drivers/usb/core/devio.c +++ b/drivers/usb/core/devio.c @@ -140,6 +140,9 @@ module_param(usbfs_memory_mb, uint, 0644); MODULE_PARM_DESC(usbfs_memory_mb, "maximum MB allowed for usbfs buffers (0 = no limit)"); +/* Hard limit, necessary to avoid arithmetic overflow */ +#define USBFS_XFER_MAX (UINT_MAX / 2 - 1000000) + static atomic64_t usbfs_memory_usage; /* Total memory currently allocated */ /* Check whether it's okay to allocate more memory for a transfer */ @@ -1460,6 +1463,8 @@ static int proc_do_submiturb(struct usb_dev_state *ps, struct usbdevfs_urb *uurb USBDEVFS_URB_ZERO_PACKET | USBDEVFS_URB_NO_INTERRUPT)) return -EINVAL; + if ((unsigned int)uurb->buffer_length >= USBFS_XFER_MAX) + return -EINVAL; if (uurb->buffer_length > 0 && !uurb->buffer) return -EINVAL; if (!(uurb->type == USBDEVFS_URB_TYPE_CONTROL && -- cgit From fa1ed74eb1c233be6131ec92df21ab46499a15b6 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 22 Sep 2017 23:43:46 +0300 Subject: USB: devio: Don't corrupt user memory The user buffer has "uurb->buffer_length" bytes. If the kernel has more information than that, we should truncate it instead of writing past the end of the user's buffer. I added a WARN_ONCE() to help the user debug the issue. Reported-by: Alan Stern Cc: stable Signed-off-by: Dan Carpenter Acked-by: Alan Stern Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/devio.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c index e9326f31db8d..4664e543cf2f 100644 --- a/drivers/usb/core/devio.c +++ b/drivers/usb/core/devio.c @@ -1576,7 +1576,11 @@ static int proc_do_submiturb(struct usb_dev_state *ps, struct usbdevfs_urb *uurb totlen += isopkt[u].length; } u *= sizeof(struct usb_iso_packet_descriptor); - uurb->buffer_length = totlen; + if (totlen <= uurb->buffer_length) + uurb->buffer_length = totlen; + else + WARN_ONCE(1, "uurb->buffer_length is too short %d vs %d", + totlen, uurb->buffer_length); break; default: -- cgit From 8fec9355a968ad240f3a2e9ad55b823cf1cc52ff Mon Sep 17 00:00:00 2001 From: Bjørn Mork Date: Fri, 22 Sep 2017 22:18:18 +0200 Subject: USB: cdc-wdm: ignore -EPIPE from GetEncapsulatedResponse MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The driver will forward errors to userspace after turning most of them into -EIO. But all status codes are not equal. The -EPIPE (stall) in particular can be seen more as a result of normal USB signaling than an actual error. The state is automatically cleared by the USB core without intervention from either driver or userspace. And most devices and firmwares will never trigger a stall as a result of GetEncapsulatedResponse. This is in fact a requirement for CDC WDM devices. Quoting from section 7.1 of the CDC WMC spec revision 1.1: The function shall not return STALL in response to GetEncapsulatedResponse. But this driver is also handling GetEncapsulatedResponse on behalf of the qmi_wwan and cdc_mbim drivers. Unfortunately the relevant specs are not as clear wrt stall. So some QMI and MBIM devices *will* occasionally stall, causing the GetEncapsulatedResponse to return an -EPIPE status. Translating this into -EIO for userspace has proven to be harmful. Treating it as an empty read is safer, making the driver behave as if the device was conforming to the CDC WDM spec. There have been numerous reports of issues related to -EPIPE errors from some newer CDC MBIM devices in particular, like for example the Fibocom L831-EAU. Testing on this device has shown that the issues go away if we simply ignore the -EPIPE status. Similar handling of -EPIPE is already known from e.g. usb_get_string() The -EPIPE log message is still kept to let us track devices with this unexpected behaviour, hoping that it attracts attention from firmware developers. Cc: Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=100938 Reported-and-tested-by: Christian Ehrig Reported-and-tested-by: Patrick Chilton Reported-and-tested-by: Andreas Böhler Signed-off-by: Bjørn Mork Acked-by: Oliver Neukum Signed-off-by: Greg Kroah-Hartman --- drivers/usb/class/cdc-wdm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/usb/class/cdc-wdm.c b/drivers/usb/class/cdc-wdm.c index 5aacea1978a5..3e865dbf878c 100644 --- a/drivers/usb/class/cdc-wdm.c +++ b/drivers/usb/class/cdc-wdm.c @@ -190,8 +190,10 @@ static void wdm_in_callback(struct urb *urb) /* * only set a new error if there is no previous error. * Errors are only cleared during read/open + * Avoid propagating -EPIPE (stall) to userspace since it is + * better handled as an empty read */ - if (desc->rerr == 0) + if (desc->rerr == 0 && status != -EPIPE) desc->rerr = status; if (length + desc->length > desc->wMaxCommand) { -- cgit From 29b46dfb136cdbeece542b3f01115237e43f2855 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 11 Sep 2017 10:10:15 -0700 Subject: perf/x86/intel/uncore: Correct num_boxes for IIO and IRP There are 6 IIO/IRP boxes for CBDMA, PCIe0-2, MCP 0 and MCP 1 separately. Correct the num_boxes. Signed-off-by: Kan Liang Signed-off-by: Thomas Gleixner Cc: ak@linux.intel.com Cc: peterz@infradead.org Cc: eranian@google.com Cc: acme@kernel.org Link: http://lkml.kernel.org/r/1505149816-12580-1-git-send-email-kan.liang@intel.com --- arch/x86/events/intel/uncore_snbep.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index db1fe377e6dd..a7196818416a 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -3462,7 +3462,7 @@ static struct intel_uncore_ops skx_uncore_iio_ops = { static struct intel_uncore_type skx_uncore_iio = { .name = "iio", .num_counters = 4, - .num_boxes = 5, + .num_boxes = 6, .perf_ctr_bits = 48, .event_ctl = SKX_IIO0_MSR_PMON_CTL0, .perf_ctr = SKX_IIO0_MSR_PMON_CTR0, @@ -3492,7 +3492,7 @@ static const struct attribute_group skx_uncore_format_group = { static struct intel_uncore_type skx_uncore_irp = { .name = "irp", .num_counters = 2, - .num_boxes = 5, + .num_boxes = 6, .perf_ctr_bits = 48, .event_ctl = SKX_IRP0_MSR_PMON_CTL0, .perf_ctr = SKX_IRP0_MSR_PMON_CTR0, -- cgit From 0add53713b1c07a1c71e27a20e21eb7c180b4e7b Mon Sep 17 00:00:00 2001 From: Michal Simek Date: Tue, 19 Sep 2017 16:40:21 +0200 Subject: microblaze: Add missing kvm_para.h to Kbuild Running make allmodconfig;make is throwing compilation error: CC kernel/watchdog.o In file included from ./include/linux/kvm_para.h:4:0, from kernel/watchdog.c:29: ./include/uapi/linux/kvm_para.h:32:26: fatal error: asm/kvm_para.h: No such file or directory #include ^ compilation terminated. make[1]: *** [kernel/watchdog.o] Error 1 make: *** [kernel/watchdog.o] Error 2 Reported-by: Michal Hocko Suggested-by: Geert Uytterhoeven Signed-off-by: Michal Simek Fixes: 83f0124ad81e87b ("microblaze: remove asm-generic wrapper headers") Reviewed-by: Tobias Klauser Tested-by: Michal Hocko --- arch/microblaze/include/uapi/asm/Kbuild | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/microblaze/include/uapi/asm/Kbuild b/arch/microblaze/include/uapi/asm/Kbuild index e77a596f3f1e..06609ca36115 100644 --- a/arch/microblaze/include/uapi/asm/Kbuild +++ b/arch/microblaze/include/uapi/asm/Kbuild @@ -7,6 +7,7 @@ generic-y += fcntl.h generic-y += ioctl.h generic-y += ioctls.h generic-y += ipcbuf.h +generic-y += kvm_para.h generic-y += mman.h generic-y += msgbuf.h generic-y += param.h -- cgit From 64c99853baca40e2f06038c4a926009edd14c7c3 Mon Sep 17 00:00:00 2001 From: Thomas Meyer Date: Thu, 21 Sep 2017 00:29:36 +0200 Subject: microblaze: Cocci spatch "vma_pages" Use vma_pages function on vma object instead of explicit computation. Found by coccinelle spatch "api/vma_pages.cocci" Signed-off-by: Thomas Meyer Signed-off-by: Michal Simek --- arch/microblaze/kernel/dma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/microblaze/kernel/dma.c b/arch/microblaze/kernel/dma.c index e45ada8fb006..94700c5270a9 100644 --- a/arch/microblaze/kernel/dma.c +++ b/arch/microblaze/kernel/dma.c @@ -165,7 +165,7 @@ int dma_direct_mmap_coherent(struct device *dev, struct vm_area_struct *vma, unsigned long attrs) { #ifdef CONFIG_MMU - unsigned long user_count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + unsigned long user_count = vma_pages(vma); unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT; unsigned long off = vma->vm_pgoff; unsigned long pfn; -- cgit From 428dbf156cc5a8f9994d1f1a5c79373d15476f3c Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Mon, 18 Sep 2017 10:53:29 -0600 Subject: arch: change default endian for microblaze Fix the default for microblaze. Michal Simek mentioned default for microblaze should be CPU_LITTLE_ENDIAN. Fixes : commit 206d3642d8ee ("arch/microblaze: add choice for endianness and update Makefile") Signed-off-by: Babu Moger Cc: Michal Simek Signed-off-by: Michal Simek --- arch/microblaze/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index 9d26abdf0dc1..4f798aa671dd 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -39,7 +39,7 @@ config MICROBLAZE # Endianness selection choice prompt "Endianness selection" - default CPU_BIG_ENDIAN + default CPU_LITTLE_ENDIAN help microblaze architectures can be configured for either little or big endian formats. Be sure to select the appropriate mode. -- cgit From 89975bd335f37b96ffd3cc24b9effb1fa25e7788 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 20 Sep 2017 16:41:34 -0300 Subject: perf tools: Get all of tools/{arch,include}/ in the MANIFEST Now that I'm switching the container builds from using a local volume pointing to the kernel repository with the perf sources, instead getting a detached tarball to be able to use a container cluster, some places broke because I forgot to put some of the required files in tools/perf/MANIFEST, namely some bitsperlong.h files. So, to fix it do the same as for tools/build/ and pack the whole tools/arch/ directory. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-wmenpjfjsobwdnfde30qqncj@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/MANIFEST | 87 ++--------------------------------------------------- 1 file changed, 2 insertions(+), 85 deletions(-) diff --git a/tools/perf/MANIFEST b/tools/perf/MANIFEST index 62072822dc85..627b7cada144 100644 --- a/tools/perf/MANIFEST +++ b/tools/perf/MANIFEST @@ -1,34 +1,8 @@ tools/perf -tools/arch/alpha/include/asm/barrier.h -tools/arch/arm/include/asm/barrier.h -tools/arch/arm64/include/asm/barrier.h -tools/arch/ia64/include/asm/barrier.h -tools/arch/mips/include/asm/barrier.h -tools/arch/powerpc/include/asm/barrier.h -tools/arch/s390/include/asm/barrier.h -tools/arch/sh/include/asm/barrier.h -tools/arch/sparc/include/asm/barrier.h -tools/arch/sparc/include/asm/barrier_32.h -tools/arch/sparc/include/asm/barrier_64.h -tools/arch/tile/include/asm/barrier.h -tools/arch/x86/include/asm/barrier.h -tools/arch/x86/include/asm/cmpxchg.h -tools/arch/x86/include/asm/cpufeatures.h -tools/arch/x86/include/asm/disabled-features.h -tools/arch/x86/include/asm/required-features.h -tools/arch/x86/include/uapi/asm/svm.h -tools/arch/x86/include/uapi/asm/vmx.h -tools/arch/x86/include/uapi/asm/kvm.h -tools/arch/x86/include/uapi/asm/kvm_perf.h -tools/arch/x86/lib/memcpy_64.S -tools/arch/x86/lib/memset_64.S -tools/arch/s390/include/uapi/asm/kvm_perf.h -tools/arch/s390/include/uapi/asm/sie.h -tools/arch/xtensa/include/asm/barrier.h +tools/arch tools/scripts tools/build -tools/arch/x86/include/asm/atomic.h -tools/arch/x86/include/asm/rmwcc.h +tools/include tools/lib/traceevent tools/lib/api tools/lib/bpf @@ -42,60 +16,3 @@ tools/lib/find_bit.c tools/lib/bitmap.c tools/lib/str_error_r.c tools/lib/vsprintf.c -tools/include/asm/alternative-asm.h -tools/include/asm/atomic.h -tools/include/asm/barrier.h -tools/include/asm/bug.h -tools/include/asm-generic/atomic-gcc.h -tools/include/asm-generic/barrier.h -tools/include/asm-generic/bitops/arch_hweight.h -tools/include/asm-generic/bitops/atomic.h -tools/include/asm-generic/bitops/const_hweight.h -tools/include/asm-generic/bitops/__ffs.h -tools/include/asm-generic/bitops/__ffz.h -tools/include/asm-generic/bitops/__fls.h -tools/include/asm-generic/bitops/find.h -tools/include/asm-generic/bitops/fls64.h -tools/include/asm-generic/bitops/fls.h -tools/include/asm-generic/bitops/hweight.h -tools/include/asm-generic/bitops.h -tools/include/linux/atomic.h -tools/include/linux/bitops.h -tools/include/linux/compiler.h -tools/include/linux/compiler-gcc.h -tools/include/linux/coresight-pmu.h -tools/include/linux/bug.h -tools/include/linux/filter.h -tools/include/linux/hash.h -tools/include/linux/kernel.h -tools/include/linux/list.h -tools/include/linux/log2.h -tools/include/uapi/asm-generic/fcntl.h -tools/include/uapi/asm-generic/ioctls.h -tools/include/uapi/asm-generic/mman-common.h -tools/include/uapi/asm-generic/mman.h -tools/include/uapi/drm/drm.h -tools/include/uapi/drm/i915_drm.h -tools/include/uapi/linux/bpf.h -tools/include/uapi/linux/bpf_common.h -tools/include/uapi/linux/fcntl.h -tools/include/uapi/linux/hw_breakpoint.h -tools/include/uapi/linux/kvm.h -tools/include/uapi/linux/mman.h -tools/include/uapi/linux/perf_event.h -tools/include/uapi/linux/sched.h -tools/include/uapi/linux/stat.h -tools/include/uapi/linux/vhost.h -tools/include/uapi/sound/asound.h -tools/include/linux/poison.h -tools/include/linux/rbtree.h -tools/include/linux/rbtree_augmented.h -tools/include/linux/refcount.h -tools/include/linux/string.h -tools/include/linux/stringify.h -tools/include/linux/types.h -tools/include/linux/err.h -tools/include/linux/bitmap.h -tools/include/linux/time64.h -tools/arch/*/include/uapi/asm/mman.h -tools/arch/*/include/uapi/asm/perf_regs.h -- cgit From 549a3976523c69a0245c0a310210c824a0b26e35 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 13 Sep 2017 09:38:23 +0200 Subject: tools include: Sync kernel ABI headers with tooling headers Time for a sync with ABI/uapi headers with the upcoming v4.14 kernel. None of the ABI changes require any source code level changes to our existing in-kernel tooling code: - tools/arch/s390/include/uapi/asm/kvm.h: New KVM_S390_VM_TOD_EXT ABI, not used by in-kernel tooling. - tools/arch/x86/include/asm/cpufeatures.h: tools/arch/x86/include/asm/disabled-features.h: New PCID, SME and VGIF x86 CPU feature bits defined. - tools/include/asm-generic/hugetlb_encode.h: tools/include/uapi/asm-generic/mman-common.h: tools/include/uapi/linux/mman.h: Two new madvise() flags, plus a hugetlb system call mmap flags restructuring/extension changes. - tools/include/uapi/drm/drm.h: tools/include/uapi/drm/i915_drm.h: New drm_syncobj_create flags definitions, new drm_syncobj_wait and drm_syncobj_array ABIs. DRM_I915_PERF_* calls and a new I915_PARAM_HAS_EXEC_FENCE_ARRAY ABI for the Intel driver. - tools/include/uapi/linux/bpf.h: New bpf_sock fields (::mark and ::priority), new XDP_REDIRECT action, new kvm_ppc_smmu_info fields (::data_keys, instr_keys) Signed-off-by: Ingo Molnar Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Milian Wolff Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Taeung Song Cc: Wang Nan Cc: Yao Jin Link: http://lkml.kernel.org/r/20170913073823.lxmi4c7ejqlfabjx@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/s390/include/uapi/asm/kvm.h | 6 +++ tools/arch/x86/include/asm/cpufeatures.h | 2 + tools/arch/x86/include/asm/disabled-features.h | 4 +- tools/include/asm-generic/hugetlb_encode.h | 34 +++++++++++++++++ tools/include/uapi/asm-generic/mman-common.h | 14 ++----- tools/include/uapi/drm/drm.h | 22 +++++++++++ tools/include/uapi/drm/i915_drm.h | 51 +++++++++++++++++++++++++- tools/include/uapi/linux/bpf.h | 32 ++++++++++------ tools/include/uapi/linux/kvm.h | 3 +- tools/include/uapi/linux/mman.h | 24 +++++++++++- 10 files changed, 164 insertions(+), 28 deletions(-) create mode 100644 tools/include/asm-generic/hugetlb_encode.h diff --git a/tools/arch/s390/include/uapi/asm/kvm.h b/tools/arch/s390/include/uapi/asm/kvm.h index 69d09c39bbcd..cd7359e23d86 100644 --- a/tools/arch/s390/include/uapi/asm/kvm.h +++ b/tools/arch/s390/include/uapi/asm/kvm.h @@ -88,6 +88,12 @@ struct kvm_s390_io_adapter_req { /* kvm attributes for KVM_S390_VM_TOD */ #define KVM_S390_VM_TOD_LOW 0 #define KVM_S390_VM_TOD_HIGH 1 +#define KVM_S390_VM_TOD_EXT 2 + +struct kvm_s390_vm_tod_clock { + __u8 epoch_idx; + __u64 tod; +}; /* kvm attributes for KVM_S390_VM_CPU_MODEL */ /* processor related attributes are r/w */ diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index 8ea315a11fe0..2519c6c801c9 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -196,6 +196,7 @@ #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ +#define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ @@ -287,6 +288,7 @@ #define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */ #define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ #define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */ +#define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */ #define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ diff --git a/tools/arch/x86/include/asm/disabled-features.h b/tools/arch/x86/include/asm/disabled-features.h index 5dff775af7cd..c10c9128f54e 100644 --- a/tools/arch/x86/include/asm/disabled-features.h +++ b/tools/arch/x86/include/asm/disabled-features.h @@ -21,11 +21,13 @@ # define DISABLE_K6_MTRR (1<<(X86_FEATURE_K6_MTRR & 31)) # define DISABLE_CYRIX_ARR (1<<(X86_FEATURE_CYRIX_ARR & 31)) # define DISABLE_CENTAUR_MCR (1<<(X86_FEATURE_CENTAUR_MCR & 31)) +# define DISABLE_PCID 0 #else # define DISABLE_VME 0 # define DISABLE_K6_MTRR 0 # define DISABLE_CYRIX_ARR 0 # define DISABLE_CENTAUR_MCR 0 +# define DISABLE_PCID (1<<(X86_FEATURE_PCID & 31)) #endif /* CONFIG_X86_64 */ #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS @@ -49,7 +51,7 @@ #define DISABLED_MASK1 0 #define DISABLED_MASK2 0 #define DISABLED_MASK3 (DISABLE_CYRIX_ARR|DISABLE_CENTAUR_MCR|DISABLE_K6_MTRR) -#define DISABLED_MASK4 0 +#define DISABLED_MASK4 (DISABLE_PCID) #define DISABLED_MASK5 0 #define DISABLED_MASK6 0 #define DISABLED_MASK7 0 diff --git a/tools/include/asm-generic/hugetlb_encode.h b/tools/include/asm-generic/hugetlb_encode.h new file mode 100644 index 000000000000..e4732d3c2998 --- /dev/null +++ b/tools/include/asm-generic/hugetlb_encode.h @@ -0,0 +1,34 @@ +#ifndef _ASM_GENERIC_HUGETLB_ENCODE_H_ +#define _ASM_GENERIC_HUGETLB_ENCODE_H_ + +/* + * Several system calls take a flag to request "hugetlb" huge pages. + * Without further specification, these system calls will use the + * system's default huge page size. If a system supports multiple + * huge page sizes, the desired huge page size can be specified in + * bits [26:31] of the flag arguments. The value in these 6 bits + * will encode the log2 of the huge page size. + * + * The following definitions are associated with this huge page size + * encoding in flag arguments. System call specific header files + * that use this encoding should include this file. They can then + * provide definitions based on these with their own specific prefix. + * for example: + * #define MAP_HUGE_SHIFT HUGETLB_FLAG_ENCODE_SHIFT + */ + +#define HUGETLB_FLAG_ENCODE_SHIFT 26 +#define HUGETLB_FLAG_ENCODE_MASK 0x3f + +#define HUGETLB_FLAG_ENCODE_64KB (16 << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_512KB (19 << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_1MB (20 << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_2MB (21 << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_8MB (23 << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_16MB (24 << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_256MB (28 << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_1GB (30 << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_2GB (31 << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_16GB (34 << HUGETLB_FLAG_ENCODE_SHIFT) + +#endif /* _ASM_GENERIC_HUGETLB_ENCODE_H_ */ diff --git a/tools/include/uapi/asm-generic/mman-common.h b/tools/include/uapi/asm-generic/mman-common.h index 8c27db0c5c08..203268f9231e 100644 --- a/tools/include/uapi/asm-generic/mman-common.h +++ b/tools/include/uapi/asm-generic/mman-common.h @@ -58,20 +58,12 @@ overrides the coredump filter bits */ #define MADV_DODUMP 17 /* Clear the MADV_DONTDUMP flag */ +#define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */ +#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ + /* compatibility flags */ #define MAP_FILE 0 -/* - * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size. - * This gives us 6 bits, which is enough until someone invents 128 bit address - * spaces. - * - * Assume these are all power of twos. - * When 0 use the default page size. - */ -#define MAP_HUGE_SHIFT 26 -#define MAP_HUGE_MASK 0x3f - #define PKEY_DISABLE_ACCESS 0x1 #define PKEY_DISABLE_WRITE 0x2 #define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ diff --git a/tools/include/uapi/drm/drm.h b/tools/include/uapi/drm/drm.h index 101593ab10ac..97677cd6964d 100644 --- a/tools/include/uapi/drm/drm.h +++ b/tools/include/uapi/drm/drm.h @@ -700,6 +700,7 @@ struct drm_prime_handle { struct drm_syncobj_create { __u32 handle; +#define DRM_SYNCOBJ_CREATE_SIGNALED (1 << 0) __u32 flags; }; @@ -718,6 +719,24 @@ struct drm_syncobj_handle { __u32 pad; }; +#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL (1 << 0) +#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT (1 << 1) +struct drm_syncobj_wait { + __u64 handles; + /* absolute timeout */ + __s64 timeout_nsec; + __u32 count_handles; + __u32 flags; + __u32 first_signaled; /* only valid when not waiting all */ + __u32 pad; +}; + +struct drm_syncobj_array { + __u64 handles; + __u32 count_handles; + __u32 pad; +}; + #if defined(__cplusplus) } #endif @@ -840,6 +859,9 @@ extern "C" { #define DRM_IOCTL_SYNCOBJ_DESTROY DRM_IOWR(0xC0, struct drm_syncobj_destroy) #define DRM_IOCTL_SYNCOBJ_HANDLE_TO_FD DRM_IOWR(0xC1, struct drm_syncobj_handle) #define DRM_IOCTL_SYNCOBJ_FD_TO_HANDLE DRM_IOWR(0xC2, struct drm_syncobj_handle) +#define DRM_IOCTL_SYNCOBJ_WAIT DRM_IOWR(0xC3, struct drm_syncobj_wait) +#define DRM_IOCTL_SYNCOBJ_RESET DRM_IOWR(0xC4, struct drm_syncobj_array) +#define DRM_IOCTL_SYNCOBJ_SIGNAL DRM_IOWR(0xC5, struct drm_syncobj_array) /** * Device specific ioctls should only be in their respective headers diff --git a/tools/include/uapi/drm/i915_drm.h b/tools/include/uapi/drm/i915_drm.h index 7ccbd6a2bbe0..6598fb76d2c2 100644 --- a/tools/include/uapi/drm/i915_drm.h +++ b/tools/include/uapi/drm/i915_drm.h @@ -260,6 +260,8 @@ typedef struct _drm_i915_sarea { #define DRM_I915_GEM_CONTEXT_GETPARAM 0x34 #define DRM_I915_GEM_CONTEXT_SETPARAM 0x35 #define DRM_I915_PERF_OPEN 0x36 +#define DRM_I915_PERF_ADD_CONFIG 0x37 +#define DRM_I915_PERF_REMOVE_CONFIG 0x38 #define DRM_IOCTL_I915_INIT DRM_IOW( DRM_COMMAND_BASE + DRM_I915_INIT, drm_i915_init_t) #define DRM_IOCTL_I915_FLUSH DRM_IO ( DRM_COMMAND_BASE + DRM_I915_FLUSH) @@ -315,6 +317,8 @@ typedef struct _drm_i915_sarea { #define DRM_IOCTL_I915_GEM_CONTEXT_GETPARAM DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GEM_CONTEXT_GETPARAM, struct drm_i915_gem_context_param) #define DRM_IOCTL_I915_GEM_CONTEXT_SETPARAM DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GEM_CONTEXT_SETPARAM, struct drm_i915_gem_context_param) #define DRM_IOCTL_I915_PERF_OPEN DRM_IOW(DRM_COMMAND_BASE + DRM_I915_PERF_OPEN, struct drm_i915_perf_open_param) +#define DRM_IOCTL_I915_PERF_ADD_CONFIG DRM_IOW(DRM_COMMAND_BASE + DRM_I915_PERF_ADD_CONFIG, struct drm_i915_perf_oa_config) +#define DRM_IOCTL_I915_PERF_REMOVE_CONFIG DRM_IOW(DRM_COMMAND_BASE + DRM_I915_PERF_REMOVE_CONFIG, __u64) /* Allow drivers to submit batchbuffers directly to hardware, relying * on the security mechanisms provided by hardware. @@ -431,6 +435,11 @@ typedef struct drm_i915_irq_wait { */ #define I915_PARAM_HAS_EXEC_BATCH_FIRST 48 +/* Query whether DRM_I915_GEM_EXECBUFFER2 supports supplying an array of + * drm_i915_gem_exec_fence structures. See I915_EXEC_FENCE_ARRAY. + */ +#define I915_PARAM_HAS_EXEC_FENCE_ARRAY 49 + typedef struct drm_i915_getparam { __s32 param; /* @@ -812,6 +821,17 @@ struct drm_i915_gem_exec_object2 { __u64 rsvd2; }; +struct drm_i915_gem_exec_fence { + /** + * User's handle for a drm_syncobj to wait on or signal. + */ + __u32 handle; + +#define I915_EXEC_FENCE_WAIT (1<<0) +#define I915_EXEC_FENCE_SIGNAL (1<<1) + __u32 flags; +}; + struct drm_i915_gem_execbuffer2 { /** * List of gem_exec_object2 structs @@ -826,7 +846,11 @@ struct drm_i915_gem_execbuffer2 { __u32 DR1; __u32 DR4; __u32 num_cliprects; - /** This is a struct drm_clip_rect *cliprects */ + /** + * This is a struct drm_clip_rect *cliprects if I915_EXEC_FENCE_ARRAY + * is not set. If I915_EXEC_FENCE_ARRAY is set, then this is a + * struct drm_i915_gem_exec_fence *fences. + */ __u64 cliprects_ptr; #define I915_EXEC_RING_MASK (7<<0) #define I915_EXEC_DEFAULT (0<<0) @@ -927,7 +951,14 @@ struct drm_i915_gem_execbuffer2 { * element). */ #define I915_EXEC_BATCH_FIRST (1<<18) -#define __I915_EXEC_UNKNOWN_FLAGS (-(I915_EXEC_BATCH_FIRST<<1)) + +/* Setting I915_FENCE_ARRAY implies that num_cliprects and cliprects_ptr + * define an array of i915_gem_exec_fence structures which specify a set of + * dma fences to wait upon or signal. + */ +#define I915_EXEC_FENCE_ARRAY (1<<19) + +#define __I915_EXEC_UNKNOWN_FLAGS (-(I915_EXEC_FENCE_ARRAY<<1)) #define I915_EXEC_CONTEXT_ID_MASK (0xffffffff) #define i915_execbuffer2_set_context_id(eb2, context) \ @@ -1467,6 +1498,22 @@ enum drm_i915_perf_record_type { DRM_I915_PERF_RECORD_MAX /* non-ABI */ }; +/** + * Structure to upload perf dynamic configuration into the kernel. + */ +struct drm_i915_perf_oa_config { + /** String formatted like "%08x-%04x-%04x-%04x-%012x" */ + char uuid[36]; + + __u32 n_mux_regs; + __u32 n_boolean_regs; + __u32 n_flex_regs; + + __u64 __user mux_regs_ptr; + __u64 __user boolean_regs_ptr; + __u64 __user flex_regs_ptr; +}; + #if defined(__cplusplus) } #endif diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 461811e57140..43ab5c402f98 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -143,12 +143,6 @@ enum bpf_attach_type { #define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE -enum bpf_sockmap_flags { - BPF_SOCKMAP_UNSPEC, - BPF_SOCKMAP_STRPARSER, - __MAX_BPF_SOCKMAP_FLAG -}; - /* If BPF_F_ALLOW_OVERRIDE flag is used in BPF_PROG_ATTACH command * to the given target_fd cgroup the descendent cgroup will be able to * override effective bpf program that was inherited from this cgroup @@ -368,9 +362,20 @@ union bpf_attr { * int bpf_redirect(ifindex, flags) * redirect to another netdev * @ifindex: ifindex of the net device - * @flags: bit 0 - if set, redirect to ingress instead of egress - * other bits - reserved - * Return: TC_ACT_REDIRECT + * @flags: + * cls_bpf: + * bit 0 - if set, redirect to ingress instead of egress + * other bits - reserved + * xdp_bpf: + * all bits - reserved + * Return: cls_bpf: TC_ACT_REDIRECT on success or TC_ACT_SHOT on error + * xdp_bfp: XDP_REDIRECT on success or XDP_ABORT on error + * int bpf_redirect_map(map, key, flags) + * redirect to endpoint in map + * @map: pointer to dev map + * @key: index in map to lookup + * @flags: -- + * Return: XDP_REDIRECT on success or XDP_ABORT on error * * u32 bpf_get_route_realm(skb) * retrieve a dst's tclassid @@ -632,7 +637,7 @@ union bpf_attr { FN(skb_adjust_room), \ FN(redirect_map), \ FN(sk_redirect_map), \ - FN(sock_map_update), + FN(sock_map_update), \ /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -753,20 +758,23 @@ struct bpf_sock { __u32 family; __u32 type; __u32 protocol; + __u32 mark; + __u32 priority; }; #define XDP_PACKET_HEADROOM 256 /* User return codes for XDP prog type. * A valid XDP program must return one of these defined values. All other - * return codes are reserved for future use. Unknown return codes will result - * in packet drop. + * return codes are reserved for future use. Unknown return codes will + * result in packet drops and a warning via bpf_warn_invalid_xdp_action(). */ enum xdp_action { XDP_ABORTED = 0, XDP_DROP, XDP_PASS, XDP_TX, + XDP_REDIRECT, }; /* user accessible metadata for XDP packet hook diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index 6cd63c18708a..838887587411 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -711,7 +711,8 @@ struct kvm_ppc_one_seg_page_size { struct kvm_ppc_smmu_info { __u64 flags; __u32 slb_size; - __u32 pad; + __u16 data_keys; /* # storage keys supported for data */ + __u16 instr_keys; /* # storage keys supported for instructions */ struct kvm_ppc_one_seg_page_size sps[KVM_PPC_PAGE_SIZES_MAX_SZ]; }; diff --git a/tools/include/uapi/linux/mman.h b/tools/include/uapi/linux/mman.h index 81d8edf11789..a937480d7cd3 100644 --- a/tools/include/uapi/linux/mman.h +++ b/tools/include/uapi/linux/mman.h @@ -1,7 +1,8 @@ #ifndef _UAPI_LINUX_MMAN_H #define _UAPI_LINUX_MMAN_H -#include +#include +#include #define MREMAP_MAYMOVE 1 #define MREMAP_FIXED 2 @@ -10,4 +11,25 @@ #define OVERCOMMIT_ALWAYS 1 #define OVERCOMMIT_NEVER 2 +/* + * Huge page size encoding when MAP_HUGETLB is specified, and a huge page + * size other than the default is desired. See hugetlb_encode.h. + * All known huge page size encodings are provided here. It is the + * responsibility of the application to know which sizes are supported on + * the running system. See mmap(2) man page for details. + */ +#define MAP_HUGE_SHIFT HUGETLB_FLAG_ENCODE_SHIFT +#define MAP_HUGE_MASK HUGETLB_FLAG_ENCODE_MASK + +#define MAP_HUGE_64KB HUGETLB_FLAG_ENCODE_64KB +#define MAP_HUGE_512KB HUGETLB_FLAG_ENCODE_512KB +#define MAP_HUGE_1MB HUGETLB_FLAG_ENCODE_1MB +#define MAP_HUGE_2MB HUGETLB_FLAG_ENCODE_2MB +#define MAP_HUGE_8MB HUGETLB_FLAG_ENCODE_8MB +#define MAP_HUGE_16MB HUGETLB_FLAG_ENCODE_16MB +#define MAP_HUGE_256MB HUGETLB_FLAG_ENCODE_256MB +#define MAP_HUGE_1GB HUGETLB_FLAG_ENCODE_1GB +#define MAP_HUGE_2GB HUGETLB_FLAG_ENCODE_2GB +#define MAP_HUGE_16GB HUGETLB_FLAG_ENCODE_16GB + #endif /* _UAPI_LINUX_MMAN_H */ -- cgit From f1e52f14a69386ac460a8d700df0647a631cf595 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 22 Sep 2017 15:41:44 -0300 Subject: perf evsel: Fix attr.exclude_kernel setting for default cycles:p Yet another fix for probing the max attr.precise_ip setting: it is not enough settting attr.exclude_kernel for !root users, as they _can_ profile the kernel if the kernel.perf_event_paranoid sysctl is set to -1, so check that as well. Testing it: As non root: $ sysctl kernel.perf_event_paranoid kernel.perf_event_paranoid = 2 $ perf record sleep 1 $ perf evlist -v cycles:uppp: ..., exclude_kernel: 1, ... precise_ip: 3, ... Now as non-root, but with kernel.perf_event_paranoid set set to the most permissive value, -1: $ sysctl kernel.perf_event_paranoid kernel.perf_event_paranoid = -1 $ perf record sleep 1 $ perf evlist -v cycles:ppp: ..., exclude_kernel: 0, ... precise_ip: 3, ... $ I.e. non-root, default kernel.perf_event_paranoid: :uppp modifier = not allowed to sample the kernel, non-root, most permissible kernel.perf_event_paranoid: :ppp = allowed to sample the kernel. In both cases, use the highest available precision: attr.precise_ip = 3. Reported-and-Tested-by: Ingo Molnar Cc: Adrian Hunter Cc: Andy Lutomirski Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Fixes: d37a36979077 ("perf evsel: Fix attr.exclude_kernel setting for default cycles:p") Link: http://lkml.kernel.org/n/tip-nj2qkf75xsd6pw6hhjzfqqdx@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evsel.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 4bb89373eb52..0dccdb89572c 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -271,12 +271,17 @@ struct perf_evsel *perf_evsel__new_idx(struct perf_event_attr *attr, int idx) return evsel; } +static bool perf_event_can_profile_kernel(void) +{ + return geteuid() == 0 || perf_event_paranoid() == -1; +} + struct perf_evsel *perf_evsel__new_cycles(bool precise) { struct perf_event_attr attr = { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES, - .exclude_kernel = geteuid() != 0, + .exclude_kernel = !perf_event_can_profile_kernel(), }; struct perf_evsel *evsel; -- cgit From 44d8143340a99b167c74365e844516b73523c087 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 21 Sep 2017 13:57:40 -0700 Subject: KEYS: fix cred refcount leak in request_key_auth_new() In request_key_auth_new(), if key_alloc() or key_instantiate_and_link() were to fail, we would leak a reference to the 'struct cred'. Currently this can only happen if key_alloc() fails to allocate memory. But it still should be fixed, as it is a more severe bug waiting to happen. Fix it by cleaning things up to use a helper function which frees a 'struct request_key_auth' correctly. Fixes: d84f4f992cbd ("CRED: Inaugurate COW credentials") Signed-off-by: Eric Biggers Signed-off-by: David Howells --- security/keys/request_key_auth.c | 68 ++++++++++++++++++---------------------- 1 file changed, 31 insertions(+), 37 deletions(-) diff --git a/security/keys/request_key_auth.c b/security/keys/request_key_auth.c index afe9d22ab361..69d6b3b35470 100644 --- a/security/keys/request_key_auth.c +++ b/security/keys/request_key_auth.c @@ -120,6 +120,18 @@ static void request_key_auth_revoke(struct key *key) } } +static void free_request_key_auth(struct request_key_auth *rka) +{ + if (!rka) + return; + key_put(rka->target_key); + key_put(rka->dest_keyring); + if (rka->cred) + put_cred(rka->cred); + kfree(rka->callout_info); + kfree(rka); +} + /* * Destroy an instantiation authorisation token key. */ @@ -129,15 +141,7 @@ static void request_key_auth_destroy(struct key *key) kenter("{%d}", key->serial); - if (rka->cred) { - put_cred(rka->cred); - rka->cred = NULL; - } - - key_put(rka->target_key); - key_put(rka->dest_keyring); - kfree(rka->callout_info); - kfree(rka); + free_request_key_auth(rka); } /* @@ -151,22 +155,17 @@ struct key *request_key_auth_new(struct key *target, const void *callout_info, const struct cred *cred = current->cred; struct key *authkey = NULL; char desc[20]; - int ret; + int ret = -ENOMEM; kenter("%d,", target->serial); /* allocate a auth record */ - rka = kmalloc(sizeof(*rka), GFP_KERNEL); - if (!rka) { - kleave(" = -ENOMEM"); - return ERR_PTR(-ENOMEM); - } + rka = kzalloc(sizeof(*rka), GFP_KERNEL); + if (!rka) + goto error; rka->callout_info = kmalloc(callout_len, GFP_KERNEL); - if (!rka->callout_info) { - kleave(" = -ENOMEM"); - kfree(rka); - return ERR_PTR(-ENOMEM); - } + if (!rka->callout_info) + goto error_free_rka; /* see if the calling process is already servicing the key request of * another process */ @@ -176,8 +175,12 @@ struct key *request_key_auth_new(struct key *target, const void *callout_info, /* if the auth key has been revoked, then the key we're * servicing is already instantiated */ - if (test_bit(KEY_FLAG_REVOKED, &cred->request_key_auth->flags)) - goto auth_key_revoked; + if (test_bit(KEY_FLAG_REVOKED, + &cred->request_key_auth->flags)) { + up_read(&cred->request_key_auth->sem); + ret = -EKEYREVOKED; + goto error_free_rka; + } irka = cred->request_key_auth->payload.data[0]; rka->cred = get_cred(irka->cred); @@ -205,32 +208,23 @@ struct key *request_key_auth_new(struct key *target, const void *callout_info, KEY_USR_VIEW, KEY_ALLOC_NOT_IN_QUOTA, NULL); if (IS_ERR(authkey)) { ret = PTR_ERR(authkey); - goto error_alloc; + goto error_free_rka; } /* construct the auth key */ ret = key_instantiate_and_link(authkey, rka, 0, NULL, NULL); if (ret < 0) - goto error_inst; + goto error_put_authkey; kleave(" = {%d,%d}", authkey->serial, refcount_read(&authkey->usage)); return authkey; -auth_key_revoked: - up_read(&cred->request_key_auth->sem); - kfree(rka->callout_info); - kfree(rka); - kleave("= -EKEYREVOKED"); - return ERR_PTR(-EKEYREVOKED); - -error_inst: +error_put_authkey: key_revoke(authkey); key_put(authkey); -error_alloc: - key_put(rka->target_key); - key_put(rka->dest_keyring); - kfree(rka->callout_info); - kfree(rka); +error_free_rka: + free_request_key_auth(rka); +error: kleave("= %d", ret); return ERR_PTR(ret); } -- cgit From f7b48cf08fa63a68b59c2894806ee478216d7f91 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 21 Sep 2017 13:57:41 -0700 Subject: KEYS: don't revoke uninstantiated key in request_key_auth_new() If key_instantiate_and_link() were to fail (which fortunately isn't possible currently), the call to key_revoke(authkey) would crash with a NULL pointer dereference in request_key_auth_revoke() because the key has not yet been instantiated. Fix this by removing the call to key_revoke(). key_put() is sufficient, as it's not possible for an uninstantiated authkey to have been used for anything yet. Fixes: b5f545c880a2 ("[PATCH] keys: Permit running process to instantiate keys") Signed-off-by: Eric Biggers Signed-off-by: David Howells --- security/keys/request_key_auth.c | 1 - 1 file changed, 1 deletion(-) diff --git a/security/keys/request_key_auth.c b/security/keys/request_key_auth.c index 69d6b3b35470..e356075ed2f8 100644 --- a/security/keys/request_key_auth.c +++ b/security/keys/request_key_auth.c @@ -220,7 +220,6 @@ struct key *request_key_auth_new(struct key *target, const void *callout_info, return authkey; error_put_authkey: - key_revoke(authkey); key_put(authkey); error_free_rka: free_request_key_auth(rka); -- cgit From 884bee0215fcc239b30c062c37ca29077005e064 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 18 Sep 2017 11:36:12 -0700 Subject: KEYS: fix key refcount leak in keyctl_assume_authority() In keyctl_assume_authority(), if keyctl_change_reqkey_auth() were to fail, we would leak the reference to the 'authkey'. Currently this can only happen if prepare_creds() fails to allocate memory. But it still should be fixed, as it is a more severe bug waiting to happen. This patch also moves the read of 'authkey->serial' to before the reference to the authkey is dropped. Doing the read after dropping the reference is very fragile because it assumes we still hold another reference to the key. (Which we do, in current->cred->request_key_auth, but there's no reason not to write it in the "obviously correct" way.) Fixes: d84f4f992cbd ("CRED: Inaugurate COW credentials") Signed-off-by: Eric Biggers Signed-off-by: David Howells --- security/keys/keyctl.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index ab0b337c84b4..562f7fe287a0 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -1406,11 +1406,9 @@ long keyctl_assume_authority(key_serial_t id) } ret = keyctl_change_reqkey_auth(authkey); - if (ret < 0) - goto error; + if (ret == 0) + ret = authkey->serial; key_put(authkey); - - ret = authkey->serial; error: return ret; } -- cgit From 7fc0786d956d9e59b68d282be9b156179846ea3d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 18 Sep 2017 11:36:31 -0700 Subject: KEYS: fix key refcount leak in keyctl_read_key() In keyctl_read_key(), if key_permission() were to return an error code other than EACCES, we would leak a the reference to the key. This can't actually happen currently because key_permission() can only return an error code other than EACCES if security_key_permission() does, only SELinux and Smack implement that hook, and neither can return an error code other than EACCES. But it should still be fixed, as it is a bug waiting to happen. Fixes: 29db91906340 ("[PATCH] Keys: Add LSM hooks for key management [try #3]") Signed-off-by: Eric Biggers Signed-off-by: David Howells --- security/keys/keyctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index 562f7fe287a0..aa1d11a29136 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -771,7 +771,7 @@ long keyctl_read_key(key_serial_t keyid, char __user *buffer, size_t buflen) if (ret == 0) goto can_read_key; if (ret != -EACCES) - goto error; + goto error2; /* we can't; see if it's searchable from this process's keyrings * - we automatically take account of the fact that it may be -- cgit From e645016abc803dafc75e4b8f6e4118f088900ffb Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 18 Sep 2017 11:36:45 -0700 Subject: KEYS: fix writing past end of user-supplied buffer in keyring_read() Userspace can call keyctl_read() on a keyring to get the list of IDs of keys in the keyring. But if the user-supplied buffer is too small, the kernel would write the full list anyway --- which will corrupt whatever userspace memory happened to be past the end of the buffer. Fix it by only filling the space that is available. Fixes: b2a4df200d57 ("KEYS: Expand the capacity of a keyring") Cc: [v3.13+] Signed-off-by: Eric Biggers Signed-off-by: David Howells --- security/keys/keyring.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/security/keys/keyring.c b/security/keys/keyring.c index de81793f9920..94f038967c17 100644 --- a/security/keys/keyring.c +++ b/security/keys/keyring.c @@ -423,7 +423,7 @@ static void keyring_describe(const struct key *keyring, struct seq_file *m) } struct keyring_read_iterator_context { - size_t qty; + size_t buflen; size_t count; key_serial_t __user *buffer; }; @@ -435,9 +435,9 @@ static int keyring_read_iterator(const void *object, void *data) int ret; kenter("{%s,%d},,{%zu/%zu}", - key->type->name, key->serial, ctx->count, ctx->qty); + key->type->name, key->serial, ctx->count, ctx->buflen); - if (ctx->count >= ctx->qty) + if (ctx->count >= ctx->buflen) return 1; ret = put_user(key->serial, ctx->buffer); @@ -472,16 +472,12 @@ static long keyring_read(const struct key *keyring, return 0; /* Calculate how much data we could return */ - ctx.qty = nr_keys * sizeof(key_serial_t); - if (!buffer || !buflen) - return ctx.qty; - - if (buflen > ctx.qty) - ctx.qty = buflen; + return nr_keys * sizeof(key_serial_t); /* Copy the IDs of the subscribed keys into the buffer */ ctx.buffer = (key_serial_t __user *)buffer; + ctx.buflen = buflen; ctx.count = 0; ret = assoc_array_iterate(&keyring->keys, keyring_read_iterator, &ctx); if (ret < 0) { -- cgit From 237bbd29f7a049d310d907f4b2716a7feef9abf3 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 18 Sep 2017 11:37:03 -0700 Subject: KEYS: prevent creating a different user's keyrings It was possible for an unprivileged user to create the user and user session keyrings for another user. For example: sudo -u '#3000' sh -c 'keyctl add keyring _uid.4000 "" @u keyctl add keyring _uid_ses.4000 "" @u sleep 15' & sleep 1 sudo -u '#4000' keyctl describe @u sudo -u '#4000' keyctl describe @us This is problematic because these "fake" keyrings won't have the right permissions. In particular, the user who created them first will own them and will have full access to them via the possessor permissions, which can be used to compromise the security of a user's keys: -4: alswrv-----v------------ 3000 0 keyring: _uid.4000 -5: alswrv-----v------------ 3000 0 keyring: _uid_ses.4000 Fix it by marking user and user session keyrings with a flag KEY_FLAG_UID_KEYRING. Then, when searching for a user or user session keyring by name, skip all keyrings that don't have the flag set. Fixes: 69664cf16af4 ("keys: don't generate user and user session keyrings unless they're accessed") Cc: [v2.6.26+] Signed-off-by: Eric Biggers Signed-off-by: David Howells --- include/linux/key.h | 2 ++ security/keys/internal.h | 2 +- security/keys/key.c | 2 ++ security/keys/keyring.c | 23 ++++++++++++++--------- security/keys/process_keys.c | 6 ++++-- 5 files changed, 23 insertions(+), 12 deletions(-) diff --git a/include/linux/key.h b/include/linux/key.h index 044114185120..e315e16b6ff8 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -187,6 +187,7 @@ struct key { #define KEY_FLAG_BUILTIN 8 /* set if key is built in to the kernel */ #define KEY_FLAG_ROOT_CAN_INVAL 9 /* set if key can be invalidated by root without permission */ #define KEY_FLAG_KEEP 10 /* set if key should not be removed */ +#define KEY_FLAG_UID_KEYRING 11 /* set if key is a user or user session keyring */ /* the key type and key description string * - the desc is used to match a key against search criteria @@ -243,6 +244,7 @@ extern struct key *key_alloc(struct key_type *type, #define KEY_ALLOC_NOT_IN_QUOTA 0x0002 /* not in quota */ #define KEY_ALLOC_BUILT_IN 0x0004 /* Key is built into kernel */ #define KEY_ALLOC_BYPASS_RESTRICTION 0x0008 /* Override the check on restricted keyrings */ +#define KEY_ALLOC_UID_KEYRING 0x0010 /* allocating a user or user session keyring */ extern void key_revoke(struct key *key); extern void key_invalidate(struct key *key); diff --git a/security/keys/internal.h b/security/keys/internal.h index 1c02c6547038..503adbae7b0d 100644 --- a/security/keys/internal.h +++ b/security/keys/internal.h @@ -141,7 +141,7 @@ extern key_ref_t keyring_search_aux(key_ref_t keyring_ref, extern key_ref_t search_my_process_keyrings(struct keyring_search_context *ctx); extern key_ref_t search_process_keyrings(struct keyring_search_context *ctx); -extern struct key *find_keyring_by_name(const char *name, bool skip_perm_check); +extern struct key *find_keyring_by_name(const char *name, bool uid_keyring); extern int install_user_keyrings(void); extern int install_thread_keyring_to_cred(struct cred *); diff --git a/security/keys/key.c b/security/keys/key.c index 83da68d98b40..e5c0896c3a8f 100644 --- a/security/keys/key.c +++ b/security/keys/key.c @@ -302,6 +302,8 @@ struct key *key_alloc(struct key_type *type, const char *desc, key->flags |= 1 << KEY_FLAG_IN_QUOTA; if (flags & KEY_ALLOC_BUILT_IN) key->flags |= 1 << KEY_FLAG_BUILTIN; + if (flags & KEY_ALLOC_UID_KEYRING) + key->flags |= 1 << KEY_FLAG_UID_KEYRING; #ifdef KEY_DEBUGGING key->magic = KEY_DEBUG_MAGIC; diff --git a/security/keys/keyring.c b/security/keys/keyring.c index 94f038967c17..4fa82a8a9c0e 100644 --- a/security/keys/keyring.c +++ b/security/keys/keyring.c @@ -1097,15 +1097,15 @@ found: /* * Find a keyring with the specified name. * - * All named keyrings in the current user namespace are searched, provided they - * grant Search permission directly to the caller (unless this check is - * skipped). Keyrings whose usage points have reached zero or who have been - * revoked are skipped. + * Only keyrings that have nonzero refcount, are not revoked, and are owned by a + * user in the current user namespace are considered. If @uid_keyring is %true, + * the keyring additionally must have been allocated as a user or user session + * keyring; otherwise, it must grant Search permission directly to the caller. * * Returns a pointer to the keyring with the keyring's refcount having being * incremented on success. -ENOKEY is returned if a key could not be found. */ -struct key *find_keyring_by_name(const char *name, bool skip_perm_check) +struct key *find_keyring_by_name(const char *name, bool uid_keyring) { struct key *keyring; int bucket; @@ -1133,10 +1133,15 @@ struct key *find_keyring_by_name(const char *name, bool skip_perm_check) if (strcmp(keyring->description, name) != 0) continue; - if (!skip_perm_check && - key_permission(make_key_ref(keyring, 0), - KEY_NEED_SEARCH) < 0) - continue; + if (uid_keyring) { + if (!test_bit(KEY_FLAG_UID_KEYRING, + &keyring->flags)) + continue; + } else { + if (key_permission(make_key_ref(keyring, 0), + KEY_NEED_SEARCH) < 0) + continue; + } /* we've got a match but we might end up racing with * key_cleanup() if the keyring is currently 'dead' diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c index 86bced9fdbdf..293d3598153b 100644 --- a/security/keys/process_keys.c +++ b/security/keys/process_keys.c @@ -77,7 +77,8 @@ int install_user_keyrings(void) if (IS_ERR(uid_keyring)) { uid_keyring = keyring_alloc(buf, user->uid, INVALID_GID, cred, user_keyring_perm, - KEY_ALLOC_IN_QUOTA, + KEY_ALLOC_UID_KEYRING | + KEY_ALLOC_IN_QUOTA, NULL, NULL); if (IS_ERR(uid_keyring)) { ret = PTR_ERR(uid_keyring); @@ -94,7 +95,8 @@ int install_user_keyrings(void) session_keyring = keyring_alloc(buf, user->uid, INVALID_GID, cred, user_keyring_perm, - KEY_ALLOC_IN_QUOTA, + KEY_ALLOC_UID_KEYRING | + KEY_ALLOC_IN_QUOTA, NULL, NULL); if (IS_ERR(session_keyring)) { ret = PTR_ERR(session_keyring); -- cgit From 37863c43b2c6464f252862bf2e9768264e961678 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 18 Sep 2017 11:37:23 -0700 Subject: KEYS: prevent KEYCTL_READ on negative key Because keyctl_read_key() looks up the key with no permissions requested, it may find a negatively instantiated key. If the key is also possessed, we went ahead and called ->read() on the key. But the key payload will actually contain the ->reject_error rather than the normal payload. Thus, the kernel oopses trying to read the user_key_payload from memory address (int)-ENOKEY = 0x00000000ffffff82. Fortunately the payload data is stored inline, so it shouldn't be possible to abuse this as an arbitrary memory read primitive... Reproducer: keyctl new_session keyctl request2 user desc '' @s keyctl read $(keyctl show | awk '/user: desc/ {print $1}') It causes a crash like the following: BUG: unable to handle kernel paging request at 00000000ffffff92 IP: user_read+0x33/0xa0 PGD 36a54067 P4D 36a54067 PUD 0 Oops: 0000 [#1] SMP CPU: 0 PID: 211 Comm: keyctl Not tainted 4.14.0-rc1 #337 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-20170228_101828-anatol 04/01/2014 task: ffff90aa3b74c3c0 task.stack: ffff9878c0478000 RIP: 0010:user_read+0x33/0xa0 RSP: 0018:ffff9878c047bee8 EFLAGS: 00010246 RAX: 0000000000000001 RBX: ffff90aa3d7da340 RCX: 0000000000000017 RDX: 0000000000000000 RSI: 00000000ffffff82 RDI: ffff90aa3d7da340 RBP: ffff9878c047bf00 R08: 00000024f95da94f R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000000 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 FS: 00007f58ece69740(0000) GS:ffff90aa3e200000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000ffffff92 CR3: 0000000036adc001 CR4: 00000000003606f0 Call Trace: keyctl_read_key+0xac/0xe0 SyS_keyctl+0x99/0x120 entry_SYSCALL_64_fastpath+0x1f/0xbe RIP: 0033:0x7f58ec787bb9 RSP: 002b:00007ffc8d401678 EFLAGS: 00000206 ORIG_RAX: 00000000000000fa RAX: ffffffffffffffda RBX: 00007ffc8d402800 RCX: 00007f58ec787bb9 RDX: 0000000000000000 RSI: 00000000174a63ac RDI: 000000000000000b RBP: 0000000000000004 R08: 00007ffc8d402809 R09: 0000000000000020 R10: 0000000000000000 R11: 0000000000000206 R12: 00007ffc8d402800 R13: 00007ffc8d4016e0 R14: 0000000000000000 R15: 0000000000000000 Code: e5 41 55 49 89 f5 41 54 49 89 d4 53 48 89 fb e8 a4 b4 ad ff 85 c0 74 09 80 3d b9 4c 96 00 00 74 43 48 8b b3 20 01 00 00 4d 85 ed <0f> b7 5e 10 74 29 4d 85 e4 74 24 4c 39 e3 4c 89 e2 4c 89 ef 48 RIP: user_read+0x33/0xa0 RSP: ffff9878c047bee8 CR2: 00000000ffffff92 Fixes: 61ea0c0ba904 ("KEYS: Skip key state checks when checking for possession") Cc: [v3.13+] Signed-off-by: Eric Biggers Signed-off-by: David Howells --- security/keys/keyctl.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index aa1d11a29136..365ff85d7e27 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -766,6 +766,11 @@ long keyctl_read_key(key_serial_t keyid, char __user *buffer, size_t buflen) key = key_ref_to_ptr(key_ref); + if (test_bit(KEY_FLAG_NEGATIVE, &key->flags)) { + ret = -ENOKEY; + goto error2; + } + /* see if we can read it directly */ ret = key_permission(key_ref, KEY_NEED_READ); if (ret == 0) -- cgit From 8f674565d405a8c0b36ee531849df87f43e72ed5 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 18 Sep 2017 11:37:39 -0700 Subject: KEYS: reset parent each time before searching key_user_tree In key_user_lookup(), if there is no key_user for the given uid, we drop key_user_lock, allocate a new key_user, and search the tree again. But we failed to set 'parent' to NULL at the beginning of the second search. If the tree were to be empty for the second search, the insertion would be done with an invalid 'parent', scribbling over freed memory. Fortunately this can't actually happen currently because the tree always contains at least the root_key_user. But it still should be fixed to make the code more robust. Signed-off-by: Eric Biggers Signed-off-by: David Howells --- security/keys/key.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/security/keys/key.c b/security/keys/key.c index e5c0896c3a8f..eb914a838840 100644 --- a/security/keys/key.c +++ b/security/keys/key.c @@ -54,10 +54,10 @@ void __key_check(const struct key *key) struct key_user *key_user_lookup(kuid_t uid) { struct key_user *candidate = NULL, *user; - struct rb_node *parent = NULL; - struct rb_node **p; + struct rb_node *parent, **p; try_again: + parent = NULL; p = &key_user_tree.rb_node; spin_lock(&key_user_lock); -- cgit From 4aa68e07d845562561f5e73c04aa521376e95252 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 18 Sep 2017 11:38:29 -0700 Subject: KEYS: restrict /proc/keys by credentials at open time When checking for permission to view keys whilst reading from /proc/keys, we should use the credentials with which the /proc/keys file was opened. This is because, in a classic type of exploit, it can be possible to bypass checks for the *current* credentials by passing the file descriptor to a suid program. Following commit 34dbbcdbf633 ("Make file credentials available to the seqfile interfaces") we can finally fix it. So let's do it. Signed-off-by: Eric Biggers Signed-off-by: David Howells --- security/keys/proc.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/security/keys/proc.c b/security/keys/proc.c index bf08d02b6646..de834309d100 100644 --- a/security/keys/proc.c +++ b/security/keys/proc.c @@ -187,7 +187,7 @@ static int proc_keys_show(struct seq_file *m, void *v) struct keyring_search_context ctx = { .index_key.type = key->type, .index_key.description = key->description, - .cred = current_cred(), + .cred = m->file->f_cred, .match_data.cmp = lookup_user_key_possessed, .match_data.raw_data = key, .match_data.lookup_type = KEYRING_SEARCH_LOOKUP_DIRECT, @@ -207,11 +207,7 @@ static int proc_keys_show(struct seq_file *m, void *v) } } - /* check whether the current task is allowed to view the key (assuming - * non-possession) - * - the caller holds a spinlock, and thus the RCU read lock, making our - * access to __current_cred() safe - */ + /* check whether the current task is allowed to view the key */ rc = key_task_permission(key_ref, ctx.cred, KEY_NEED_VIEW); if (rc < 0) return 0; -- cgit From e007ce9c59bddd1e67b94bc29036d920f5c5428a Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 21 Sep 2017 13:57:42 -0700 Subject: KEYS: use kmemdup() in request_key_auth_new() kmemdup() is preferred to kmalloc() followed by memcpy(). Signed-off-by: Eric Biggers Signed-off-by: David Howells --- security/keys/request_key_auth.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/security/keys/request_key_auth.c b/security/keys/request_key_auth.c index e356075ed2f8..6ebf1af8fce9 100644 --- a/security/keys/request_key_auth.c +++ b/security/keys/request_key_auth.c @@ -163,9 +163,10 @@ struct key *request_key_auth_new(struct key *target, const void *callout_info, rka = kzalloc(sizeof(*rka), GFP_KERNEL); if (!rka) goto error; - rka->callout_info = kmalloc(callout_len, GFP_KERNEL); + rka->callout_info = kmemdup(callout_info, callout_len, GFP_KERNEL); if (!rka->callout_info) goto error_free_rka; + rka->callout_len = callout_len; /* see if the calling process is already servicing the key request of * another process */ @@ -196,8 +197,6 @@ struct key *request_key_auth_new(struct key *target, const void *callout_info, rka->target_key = key_get(target); rka->dest_keyring = key_get(dest_keyring); - memcpy(rka->callout_info, callout_info, callout_len); - rka->callout_len = callout_len; /* allocate the auth key */ sprintf(desc, "%x", target->serial); -- cgit From c74aef2d06a9f59cece89093eecc552933cba72a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Sep 2017 17:48:06 +0200 Subject: futex: Fix pi_state->owner serialization There was a reported suspicion about a race between exit_pi_state_list() and put_pi_state(). The same report mentioned the comment with put_pi_state() said it should be called with hb->lock held, and it no longer is in all places. As it turns out, the pi_state->owner serialization is indeed broken. As per the new rules: 734009e96d19 ("futex: Change locking rules") pi_state->owner should be serialized by pi_state->pi_mutex.wait_lock. For the sites setting pi_state->owner we already hold wait_lock (where required) but exit_pi_state_list() and put_pi_state() were not and raced on clearing it. Fixes: 734009e96d19 ("futex: Change locking rules") Reported-by: Gratian Crisan Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: dvhart@infradead.org Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20170922154806.jd3ffltfk24m4o4y@hirez.programming.kicks-ass.net --- kernel/futex.c | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 3d38eaf05492..0518a0bfc746 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -821,8 +821,6 @@ static void get_pi_state(struct futex_pi_state *pi_state) /* * Drops a reference to the pi_state object and frees or caches it * when the last reference is gone. - * - * Must be called with the hb lock held. */ static void put_pi_state(struct futex_pi_state *pi_state) { @@ -837,16 +835,22 @@ static void put_pi_state(struct futex_pi_state *pi_state) * and has cleaned up the pi_state already */ if (pi_state->owner) { - raw_spin_lock_irq(&pi_state->owner->pi_lock); - list_del_init(&pi_state->list); - raw_spin_unlock_irq(&pi_state->owner->pi_lock); + struct task_struct *owner; - rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner); + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + owner = pi_state->owner; + if (owner) { + raw_spin_lock(&owner->pi_lock); + list_del_init(&pi_state->list); + raw_spin_unlock(&owner->pi_lock); + } + rt_mutex_proxy_unlock(&pi_state->pi_mutex, owner); + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); } - if (current->pi_state_cache) + if (current->pi_state_cache) { kfree(pi_state); - else { + } else { /* * pi_state->list is already empty. * clear pi_state->owner. @@ -907,13 +911,14 @@ void exit_pi_state_list(struct task_struct *curr) raw_spin_unlock_irq(&curr->pi_lock); spin_lock(&hb->lock); - - raw_spin_lock_irq(&curr->pi_lock); + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + raw_spin_lock(&curr->pi_lock); /* * We dropped the pi-lock, so re-check whether this * task still owns the PI-state: */ if (head->next != next) { + raw_spin_unlock(&pi_state->pi_mutex.wait_lock); spin_unlock(&hb->lock); continue; } @@ -922,9 +927,10 @@ void exit_pi_state_list(struct task_struct *curr) WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); pi_state->owner = NULL; - raw_spin_unlock_irq(&curr->pi_lock); + raw_spin_unlock(&curr->pi_lock); get_pi_state(pi_state); + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); spin_unlock(&hb->lock); rt_mutex_futex_unlock(&pi_state->pi_mutex); @@ -1208,6 +1214,10 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key, WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &p->pi_state_list); + /* + * Assignment without holding pi_state->pi_mutex.wait_lock is safe + * because there is no concurrency as the object is not published yet. + */ pi_state->owner = p; raw_spin_unlock_irq(&p->pi_lock); @@ -2878,6 +2888,7 @@ retry: raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); spin_unlock(&hb->lock); + /* drops pi_state->pi_mutex.wait_lock */ ret = wake_futex_pi(uaddr, uval, pi_state); put_pi_state(pi_state); -- cgit From 2827a418ca1b23e432e62c9b3d0e7cf3255dfe88 Mon Sep 17 00:00:00 2001 From: Alexandru Moise <00moses.alexander00@gmail.com> Date: Tue, 19 Sep 2017 22:04:12 +0200 Subject: genirq: Check __free_irq() return value for NULL __free_irq() can return a NULL irqaction for example when trying to free already-free IRQ, but the callsite unconditionally dereferences the returned pointer. Fix this by adding a check and return NULL. Signed-off-by: Alexandru Moise <00moses.alexander00@gmail.com> Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20170919200412.GA29985@gmail.com --- kernel/irq/manage.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 573dc52b0806..d00132b5c325 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1643,6 +1643,10 @@ const void *free_irq(unsigned int irq, void *dev_id) #endif action = __free_irq(irq, dev_id); + + if (!action) + return NULL; + devname = action->name; kfree(action); return devname; -- cgit From 02a4843618fb35f847cf8c31cd3893873aa0edde Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 13 Sep 2017 09:17:57 -0400 Subject: brd: fix overflow in __brd_direct_access The code in __brd_direct_access multiplies the pgoff variable by page size and divides it by 512. It can cause overflow on 32-bit architectures. The overflow happens if we create ramdisk larger than 4G and use it as a sparse device. This patch replaces multiplication and division with multiplication by the number of sectors per page. Reviewed-by: Dan Williams Signed-off-by: Mikulas Patocka Fixes: 1647b9b959c7 ("brd: add dax_operations support") Cc: stable@vger.kernel.org # 4.12+ Signed-off-by: Jens Axboe --- drivers/block/brd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/brd.c b/drivers/block/brd.c index bbd0d186cfc0..2d7178f7754e 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -342,7 +342,7 @@ static long __brd_direct_access(struct brd_device *brd, pgoff_t pgoff, if (!brd) return -ENODEV; - page = brd_insert_page(brd, PFN_PHYS(pgoff) / 512); + page = brd_insert_page(brd, (sector_t)pgoff << PAGE_SECTORS_SHIFT); if (!page) return -ENOSPC; *kaddr = page_address(page); -- cgit From f507b54dccfd8000c517d740bc45f20c74532d18 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 7 Sep 2017 13:54:35 +0200 Subject: bsg-lib: don't free job in bsg_prepare_job The job structure is allocated as part of the request, so we should not free it in the error path of bsg_prepare_job. Signed-off-by: Christoph Hellwig Cc: stable@vger.kernel.org Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- block/bsg-lib.c | 1 - 1 file changed, 1 deletion(-) diff --git a/block/bsg-lib.c b/block/bsg-lib.c index c82408c7cc3c..dbddff8174e5 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -154,7 +154,6 @@ static int bsg_prepare_job(struct device *dev, struct request *req) failjob_rls_rqst_payload: kfree(job->request_payload.sg_list); failjob_rls_job: - kfree(job); return -ENOMEM; } -- cgit From 1dae69bedeeca0b57e441eae491fbd38049c0b47 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 5 May 2017 22:25:18 -0400 Subject: nbd: ignore non-nbd ioctl's In testing we noticed that nbd would spew if you ran a fio job against the raw device itself. This is because fio calls a block device specific ioctl, however the block layer will first pass this back to the driver ioctl handler in case the driver wants to do something special. Since the device was setup using netlink this caused us to spew every time fio called this ioctl. Since we don't have special handling, just error out for any non-nbd specific ioctl's that come in. This fixes the spew. Signed-off-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 2aa87cbdede0..3684e21d543f 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1194,6 +1194,12 @@ static int nbd_ioctl(struct block_device *bdev, fmode_t mode, if (!capable(CAP_SYS_ADMIN)) return -EPERM; + /* The block layer will pass back some non-nbd ioctls in case we have + * special handling for them, but we don't so just return an error. + */ + if (_IOC_TYPE(cmd) != 0xab) + return -EINVAL; + mutex_lock(&nbd->config_lock); /* Don't allow ioctl operations on a nbd device that was created with -- cgit From 5acb3cc2c2e9d3020a4fee43763c6463767f1572 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 20 Sep 2017 13:12:20 -0600 Subject: blktrace: Fix potential deadlock between delete & sysfs ops The lockdep code had reported the following unsafe locking scenario: CPU0 CPU1 ---- ---- lock(s_active#228); lock(&bdev->bd_mutex/1); lock(s_active#228); lock(&bdev->bd_mutex); *** DEADLOCK *** The deadlock may happen when one task (CPU1) is trying to delete a partition in a block device and another task (CPU0) is accessing tracing sysfs file (e.g. /sys/block/dm-1/trace/act_mask) in that partition. The s_active isn't an actual lock. It is a reference count (kn->count) on the sysfs (kernfs) file. Removal of a sysfs file, however, require a wait until all the references are gone. The reference count is treated like a rwsem using lockdep instrumentation code. The fact that a thread is in the sysfs callback method or in the ioctl call means there is a reference to the opended sysfs or device file. That should prevent the underlying block structure from being removed. Instead of using bd_mutex in the block_device structure, a new blk_trace_mutex is now added to the request_queue structure to protect access to the blk_trace structure. Suggested-by: Christoph Hellwig Signed-off-by: Waiman Long Acked-by: Steven Rostedt (VMware) Fix typo in patch subject line, and prune a comment detailing how the code used to work. Signed-off-by: Jens Axboe --- block/blk-core.c | 3 +++ include/linux/blkdev.h | 1 + kernel/trace/blktrace.c | 18 ++++++++++++------ 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index aebe676225e6..048be4aa6024 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -854,6 +854,9 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) kobject_init(&q->kobj, &blk_queue_ktype); +#ifdef CONFIG_BLK_DEV_IO_TRACE + mutex_init(&q->blk_trace_mutex); +#endif mutex_init(&q->sysfs_lock); spin_lock_init(&q->__queue_lock); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 460294bb0fa5..02fa42d24b52 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -551,6 +551,7 @@ struct request_queue { int node; #ifdef CONFIG_BLK_DEV_IO_TRACE struct blk_trace *blk_trace; + struct mutex blk_trace_mutex; #endif /* * for flush operations diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 2a685b45b73b..45a3928544ce 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -648,6 +648,12 @@ int blk_trace_startstop(struct request_queue *q, int start) } EXPORT_SYMBOL_GPL(blk_trace_startstop); +/* + * When reading or writing the blktrace sysfs files, the references to the + * opened sysfs or device files should prevent the underlying block device + * from being removed. So no further delete protection is really needed. + */ + /** * blk_trace_ioctl: - handle the ioctls associated with tracing * @bdev: the block device @@ -665,7 +671,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) if (!q) return -ENXIO; - mutex_lock(&bdev->bd_mutex); + mutex_lock(&q->blk_trace_mutex); switch (cmd) { case BLKTRACESETUP: @@ -691,7 +697,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) break; } - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&q->blk_trace_mutex); return ret; } @@ -1727,7 +1733,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, if (q == NULL) goto out_bdput; - mutex_lock(&bdev->bd_mutex); + mutex_lock(&q->blk_trace_mutex); if (attr == &dev_attr_enable) { ret = sprintf(buf, "%u\n", !!q->blk_trace); @@ -1746,7 +1752,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba); out_unlock_bdev: - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&q->blk_trace_mutex); out_bdput: bdput(bdev); out: @@ -1788,7 +1794,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, if (q == NULL) goto out_bdput; - mutex_lock(&bdev->bd_mutex); + mutex_lock(&q->blk_trace_mutex); if (attr == &dev_attr_enable) { if (value) @@ -1814,7 +1820,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, } out_unlock_bdev: - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&q->blk_trace_mutex); out_bdput: bdput(bdev); out: -- cgit From e5313c141b49c1b1af43d1ca81398185d66ad1a6 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Wed, 20 Sep 2017 14:24:34 -0700 Subject: loop: remove union of use_aio and ref in struct loop_cmd When the request is completed, lo_complete_rq() checks cmd->use_aio. However, if this is in fact an aio request, cmd->use_aio will have already been reused as cmd->ref by lo_rw_aio*. Fix it by not using a union. On x86_64, there's a hole after the union anyways, so this doesn't make struct loop_cmd any bigger. Fixes: 92d773324b7e ("block/loop: fix use after free") Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- drivers/block/loop.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/block/loop.h b/drivers/block/loop.h index f68c1d50802f..1f3956702993 100644 --- a/drivers/block/loop.h +++ b/drivers/block/loop.h @@ -67,10 +67,8 @@ struct loop_device { struct loop_cmd { struct kthread_work work; struct request *rq; - union { - bool use_aio; /* use AIO interface to handle I/O */ - atomic_t ref; /* only for aio */ - }; + bool use_aio; /* use AIO interface to handle I/O */ + atomic_t ref; /* only for aio */ long ret; struct kiocb iocb; struct bio_vec *bvec; -- cgit From 56b7103a06083b8ce1160f8289460ba2f584e182 Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 7 Sep 2017 16:27:26 -0700 Subject: nvme-fc: remove use of FC-specific error codes The FC-NVME transport used the FC-specific error codes in cases where it had to fabricate an error to go back up stack. Instead of using the FC-specific values, now use a generic value (NVME_SC_INTERNAL). Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/fc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index d2e882c0f496..9100779b58c9 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1376,7 +1376,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) if (atomic_read(&op->state) == FCPOP_STATE_ABORTED) status = cpu_to_le16((NVME_SC_ABORT_REQ | NVME_SC_DNR) << 1); else if (freq->status) - status = cpu_to_le16(NVME_SC_FC_TRANSPORT_ERROR << 1); + status = cpu_to_le16(NVME_SC_INTERNAL << 1); /* * For the linux implementation, if we have an unsuccesful @@ -1404,7 +1404,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) */ if (freq->transferred_length != be32_to_cpu(op->cmd_iu.data_len)) { - status = cpu_to_le16(NVME_SC_FC_TRANSPORT_ERROR << 1); + status = cpu_to_le16(NVME_SC_INTERNAL << 1); goto done; } result.u64 = 0; @@ -1421,7 +1421,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) freq->transferred_length || op->rsp_iu.status_code || sqe->common.command_id != cqe->command_id)) { - status = cpu_to_le16(NVME_SC_FC_TRANSPORT_ERROR << 1); + status = cpu_to_le16(NVME_SC_INTERNAL << 1); goto done; } result = cqe->result; @@ -1429,7 +1429,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) break; default: - status = cpu_to_le16(NVME_SC_FC_TRANSPORT_ERROR << 1); + status = cpu_to_le16(NVME_SC_INTERNAL << 1); goto done; } -- cgit From 29b3d26ecc8046838de88205b7c4b182ac27ff65 Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 7 Sep 2017 16:27:27 -0700 Subject: nvmet-fc: remove use of FC-specific error codes The FC-NVME target transport used the FC-specific error codes in return codes when the transport or lldd failed. Instead of using the FC-specific values, now use a generic value (NVME_SC_INTERNAL). Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/target/fc.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 421e43bf1dd7..088f07250d76 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -1910,8 +1910,7 @@ nvmet_fc_transfer_fcp_data(struct nvmet_fc_tgtport *tgtport, spin_lock_irqsave(&fod->flock, flags); fod->writedataactive = false; spin_unlock_irqrestore(&fod->flock, flags); - nvmet_req_complete(&fod->req, - NVME_SC_FC_TRANSPORT_ERROR); + nvmet_req_complete(&fod->req, NVME_SC_INTERNAL); } else /* NVMET_FCOP_READDATA or NVMET_FCOP_READDATA_RSP */ { fcpreq->fcp_error = ret; fcpreq->transferred_length = 0; @@ -1929,8 +1928,7 @@ __nvmet_fc_fod_op_abort(struct nvmet_fc_fcp_iod *fod, bool abort) /* if in the middle of an io and we need to tear down */ if (abort) { if (fcpreq->op == NVMET_FCOP_WRITEDATA) { - nvmet_req_complete(&fod->req, - NVME_SC_FC_TRANSPORT_ERROR); + nvmet_req_complete(&fod->req, NVME_SC_INTERNAL); return true; } @@ -1968,8 +1966,7 @@ nvmet_fc_fod_op_done(struct nvmet_fc_fcp_iod *fod) fod->abort = true; spin_unlock(&fod->flock); - nvmet_req_complete(&fod->req, - NVME_SC_FC_TRANSPORT_ERROR); + nvmet_req_complete(&fod->req, NVME_SC_INTERNAL); return; } -- cgit From fc9608e8b4dc3c2545fa0bc5d3eef158ca56ccf8 Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 7 Sep 2017 16:27:28 -0700 Subject: nvmet-fcloop: remove use of FC-specific error codes The FC-NVME transport loopback test module used the FC-specific error codes in cases where it emulated a transport abort case. Instead of using the FC-specific values, now use a generic value (NVME_SC_INTERNAL). Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/target/fcloop.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index 1cb9847ec261..1fd1afbb8b2a 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -576,7 +576,7 @@ fcloop_tgt_fcp_abort(struct nvmet_fc_target_port *tgtport, tfcp_req->aborted = true; spin_unlock(&tfcp_req->reqlock); - tfcp_req->status = NVME_SC_FC_TRANSPORT_ABORTED; + tfcp_req->status = NVME_SC_INTERNAL; /* * nothing more to do. If io wasn't active, the transport should -- cgit From 8e009ce84683fa124b23ff5cb7fd87c48b331b88 Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 7 Sep 2017 16:27:29 -0700 Subject: lpfc: remove use of FC-specific error codes The lpfc driver uses the FC-specific error when it needed to return an error to the FC-NVME transport. Convert to use a generic value instead. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/scsi/lpfc/lpfc_nvme.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/lpfc/lpfc_nvme.c b/drivers/scsi/lpfc/lpfc_nvme.c index 79ba3ce063a4..23bdb1ca106e 100644 --- a/drivers/scsi/lpfc/lpfc_nvme.c +++ b/drivers/scsi/lpfc/lpfc_nvme.c @@ -884,7 +884,7 @@ out_err: wcqe->total_data_placed); nCmd->transferred_length = 0; nCmd->rcv_rsplen = 0; - nCmd->status = NVME_SC_FC_TRANSPORT_ERROR; + nCmd->status = NVME_SC_INTERNAL; } } -- cgit From 39a550d2d9eaee8b618084e6011441eac6a2a3b7 Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 14 Sep 2017 11:30:15 -0700 Subject: qla2xxx: remove use of FC-specific error codes The qla2xxx driver uses the FC-specific error when it needed to return an error to the FC-NVME transport. Convert to use a generic value instead. Signed-off-by: James Smart Acked-by: Himanshu Madhani Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/scsi/qla2xxx/qla_nvme.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/qla2xxx/qla_nvme.c b/drivers/scsi/qla2xxx/qla_nvme.c index 1f59e7a74c7b..6b33a1f24f56 100644 --- a/drivers/scsi/qla2xxx/qla_nvme.c +++ b/drivers/scsi/qla2xxx/qla_nvme.c @@ -180,7 +180,7 @@ static void qla_nvme_sp_done(void *ptr, int res) goto rel; if (unlikely(res == QLA_FUNCTION_FAILED)) - fd->status = NVME_SC_FC_TRANSPORT_ERROR; + fd->status = NVME_SC_INTERNAL; else fd->status = 0; -- cgit From c98cb3bd882119e7e1a7c8df2f1eacfcc701450b Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 7 Sep 2017 16:27:25 -0700 Subject: nvme.h: remove FC transport-specific error values The NVM express group recinded the reserved range for the transport. Remove the FC-centric values that had been defined. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/nvme.h | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 87723c86f136..2440be32be1d 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -1127,19 +1127,6 @@ enum { NVME_SC_UNWRITTEN_BLOCK = 0x287, NVME_SC_DNR = 0x4000, - - - /* - * FC Transport-specific error status values for NVME commands - * - * Transport-specific status code values must be in the range 0xB0..0xBF - */ - - /* Generic FC failure - catchall */ - NVME_SC_FC_TRANSPORT_ERROR = 0x00B0, - - /* I/O failure due to FC ABTS'd */ - NVME_SC_FC_TRANSPORT_ABORTED = 0x00B1, }; struct nvme_completion { -- cgit From d85cf207499e6740ab9c490ff4f360af5c432d23 Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 7 Sep 2017 13:20:23 -0700 Subject: nvme: add transport SGL definitions Add transport SGL defintions from NVMe TP 4008, required for the final NVMe-FC standard. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/nvme.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 2440be32be1d..9310ce77d8e1 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -471,12 +471,14 @@ enum nvme_opcode { * * @NVME_SGL_FMT_ADDRESS: absolute address of the data block * @NVME_SGL_FMT_OFFSET: relative offset of the in-capsule data block + * @NVME_SGL_FMT_TRANSPORT_A: transport defined format, value 0xA * @NVME_SGL_FMT_INVALIDATE: RDMA transport specific remote invalidation * request subtype */ enum { NVME_SGL_FMT_ADDRESS = 0x00, NVME_SGL_FMT_OFFSET = 0x01, + NVME_SGL_FMT_TRANSPORT_A = 0x0A, NVME_SGL_FMT_INVALIDATE = 0x0f, }; @@ -490,12 +492,16 @@ enum { * * For struct nvme_keyed_sgl_desc: * @NVME_KEY_SGL_FMT_DATA_DESC: keyed data block descriptor + * + * Transport-specific SGL types: + * @NVME_TRANSPORT_SGL_DATA_DESC: Transport SGL data dlock descriptor */ enum { NVME_SGL_FMT_DATA_DESC = 0x00, NVME_SGL_FMT_SEG_DESC = 0x02, NVME_SGL_FMT_LAST_SEG_DESC = 0x03, NVME_KEY_SGL_FMT_DATA_DESC = 0x04, + NVME_TRANSPORT_SGL_DATA_DESC = 0x05, }; struct nvme_sgl_desc { -- cgit From d9d34c0b2327e85da0ad1476575264fe957fc6ef Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 7 Sep 2017 13:20:24 -0700 Subject: nvme-fc: use transport-specific sgl format Sync with NVM Express spec change and FC-NVME 1.18. FC transport sets SGL type to Transport SGL Data Block Descriptor and subtype to transport-specific value 0x0A. Removed the warn-on's on the PRP fields. They are unneeded. They were to check for values from the upper layer that weren't set right, and for the most part were fine. But, with Async events, which reuse the same structure and 2nd time issued the SGL overlay converted them to the Transport SGL values - the warn-on's were errantly firing. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/fc.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 9100779b58c9..af075e998944 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1989,16 +1989,17 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue, * as well as those by FC-NVME spec. */ WARN_ON_ONCE(sqe->common.metadata); - WARN_ON_ONCE(sqe->common.dptr.prp1); - WARN_ON_ONCE(sqe->common.dptr.prp2); sqe->common.flags |= NVME_CMD_SGL_METABUF; /* - * format SQE DPTR field per FC-NVME rules - * type=data block descr; subtype=offset; - * offset is currently 0. + * format SQE DPTR field per FC-NVME rules: + * type=0x5 Transport SGL Data Block Descriptor + * subtype=0xA Transport-specific value + * address=0 + * length=length of the data series */ - sqe->rw.dptr.sgl.type = NVME_SGL_FMT_OFFSET; + sqe->rw.dptr.sgl.type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) | + NVME_SGL_FMT_TRANSPORT_A; sqe->rw.dptr.sgl.length = cpu_to_le32(data_len); sqe->rw.dptr.sgl.addr = 0; -- cgit From deb61742e060d4447712598bc11bb50f8b2e51dd Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 11 Sep 2017 16:16:53 -0700 Subject: nvmet-fc: fix failing max io queue connections fc transport is treating NVMET_NR_QUEUES as maximum queue count, e.g. admin queue plus NVMET_NR_QUEUES-1 io queues. But NVMET_NR_QUEUES is the number of io queues, so maximum queue count is really NVMET_NR_QUEUES+1. Fix the handling in the target fc transport Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/target/fc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 088f07250d76..c48c83d97e30 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -148,7 +148,7 @@ struct nvmet_fc_tgt_assoc { u32 a_id; struct nvmet_fc_tgtport *tgtport; struct list_head a_list; - struct nvmet_fc_tgt_queue *queues[NVMET_NR_QUEUES]; + struct nvmet_fc_tgt_queue *queues[NVMET_NR_QUEUES + 1]; struct kref ref; }; @@ -608,7 +608,7 @@ nvmet_fc_alloc_target_queue(struct nvmet_fc_tgt_assoc *assoc, unsigned long flags; int ret; - if (qid >= NVMET_NR_QUEUES) + if (qid > NVMET_NR_QUEUES) return NULL; queue = kzalloc((sizeof(*queue) + @@ -888,7 +888,7 @@ nvmet_fc_delete_target_assoc(struct nvmet_fc_tgt_assoc *assoc) int i; spin_lock_irqsave(&tgtport->lock, flags); - for (i = NVMET_NR_QUEUES - 1; i >= 0; i--) { + for (i = NVMET_NR_QUEUES; i >= 0; i--) { queue = assoc->queues[i]; if (queue) { if (!nvmet_fc_tgt_q_get(queue)) -- cgit From 161b8be2bd6abad250d4b3f674bdd5480f15beeb Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 14 Sep 2017 13:54:39 -0400 Subject: nvme-pci: initialize queue memory before interrupts A spurious interrupt before the nvme driver has initialized the completion queue may inadvertently cause the driver to believe it has a completion to process. This may result in a NULL dereference since the nvmeq's tags are not set at this point. The patch initializes the host's CQ memory so that a spurious interrupt isn't mistaken for a real completion. Signed-off-by: Keith Busch Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 4a2121335f48..004018c5dccc 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1313,11 +1313,11 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) if (result < 0) goto release_cq; + nvme_init_queue(nvmeq, qid); result = queue_request_irq(nvmeq); if (result < 0) goto release_sq; - nvme_init_queue(nvmeq, qid); return result; release_sq: @@ -1464,6 +1464,7 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev) return result; nvmeq->cq_vector = 0; + nvme_init_queue(nvmeq, 0); result = queue_request_irq(nvmeq); if (result) { nvmeq->cq_vector = -1; @@ -2156,7 +2157,6 @@ static void nvme_reset_work(struct work_struct *work) if (result) goto out; - nvme_init_queue(dev->queues[0], 0); result = nvme_alloc_admin_tags(dev); if (result) goto out; -- cgit From d08774738446e77734777adcf5d1045237b4475a Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 15 Sep 2017 13:05:38 -0400 Subject: nvme-pci: Print invalid SGL only once The WARN_ONCE macro returns true if the condition is true, not if the warn was raised, so we're printing the scatter list every time it's invalid. This is excessive and makes debugging harder, so this patch prints it just once. Signed-off-by: Keith Busch Reviewed-by: Martin K. Petersen Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 004018c5dccc..cb73bc8cad3b 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -540,6 +541,20 @@ static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi) } #endif +static void nvme_print_sgl(struct scatterlist *sgl, int nents) +{ + int i; + struct scatterlist *sg; + + for_each_sg(sgl, sg, nents, i) { + dma_addr_t phys = sg_phys(sg); + pr_warn("sg[%d] phys_addr:%pad offset:%d length:%d " + "dma_address:%pad dma_length:%d\n", + i, &phys, sg->offset, sg->length, &sg_dma_address(sg), + sg_dma_len(sg)); + } +} + static blk_status_t nvme_setup_prps(struct nvme_dev *dev, struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); @@ -622,19 +637,10 @@ static blk_status_t nvme_setup_prps(struct nvme_dev *dev, struct request *req) return BLK_STS_OK; bad_sgl: - if (WARN_ONCE(1, "Invalid SGL for payload:%d nents:%d\n", - blk_rq_payload_bytes(req), iod->nents)) { - for_each_sg(iod->sg, sg, iod->nents, i) { - dma_addr_t phys = sg_phys(sg); - pr_warn("sg[%d] phys_addr:%pad offset:%d length:%d " - "dma_address:%pad dma_length:%d\n", i, &phys, - sg->offset, sg->length, - &sg_dma_address(sg), - sg_dma_len(sg)); - } - } + WARN(DO_ONCE(nvme_print_sgl, iod->sg, iod->nents), + "Invalid SGL for payload:%d nents:%d\n", + blk_rq_payload_bytes(req), iod->nents); return BLK_STS_IOERR; - } static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, -- cgit From cd48282cc736377d5abf7c04de8c6ba864ba3794 Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 14 Sep 2017 11:03:09 -0700 Subject: nvme: stop aer posting if controller state not live If an nvme async_event command completes, in most cases, a new async event is posted. However, if the controller enters a resetting or reconnecting state, there is nothing to block the scheduled work element from posting the async event again. Nor are there calls from the transport to stop async events when an association dies. In the case of FC, where the association is torn down, the aer must be aborted on the FC link and completes through the normal job completion path. Thus the terminated async event ends up being rescheduled even though the controller isn't in a valid state for the aer, and the reposting gets the transport into a partially torn down data structure. It's possible to hit the scenario on rdma, although much less likely due to an aer completing right as the association is terminated and as the association teardown reclaims the blk requests via nvme_cancel_request() so its immediate, not a link-related action like on FC. Fix by putting controller state checks in both the async event completion routine where it schedules the async event and in the async event work routine before it calls into the transport. It's effectively a "stop_async_events()" behavior. The transport, when it creates a new association with the subsystem will transition the state back to live and is already restarting the async event posting. Signed-off-by: James Smart [hch: remove taking a lock over reading the controller state] Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index acc816b67582..d470f031e27f 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2590,7 +2590,7 @@ static void nvme_async_event_work(struct work_struct *work) container_of(work, struct nvme_ctrl, async_event_work); spin_lock_irq(&ctrl->lock); - while (ctrl->event_limit > 0) { + while (ctrl->state == NVME_CTRL_LIVE && ctrl->event_limit > 0) { int aer_idx = --ctrl->event_limit; spin_unlock_irq(&ctrl->lock); @@ -2677,7 +2677,8 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, /*FALLTHRU*/ case NVME_SC_ABORT_REQ: ++ctrl->event_limit; - queue_work(nvme_wq, &ctrl->async_event_work); + if (ctrl->state == NVME_CTRL_LIVE) + schedule_work(&ctrl->async_event_work); break; default: break; -- cgit From 0951338d9677f546e230685d68631dfd3f81cca5 Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 7 Sep 2017 13:18:04 -0700 Subject: nvme: allow timed-out ios to retry Currently the nvme_req_needs_retry() applies several checks to see if a retry is allowed. On of those is whether the current time has exceeded the start time of the io plus the timeout length. This check, if an io times out, means there is never a retry allowed for the io. Which means applications see the io failure. Remove this check and allow the io to timeout, like it does on other protocols, and retries to be made. On the FC transport, a frame can be lost for an individual io, and there may be no other errors that escalate for the connection/association. The io will timeout, which causes the transport to escalate into creating a new association, but the io that timed out, due to this retry logic, has already failed back to the application and things are hosed. Signed-off-by: James Smart Reviewed-by: Keith Busch Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index d470f031e27f..5589f67d2cd8 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -134,8 +134,6 @@ static inline bool nvme_req_needs_retry(struct request *req) return false; if (nvme_req(req)->status & NVME_SC_DNR) return false; - if (jiffies - req->start_time >= req->timeout) - return false; if (nvme_req(req)->retries >= nvme_max_retries) return false; return true; -- cgit From 8edd11c9ad3a6205eea6de9d02eaf64c681a0658 Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Thu, 14 Sep 2017 13:59:28 -0300 Subject: nvme-fabrics: Allow 0 as KATO value Currently, driver code allows user to set 0 as KATO (Keep Alive TimeOut), but this is not being respected. This patch enforces the expected behavior. Signed-off-by: Guilherme G. Piccoli Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/fabrics.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 47307752dc65..555c976cc2ee 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -565,6 +565,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, opts->queue_size = NVMF_DEF_QUEUE_SIZE; opts->nr_io_queues = num_online_cpus(); opts->reconnect_delay = NVMF_DEF_RECONNECT_DELAY; + opts->kato = NVME_DEFAULT_KATO; options = o = kstrdup(buf, GFP_KERNEL); if (!options) @@ -655,21 +656,22 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, goto out; } - if (opts->discovery_nqn) { - pr_err("Discovery controllers cannot accept keep_alive_tmo != 0\n"); - ret = -EINVAL; - goto out; - } - if (token < 0) { pr_err("Invalid keep_alive_tmo %d\n", token); ret = -EINVAL; goto out; - } else if (token == 0) { + } else if (token == 0 && !opts->discovery_nqn) { /* Allowed for debug */ pr_warn("keep_alive_tmo 0 won't execute keep alives!!!\n"); } opts->kato = token; + + if (opts->discovery_nqn && opts->kato) { + pr_err("Discovery controllers cannot accept KATO != 0\n"); + ret = -EINVAL; + goto out; + } + break; case NVMF_OPT_CTRL_LOSS_TMO: if (match_int(args, &token)) { @@ -762,8 +764,6 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, uuid_copy(&opts->host->id, &hostid); out: - if (!opts->discovery_nqn && !opts->kato) - opts->kato = NVME_DEFAULT_KATO; kfree(options); return ret; } -- cgit From bb1cc74790eb51f52d23c6e5fd9a3bb16030c3d8 Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 18 Sep 2017 09:08:29 -0700 Subject: nvmet: implement valid sqhd values in completions To support sqhd, for initiators that are following the spec and paying attention to sqhd vs their sqtail values: - add sqhd to struct nvmet_sq - initialize sqhd to 0 in nvmet_sq_setup - rather than propagate the 0's-based qsize value from the connect message which requires a +1 in every sqhd update, and as nothing else references it, convert to 1's-based value in nvmt_sq/cq_setup() calls. - validate connect message sqsize being non-zero per spec. - updated assign sqhd for every completion that goes back. Also remove handling the NULL sq case in __nvmet_req_complete, as it can't happen with the current code. Signed-off-by: James Smart Reviewed-by: Sagi Grimberg Reviewed-by: Max Gurtovoy Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/target/core.c | 8 ++++---- drivers/nvme/target/fabrics-cmd.c | 9 +++++++-- drivers/nvme/target/nvmet.h | 1 + 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 7c23eaf8e563..c2a768a94235 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -390,10 +390,9 @@ static void __nvmet_req_complete(struct nvmet_req *req, u16 status) if (status) nvmet_set_status(req, status); - /* XXX: need to fill in something useful for sq_head */ - req->rsp->sq_head = 0; - if (likely(req->sq)) /* may happen during early failure */ - req->rsp->sq_id = cpu_to_le16(req->sq->qid); + req->sq->sqhd = (req->sq->sqhd + 1) % req->sq->size; + req->rsp->sq_head = cpu_to_le16(req->sq->sqhd); + req->rsp->sq_id = cpu_to_le16(req->sq->qid); req->rsp->command_id = req->cmd->common.command_id; if (req->ns) @@ -420,6 +419,7 @@ void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, void nvmet_sq_setup(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, u16 qid, u16 size) { + sq->sqhd = 0; sq->qid = qid; sq->size = size; diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c index 859a66725291..db3bf6b8bf9e 100644 --- a/drivers/nvme/target/fabrics-cmd.c +++ b/drivers/nvme/target/fabrics-cmd.c @@ -109,9 +109,14 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req) pr_warn("queue already connected!\n"); return NVME_SC_CONNECT_CTRL_BUSY | NVME_SC_DNR; } + if (!sqsize) { + pr_warn("queue size zero!\n"); + return NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; + } - nvmet_cq_setup(ctrl, req->cq, qid, sqsize); - nvmet_sq_setup(ctrl, req->sq, qid, sqsize); + /* note: convert queue size from 0's-based value to 1's-based value */ + nvmet_cq_setup(ctrl, req->cq, qid, sqsize + 1); + nvmet_sq_setup(ctrl, req->sq, qid, sqsize + 1); return 0; } diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 7d261ab894f4..7b8e20adf760 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -74,6 +74,7 @@ struct nvmet_sq { struct percpu_ref ref; u16 qid; u16 size; + u16 sqhd; struct completion free_done; struct completion confirm_done; }; -- cgit From 332391a9935da939319e473b4680e173df75afcf Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Thu, 21 Sep 2017 08:16:29 -0600 Subject: fs: Fix page cache inconsistency when mixing buffered and AIO DIO Currently when mixing buffered reads and asynchronous direct writes it is possible to end up with the situation where we have stale data in the page cache while the new data is already written to disk. This is permanent until the affected pages are flushed away. Despite the fact that mixing buffered and direct IO is ill-advised it does pose a thread for a data integrity, is unexpected and should be fixed. Fix this by deferring completion of asynchronous direct writes to a process context in the case that there are mapped pages to be found in the inode. Later before the completion in dio_complete() invalidate the pages in question. This ensures that after the completion the pages in the written area are either unmapped, or populated with up-to-date data. Also do the same for the iomap case which uses iomap_dio_complete() instead. This has a side effect of deferring the completion to a process context for every AIO DIO that happens on inode that has pages mapped. However since the consensus is that this is ill-advised practice the performance implication should not be a problem. This was based on proposal from Jeff Moyer, thanks! Reviewed-by: Jan Kara Reviewed-by: Darrick J. Wong Reviewed-by: Jeff Moyer Signed-off-by: Lukas Czerner Signed-off-by: Jens Axboe --- fs/direct-io.c | 49 +++++++++++++++++++++++++++++++++++++++++++------ fs/iomap.c | 29 ++++++++++++++++------------- mm/filemap.c | 10 ++++++++-- 3 files changed, 67 insertions(+), 21 deletions(-) diff --git a/fs/direct-io.c b/fs/direct-io.c index 5fa2211e49ae..62cf812ed0e5 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -229,6 +229,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async) { loff_t offset = dio->iocb->ki_pos; ssize_t transferred = 0; + int err; /* * AIO submission can race with bio completion to get here while @@ -258,8 +259,22 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async) if (ret == 0) ret = transferred; + /* + * Try again to invalidate clean pages which might have been cached by + * non-direct readahead, or faulted in by get_user_pages() if the source + * of the write was an mmap'ed region of the file we're writing. Either + * one is a pretty crazy thing to do, so we don't support it 100%. If + * this invalidation fails, tough, the write still worked... + */ + if (ret > 0 && dio->op == REQ_OP_WRITE && + dio->inode->i_mapping->nrpages) { + err = invalidate_inode_pages2_range(dio->inode->i_mapping, + offset >> PAGE_SHIFT, + (offset + ret - 1) >> PAGE_SHIFT); + WARN_ON_ONCE(err); + } + if (dio->end_io) { - int err; // XXX: ki_pos?? err = dio->end_io(dio->iocb, offset, ret, dio->private); @@ -304,6 +319,7 @@ static void dio_bio_end_aio(struct bio *bio) struct dio *dio = bio->bi_private; unsigned long remaining; unsigned long flags; + bool defer_completion = false; /* cleanup the bio */ dio_bio_complete(dio, bio); @@ -315,7 +331,19 @@ static void dio_bio_end_aio(struct bio *bio) spin_unlock_irqrestore(&dio->bio_lock, flags); if (remaining == 0) { - if (dio->result && dio->defer_completion) { + /* + * Defer completion when defer_completion is set or + * when the inode has pages mapped and this is AIO write. + * We need to invalidate those pages because there is a + * chance they contain stale data in the case buffered IO + * went in between AIO submission and completion into the + * same region. + */ + if (dio->result) + defer_completion = dio->defer_completion || + (dio->op == REQ_OP_WRITE && + dio->inode->i_mapping->nrpages); + if (defer_completion) { INIT_WORK(&dio->complete_work, dio_aio_complete_work); queue_work(dio->inode->i_sb->s_dio_done_wq, &dio->complete_work); @@ -1210,10 +1238,19 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, * For AIO O_(D)SYNC writes we need to defer completions to a workqueue * so that we can call ->fsync. */ - if (dio->is_async && iov_iter_rw(iter) == WRITE && - ((iocb->ki_filp->f_flags & O_DSYNC) || - IS_SYNC(iocb->ki_filp->f_mapping->host))) { - retval = dio_set_defer_completion(dio); + if (dio->is_async && iov_iter_rw(iter) == WRITE) { + retval = 0; + if ((iocb->ki_filp->f_flags & O_DSYNC) || + IS_SYNC(iocb->ki_filp->f_mapping->host)) + retval = dio_set_defer_completion(dio); + else if (!dio->inode->i_sb->s_dio_done_wq) { + /* + * In case of AIO write racing with buffered read we + * need to defer completion. We can't decide this now, + * however the workqueue needs to be initialized here. + */ + retval = sb_init_dio_done_wq(dio->inode->i_sb); + } if (retval) { /* * We grab i_mutex only for reads so we don't have diff --git a/fs/iomap.c b/fs/iomap.c index 269b24a01f32..8194d30bdca0 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -713,8 +713,24 @@ struct iomap_dio { static ssize_t iomap_dio_complete(struct iomap_dio *dio) { struct kiocb *iocb = dio->iocb; + struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; + /* + * Try again to invalidate clean pages which might have been cached by + * non-direct readahead, or faulted in by get_user_pages() if the source + * of the write was an mmap'ed region of the file we're writing. Either + * one is a pretty crazy thing to do, so we don't support it 100%. If + * this invalidation fails, tough, the write still worked... + */ + if (!dio->error && + (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) { + ret = invalidate_inode_pages2_range(inode->i_mapping, + iocb->ki_pos >> PAGE_SHIFT, + (iocb->ki_pos + dio->size - 1) >> PAGE_SHIFT); + WARN_ON_ONCE(ret); + } + if (dio->end_io) { ret = dio->end_io(iocb, dio->error ? dio->error : dio->size, @@ -1042,19 +1058,6 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, ret = iomap_dio_complete(dio); - /* - * Try again to invalidate clean pages which might have been cached by - * non-direct readahead, or faulted in by get_user_pages() if the source - * of the write was an mmap'ed region of the file we're writing. Either - * one is a pretty crazy thing to do, so we don't support it 100%. If - * this invalidation fails, tough, the write still worked... - */ - if (iov_iter_rw(iter) == WRITE) { - int err = invalidate_inode_pages2_range(mapping, - start >> PAGE_SHIFT, end >> PAGE_SHIFT); - WARN_ON_ONCE(err); - } - return ret; out_free_dio: diff --git a/mm/filemap.c b/mm/filemap.c index 870971e20967..db250d0e0565 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2926,9 +2926,15 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) * we're writing. Either one is a pretty crazy thing to do, * so we don't support it 100%. If this invalidation * fails, tough, the write still worked... + * + * Most of the time we do not need this since dio_complete() will do + * the invalidation for us. However there are some file systems that + * do not end up with dio_complete() being called, so let's not break + * them by removing it completely */ - invalidate_inode_pages2_range(mapping, - pos >> PAGE_SHIFT, end); + if (mapping->nrpages) + invalidate_inode_pages2_range(mapping, + pos >> PAGE_SHIFT, end); if (written > 0) { pos += written; -- cgit From f5c156c4c29a3d87176dd6e5c099388e187ec29b Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 21 Sep 2017 12:17:16 -0700 Subject: block: fix a crash caused by wrong API part_stat_show takes a part device not a disk, so we should use part_to_disk. Fixes: d62e26b3ffd2("block: pass in queue to inflight accounting") Cc: Bart Van Assche Cc: Omar Sandoval Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- block/partition-generic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/partition-generic.c b/block/partition-generic.c index 86e8fe1adcdb..88c555db4e5d 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -112,7 +112,7 @@ ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, char *buf) { struct hd_struct *p = dev_to_part(dev); - struct request_queue *q = dev_to_disk(dev)->queue; + struct request_queue *q = part_to_disk(p)->queue; unsigned int inflight[2]; int cpu; -- cgit From 62e082430ea4bb5b28909ca4375bb683931e22aa Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 20 Sep 2017 07:29:49 -0400 Subject: dm ioctl: fix alignment of event number in the device list The size of struct dm_name_list is different on 32-bit and 64-bit kernels (so "(nl + 1)" differs between 32-bit and 64-bit kernels). This mismatch caused some harmless difference in padding when using 32-bit or 64-bit kernel. Commit 23d70c5e52dd ("dm ioctl: report event number in DM_LIST_DEVICES") added reporting event number in the output of DM_LIST_DEVICES_CMD. This difference in padding makes it impossible for userspace to determine the location of the event number (the location would be different when running on 32-bit and 64-bit kernels). Fix the padding by using offsetof(struct dm_name_list, name) instead of sizeof(struct dm_name_list) to determine the location of entries. Also, the ioctl version number is incremented to 37 so that userspace can use the version number to determine that the event number is present and correctly located. In addition, a global event is now raised when a DM device is created, removed, renamed or when table is swapped, so that the user can monitor for device changes. Reported-by: Eugene Syromiatnikov Fixes: 23d70c5e52dd ("dm ioctl: report event number in DM_LIST_DEVICES") Cc: stable@vger.kernel.org # 4.13 Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-core.h | 1 + drivers/md/dm-ioctl.c | 37 ++++++++++++++++++++++++------------- drivers/md/dm.c | 10 ++++++++-- include/uapi/linux/dm-ioctl.h | 4 ++-- 4 files changed, 35 insertions(+), 17 deletions(-) diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index 24eddbdf2ab4..203144762f36 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -149,5 +149,6 @@ static inline bool dm_message_test_buffer_overflow(char *result, unsigned maxlen extern atomic_t dm_global_event_nr; extern wait_queue_head_t dm_global_eventq; +void dm_issue_global_event(void); #endif diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 8756a6850431..e52676fa9832 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -477,9 +477,13 @@ static int remove_all(struct file *filp, struct dm_ioctl *param, size_t param_si * Round up the ptr to an 8-byte boundary. */ #define ALIGN_MASK 7 +static inline size_t align_val(size_t val) +{ + return (val + ALIGN_MASK) & ~ALIGN_MASK; +} static inline void *align_ptr(void *ptr) { - return (void *) (((size_t) (ptr + ALIGN_MASK)) & ~ALIGN_MASK); + return (void *)align_val((size_t)ptr); } /* @@ -505,7 +509,7 @@ static int list_devices(struct file *filp, struct dm_ioctl *param, size_t param_ struct hash_cell *hc; size_t len, needed = 0; struct gendisk *disk; - struct dm_name_list *nl, *old_nl = NULL; + struct dm_name_list *orig_nl, *nl, *old_nl = NULL; uint32_t *event_nr; down_write(&_hash_lock); @@ -516,17 +520,15 @@ static int list_devices(struct file *filp, struct dm_ioctl *param, size_t param_ */ for (i = 0; i < NUM_BUCKETS; i++) { list_for_each_entry (hc, _name_buckets + i, name_list) { - needed += sizeof(struct dm_name_list); - needed += strlen(hc->name) + 1; - needed += ALIGN_MASK; - needed += (sizeof(uint32_t) + ALIGN_MASK) & ~ALIGN_MASK; + needed += align_val(offsetof(struct dm_name_list, name) + strlen(hc->name) + 1); + needed += align_val(sizeof(uint32_t)); } } /* * Grab our output buffer. */ - nl = get_result_buffer(param, param_size, &len); + nl = orig_nl = get_result_buffer(param, param_size, &len); if (len < needed) { param->flags |= DM_BUFFER_FULL_FLAG; goto out; @@ -549,11 +551,16 @@ static int list_devices(struct file *filp, struct dm_ioctl *param, size_t param_ strcpy(nl->name, hc->name); old_nl = nl; - event_nr = align_ptr(((void *) (nl + 1)) + strlen(hc->name) + 1); + event_nr = align_ptr(nl->name + strlen(hc->name) + 1); *event_nr = dm_get_event_nr(hc->md); nl = align_ptr(event_nr + 1); } } + /* + * If mismatch happens, security may be compromised due to buffer + * overflow, so it's better to crash. + */ + BUG_ON((char *)nl - (char *)orig_nl != needed); out: up_write(&_hash_lock); @@ -1621,7 +1628,8 @@ static int target_message(struct file *filp, struct dm_ioctl *param, size_t para * which has a variable size, is not used by the function processing * the ioctl. */ -#define IOCTL_FLAGS_NO_PARAMS 1 +#define IOCTL_FLAGS_NO_PARAMS 1 +#define IOCTL_FLAGS_ISSUE_GLOBAL_EVENT 2 /*----------------------------------------------------------------- * Implementation of open/close/ioctl on the special char @@ -1635,12 +1643,12 @@ static ioctl_fn lookup_ioctl(unsigned int cmd, int *ioctl_flags) ioctl_fn fn; } _ioctls[] = { {DM_VERSION_CMD, 0, NULL}, /* version is dealt with elsewhere */ - {DM_REMOVE_ALL_CMD, IOCTL_FLAGS_NO_PARAMS, remove_all}, + {DM_REMOVE_ALL_CMD, IOCTL_FLAGS_NO_PARAMS | IOCTL_FLAGS_ISSUE_GLOBAL_EVENT, remove_all}, {DM_LIST_DEVICES_CMD, 0, list_devices}, - {DM_DEV_CREATE_CMD, IOCTL_FLAGS_NO_PARAMS, dev_create}, - {DM_DEV_REMOVE_CMD, IOCTL_FLAGS_NO_PARAMS, dev_remove}, - {DM_DEV_RENAME_CMD, 0, dev_rename}, + {DM_DEV_CREATE_CMD, IOCTL_FLAGS_NO_PARAMS | IOCTL_FLAGS_ISSUE_GLOBAL_EVENT, dev_create}, + {DM_DEV_REMOVE_CMD, IOCTL_FLAGS_NO_PARAMS | IOCTL_FLAGS_ISSUE_GLOBAL_EVENT, dev_remove}, + {DM_DEV_RENAME_CMD, IOCTL_FLAGS_ISSUE_GLOBAL_EVENT, dev_rename}, {DM_DEV_SUSPEND_CMD, IOCTL_FLAGS_NO_PARAMS, dev_suspend}, {DM_DEV_STATUS_CMD, IOCTL_FLAGS_NO_PARAMS, dev_status}, {DM_DEV_WAIT_CMD, 0, dev_wait}, @@ -1869,6 +1877,9 @@ static int ctl_ioctl(struct file *file, uint command, struct dm_ioctl __user *us unlikely(ioctl_flags & IOCTL_FLAGS_NO_PARAMS)) DMERR("ioctl %d tried to output some data but has IOCTL_FLAGS_NO_PARAMS set", cmd); + if (!r && ioctl_flags & IOCTL_FLAGS_ISSUE_GLOBAL_EVENT) + dm_issue_global_event(); + /* * Copy the results back to userland. */ diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 6e54145969c5..4be85324f44d 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -52,6 +52,12 @@ static struct workqueue_struct *deferred_remove_workqueue; atomic_t dm_global_event_nr = ATOMIC_INIT(0); DECLARE_WAIT_QUEUE_HEAD(dm_global_eventq); +void dm_issue_global_event(void) +{ + atomic_inc(&dm_global_event_nr); + wake_up(&dm_global_eventq); +} + /* * One of these is allocated per bio. */ @@ -1865,9 +1871,8 @@ static void event_callback(void *context) dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj); atomic_inc(&md->event_nr); - atomic_inc(&dm_global_event_nr); wake_up(&md->eventq); - wake_up(&dm_global_eventq); + dm_issue_global_event(); } /* @@ -2283,6 +2288,7 @@ struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) } map = __bind(md, table, &limits); + dm_issue_global_event(); out: mutex_unlock(&md->suspend_lock); diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h index 412c06a624c8..ccaea525340b 100644 --- a/include/uapi/linux/dm-ioctl.h +++ b/include/uapi/linux/dm-ioctl.h @@ -269,9 +269,9 @@ enum { #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) #define DM_VERSION_MAJOR 4 -#define DM_VERSION_MINOR 36 +#define DM_VERSION_MINOR 37 #define DM_VERSION_PATCHLEVEL 0 -#define DM_VERSION_EXTRA "-ioctl (2017-06-09)" +#define DM_VERSION_EXTRA "-ioctl (2017-09-20)" /* Status bits */ #define DM_READONLY_FLAG (1 << 0) /* In/Out */ -- cgit From 9789e7e93f2b892098d7684ac8131092aa617814 Mon Sep 17 00:00:00 2001 From: Mengting Zhang Date: Sat, 23 Sep 2017 16:18:14 +0800 Subject: perf report: Fix debug messages with --call-graph option With --call-graph option, perf report can display call chains using type, min percent threshold, optional print limit and order. And the default call-graph parameter is 'graph,0.5,caller,function,percent'. Before this patch, 'perf report --call-graph' shows incorrect debug messages as below: # perf report --call-graph Invalid callchain mode: 0.5 Invalid callchain order: 0.5 Invalid callchain sort key: 0.5 Invalid callchain config key: 0.5 Invalid callchain mode: caller Invalid callchain mode: function Invalid callchain order: function Invalid callchain mode: percent Invalid callchain order: percent Invalid callchain sort key: percent That is because in function __parse_callchain_report_opt(),each field of the call-graph parameter is passed to parse_callchain_{mode,order, sort_key,value} in turn until it meets the matching value. For example, the order field "caller" is passed to parse_callchain_mode() firstly and obviously it doesn't match any mode field. Therefore parse_callchain_mode() will shows the debug message "Invalid callchain mode: caller", which could confuse users. The patch fixes this issue by moving the warning out of the function parse_callchain_{mode,order,sort_key,value}. Signed-off-by: Mengting Zhang Acked-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Andi Kleen Cc: Krister Johansen Cc: Li Bin Cc: Milian Wolff Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Wang Nan Cc: Yao Jin Link: http://lkml.kernel.org/r/1506154694-39691-1-git-send-email-zhangmengting@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/callchain.c | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index 510b513e0f01..be09d77cade0 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -65,8 +65,6 @@ static int parse_callchain_mode(const char *value) callchain_param.mode = CHAIN_FOLDED; return 0; } - - pr_err("Invalid callchain mode: %s\n", value); return -1; } @@ -82,8 +80,6 @@ static int parse_callchain_order(const char *value) callchain_param.order_set = true; return 0; } - - pr_err("Invalid callchain order: %s\n", value); return -1; } @@ -105,8 +101,6 @@ static int parse_callchain_sort_key(const char *value) callchain_param.branch_callstack = 1; return 0; } - - pr_err("Invalid callchain sort key: %s\n", value); return -1; } @@ -124,8 +118,6 @@ static int parse_callchain_value(const char *value) callchain_param.value = CCVAL_COUNT; return 0; } - - pr_err("Invalid callchain config key: %s\n", value); return -1; } @@ -319,12 +311,27 @@ int perf_callchain_config(const char *var, const char *value) return ret; } - if (!strcmp(var, "print-type")) - return parse_callchain_mode(value); - if (!strcmp(var, "order")) - return parse_callchain_order(value); - if (!strcmp(var, "sort-key")) - return parse_callchain_sort_key(value); + if (!strcmp(var, "print-type")){ + int ret; + ret = parse_callchain_mode(value); + if (ret == -1) + pr_err("Invalid callchain mode: %s\n", value); + return ret; + } + if (!strcmp(var, "order")){ + int ret; + ret = parse_callchain_order(value); + if (ret == -1) + pr_err("Invalid callchain order: %s\n", value); + return ret; + } + if (!strcmp(var, "sort-key")){ + int ret; + ret = parse_callchain_sort_key(value); + if (ret == -1) + pr_err("Invalid callchain sort key: %s\n", value); + return ret; + } if (!strcmp(var, "threshold")) { callchain_param.min_percent = strtod(value, &endptr); if (value == endptr) { -- cgit From 090657c9fb7094e4c1b05c1713d6c2a12ef43dea Mon Sep 17 00:00:00 2001 From: Akemi Yagi Date: Fri, 22 Sep 2017 22:11:53 +0000 Subject: perf tools: Fix syscalltbl build failure The build of kernel v4.14-rc1 for i686 fails on RHEL 6 with the error in tools/perf: util/syscalltbl.c:157: error: expected ';', ',' or ')' before '__maybe_unused' mv: cannot stat `util/.syscalltbl.o.tmp': No such file or directory Fix it by placing/moving: #include outside of #ifdef HAVE_SYSCALL_TABLE block. Signed-off-by: Akemi Yagi Cc: Alan Bartlett Link: http://lkml.kernel.org/r/oq41r8$1v9$1@blaine.gmane.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/syscalltbl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/syscalltbl.c b/tools/perf/util/syscalltbl.c index 19e5db90394c..6eea7cff3d4e 100644 --- a/tools/perf/util/syscalltbl.c +++ b/tools/perf/util/syscalltbl.c @@ -15,9 +15,9 @@ #include "syscalltbl.h" #include +#include #ifdef HAVE_SYSCALL_TABLE -#include #include #include "string2.h" #include "util.h" -- cgit From 78b1beb0998437107ed144b341fbe1252188916b Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 24 Sep 2017 21:46:29 +0300 Subject: IB/core: Fix typo in the name of the tag-matching cap struct The tag matching functionality is implemented by mlx5 driver by extending XRQ, however this internal kernel information was exposed to user space applications with *xrq* name instead of *tm*. This patch renames *xrq* to *tm* to handle that. Fixes: 8d50505ada72 ("IB/uverbs: Expose XRQ capabilities") Signed-off-by: Leon Romanovsky Reviewed-by: Yishai Hadas Signed-off-by: Doug Ledford --- drivers/infiniband/core/uverbs_cmd.c | 14 +++++++------- drivers/infiniband/hw/mlx5/main.c | 10 +++++----- include/rdma/ib_verbs.h | 4 ++-- include/uapi/rdma/ib_user_verbs.h | 2 +- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 4ab30d832ac5..52a2cf2d83aa 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -3869,15 +3869,15 @@ int ib_uverbs_ex_query_device(struct ib_uverbs_file *file, resp.raw_packet_caps = attr.raw_packet_caps; resp.response_length += sizeof(resp.raw_packet_caps); - if (ucore->outlen < resp.response_length + sizeof(resp.xrq_caps)) + if (ucore->outlen < resp.response_length + sizeof(resp.tm_caps)) goto end; - resp.xrq_caps.max_rndv_hdr_size = attr.xrq_caps.max_rndv_hdr_size; - resp.xrq_caps.max_num_tags = attr.xrq_caps.max_num_tags; - resp.xrq_caps.max_ops = attr.xrq_caps.max_ops; - resp.xrq_caps.max_sge = attr.xrq_caps.max_sge; - resp.xrq_caps.flags = attr.xrq_caps.flags; - resp.response_length += sizeof(resp.xrq_caps); + resp.tm_caps.max_rndv_hdr_size = attr.tm_caps.max_rndv_hdr_size; + resp.tm_caps.max_num_tags = attr.tm_caps.max_num_tags; + resp.tm_caps.max_ops = attr.tm_caps.max_ops; + resp.tm_caps.max_sge = attr.tm_caps.max_sge; + resp.tm_caps.flags = attr.tm_caps.flags; + resp.response_length += sizeof(resp.tm_caps); end: err = ib_copy_to_udata(ucore, &resp, resp.response_length); return err; diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 05fb4bdff6a0..d6fbad8f34aa 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -778,13 +778,13 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, } if (MLX5_CAP_GEN(mdev, tag_matching)) { - props->xrq_caps.max_rndv_hdr_size = MLX5_TM_MAX_RNDV_MSG_SIZE; - props->xrq_caps.max_num_tags = + props->tm_caps.max_rndv_hdr_size = MLX5_TM_MAX_RNDV_MSG_SIZE; + props->tm_caps.max_num_tags = (1 << MLX5_CAP_GEN(mdev, log_tag_matching_list_sz)) - 1; - props->xrq_caps.flags = IB_TM_CAP_RC; - props->xrq_caps.max_ops = + props->tm_caps.flags = IB_TM_CAP_RC; + props->tm_caps.max_ops = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz); - props->xrq_caps.max_sge = MLX5_TM_MAX_SGE; + props->tm_caps.max_sge = MLX5_TM_MAX_SGE; } if (field_avail(typeof(resp), cqe_comp_caps, uhw->outlen)) { diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index bdb1279a415b..bbb5f54db882 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -285,7 +285,7 @@ enum ib_tm_cap_flags { IB_TM_CAP_RC = 1 << 0, }; -struct ib_xrq_caps { +struct ib_tm_caps { /* Max size of RNDV header */ u32 max_rndv_hdr_size; /* Max number of entries in tag matching list */ @@ -358,7 +358,7 @@ struct ib_device_attr { struct ib_rss_caps rss_caps; u32 max_wq_type_rq; u32 raw_packet_caps; /* Use ib_raw_packet_caps enum */ - struct ib_xrq_caps xrq_caps; + struct ib_tm_caps tm_caps; }; enum ib_mtu { diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index 9a0b6479fe0c..d4e0b53bfc75 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -261,7 +261,7 @@ struct ib_uverbs_ex_query_device_resp { struct ib_uverbs_rss_caps rss_caps; __u32 max_wq_type_rq; __u32 raw_packet_caps; - struct ib_uverbs_tm_caps xrq_caps; + struct ib_uverbs_tm_caps tm_caps; }; struct ib_uverbs_query_port { -- cgit From 73827a605bbd7cebef4cfd1261e497246a82a0e7 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Sun, 24 Sep 2017 21:46:30 +0300 Subject: IB/core: Fix qp_sec use after free access When security_ib_alloc_security fails, qp->qp_sec memory is freed. However ib_destroy_qp still tries to access this memory which result in kernel crash. So its initialized to NULL to avoid such access. Fixes: d291f1a65232 ("IB/core: Enforce PKey security on QPs") Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/security.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/security.c b/drivers/infiniband/core/security.c index 70ad19c4c73e..88bdafb297f5 100644 --- a/drivers/infiniband/core/security.c +++ b/drivers/infiniband/core/security.c @@ -432,8 +432,10 @@ int ib_create_qp_security(struct ib_qp *qp, struct ib_device *dev) atomic_set(&qp->qp_sec->error_list_count, 0); init_completion(&qp->qp_sec->error_complete); ret = security_ib_alloc_security(&qp->qp_sec->security); - if (ret) + if (ret) { kfree(qp->qp_sec); + qp->qp_sec = NULL; + } return ret; } -- cgit From edd31551148c09608feee6b8756ad148d550ee3b Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Sun, 24 Sep 2017 21:46:31 +0300 Subject: IB: Correct MR length field to be 64-bit The ib_mr->length represents the length of the MR in bytes as per the IBTA spec 1.3 section 11.2.10.3 (REGISTER PHYSICAL MEMORY REGION). Currently ib_mr->length field is defined as only 32-bits field. This might result into truncation and failed WRs of consumers who registers more than 4GB bytes memory regions and whose WRs accessing such MRs. This patch makes the length 64-bit to avoid such truncation. Cc: Sagi Grimberg Cc: Chuck Lever Cc: Faisal Latif Fixes: 4c67e2bfc8b7 ("IB/core: Introduce new fast registration API") Signed-off-by: Ilya Lesokhin Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/nes/nes_verbs.c | 4 ++-- drivers/infiniband/ulp/iser/iser_memory.c | 2 +- include/rdma/ib_verbs.h | 2 +- net/sunrpc/xprtrdma/frwr_ops.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index f0dc5f4aa177..442b9bdc0f03 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -3232,7 +3232,7 @@ static int nes_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr, mr->ibmr.iova); set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_FMR_WQE_LENGTH_LOW_IDX, - mr->ibmr.length); + lower_32_bits(mr->ibmr.length)); set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_FMR_WQE_LENGTH_HIGH_IDX, 0); set_wqe_32bit_value(wqe->wqe_words, @@ -3274,7 +3274,7 @@ static int nes_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr, mr->npages * 8); nes_debug(NES_DBG_IW_TX, "SQ_REG_MR: iova_start: %llx, " - "length: %d, rkey: %0x, pgl_paddr: %llx, " + "length: %lld, rkey: %0x, pgl_paddr: %llx, " "page_list_len: %u, wqe_misc: %x\n", (unsigned long long) mr->ibmr.iova, mr->ibmr.length, diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c index 9c3e9ab53a41..322209d5ff58 100644 --- a/drivers/infiniband/ulp/iser/iser_memory.c +++ b/drivers/infiniband/ulp/iser/iser_memory.c @@ -154,7 +154,7 @@ static void iser_dump_page_vec(struct iser_page_vec *page_vec) { int i; - iser_err("page vec npages %d data length %d\n", + iser_err("page vec npages %d data length %lld\n", page_vec->npages, page_vec->fake_mr.length); for (i = 0; i < page_vec->npages; i++) iser_err("vec[%d]: %llx\n", i, page_vec->pages[i]); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index bbb5f54db882..e8608b2dc844 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1739,7 +1739,7 @@ struct ib_mr { u32 lkey; u32 rkey; u64 iova; - u32 length; + u64 length; unsigned int page_size; bool need_inval; union { diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 5a936a6a31a3..df062e086bdb 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -401,7 +401,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, if (unlikely(n != mw->mw_nents)) goto out_mapmr_err; - dprintk("RPC: %s: Using frmr %p to map %u segments (%u bytes)\n", + dprintk("RPC: %s: Using frmr %p to map %u segments (%llu bytes)\n", __func__, frmr, mw->mw_nents, mr->length); key = (u8)(mr->rkey & 0x000000FF); -- cgit From 9c6f42e9254150d2772242d9f8bd8d0b7b7431ff Mon Sep 17 00:00:00 2001 From: Shalom Lagziel Date: Sun, 24 Sep 2017 21:46:32 +0300 Subject: IB/ipoib: Fix sysfs Pkey create<->remove possible deadlock A possible ABBA lock can happen with RTNL and vlan_rwsem. For example: Flow A: Device Flush __ipoib_ib_dev_flush down_read(vlan_rwsem) // Lock A ipoib_flush_ah flush_workqueue(priv->wq) // Wait for completion A work on shared WQ (Mcast carrier) ipoib_mcast_carrier_on_task while (!rtnl_trylock()) // Wait for lock B Flow B: Sysfs PKEY delete ipoib_vlan_delete lock(RTNL) // Lock B down_write(vlan_rwsem) // Wait for lock A This can happen with PKEY creates as well. The solution is to release the RTNL lock in sysfs functions in case it is not possible to lock VLAN RW semaphore and reset the SYS call. Fixes: 69956d83267e ("IB/ipoib: Sync between remove_one to sysfs calls that use rtnl_lock") Signed-off-by: Shalom Lagziel Signed-off-by: Alex Vesker Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/ipoib/ipoib_vlan.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c index 9927cd6b7082..e01c58edca15 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c @@ -141,14 +141,17 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) return restart_syscall(); } - priv = ipoib_intf_alloc(ppriv->ca, ppriv->port, intf_name); - if (!priv) { + if (!down_write_trylock(&ppriv->vlan_rwsem)) { rtnl_unlock(); mutex_unlock(&ppriv->sysfs_mutex); - return -ENOMEM; + return restart_syscall(); } - down_write(&ppriv->vlan_rwsem); + priv = ipoib_intf_alloc(ppriv->ca, ppriv->port, intf_name); + if (!priv) { + result = -ENOMEM; + goto out; + } /* * First ensure this isn't a duplicate. We check the parent device and @@ -175,7 +178,7 @@ out: rtnl_unlock(); mutex_unlock(&ppriv->sysfs_mutex); - if (result) { + if (result && priv) { free_netdev(priv->dev); kfree(priv); } @@ -204,7 +207,12 @@ int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey) return restart_syscall(); } - down_write(&ppriv->vlan_rwsem); + if (!down_write_trylock(&ppriv->vlan_rwsem)) { + rtnl_unlock(); + mutex_unlock(&ppriv->sysfs_mutex); + return restart_syscall(); + } + list_for_each_entry_safe(priv, tpriv, &ppriv->child_intfs, list) { if (priv->pkey == pkey && priv->child_type == IPOIB_LEGACY_CHILD) { -- cgit From 7c9d9662103ae1c11acc7bfc47d988466cff23cf Mon Sep 17 00:00:00 2001 From: Alex Vesker Date: Sun, 24 Sep 2017 21:46:33 +0300 Subject: IB/ipoib: Fix inconsistency with free_netdev and free_rdma_netdev Call free_rdma_netdev instead of free_netdev each time we want to release a netdevice. This call is also relevant for future freeing of offloaded child interfaces. This patch also adds a missing call for free netdevice when releasing a parent interface that has child interfaces using ipoib_remove_one. Fixes: cd565b4b51e5 ('IB/IPoIB: Support acceleration options callbacks') Signed-off-by: Alex Vesker Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/ipoib/ipoib_main.c | 15 +++++++++++---- drivers/infiniband/ulp/ipoib/ipoib_vlan.c | 10 ++++++++-- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index bac95b509a9b..dcc77014018d 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -2180,6 +2180,7 @@ static struct net_device *ipoib_add_port(const char *format, { struct ipoib_dev_priv *priv; struct ib_port_attr attr; + struct rdma_netdev *rn; int result = -ENOMEM; priv = ipoib_intf_alloc(hca, port, format); @@ -2279,7 +2280,8 @@ register_failed: ipoib_dev_cleanup(priv->dev); device_init_failed: - free_netdev(priv->dev); + rn = netdev_priv(priv->dev); + rn->free_rdma_netdev(priv->dev); kfree(priv); alloc_mem_failed: @@ -2328,7 +2330,7 @@ static void ipoib_remove_one(struct ib_device *device, void *client_data) return; list_for_each_entry_safe(priv, tmp, dev_list, list) { - struct rdma_netdev *rn = netdev_priv(priv->dev); + struct rdma_netdev *parent_rn = netdev_priv(priv->dev); ib_unregister_event_handler(&priv->event_handler); flush_workqueue(ipoib_workqueue); @@ -2350,10 +2352,15 @@ static void ipoib_remove_one(struct ib_device *device, void *client_data) unregister_netdev(priv->dev); mutex_unlock(&priv->sysfs_mutex); - rn->free_rdma_netdev(priv->dev); + parent_rn->free_rdma_netdev(priv->dev); + + list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) { + struct rdma_netdev *child_rn; - list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) + child_rn = netdev_priv(cpriv->dev); + child_rn->free_rdma_netdev(cpriv->dev); kfree(cpriv); + } kfree(priv); } diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c index e01c58edca15..55a9b71ed05a 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c @@ -179,7 +179,10 @@ out: mutex_unlock(&ppriv->sysfs_mutex); if (result && priv) { - free_netdev(priv->dev); + struct rdma_netdev *rn; + + rn = netdev_priv(priv->dev); + rn->free_rdma_netdev(priv->dev); kfree(priv); } @@ -232,7 +235,10 @@ int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey) mutex_unlock(&ppriv->sysfs_mutex); if (dev) { - free_netdev(dev); + struct rdma_netdev *rn; + + rn = netdev_priv(dev); + rn->free_rdma_netdev(priv->dev); kfree(priv); return 0; } -- cgit From d67bc5d4e3e100d762c0f57ea67f28bc219698a6 Mon Sep 17 00:00:00 2001 From: Ilya Lesokhin Date: Sun, 24 Sep 2017 21:46:34 +0300 Subject: IB/mlx5: Simplify mlx5_ib_cont_pages The patch simplifies mlx5_ib_cont_pages and fixes the following issues in the original implementation: First issues is related to alignment of the PFNs. After the check base + p != PFN, the alignment of the PFN wasn't checked. So the PFN sequence 0, 1, 1, 2 would result in a page_shift of 13 even though the 3rd PFN is not 8KB aligned. This wasn't actually a bug because it was supported by all the existing mlx5 compatible device, but we don't want to require this support in all future devices. Another issue is because the inner loop didn't advance PFN so the test "if (base + p != pfn)" always failed for SGE with len > (1< Reviewed-by: Eli Cohen Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/mem.c | 47 +++++++++++++++------------------------- 1 file changed, 17 insertions(+), 30 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c index 914f212e7ef6..f3dbd75a0a96 100644 --- a/drivers/infiniband/hw/mlx5/mem.c +++ b/drivers/infiniband/hw/mlx5/mem.c @@ -50,13 +50,9 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, { unsigned long tmp; unsigned long m; - int i, k; - u64 base = 0; - int p = 0; - int skip; - int mask; - u64 len; - u64 pfn; + u64 base = ~0, p = 0; + u64 len, pfn; + int i = 0; struct scatterlist *sg; int entry; unsigned long page_shift = umem->page_shift; @@ -76,33 +72,24 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, m = find_first_bit(&tmp, BITS_PER_LONG); if (max_page_shift) m = min_t(unsigned long, max_page_shift - page_shift, m); - skip = 1 << m; - mask = skip - 1; - i = 0; + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { len = sg_dma_len(sg) >> page_shift; pfn = sg_dma_address(sg) >> page_shift; - for (k = 0; k < len; k++) { - if (!(i & mask)) { - tmp = (unsigned long)pfn; - m = min_t(unsigned long, m, find_first_bit(&tmp, BITS_PER_LONG)); - skip = 1 << m; - mask = skip - 1; - base = pfn; - p = 0; - } else { - if (base + p != pfn) { - tmp = (unsigned long)p; - m = find_first_bit(&tmp, BITS_PER_LONG); - skip = 1 << m; - mask = skip - 1; - base = pfn; - p = 0; - } - } - p++; - i++; + if (base + p != pfn) { + /* If either the offset or the new + * base are unaligned update m + */ + tmp = (unsigned long)(pfn | p); + if (!IS_ALIGNED(tmp, 1 << m)) + m = find_first_bit(&tmp, BITS_PER_LONG); + + base = pfn; + p = 0; } + + p += len; + i += len; } if (i) { -- cgit From fbcd49838d9094ca45772356e7b33afe4b7c93e7 Mon Sep 17 00:00:00 2001 From: Ilya Lesokhin Date: Sun, 24 Sep 2017 21:46:35 +0300 Subject: IB/mlx5: Fix NULL deference on mlx5_ib_update_xlt failure mlx5_ib_reg_user_mr called mlx5_ib_dereg_mr in case of MR population failure. This resulted in a NULL dereference as ibmr->device wasn't initialized yet. We address this by adding an internal dereg_mr function that can handle partially initialized MRs, and fixing clean_mr to work on partially initialized MRs. Fixes: ff740aefecb9 ("IB/mlx5: Decouple MR allocation and population flows") Signed-off-by: Ilya Lesokhin Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/mr.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 0e2789d9bb4d..37bbc543847a 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -47,7 +47,8 @@ enum { #define MLX5_UMR_ALIGN 2048 -static int clean_mr(struct mlx5_ib_mr *mr); +static int clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr); +static int dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr); static int mr_cache_max_order(struct mlx5_ib_dev *dev); static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr); @@ -1270,8 +1271,9 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, err = mlx5_ib_update_xlt(mr, 0, ncont, page_shift, update_xlt_flags); + if (err) { - mlx5_ib_dereg_mr(&mr->ibmr); + dereg_mr(dev, mr); return ERR_PTR(err); } } @@ -1356,7 +1358,7 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, err = mr_umem_get(pd, addr, len, access_flags, &mr->umem, &npages, &page_shift, &ncont, &order); if (err < 0) { - clean_mr(mr); + clean_mr(dev, mr); return err; } } @@ -1410,7 +1412,7 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, if (err) { mlx5_ib_warn(dev, "Failed to rereg UMR\n"); ib_umem_release(mr->umem); - clean_mr(mr); + clean_mr(dev, mr); return err; } } @@ -1469,9 +1471,8 @@ mlx5_free_priv_descs(struct mlx5_ib_mr *mr) } } -static int clean_mr(struct mlx5_ib_mr *mr) +static int clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) { - struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); int allocated_from_cache = mr->allocated_from_cache; int err; @@ -1507,10 +1508,8 @@ static int clean_mr(struct mlx5_ib_mr *mr) return 0; } -int mlx5_ib_dereg_mr(struct ib_mr *ibmr) +static int dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) { - struct mlx5_ib_dev *dev = to_mdev(ibmr->device); - struct mlx5_ib_mr *mr = to_mmr(ibmr); int npages = mr->npages; struct ib_umem *umem = mr->umem; @@ -1539,7 +1538,7 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr) } #endif - clean_mr(mr); + clean_mr(dev, mr); if (umem) { ib_umem_release(umem); @@ -1549,6 +1548,14 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr) return 0; } +int mlx5_ib_dereg_mr(struct ib_mr *ibmr) +{ + struct mlx5_ib_dev *dev = to_mdev(ibmr->device); + struct mlx5_ib_mr *mr = to_mmr(ibmr); + + return dereg_mr(dev, mr); +} + struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, u32 max_num_sg) -- cgit From fe59493240169a2cc3f445ae5f2a2308fda06b63 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 11 Sep 2017 14:29:15 +0200 Subject: PCI: Add dummy pci_acs_enabled() for CONFIG_PCI=n build If CONFIG_PCI=n and gcc (e.g. 4.1.2) decides not to inline get_pci_function_alias_group(), the build fails with: drivers/iommu/iommu.o: In function `get_pci_function_alias_group': iommu.c:(.text+0xfdc): undefined reference to `pci_acs_enabled' Due to the various dummies for PCI calls in the CONFIG_PCI=n case, pci_acs_enabled() never called, but not all versions of gcc are smart enough to realize that. While explicitly marking get_pci_function_alias_group() inline would fix the build, this would inflate the code for the CONFIG_PCI=y case, as get_pci_function_alias_group() is a not-so-small function called from two places. Hence fix the issue by introducing a dummy for pci_acs_enabled() instead. Fixes: 0ae349a0f33f ("iommu/qcom: Add qcom_iommu") Signed-off-by: Geert Uytterhoeven Signed-off-by: Bjorn Helgaas Reviewed-by: Alex Williamson --- include/linux/pci.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/pci.h b/include/linux/pci.h index f68c58a93dd0..f4f8ee5a7362 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1685,6 +1685,8 @@ static inline int pci_get_new_domain_nr(void) { return -ENOSYS; } #define dev_is_pci(d) (false) #define dev_is_pf(d) (false) +static inline bool pci_acs_enabled(struct pci_dev *pdev, u16 acs_flags) +{ return false; } #endif /* CONFIG_PCI */ /* Include architecture-dependent settings and functions */ -- cgit From 9c3340ea7f5dabf88ca096a917cb0ab1f208ef2a Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Mon, 11 Sep 2017 19:11:07 -0600 Subject: selftests: futex: copy sub-dir test scripts for make O=dir run For make O=dir run_tests to work, test scripts from sub-directories need to be copied over to the object directory. Running tests from the object directory is necessary to avoid making the source tree dirty. Signed-off-by: Shuah Khan Reviewed-by: Darren Hart (VMware) Signed-off-by: Shuah Khan --- tools/testing/selftests/futex/Makefile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/futex/Makefile b/tools/testing/selftests/futex/Makefile index 7c647f619d63..9358cb210fd5 100644 --- a/tools/testing/selftests/futex/Makefile +++ b/tools/testing/selftests/futex/Makefile @@ -11,10 +11,13 @@ all: BUILD_TARGET=$(OUTPUT)/$$DIR; \ mkdir $$BUILD_TARGET -p; \ make OUTPUT=$$BUILD_TARGET -C $$DIR $@;\ + if [ -e $$DIR/$(TEST_PROGS) ]; then + rsync -a $$DIR/$(TEST_PROGS) $$BUILD_TARGET/; + fi done override define RUN_TESTS - $(OUTPUT)/run.sh + cd $(OUTPUT); ./run.sh endef override define INSTALL_RULE -- cgit From 8230b905a6780c60372bf5df7bf5f23041ca3196 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Tue, 12 Sep 2017 08:52:13 -0600 Subject: selftests: mqueue: Use full path to run tests from Makefile Use full path including $(OUTPUT) to run tests from Makefile for normal case when objects reside in the source tree as well as when objects are relocated with make O=dir. In both cases $(OUTPUT) will be set correctly by lib.mk. Signed-off-by: Shuah Khan --- tools/testing/selftests/mqueue/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/mqueue/Makefile b/tools/testing/selftests/mqueue/Makefile index 79a664aeb8d7..0f5e347b068d 100644 --- a/tools/testing/selftests/mqueue/Makefile +++ b/tools/testing/selftests/mqueue/Makefile @@ -5,8 +5,8 @@ TEST_GEN_PROGS := mq_open_tests mq_perf_tests include ../lib.mk override define RUN_TESTS - @./mq_open_tests /test1 || echo "selftests: mq_open_tests [FAIL]" - @./mq_perf_tests || echo "selftests: mq_perf_tests [FAIL]" + $(OUTPUT)/mq_open_tests /test1 || echo "selftests: mq_open_tests [FAIL]" + $(OUTPUT)//mq_perf_tests || echo "selftests: mq_perf_tests [FAIL]" endef override define EMIT_TESTS -- cgit From 1ede053632f6380d7e1dba4d781e5eb78621aa3a Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Mon, 18 Sep 2017 17:30:50 -0600 Subject: selftests: Makefile: fix for loops in targets to run silently Fix for loops in targets to run silently to avoid cluttering the test results. Suppresses the following from targets: e.g run from breakpoints for TARGET in breakpoints; do \ BUILD_TARGET=$BUILD/$TARGET; \ mkdir $BUILD_TARGET -p; \ make OUTPUT=$BUILD_TARGET -C $TARGET;\ done; Signed-off-by: Shuah Khan --- tools/testing/selftests/Makefile | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index f4368db011ea..ff805643b5f7 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -66,32 +66,32 @@ endif export BUILD all: - for TARGET in $(TARGETS); do \ + @for TARGET in $(TARGETS); do \ BUILD_TARGET=$$BUILD/$$TARGET; \ mkdir $$BUILD_TARGET -p; \ make OUTPUT=$$BUILD_TARGET -C $$TARGET;\ done; run_tests: all - for TARGET in $(TARGETS); do \ + @for TARGET in $(TARGETS); do \ BUILD_TARGET=$$BUILD/$$TARGET; \ make OUTPUT=$$BUILD_TARGET -C $$TARGET run_tests;\ done; hotplug: - for TARGET in $(TARGETS_HOTPLUG); do \ + @for TARGET in $(TARGETS_HOTPLUG); do \ BUILD_TARGET=$$BUILD/$$TARGET; \ make OUTPUT=$$BUILD_TARGET -C $$TARGET;\ done; run_hotplug: hotplug - for TARGET in $(TARGETS_HOTPLUG); do \ + @for TARGET in $(TARGETS_HOTPLUG); do \ BUILD_TARGET=$$BUILD/$$TARGET; \ make OUTPUT=$$BUILD_TARGET -C $$TARGET run_full_test;\ done; clean_hotplug: - for TARGET in $(TARGETS_HOTPLUG); do \ + @for TARGET in $(TARGETS_HOTPLUG); do \ BUILD_TARGET=$$BUILD/$$TARGET; \ make OUTPUT=$$BUILD_TARGET -C $$TARGET clean;\ done; @@ -107,7 +107,7 @@ install: ifdef INSTALL_PATH @# Ask all targets to install their files mkdir -p $(INSTALL_PATH) - for TARGET in $(TARGETS); do \ + @for TARGET in $(TARGETS); do \ BUILD_TARGET=$$BUILD/$$TARGET; \ make OUTPUT=$$BUILD_TARGET -C $$TARGET INSTALL_PATH=$(INSTALL_PATH)/$$TARGET install; \ done; @@ -132,7 +132,7 @@ else endif clean: - for TARGET in $(TARGETS); do \ + @for TARGET in $(TARGETS); do \ BUILD_TARGET=$$BUILD/$$TARGET; \ make OUTPUT=$$BUILD_TARGET -C $$TARGET clean;\ done; -- cgit From 659dbfd8c47adeb03f401d1a1f17091bb63cc5a2 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Mon, 18 Sep 2017 18:46:23 -0600 Subject: selftests: futex: Makefile: fix for loops in targets to run silently Fix for loops in targets to run silently to avoid cluttering the test results. Suppresses the following from targets: for DIR in functional; do \ BUILD_TARGET=./tools/testing/selftests/futex/$DIR; \ mkdir $BUILD_TARGET -p; \ make OUTPUT=$BUILD_TARGET -C $DIR all;\ done ./tools/testing/selftests/futex/run.sh Signed-off-by: Shuah Khan Reviewed-by: Darren Hart (VMware) Signed-off-by: Shuah Khan --- tools/testing/selftests/futex/Makefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/futex/Makefile b/tools/testing/selftests/futex/Makefile index 9358cb210fd5..f0c0369ccb79 100644 --- a/tools/testing/selftests/futex/Makefile +++ b/tools/testing/selftests/futex/Makefile @@ -7,7 +7,7 @@ TEST_PROGS := run.sh include ../lib.mk all: - for DIR in $(SUBDIRS); do \ + @for DIR in $(SUBDIRS); do \ BUILD_TARGET=$(OUTPUT)/$$DIR; \ mkdir $$BUILD_TARGET -p; \ make OUTPUT=$$BUILD_TARGET -C $$DIR $@;\ @@ -17,7 +17,7 @@ all: done override define RUN_TESTS - cd $(OUTPUT); ./run.sh + @cd $(OUTPUT); ./run.sh endef override define INSTALL_RULE @@ -36,7 +36,7 @@ override define EMIT_TESTS endef override define CLEAN - for DIR in $(SUBDIRS); do \ + @for DIR in $(SUBDIRS); do \ BUILD_TARGET=$(OUTPUT)/$$DIR; \ mkdir $$BUILD_TARGET -p; \ make OUTPUT=$$BUILD_TARGET -C $$DIR $@;\ -- cgit From 10859f3855db4c6f10dc7974ff4b3a292f3de8e0 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 7 Sep 2017 16:32:46 -0700 Subject: selftests/seccomp: Support glibc 2.26 siginfo_t.h The 2.26 release of glibc changed how siginfo_t is defined, and the earlier work-around to using the kernel definition are no longer needed. The old way needs to stay around for a while, though. Reported-by: Seth Forshee Cc: Andy Lutomirski Cc: Will Drewry Cc: Shuah Khan Cc: linux-kselftest@vger.kernel.org Cc: stable@vger.kernel.org Signed-off-by: Kees Cook Tested-by: Seth Forshee Signed-off-by: Shuah Khan --- tools/testing/selftests/seccomp/seccomp_bpf.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 4d6f92a9df6b..19cd272c234d 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -6,10 +6,18 @@ */ #include -#include -#define __have_siginfo_t 1 -#define __have_sigval_t 1 -#define __have_sigevent_t 1 + +/* + * glibc 2.26 and later have SIGSYS in siginfo_t. Before that, + * we need to use the kernel's siginfo.h file and trick glibc + * into accepting it. + */ +#if !__GLIBC_PREREQ(2, 26) +# include +# define __have_siginfo_t 1 +# define __have_sigval_t 1 +# define __have_sigevent_t 1 +#endif #include #include @@ -676,7 +684,7 @@ TEST_F_SIGNAL(TRAP, ign, SIGSYS) syscall(__NR_getpid); } -static struct siginfo TRAP_info; +static siginfo_t TRAP_info; static volatile int TRAP_nr; static void TRAP_action(int nr, siginfo_t *info, void *void_context) { -- cgit From 21aadfa2426d5d199ceb474d0159d079c7f17bfa Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Thu, 21 Sep 2017 17:13:27 +0800 Subject: selftests/memfd: correct run_tests.sh permission to fix the following issue: ------------------ TAP version 13 selftests: run_tests.sh ======================================== selftests: Warning: file run_tests.sh is not executable, correct this. not ok 1..1 selftests: run_tests.sh [FAIL] ------------------ Signed-off-by: Li Zhijian Signed-off-by: Shuah Khan --- tools/testing/selftests/memfd/run_tests.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 tools/testing/selftests/memfd/run_tests.sh diff --git a/tools/testing/selftests/memfd/run_tests.sh b/tools/testing/selftests/memfd/run_tests.sh old mode 100644 new mode 100755 -- cgit From 01db7fbf5487505b887fbd6a03c51f2adc952196 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Thu, 21 Sep 2017 13:46:01 -0600 Subject: selftests: timers: set-timer-lat: fix hang when std out/err are redirected do_timer_oneshot() uses select() as a timer with FD_SETSIZE and readfs is cleared with FD_ZERO without FD_SET. When stdout and stderr are redirected, the test hangs in select forever. Fix the problem calling select() with readfds empty and nfds zero. This is sufficient for using select() for timer. With this fix "./set-timer-lat > /dev/null 2>&1" no longer hangs. Signed-off-by: Shuah Khan Acked-by: Greg Hackmann Signed-off-by: Shuah Khan --- tools/testing/selftests/timers/set-timer-lat.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tools/testing/selftests/timers/set-timer-lat.c b/tools/testing/selftests/timers/set-timer-lat.c index 9c92b7bd5641..ea1af5dbc7b6 100644 --- a/tools/testing/selftests/timers/set-timer-lat.c +++ b/tools/testing/selftests/timers/set-timer-lat.c @@ -228,7 +228,6 @@ int do_timer_oneshot(int clock_id, int flags) timer_t tm1; const int interval = 0; struct timeval timeout; - fd_set fds; int err; err = setup_timer(clock_id, flags, interval, &tm1); @@ -237,9 +236,8 @@ int do_timer_oneshot(int clock_id, int flags) memset(&timeout, 0, sizeof(timeout)); timeout.tv_sec = 5; - FD_ZERO(&fds); do { - err = select(FD_SETSIZE, &fds, NULL, NULL, &timeout); + err = select(0, NULL, NULL, NULL, &timeout); } while (err == -1 && errno == EINTR); timer_delete(tm1); -- cgit From eefd95e1f3d47b90dc768e9ebc77d390c4f34809 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Thu, 21 Sep 2017 13:05:18 -0600 Subject: selftests: timers: set-timer-lat: Fix hang when testing unsupported alarms When timer_create() fails on a bootime or realtime clock, setup_timer() returns 0 as if timer has been set. Callers wait forever for the timer to expire. This hang is seen on a system that doesn't have support for: CLOCK_REALTIME_ALARM ABSTIME missing CAP_WAKE_ALARM? : [UNSUPPORTED] Test hangs waiting for a timer that hasn't been set to expire. Fix setup_timer() to return 1, add handling in callers to detect the unsupported case and return 0 without waiting to not fail the test. Signed-off-by: Shuah Khan --- tools/testing/selftests/timers/set-timer-lat.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/timers/set-timer-lat.c b/tools/testing/selftests/timers/set-timer-lat.c index ea1af5dbc7b6..50da45437daa 100644 --- a/tools/testing/selftests/timers/set-timer-lat.c +++ b/tools/testing/selftests/timers/set-timer-lat.c @@ -143,7 +143,8 @@ int setup_timer(int clock_id, int flags, int interval, timer_t *tm1) printf("%-22s %s missing CAP_WAKE_ALARM? : [UNSUPPORTED]\n", clockstring(clock_id), flags ? "ABSTIME":"RELTIME"); - return 0; + /* Indicate timer isn't set, so caller doesn't wait */ + return 1; } printf("%s - timer_create() failed\n", clockstring(clock_id)); return -1; @@ -213,8 +214,9 @@ int do_timer(int clock_id, int flags) int err; err = setup_timer(clock_id, flags, interval, &tm1); + /* Unsupported case - return 0 to not fail the test */ if (err) - return err; + return err == 1 ? 0 : err; while (alarmcount < 5) sleep(1); @@ -231,8 +233,9 @@ int do_timer_oneshot(int clock_id, int flags) int err; err = setup_timer(clock_id, flags, interval, &tm1); + /* Unsupported case - return 0 to not fail the test */ if (err) - return err; + return err == 1 ? 0 : err; memset(&timeout, 0, sizeof(timeout)); timeout.tv_sec = 5; -- cgit From 10201655b085df8e000822e496e5d4016a167a36 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 19 Sep 2017 07:15:35 -0500 Subject: gfs2: Fix debugfs glocks dump The switch to rhashtables (commit 88ffbf3e03) broke the debugfs glock dump (/sys/kernel/debug/gfs2//glocks) for dumps bigger than a single buffer: the right function for restarting an rhashtable iteration from the beginning of the hash table is rhashtable_walk_enter; rhashtable_walk_stop + rhashtable_walk_start will just resume from the current position. Signed-off-by: Andreas Gruenbacher Signed-off-by: Bob Peterson Cc: stable@vger.kernel.org # v4.3+ --- fs/gfs2/glock.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 98e845b7841b..11066d8647d2 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1945,13 +1945,9 @@ static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos) { struct gfs2_glock_iter *gi = seq->private; loff_t n = *pos; - int ret; - - if (gi->last_pos <= *pos) - n = (*pos - gi->last_pos); - ret = rhashtable_walk_start(&gi->hti); - if (ret) + rhashtable_walk_enter(&gl_hash_table, &gi->hti); + if (rhashtable_walk_start(&gi->hti) != 0) return NULL; do { @@ -1959,6 +1955,7 @@ static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos) } while (gi->gl && n--); gi->last_pos = *pos; + return gi->gl; } @@ -1970,6 +1967,7 @@ static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr, (*pos)++; gi->last_pos = *pos; gfs2_glock_iter_next(gi); + return gi->gl; } @@ -1980,6 +1978,7 @@ static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr) gi->gl = NULL; rhashtable_walk_stop(&gi->hti); + rhashtable_walk_exit(&gi->hti); } static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr) @@ -2042,12 +2041,10 @@ static int __gfs2_glocks_open(struct inode *inode, struct file *file, struct gfs2_glock_iter *gi = seq->private; gi->sdp = inode->i_private; - gi->last_pos = 0; seq->buf = kmalloc(GFS2_SEQ_GOODSIZE, GFP_KERNEL | __GFP_NOWARN); if (seq->buf) seq->size = GFS2_SEQ_GOODSIZE; gi->gl = NULL; - rhashtable_walk_enter(&gl_hash_table, &gi->hti); } return ret; } @@ -2063,7 +2060,6 @@ static int gfs2_glocks_release(struct inode *inode, struct file *file) struct gfs2_glock_iter *gi = seq->private; gi->gl = NULL; - rhashtable_walk_exit(&gi->hti); return seq_release_private(inode, file); } -- cgit From 115ef3b7e61ac64e32827611a127002672ed3725 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 25 Sep 2017 20:21:54 +0200 Subject: watchdog/hardlockup/perf: Cure UP damage for_each_cpu() unintuitively reports CPU0 as set independend of the actual cpumask content on UP kernels. That leads to a NULL pointer dereference when the cleanup function is invoked and there is no event to clean up. Reported-by: Fengguang Wu Signed-off-by: Thomas Gleixner --- kernel/watchdog_hld.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index b2931154b5f2..204a8cadb717 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -220,8 +220,13 @@ void hardlockup_detector_perf_cleanup(void) for_each_cpu(cpu, &dead_events_mask) { struct perf_event *event = per_cpu(watchdog_ev, cpu); + /* + * Required because for_each_cpu() reports unconditionally + * CPU0 as set on UP kernels. Sigh. + */ + if (event) + perf_event_release_kernel(event); per_cpu(watchdog_ev, cpu) = NULL; - perf_event_release_kernel(event); } cpumask_clear(&dead_events_mask); } -- cgit From 8cbd96a6285e8eb65232b5afd3e8d9418453a61c Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 21 Sep 2017 08:13:49 -0700 Subject: nvme: fix sqhd reference when admin queue connect fails Fix bug in sqhd patch. It wasn't the sq that was at risk. In the case where the admin queue connect command fails, the sq->size field is not set. Therefore, this becomes a divide by zero error. Add a quick check to bypass under this failure condition. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/target/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index c2a768a94235..1b208beeef50 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -390,7 +390,8 @@ static void __nvmet_req_complete(struct nvmet_req *req, u16 status) if (status) nvmet_set_status(req, status); - req->sq->sqhd = (req->sq->sqhd + 1) % req->sq->size; + if (req->sq->size) + req->sq->sqhd = (req->sq->sqhd + 1) % req->sq->size; req->rsp->sq_head = cpu_to_le16(req->sq->sqhd); req->rsp->sq_id = cpu_to_le16(req->sq->qid); req->rsp->command_id = req->cmd->common.command_id; -- cgit From 1a40d97288c6ffea9b355139e88fa62f0e5439f7 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 21 Sep 2017 17:01:36 +0300 Subject: nvme-core: Use nvme_wq to queue async events and fw activation async_event_work might race as it is executed from two different workqueues at the moment. Reviewed-by: Johannes Thumshirn Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 5589f67d2cd8..bb2aad078637 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2676,7 +2676,7 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, case NVME_SC_ABORT_REQ: ++ctrl->event_limit; if (ctrl->state == NVME_CTRL_LIVE) - schedule_work(&ctrl->async_event_work); + queue_work(nvme_wq, &ctrl->async_event_work); break; default: break; @@ -2691,7 +2691,7 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, nvme_queue_scan(ctrl); break; case NVME_AER_NOTICE_FW_ACT_STARTING: - schedule_work(&ctrl->fw_act_work); + queue_work(nvme_wq, &ctrl->fw_act_work); break; default: dev_warn(ctrl->device, "async event result %08x\n", result); -- cgit From 0a960afd60d02808c7f7f36d4aa8a2e07045e1e9 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 21 Sep 2017 17:01:37 +0300 Subject: nvme-rdma: give up reconnect if state change fails If we failed to transition to state LIVE after a successful reconnect, then controller deletion already started. In this case there is no point moving forward with reconnect. Reviewed-by: Johannes Thumshirn Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/rdma.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 58983000964b..8441f6b3f617 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -942,7 +942,12 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) } changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); - WARN_ON_ONCE(!changed); + if (!changed) { + /* state change failure is ok if we're in DELETING state */ + WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING); + return; + } + ctrl->ctrl.nr_reconnects = 0; nvme_start_ctrl(&ctrl->ctrl); -- cgit From e4d753d7e51c0648b9ee33efeed55d45f362fc3d Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 21 Sep 2017 17:01:38 +0300 Subject: nvme-rdma: don't fully stop the controller in error recovery By calling nvme_stop_ctrl on a already failed controller will wait for the scan work to complete (only by identify timeout expiration which is 60 seconds). This is unnecessary when we already know that the controller has failed. Reported-by: Yi Zhang Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 8441f6b3f617..92a03ff5fb4d 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -967,7 +967,7 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) struct nvme_rdma_ctrl *ctrl = container_of(work, struct nvme_rdma_ctrl, err_work); - nvme_stop_ctrl(&ctrl->ctrl); + nvme_stop_keep_alive(&ctrl->ctrl); if (ctrl->ctrl.queue_count > 1) { nvme_stop_queues(&ctrl->ctrl); -- cgit From 3688feb582a1bc4e58ad50f5eccfdb90615de27b Mon Sep 17 00:00:00 2001 From: James Smart Date: Tue, 19 Sep 2017 15:13:11 -0700 Subject: nvmet-fc: on port remove call put outside lock Avoid calling the put routine, as it may traverse to free routines while holding the target lock. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/target/fc.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index c48c83d97e30..6850672ad2a2 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -2530,13 +2530,17 @@ nvmet_fc_remove_port(struct nvmet_port *port) { struct nvmet_fc_tgtport *tgtport = port->priv; unsigned long flags; + bool matched = false; spin_lock_irqsave(&nvmet_fc_tgtlock, flags); if (tgtport->port == port) { - nvmet_fc_tgtport_put(tgtport); + matched = true; tgtport->port = NULL; } spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags); + + if (matched) + nvmet_fc_tgtport_put(tgtport); } static struct nvmet_fabrics_ops nvmet_fc_tgt_fcp_ops = { -- cgit From 0c319d3a144d4b8f1ea2047fd614d2149b68f889 Mon Sep 17 00:00:00 2001 From: James Smart Date: Tue, 19 Sep 2017 16:33:56 -0700 Subject: nvmet-fc: ensure target queue id within range. When searching for queue id's ensure they are within the expected range. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/target/fc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 6850672ad2a2..58e010bdda3e 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -783,6 +783,9 @@ nvmet_fc_find_target_queue(struct nvmet_fc_tgtport *tgtport, u16 qid = nvmet_fc_getqueueid(connection_id); unsigned long flags; + if (qid > NVMET_NR_QUEUES) + return NULL; + spin_lock_irqsave(&tgtport->lock, flags); list_for_each_entry(assoc, &tgtport->assoc_list, a_list) { if (association_id == assoc->association_id) { -- cgit From 6b71f9e1e849f82abb4a8d54ce7f4b1c71f19ac4 Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 20 Sep 2017 11:07:26 -0700 Subject: nvmet-fc: sync header templates with comments Comments were incorrect: - defer_rcv was in host port template. moved to target port template - Added Mandatory statements for target port template items Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/nvme-fc-driver.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h index 9c5cb4480806..a726f96010d5 100644 --- a/include/linux/nvme-fc-driver.h +++ b/include/linux/nvme-fc-driver.h @@ -346,11 +346,6 @@ struct nvme_fc_remote_port { * indicating an FC transport Aborted status. * Entrypoint is Mandatory. * - * @defer_rcv: Called by the transport to signal the LLLD that it has - * begun processing of a previously received NVME CMD IU. The LLDD - * is now free to re-use the rcv buffer associated with the - * nvmefc_tgt_fcp_req. - * * @max_hw_queues: indicates the maximum number of hw queues the LLDD * supports for cpu affinitization. * Value is Mandatory. Must be at least 1. @@ -806,11 +801,19 @@ struct nvmet_fc_target_port { * outstanding operation (if there was one) to complete, then will * call the fcp_req_release() callback to return the command's * exchange context back to the LLDD. + * Entrypoint is Mandatory. * * @fcp_req_release: Called by the transport to return a nvmefc_tgt_fcp_req * to the LLDD after all operations on the fcp operation are complete. * This may be due to the command completing or upon completion of * abort cleanup. + * Entrypoint is Mandatory. + * + * @defer_rcv: Called by the transport to signal the LLLD that it has + * begun processing of a previously received NVME CMD IU. The LLDD + * is now free to re-use the rcv buffer associated with the + * nvmefc_tgt_fcp_req. + * Entrypoint is Optional. * * @max_hw_queues: indicates the maximum number of hw queues the LLDD * supports for cpu affinitization. -- cgit From fddc9923c6d41de9fe7b1f323a3cece53e046c88 Mon Sep 17 00:00:00 2001 From: James Smart Date: Tue, 19 Sep 2017 14:01:50 -0700 Subject: nvme-fcloop: fix port deletes and callbacks Now that there are potentially long delays between when a remoteport or targetport delete calls is made and when the callback occurs (dev_loss_tmo timeout), no longer block in the delete routines and move the final nport puts to the callbacks. Moved the fcloop_nport_get/put/free routines to avoid forward declarations. Ensure port_info structs used in registrations are nulled in case fields are not set (ex: devloss_tmo values). Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/target/fcloop.c | 102 ++++++++++++++++--------------------------- 1 file changed, 38 insertions(+), 64 deletions(-) diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index 1fd1afbb8b2a..7b75d9de55ab 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -224,8 +224,6 @@ struct fcloop_nport { struct fcloop_lport *lport; struct list_head nport_list; struct kref ref; - struct completion rport_unreg_done; - struct completion tport_unreg_done; u64 node_name; u64 port_name; u32 port_role; @@ -630,6 +628,32 @@ fcloop_fcp_abort(struct nvme_fc_local_port *localport, schedule_work(&inireq->iniwork); } +static void +fcloop_nport_free(struct kref *ref) +{ + struct fcloop_nport *nport = + container_of(ref, struct fcloop_nport, ref); + unsigned long flags; + + spin_lock_irqsave(&fcloop_lock, flags); + list_del(&nport->nport_list); + spin_unlock_irqrestore(&fcloop_lock, flags); + + kfree(nport); +} + +static void +fcloop_nport_put(struct fcloop_nport *nport) +{ + kref_put(&nport->ref, fcloop_nport_free); +} + +static int +fcloop_nport_get(struct fcloop_nport *nport) +{ + return kref_get_unless_zero(&nport->ref); +} + static void fcloop_localport_delete(struct nvme_fc_local_port *localport) { @@ -644,8 +668,7 @@ fcloop_remoteport_delete(struct nvme_fc_remote_port *remoteport) { struct fcloop_rport *rport = remoteport->private; - /* release any threads waiting for the unreg to complete */ - complete(&rport->nport->rport_unreg_done); + fcloop_nport_put(rport->nport); } static void @@ -653,8 +676,7 @@ fcloop_targetport_delete(struct nvmet_fc_target_port *targetport) { struct fcloop_tport *tport = targetport->private; - /* release any threads waiting for the unreg to complete */ - complete(&tport->nport->tport_unreg_done); + fcloop_nport_put(tport->nport); } #define FCLOOP_HW_QUEUES 4 @@ -722,6 +744,7 @@ fcloop_create_local_port(struct device *dev, struct device_attribute *attr, goto out_free_opts; } + memset(&pinfo, 0, sizeof(pinfo)); pinfo.node_name = opts->wwnn; pinfo.port_name = opts->wwpn; pinfo.port_role = opts->roles; @@ -804,32 +827,6 @@ fcloop_delete_local_port(struct device *dev, struct device_attribute *attr, return ret ? ret : count; } -static void -fcloop_nport_free(struct kref *ref) -{ - struct fcloop_nport *nport = - container_of(ref, struct fcloop_nport, ref); - unsigned long flags; - - spin_lock_irqsave(&fcloop_lock, flags); - list_del(&nport->nport_list); - spin_unlock_irqrestore(&fcloop_lock, flags); - - kfree(nport); -} - -static void -fcloop_nport_put(struct fcloop_nport *nport) -{ - kref_put(&nport->ref, fcloop_nport_free); -} - -static int -fcloop_nport_get(struct fcloop_nport *nport) -{ - return kref_get_unless_zero(&nport->ref); -} - static struct fcloop_nport * fcloop_alloc_nport(const char *buf, size_t count, bool remoteport) { @@ -938,6 +935,7 @@ fcloop_create_remote_port(struct device *dev, struct device_attribute *attr, if (!nport) return -EIO; + memset(&pinfo, 0, sizeof(pinfo)); pinfo.node_name = nport->node_name; pinfo.port_name = nport->port_name; pinfo.port_role = nport->port_role; @@ -979,24 +977,12 @@ __unlink_remote_port(struct fcloop_nport *nport) } static int -__wait_remoteport_unreg(struct fcloop_nport *nport, struct fcloop_rport *rport) +__remoteport_unreg(struct fcloop_nport *nport, struct fcloop_rport *rport) { - int ret; - if (!rport) return -EALREADY; - init_completion(&nport->rport_unreg_done); - - ret = nvme_fc_unregister_remoteport(rport->remoteport); - if (ret) - return ret; - - wait_for_completion(&nport->rport_unreg_done); - - fcloop_nport_put(nport); - - return ret; + return nvme_fc_unregister_remoteport(rport->remoteport); } static ssize_t @@ -1029,7 +1015,7 @@ fcloop_delete_remote_port(struct device *dev, struct device_attribute *attr, if (!nport) return -ENOENT; - ret = __wait_remoteport_unreg(nport, rport); + ret = __remoteport_unreg(nport, rport); return ret ? ret : count; } @@ -1086,24 +1072,12 @@ __unlink_target_port(struct fcloop_nport *nport) } static int -__wait_targetport_unreg(struct fcloop_nport *nport, struct fcloop_tport *tport) +__targetport_unreg(struct fcloop_nport *nport, struct fcloop_tport *tport) { - int ret; - if (!tport) return -EALREADY; - init_completion(&nport->tport_unreg_done); - - ret = nvmet_fc_unregister_targetport(tport->targetport); - if (ret) - return ret; - - wait_for_completion(&nport->tport_unreg_done); - - fcloop_nport_put(nport); - - return ret; + return nvmet_fc_unregister_targetport(tport->targetport); } static ssize_t @@ -1136,7 +1110,7 @@ fcloop_delete_target_port(struct device *dev, struct device_attribute *attr, if (!nport) return -ENOENT; - ret = __wait_targetport_unreg(nport, tport); + ret = __targetport_unreg(nport, tport); return ret ? ret : count; } @@ -1223,11 +1197,11 @@ static void __exit fcloop_exit(void) spin_unlock_irqrestore(&fcloop_lock, flags); - ret = __wait_targetport_unreg(nport, tport); + ret = __targetport_unreg(nport, tport); if (ret) pr_warn("%s: Failed deleting target port\n", __func__); - ret = __wait_remoteport_unreg(nport, rport); + ret = __remoteport_unreg(nport, rport); if (ret) pr_warn("%s: Failed deleting remote port\n", __func__); -- cgit From a08588ea486a5590b50c36f437dc86350271b250 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Thu, 21 Sep 2017 23:24:39 -0700 Subject: irqchip/mips-gic: Fix shifts to extract register fields The MIPS GIC driver is incorrectly using __fls to shift registers, intending to shift to the least significant bit of a value based upon its mask but instead shifting off all but the value's top bit. It should actually be using __ffs to shift to the first, not last, bit of the value. Apparently the system I used when testing commit 3680746abd87 ("irqchip: mips-gic: Convert remaining shared reg access to new accessors") and commit b2b2e584ceab ("irqchip: mips-gic: Clean up mti, reserved-cpu-vectors handling") managed to work correctly despite this issue, but not all systems do... Fixes: 3680746abd87 ("irqchip: mips-gic: Convert remaining shared reg access to new accessors") Fixes: b2b2e584ceab ("irqchip: mips-gic: Clean up mti, reserved-cpu-vectors handling") Signed-off-by: Paul Burton Signed-off-by: Thomas Gleixner Cc: Marc Zyngier Cc: Jason Cooper Link: https://lkml.kernel.org/r/20170922062440.23701-2-paul.burton@imgtec.com --- drivers/irqchip/irq-mips-gic.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c index 40159ac12ac8..0022b31ad2c5 100644 --- a/drivers/irqchip/irq-mips-gic.c +++ b/drivers/irqchip/irq-mips-gic.c @@ -645,7 +645,7 @@ static int __init gic_of_init(struct device_node *node, /* Find the first available CPU vector. */ i = 0; - reserved = (C_SW0 | C_SW1) >> __fls(C_SW0); + reserved = (C_SW0 | C_SW1) >> __ffs(C_SW0); while (!of_property_read_u32_index(node, "mti,reserved-cpu-vectors", i++, &cpu_vec)) reserved |= BIT(cpu_vec); @@ -684,11 +684,11 @@ static int __init gic_of_init(struct device_node *node, gicconfig = read_gic_config(); gic_shared_intrs = gicconfig & GIC_CONFIG_NUMINTERRUPTS; - gic_shared_intrs >>= __fls(GIC_CONFIG_NUMINTERRUPTS); + gic_shared_intrs >>= __ffs(GIC_CONFIG_NUMINTERRUPTS); gic_shared_intrs = (gic_shared_intrs + 1) * 8; gic_vpes = gicconfig & GIC_CONFIG_PVPS; - gic_vpes >>= __fls(GIC_CONFIG_PVPS); + gic_vpes >>= __ffs(GIC_CONFIG_PVPS); gic_vpes = gic_vpes + 1; if (cpu_has_veic) { -- cgit From d9f82930a5b41f28fadb1e4838b877ae528456d3 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Thu, 21 Sep 2017 23:24:40 -0700 Subject: irqchip/mips-gic: Use effective affinity to unmask Commit 7778c4b27cbe ("irqchip: mips-gic: Use pcpu_masks to avoid reading GIC_SH_MASK*") adjusted the way we handle masking interrupts to set & clear the interrupt's bit in each pcpu_mask. This allows us to avoid needing to read the GIC mask registers and perform a bitwise and of their values with the pending & pcpu_masks. Unfortunately this didn't quite work for IPIs, which were mapped to a particular CPU/VP during initialisation but never set the affinity or effective_affinity fields of their struct irq_desc. This led to them losing their affinity when gic_unmask_irq() was called for them, and they'd all become affine to cpu0. Fix this by: 1) Setting the effective affinity of interrupts in gic_shared_irq_domain_map(), which is where we actually map an interrupt to a CPU/VP. This ensures that the effective affinity mask is always valid, not just after explicitly setting affinity. 2) Using an interrupt's effective affinity when unmasking it, which prevents gic_unmask_irq() from unintentionally changing which pcpu_mask includes an interrupt. Fixes: 7778c4b27cbe ("irqchip: mips-gic: Use pcpu_masks to avoid reading GIC_SH_MASK*") Signed-off-by: Paul Burton Signed-off-by: Thomas Gleixner Cc: Marc Zyngier Cc: Jason Cooper Link: https://lkml.kernel.org/r/20170922062440.23701-3-paul.burton@imgtec.com --- drivers/irqchip/irq-mips-gic.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c index 0022b31ad2c5..c90976d7e53c 100644 --- a/drivers/irqchip/irq-mips-gic.c +++ b/drivers/irqchip/irq-mips-gic.c @@ -175,14 +175,13 @@ static void gic_mask_irq(struct irq_data *d) static void gic_unmask_irq(struct irq_data *d) { - struct cpumask *affinity = irq_data_get_affinity_mask(d); unsigned int intr = GIC_HWIRQ_TO_SHARED(d->hwirq); unsigned int cpu; write_gic_smask(intr); gic_clear_pcpu_masks(intr); - cpu = cpumask_first_and(affinity, cpu_online_mask); + cpu = cpumask_first(irq_data_get_effective_affinity_mask(d)); set_bit(intr, per_cpu_ptr(pcpu_masks, cpu)); } @@ -420,13 +419,17 @@ static int gic_shared_irq_domain_map(struct irq_domain *d, unsigned int virq, irq_hw_number_t hw, unsigned int cpu) { int intr = GIC_HWIRQ_TO_SHARED(hw); + struct irq_data *data; unsigned long flags; + data = irq_get_irq_data(virq); + spin_lock_irqsave(&gic_lock, flags); write_gic_map_pin(intr, GIC_MAP_PIN_MAP_TO_PIN | gic_cpu_pin); write_gic_map_vp(intr, BIT(mips_cm_vp_id(cpu))); gic_clear_pcpu_masks(intr); set_bit(intr, per_cpu_ptr(pcpu_masks, cpu)); + irq_data_update_effective_affinity(data, cpumask_of(cpu)); spin_unlock_irqrestore(&gic_lock, flags); return 0; -- cgit From 7755d83e48397e822aac751b1545f8bcf71d133e Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 22 Sep 2017 21:20:41 +0900 Subject: irqdomain: Add __rcu annotations to radix tree accessors Fix various address spaces warning of sparse. kernel/irq/irqdomain.c:1463:14: warning: incorrect type in assignment (different address spaces) kernel/irq/irqdomain.c:1463:14: expected void **slot kernel/irq/irqdomain.c:1463:14: got void [noderef] ** kernel/irq/irqdomain.c:1465:66: warning: incorrect type in argument 2 (different address spaces) kernel/irq/irqdomain.c:1465:66: expected void [noderef] **slot kernel/irq/irqdomain.c:1465:66: got void **slot Signed-off-by: Masahiro Yamada Signed-off-by: Thomas Gleixner Cc: Marc Zyngier Cc: Jason Cooper Link: https://lkml.kernel.org/r/1506082841-11530-1-git-send-email-yamada.masahiro@socionext.com --- kernel/irq/irqdomain.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index e84b7056bb08..ac4644e92b49 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -945,7 +945,7 @@ static int virq_debug_show(struct seq_file *m, void *private) struct irq_desc *desc; struct irq_domain *domain; struct radix_tree_iter iter; - void **slot; + void __rcu **slot; int i; seq_printf(m, " %-16s %-6s %-10s %-10s %s\n", @@ -1453,7 +1453,7 @@ out_free_desc: /* The irq_data was moved, fix the revmap to refer to the new location */ static void irq_domain_fix_revmap(struct irq_data *d) { - void **slot; + void __rcu **slot; if (d->hwirq < d->domain->revmap_size) return; /* Not using radix tree. */ -- cgit From c88f0e6b06f4092995688211a631bb436125d77b Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 27 Aug 2017 20:25:26 +0800 Subject: scsi: scsi_transport_iscsi: fix the issue that iscsi_if_rx doesn't parse nlmsg properly ChunYu found a kernel crash by syzkaller: [ 651.617875] kasan: CONFIG_KASAN_INLINE enabled [ 651.618217] kasan: GPF could be caused by NULL-ptr deref or user memory access [ 651.618731] general protection fault: 0000 [#1] SMP KASAN [ 651.621543] CPU: 1 PID: 9539 Comm: scsi Not tainted 4.11.0.cov #32 [ 651.621938] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 [ 651.622309] task: ffff880117780000 task.stack: ffff8800a3188000 [ 651.622762] RIP: 0010:skb_release_data+0x26c/0x590 [...] [ 651.627260] Call Trace: [ 651.629156] skb_release_all+0x4f/0x60 [ 651.629450] consume_skb+0x1a5/0x600 [ 651.630705] netlink_unicast+0x505/0x720 [ 651.632345] netlink_sendmsg+0xab2/0xe70 [ 651.633704] sock_sendmsg+0xcf/0x110 [ 651.633942] ___sys_sendmsg+0x833/0x980 [ 651.637117] __sys_sendmsg+0xf3/0x240 [ 651.638820] SyS_sendmsg+0x32/0x50 [ 651.639048] entry_SYSCALL_64_fastpath+0x1f/0xc2 It's caused by skb_shared_info at the end of sk_buff was overwritten by ISCSI_KEVENT_IF_ERROR when parsing nlmsg info from skb in iscsi_if_rx. During the loop if skb->len == nlh->nlmsg_len and both are sizeof(*nlh), ev = nlmsg_data(nlh) will acutally get skb_shinfo(SKB) instead and set a new value to skb_shinfo(SKB)->nr_frags by ev->type. This patch is to fix it by checking nlh->nlmsg_len properly there to avoid over accessing sk_buff. Reported-by: ChunYu Wang Signed-off-by: Xin Long Acked-by: Chris Leech Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_transport_iscsi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index 8934f19bce8e..0190aeff5f7f 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -3689,7 +3689,7 @@ iscsi_if_rx(struct sk_buff *skb) uint32_t group; nlh = nlmsg_hdr(skb); - if (nlh->nlmsg_len < sizeof(*nlh) || + if (nlh->nlmsg_len < sizeof(*nlh) + sizeof(*ev) || skb->len < nlh->nlmsg_len) { break; } -- cgit From f3a0c7b3fa7cdf6783827c245c62772687f8b3ac Mon Sep 17 00:00:00 2001 From: Sean Wang Date: Sat, 9 Sep 2017 20:37:03 +0800 Subject: MAINTAINERS: Add entry for MediaTek PMIC LED driver Add myself as a maintainer to support existing SoCs and push forward following MediaTek PMICs with LEDs to reuse the driver. Signed-off-by: Sean Wang Signed-off-by: Jacek Anaszewski --- MAINTAINERS | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 2281af4b41b6..22cb6cc91081 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8586,6 +8586,12 @@ M: Sean Wang S: Maintained F: drivers/media/rc/mtk-cir.c +MEDIATEK PMIC LED DRIVER +M: Sean Wang +S: Maintained +F: drivers/leds/leds-mt6323.c +F: Documentation/devicetree/bindings/leds/leds-mt6323.txt + MEDIATEK ETHERNET DRIVER M: Felix Fietkau M: John Crispin -- cgit From fac1c2040203363eab6c6e86ce883cb71390418f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Sep 2017 19:00:15 +0200 Subject: smp/hotplug: Add state diagram Add a state diagram to clarify when which states are ran where. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: bigeasy@linutronix.de Cc: efault@gmx.de Cc: rostedt@goodmis.org Cc: max.byungchul.park@gmail.com Link: https://lkml.kernel.org/r/20170920170546.661598270@infradead.org --- include/linux/cpuhotplug.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index f24bfb2b9a2d..477b2e6f60f7 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -3,6 +3,24 @@ #include +/* + * CPU-up CPU-down + * + * BP AP BP AP + * + * OFFLINE OFFLINE + * | ^ + * v | + * BRINGUP_CPU->AP_OFFLINE BRINGUP_CPU <- AP_IDLE_DEAD (idle thread/play_dead) + * | AP_OFFLINE + * v (IRQ-off) ,---------------^ + * AP_ONLNE | (stop_machine) + * | TEARDOWN_CPU <- AP_ONLINE_IDLE + * | ^ + * v | + * AP_ACTIVE AP_ACTIVE + */ + enum cpuhp_state { CPUHP_OFFLINE, CPUHP_CREATE_THREADS, -- cgit From 96abb968549cdefd0964d1f7af0a79f4e6e7f897 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Sep 2017 19:00:16 +0200 Subject: smp/hotplug: Allow external multi-instance rollback Currently the rollback of multi-instance states is handled inside cpuhp_invoke_callback(). The problem is that when we want to allow an explicit state change for rollback, we need to return from the function without doing the rollback. Change cpuhp_invoke_callback() to optionally return the multi-instance state, such that rollback can be done from a subsequent call. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: bigeasy@linutronix.de Cc: efault@gmx.de Cc: rostedt@goodmis.org Cc: max.byungchul.park@gmail.com Link: https://lkml.kernel.org/r/20170920170546.720361181@infradead.org --- kernel/cpu.c | 47 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 15 deletions(-) diff --git a/kernel/cpu.c b/kernel/cpu.c index acf5308fad51..323b71050b54 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -123,13 +123,16 @@ static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) /** * cpuhp_invoke_callback _ Invoke the callbacks for a given state * @cpu: The cpu for which the callback should be invoked - * @step: The step in the state machine + * @state: The state to do callbacks for * @bringup: True if the bringup callback should be invoked + * @node: For multi-instance, do a single entry callback for install/remove + * @lastp: For multi-instance rollback, remember how far we got * * Called from cpu hotplug and from the state register machinery. */ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, - bool bringup, struct hlist_node *node) + bool bringup, struct hlist_node *node, + struct hlist_node **lastp) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); struct cpuhp_step *step = cpuhp_get_step(state); @@ -138,6 +141,7 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, int ret, cnt; if (!step->multi_instance) { + WARN_ON_ONCE(lastp && *lastp); cb = bringup ? step->startup.single : step->teardown.single; if (!cb) return 0; @@ -152,6 +156,7 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, /* Single invocation for instance add/remove */ if (node) { + WARN_ON_ONCE(lastp && *lastp); trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node); ret = cbm(cpu, node); trace_cpuhp_exit(cpu, st->state, state, ret); @@ -161,13 +166,23 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, /* State transition. Invoke on all instances */ cnt = 0; hlist_for_each(node, &step->list) { + if (lastp && node == *lastp) + break; + trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node); ret = cbm(cpu, node); trace_cpuhp_exit(cpu, st->state, state, ret); - if (ret) - goto err; + if (ret) { + if (!lastp) + goto err; + + *lastp = node; + return ret; + } cnt++; } + if (lastp) + *lastp = NULL; return 0; err: /* Rollback the instances if one failed */ @@ -323,7 +338,7 @@ static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st) struct cpuhp_step *step = cpuhp_get_step(st->state); if (!step->skip_onerr) - cpuhp_invoke_callback(cpu, st->state, true, NULL); + cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); } } @@ -334,7 +349,7 @@ static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, int ret = 0; for (; st->state > target; st->state--) { - ret = cpuhp_invoke_callback(cpu, st->state, false, NULL); + ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); if (ret) { st->target = prev_state; undo_cpu_down(cpu, st); @@ -350,7 +365,7 @@ static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st) struct cpuhp_step *step = cpuhp_get_step(st->state); if (!step->skip_onerr) - cpuhp_invoke_callback(cpu, st->state, false, NULL); + cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); } } @@ -362,7 +377,7 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, while (st->state < target) { st->state++; - ret = cpuhp_invoke_callback(cpu, st->state, true, NULL); + ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); if (ret) { st->target = prev_state; undo_cpu_up(cpu, st); @@ -428,11 +443,13 @@ static void cpuhp_thread_fun(unsigned int cpu) if (st->cb_state < CPUHP_AP_ONLINE) { local_irq_disable(); ret = cpuhp_invoke_callback(cpu, st->cb_state, - st->bringup, st->node); + st->bringup, st->node, + NULL); local_irq_enable(); } else { ret = cpuhp_invoke_callback(cpu, st->cb_state, - st->bringup, st->node); + st->bringup, st->node, + NULL); } } else if (st->rollback) { BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE); @@ -472,7 +489,7 @@ cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup, * we invoke the thread function directly. */ if (!st->thread) - return cpuhp_invoke_callback(cpu, state, bringup, node); + return cpuhp_invoke_callback(cpu, state, bringup, node, NULL); st->cb_state = state; st->single = true; @@ -595,7 +612,7 @@ static int take_cpu_down(void *_param) st->state--; /* Invoke the former CPU_DYING callbacks */ for (; st->state > target; st->state--) - cpuhp_invoke_callback(cpu, st->state, false, NULL); + cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); /* Give up timekeeping duties */ tick_handover_do_timer(); @@ -776,7 +793,7 @@ void notify_cpu_starting(unsigned int cpu) rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */ while (st->state < target) { st->state++; - cpuhp_invoke_callback(cpu, st->state, true, NULL); + cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); } } @@ -1307,9 +1324,9 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup, if (cpuhp_is_ap_state(state)) ret = cpuhp_invoke_ap_callback(cpu, state, bringup, node); else - ret = cpuhp_invoke_callback(cpu, state, bringup, node); + ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL); #else - ret = cpuhp_invoke_callback(cpu, state, bringup, node); + ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL); #endif BUG_ON(ret && !bringup); return ret; -- cgit From 4dddfb5faa6118564b0c54a163353d13882299d8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Sep 2017 19:00:17 +0200 Subject: smp/hotplug: Rewrite AP state machine core There is currently no explicit state change on rollback. That is, st->bringup, st->rollback and st->target are not consistent when doing the rollback. Rework the AP state handling to be more coherent. This does mean we have to do a second AP kick-and-wait for rollback, but since rollback is the slow path of a slowpath, this really should not matter. Take this opportunity to simplify the AP thread function to only run a single callback per invocation. This unifies the three single/up/down modes is supports. The looping it used to do for up/down are achieved by retaining should_run and relying on the main smpboot_thread_fn() loop. (I have most of a patch that does the same for the BP state handling, but that's not critical and gets a little complicated because CPUHP_BRINGUP_CPU does the AP handoff from a callback, which gets recursive @st usage, I still have de-fugly that.) [ tglx: Move cpuhp_down_callbacks() et al. into the HOTPLUG_CPU section to avoid gcc complaining about unused functions. Make the HOTPLUG_CPU one piece instead of having two consecutive ifdef sections of the same type. ] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: bigeasy@linutronix.de Cc: efault@gmx.de Cc: rostedt@goodmis.org Cc: max.byungchul.park@gmail.com Link: https://lkml.kernel.org/r/20170920170546.769658088@infradead.org --- kernel/cpu.c | 321 ++++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 206 insertions(+), 115 deletions(-) diff --git a/kernel/cpu.c b/kernel/cpu.c index 323b71050b54..1139063de5af 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -58,6 +58,7 @@ struct cpuhp_cpu_state { bool single; bool bringup; struct hlist_node *node; + struct hlist_node *last; enum cpuhp_state cb_state; int result; struct completion done; @@ -112,6 +113,14 @@ static bool cpuhp_is_ap_state(enum cpuhp_state state) return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU; } +/* + * The former STARTING/DYING states, ran with IRQs disabled and must not fail. + */ +static bool cpuhp_is_atomic_state(enum cpuhp_state state) +{ + return CPUHP_AP_IDLE_DEAD <= state && state < CPUHP_AP_ONLINE; +} + static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) { struct cpuhp_step *sp; @@ -286,7 +295,72 @@ void cpu_hotplug_enable(void) EXPORT_SYMBOL_GPL(cpu_hotplug_enable); #endif /* CONFIG_HOTPLUG_CPU */ -static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st); +static inline enum cpuhp_state +cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target) +{ + enum cpuhp_state prev_state = st->state; + + st->rollback = false; + st->last = NULL; + + st->target = target; + st->single = false; + st->bringup = st->state < target; + + return prev_state; +} + +static inline void +cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state) +{ + st->rollback = true; + + /* + * If we have st->last we need to undo partial multi_instance of this + * state first. Otherwise start undo at the previous state. + */ + if (!st->last) { + if (st->bringup) + st->state--; + else + st->state++; + } + + st->target = prev_state; + st->bringup = !st->bringup; +} + +/* Regular hotplug invocation of the AP hotplug thread */ +static void __cpuhp_kick_ap(struct cpuhp_cpu_state *st) +{ + if (!st->single && st->state == st->target) + return; + + st->result = 0; + /* + * Make sure the above stores are visible before should_run becomes + * true. Paired with the mb() above in cpuhp_thread_fun() + */ + smp_mb(); + st->should_run = true; + wake_up_process(st->thread); + wait_for_completion(&st->done); +} + +static int cpuhp_kick_ap(struct cpuhp_cpu_state *st, enum cpuhp_state target) +{ + enum cpuhp_state prev_state; + int ret; + + prev_state = cpuhp_set_state(st, target); + __cpuhp_kick_ap(st); + if ((ret = st->result)) { + cpuhp_reset_state(st, prev_state); + __cpuhp_kick_ap(st); + } + + return ret; +} static int bringup_wait_for_ap(unsigned int cpu) { @@ -301,12 +375,10 @@ static int bringup_wait_for_ap(unsigned int cpu) stop_machine_unpark(cpu); kthread_unpark(st->thread); - /* Should we go further up ? */ - if (st->target > CPUHP_AP_ONLINE_IDLE) { - __cpuhp_kick_ap_work(st); - wait_for_completion(&st->done); - } - return st->result; + if (st->target <= CPUHP_AP_ONLINE_IDLE) + return 0; + + return cpuhp_kick_ap(st, st->target); } static int bringup_cpu(unsigned int cpu) @@ -332,32 +404,6 @@ static int bringup_cpu(unsigned int cpu) /* * Hotplug state machine related functions */ -static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st) -{ - for (st->state++; st->state < st->target; st->state++) { - struct cpuhp_step *step = cpuhp_get_step(st->state); - - if (!step->skip_onerr) - cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); - } -} - -static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, - enum cpuhp_state target) -{ - enum cpuhp_state prev_state = st->state; - int ret = 0; - - for (; st->state > target; st->state--) { - ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); - if (ret) { - st->target = prev_state; - undo_cpu_down(cpu, st); - break; - } - } - return ret; -} static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st) { @@ -404,71 +450,90 @@ static int cpuhp_should_run(unsigned int cpu) return st->should_run; } -/* Execute the teardown callbacks. Used to be CPU_DOWN_PREPARE */ -static int cpuhp_ap_offline(unsigned int cpu, struct cpuhp_cpu_state *st) -{ - enum cpuhp_state target = max((int)st->target, CPUHP_TEARDOWN_CPU); - - return cpuhp_down_callbacks(cpu, st, target); -} - -/* Execute the online startup callbacks. Used to be CPU_ONLINE */ -static int cpuhp_ap_online(unsigned int cpu, struct cpuhp_cpu_state *st) -{ - return cpuhp_up_callbacks(cpu, st, st->target); -} - /* * Execute teardown/startup callbacks on the plugged cpu. Also used to invoke * callbacks when a state gets [un]installed at runtime. + * + * Each invocation of this function by the smpboot thread does a single AP + * state callback. + * + * It has 3 modes of operation: + * - single: runs st->cb_state + * - up: runs ++st->state, while st->state < st->target + * - down: runs st->state--, while st->state > st->target + * + * When complete or on error, should_run is cleared and the completion is fired. */ static void cpuhp_thread_fun(unsigned int cpu) { struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); - int ret = 0; + bool bringup = st->bringup; + enum cpuhp_state state; /* - * Paired with the mb() in cpuhp_kick_ap_work and - * cpuhp_invoke_ap_callback, so the work set is consistent visible. + * ACQUIRE for the cpuhp_should_run() load of ->should_run. Ensures + * that if we see ->should_run we also see the rest of the state. */ smp_mb(); - if (!st->should_run) - return; - st->should_run = false; + if (WARN_ON_ONCE(!st->should_run)) + return; lock_map_acquire(&cpuhp_state_lock_map); - /* Single callback invocation for [un]install ? */ + if (st->single) { - if (st->cb_state < CPUHP_AP_ONLINE) { - local_irq_disable(); - ret = cpuhp_invoke_callback(cpu, st->cb_state, - st->bringup, st->node, - NULL); - local_irq_enable(); + state = st->cb_state; + st->should_run = false; + } else { + if (bringup) { + st->state++; + state = st->state; + st->should_run = (st->state < st->target); + WARN_ON_ONCE(st->state > st->target); } else { - ret = cpuhp_invoke_callback(cpu, st->cb_state, - st->bringup, st->node, - NULL); + state = st->state; + st->state--; + st->should_run = (st->state > st->target); + WARN_ON_ONCE(st->state < st->target); } - } else if (st->rollback) { - BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE); + } + + WARN_ON_ONCE(!cpuhp_is_ap_state(state)); + + if (st->rollback) { + struct cpuhp_step *step = cpuhp_get_step(state); + if (step->skip_onerr) + goto next; + } + + if (cpuhp_is_atomic_state(state)) { + local_irq_disable(); + st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last); + local_irq_enable(); - undo_cpu_down(cpu, st); - st->rollback = false; + /* + * STARTING/DYING must not fail! + */ + WARN_ON_ONCE(st->result); } else { - /* Cannot happen .... */ - BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE); - - /* Regular hotplug work */ - if (st->state < st->target) - ret = cpuhp_ap_online(cpu, st); - else if (st->state > st->target) - ret = cpuhp_ap_offline(cpu, st); + st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last); + } + + if (st->result) { + /* + * If we fail on a rollback, we're up a creek without no + * paddle, no way forward, no way back. We loose, thanks for + * playing. + */ + WARN_ON_ONCE(st->rollback); + st->should_run = false; } + +next: lock_map_release(&cpuhp_state_lock_map); - st->result = ret; - complete(&st->done); + + if (!st->should_run) + complete(&st->done); } /* Invoke a single callback on a remote cpu */ @@ -477,6 +542,7 @@ cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup, struct hlist_node *node) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + int ret; if (!cpu_online(cpu)) return 0; @@ -491,48 +557,43 @@ cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup, if (!st->thread) return cpuhp_invoke_callback(cpu, state, bringup, node, NULL); + st->rollback = false; + st->last = NULL; + + st->node = node; + st->bringup = bringup; st->cb_state = state; st->single = true; - st->bringup = bringup; - st->node = node; - /* - * Make sure the above stores are visible before should_run becomes - * true. Paired with the mb() above in cpuhp_thread_fun() - */ - smp_mb(); - st->should_run = true; - wake_up_process(st->thread); - wait_for_completion(&st->done); - return st->result; -} + __cpuhp_kick_ap(st); -/* Regular hotplug invocation of the AP hotplug thread */ -static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st) -{ - st->result = 0; - st->single = false; /* - * Make sure the above stores are visible before should_run becomes - * true. Paired with the mb() above in cpuhp_thread_fun() + * If we failed and did a partial, do a rollback. */ - smp_mb(); - st->should_run = true; - wake_up_process(st->thread); + if ((ret = st->result) && st->last) { + st->rollback = true; + st->bringup = !bringup; + + __cpuhp_kick_ap(st); + } + + return ret; } static int cpuhp_kick_ap_work(unsigned int cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); - enum cpuhp_state state = st->state; + enum cpuhp_state prev_state = st->state; + int ret; - trace_cpuhp_enter(cpu, st->target, state, cpuhp_kick_ap_work); lock_map_acquire(&cpuhp_state_lock_map); lock_map_release(&cpuhp_state_lock_map); - __cpuhp_kick_ap_work(st); - wait_for_completion(&st->done); - trace_cpuhp_exit(cpu, st->state, state, st->result); - return st->result; + + trace_cpuhp_enter(cpu, st->target, prev_state, cpuhp_kick_ap_work); + ret = cpuhp_kick_ap(st, st->target); + trace_cpuhp_exit(cpu, st->state, prev_state, ret); + + return ret; } static struct smp_hotplug_thread cpuhp_threads = { @@ -693,11 +754,32 @@ void cpuhp_report_idle_dead(void) cpuhp_complete_idle_dead, st, 0); } -#else -#define takedown_cpu NULL -#endif +static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st) +{ + for (st->state++; st->state < st->target; st->state++) { + struct cpuhp_step *step = cpuhp_get_step(st->state); -#ifdef CONFIG_HOTPLUG_CPU + if (!step->skip_onerr) + cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); + } +} + +static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, + enum cpuhp_state target) +{ + enum cpuhp_state prev_state = st->state; + int ret = 0; + + for (; st->state > target; st->state--) { + ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); + if (ret) { + st->target = prev_state; + undo_cpu_down(cpu, st); + break; + } + } + return ret; +} /* Requires cpu_add_remove_lock to be held */ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, @@ -716,13 +798,13 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, cpuhp_tasks_frozen = tasks_frozen; - prev_state = st->state; - st->target = target; + prev_state = cpuhp_set_state(st, target); /* * If the current CPU state is in the range of the AP hotplug thread, * then we need to kick the thread. */ if (st->state > CPUHP_TEARDOWN_CPU) { + st->target = max((int)target, CPUHP_TEARDOWN_CPU); ret = cpuhp_kick_ap_work(cpu); /* * The AP side has done the error rollback already. Just @@ -737,6 +819,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, */ if (st->state > CPUHP_TEARDOWN_CPU) goto out; + + st->target = target; } /* * The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need @@ -744,9 +828,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, */ ret = cpuhp_down_callbacks(cpu, st, target); if (ret && st->state > CPUHP_TEARDOWN_CPU && st->state < prev_state) { - st->target = prev_state; - st->rollback = true; - cpuhp_kick_ap_work(cpu); + cpuhp_reset_state(st, prev_state); + __cpuhp_kick_ap(st); } out: @@ -771,11 +854,15 @@ out: cpu_maps_update_done(); return err; } + int cpu_down(unsigned int cpu) { return do_cpu_down(cpu, CPUHP_OFFLINE); } EXPORT_SYMBOL(cpu_down); + +#else +#define takedown_cpu NULL #endif /*CONFIG_HOTPLUG_CPU*/ /** @@ -846,7 +933,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) cpuhp_tasks_frozen = tasks_frozen; - st->target = target; + cpuhp_set_state(st, target); /* * If the current CPU state is in the range of the AP hotplug thread, * then we need to kick the thread once more. @@ -1313,6 +1400,10 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup, struct cpuhp_step *sp = cpuhp_get_step(state); int ret; + /* + * If there's nothing to do, we done. + * Relies on the union for multi_instance. + */ if ((bringup && !sp->startup.single) || (!bringup && !sp->teardown.single)) return 0; -- cgit From 724a86881d03ee5794148e65142e24ed3621be66 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Sep 2017 19:00:18 +0200 Subject: smp/hotplug: Callback vs state-machine consistency While the generic callback functions have an 'int' return and thus appear to be allowed to return error, this is not true for all states. Specifically, what used to be STARTING/DYING are ran with IRQs disabled from critical parts of CPU bringup/teardown and are not allowed to fail. Add WARNs to enforce this rule. But since some callbacks are indeed allowed to fail, we have the situation where a state-machine rollback encounters a failure, in this case we're stuck, we can't go forward and we can't go back. Also add a WARN for that case. AFAICT this is a fundamental 'problem' with no real obvious solution. We want the 'prepare' callbacks to allow failure on either up or down. Typically on prepare-up this would be things like -ENOMEM from resource allocations, and the typical usage in prepare-down would be something like -EBUSY to avoid CPUs being taken away. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: bigeasy@linutronix.de Cc: efault@gmx.de Cc: rostedt@goodmis.org Cc: max.byungchul.park@gmail.com Link: https://lkml.kernel.org/r/20170920170546.819539119@infradead.org --- kernel/cpu.c | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/kernel/cpu.c b/kernel/cpu.c index 1139063de5af..d6f1b8c36400 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -202,7 +202,14 @@ err: hlist_for_each(node, &step->list) { if (!cnt--) break; - cbm(cpu, node); + + trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node); + ret = cbm(cpu, node); + trace_cpuhp_exit(cpu, st->state, state, ret); + /* + * Rollback must not fail, + */ + WARN_ON_ONCE(ret); } return ret; } @@ -659,6 +666,7 @@ static int take_cpu_down(void *_param) struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); enum cpuhp_state target = max((int)st->target, CPUHP_AP_OFFLINE); int err, cpu = smp_processor_id(); + int ret; /* Ensure this CPU doesn't handle any more interrupts. */ err = __cpu_disable(); @@ -672,8 +680,13 @@ static int take_cpu_down(void *_param) WARN_ON(st->state != CPUHP_TEARDOWN_CPU); st->state--; /* Invoke the former CPU_DYING callbacks */ - for (; st->state > target; st->state--) - cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); + for (; st->state > target; st->state--) { + ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); + /* + * DYING must not fail! + */ + WARN_ON_ONCE(ret); + } /* Give up timekeeping duties */ tick_handover_do_timer(); @@ -876,11 +889,16 @@ void notify_cpu_starting(unsigned int cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE); + int ret; rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */ while (st->state < target) { st->state++; - cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); + ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); + /* + * STARTING must not fail! + */ + WARN_ON_ONCE(ret); } } -- cgit From 5f4b55e10645b7371322c800a5ec745cab487a6c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Sep 2017 19:00:20 +0200 Subject: smp/hotplug: Differentiate the AP-work lockdep class between up and down With lockdep-crossrelease we get deadlock reports that span cpu-up and cpu-down chains. Such deadlocks cannot possibly happen because cpu-up and cpu-down are globally serialized. CPU0 CPU1 CPU2 cpuhp_up_callbacks: takedown_cpu: cpuhp_thread_fun: cpuhp_state irq_lock_sparse() irq_lock_sparse() wait_for_completion() cpuhp_state complete() Now that we have consistent AP state, we can trivially separate the AP-work class between up and down using st->bringup. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: max.byungchul.park@gmail.com Cc: bigeasy@linutronix.de Cc: efault@gmx.de Cc: rostedt@goodmis.org Link: https://lkml.kernel.org/r/20170920170546.922524234@infradead.org --- kernel/cpu.c | 41 ++++++++++++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/kernel/cpu.c b/kernel/cpu.c index d6f1b8c36400..d5c09985fbb6 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -68,9 +68,26 @@ struct cpuhp_cpu_state { static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state); #if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP) -static struct lock_class_key cpuhp_state_key; -static struct lockdep_map cpuhp_state_lock_map = - STATIC_LOCKDEP_MAP_INIT("cpuhp_state", &cpuhp_state_key); +static struct lockdep_map cpuhp_state_up_map = + STATIC_LOCKDEP_MAP_INIT("cpuhp_state-up", &cpuhp_state_up_map); +static struct lockdep_map cpuhp_state_down_map = + STATIC_LOCKDEP_MAP_INIT("cpuhp_state-down", &cpuhp_state_down_map); + + +static void inline cpuhp_lock_acquire(bool bringup) +{ + lock_map_acquire(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map); +} + +static void inline cpuhp_lock_release(bool bringup) +{ + lock_map_release(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map); +} +#else + +static void inline cpuhp_lock_acquire(bool bringup) { } +static void inline cpuhp_lock_release(bool bringup) { } + #endif /** @@ -486,7 +503,7 @@ static void cpuhp_thread_fun(unsigned int cpu) if (WARN_ON_ONCE(!st->should_run)) return; - lock_map_acquire(&cpuhp_state_lock_map); + cpuhp_lock_acquire(bringup); if (st->single) { state = st->cb_state; @@ -537,7 +554,7 @@ static void cpuhp_thread_fun(unsigned int cpu) } next: - lock_map_release(&cpuhp_state_lock_map); + cpuhp_lock_release(bringup); if (!st->should_run) complete(&st->done); @@ -554,8 +571,11 @@ cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup, if (!cpu_online(cpu)) return 0; - lock_map_acquire(&cpuhp_state_lock_map); - lock_map_release(&cpuhp_state_lock_map); + cpuhp_lock_acquire(false); + cpuhp_lock_release(false); + + cpuhp_lock_acquire(true); + cpuhp_lock_release(true); /* * If we are up and running, use the hotplug thread. For early calls @@ -593,8 +613,11 @@ static int cpuhp_kick_ap_work(unsigned int cpu) enum cpuhp_state prev_state = st->state; int ret; - lock_map_acquire(&cpuhp_state_lock_map); - lock_map_release(&cpuhp_state_lock_map); + cpuhp_lock_acquire(false); + cpuhp_lock_release(false); + + cpuhp_lock_acquire(true); + cpuhp_lock_release(true); trace_cpuhp_enter(cpu, st->target, prev_state, cpuhp_kick_ap_work); ret = cpuhp_kick_ap(st, st->target); -- cgit From 5ebe7742fff8be5f1359bc50f5d43fb6ff7bd060 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Sep 2017 19:00:19 +0200 Subject: smp/hotplug: Differentiate the AP completion between up and down With lockdep-crossrelease we get deadlock reports that span cpu-up and cpu-down chains. Such deadlocks cannot possibly happen because cpu-up and cpu-down are globally serialized. takedown_cpu() irq_lock_sparse() wait_for_completion(&st->done) cpuhp_thread_fun cpuhp_up_callback cpuhp_invoke_callback irq_affinity_online_cpu irq_local_spare() irq_unlock_sparse() complete(&st->done) Now that we have consistent AP state, we can trivially separate the AP completion between up and down using st->bringup. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Acked-by: max.byungchul.park@gmail.com Cc: bigeasy@linutronix.de Cc: efault@gmx.de Cc: rostedt@goodmis.org Link: https://lkml.kernel.org/r/20170920170546.872472799@infradead.org --- kernel/cpu.c | 49 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 32 insertions(+), 17 deletions(-) diff --git a/kernel/cpu.c b/kernel/cpu.c index d5c09985fbb6..6bbe261b851f 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -46,7 +46,8 @@ * @bringup: Single callback bringup or teardown selector * @cb_state: The state for a single callback (install/uninstall) * @result: Result of the operation - * @done: Signal completion to the issuer of the task + * @done_up: Signal completion to the issuer of the task for cpu-up + * @done_down: Signal completion to the issuer of the task for cpu-down */ struct cpuhp_cpu_state { enum cpuhp_state state; @@ -61,7 +62,8 @@ struct cpuhp_cpu_state { struct hlist_node *last; enum cpuhp_state cb_state; int result; - struct completion done; + struct completion done_up; + struct completion done_down; #endif }; @@ -130,14 +132,6 @@ static bool cpuhp_is_ap_state(enum cpuhp_state state) return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU; } -/* - * The former STARTING/DYING states, ran with IRQs disabled and must not fail. - */ -static bool cpuhp_is_atomic_state(enum cpuhp_state state) -{ - return CPUHP_AP_IDLE_DEAD <= state && state < CPUHP_AP_ONLINE; -} - static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) { struct cpuhp_step *sp; @@ -232,6 +226,26 @@ err: } #ifdef CONFIG_SMP +static inline void wait_for_ap_thread(struct cpuhp_cpu_state *st, bool bringup) +{ + struct completion *done = bringup ? &st->done_up : &st->done_down; + wait_for_completion(done); +} + +static inline void complete_ap_thread(struct cpuhp_cpu_state *st, bool bringup) +{ + struct completion *done = bringup ? &st->done_up : &st->done_down; + complete(done); +} + +/* + * The former STARTING/DYING states, ran with IRQs disabled and must not fail. + */ +static bool cpuhp_is_atomic_state(enum cpuhp_state state) +{ + return CPUHP_AP_IDLE_DEAD <= state && state < CPUHP_AP_ONLINE; +} + /* Serializes the updates to cpu_online_mask, cpu_present_mask */ static DEFINE_MUTEX(cpu_add_remove_lock); bool cpuhp_tasks_frozen; @@ -368,7 +382,7 @@ static void __cpuhp_kick_ap(struct cpuhp_cpu_state *st) smp_mb(); st->should_run = true; wake_up_process(st->thread); - wait_for_completion(&st->done); + wait_for_ap_thread(st, st->bringup); } static int cpuhp_kick_ap(struct cpuhp_cpu_state *st, enum cpuhp_state target) @@ -391,7 +405,7 @@ static int bringup_wait_for_ap(unsigned int cpu) struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); /* Wait for the CPU to reach CPUHP_AP_ONLINE_IDLE */ - wait_for_completion(&st->done); + wait_for_ap_thread(st, true); if (WARN_ON_ONCE((!cpu_online(cpu)))) return -ECANCELED; @@ -464,7 +478,8 @@ static void cpuhp_create(unsigned int cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); - init_completion(&st->done); + init_completion(&st->done_up); + init_completion(&st->done_down); } static int cpuhp_should_run(unsigned int cpu) @@ -557,7 +572,7 @@ next: cpuhp_lock_release(bringup); if (!st->should_run) - complete(&st->done); + complete_ap_thread(st, bringup); } /* Invoke a single callback on a remote cpu */ @@ -753,7 +768,7 @@ static int takedown_cpu(unsigned int cpu) * * Wait for the stop thread to go away. */ - wait_for_completion(&st->done); + wait_for_ap_thread(st, false); BUG_ON(st->state != CPUHP_AP_IDLE_DEAD); /* Interrupts are moved away from the dying cpu, reenable alloc/free */ @@ -772,7 +787,7 @@ static void cpuhp_complete_idle_dead(void *arg) { struct cpuhp_cpu_state *st = arg; - complete(&st->done); + complete_ap_thread(st, false); } void cpuhp_report_idle_dead(void) @@ -939,7 +954,7 @@ void cpuhp_online_idle(enum cpuhp_state state) return; st->state = CPUHP_AP_ONLINE_IDLE; - complete(&st->done); + complete_ap_thread(st, true); } /* Requires cpu_add_remove_lock to be held */ -- cgit From 1db49484f21ed0fcdadd0635a3669f5f386546fa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Sep 2017 19:00:21 +0200 Subject: smp/hotplug: Hotplug state fail injection Add a sysfs file to one-time fail a specific state. This can be used to test the state rollback code paths. Something like this (hotplug-up.sh): #!/bin/bash echo 0 > /debug/sched_debug echo 1 > /debug/tracing/events/cpuhp/enable ALL_STATES=`cat /sys/devices/system/cpu/hotplug/states | cut -d':' -f1` STATES=${1:-$ALL_STATES} for state in $STATES do echo 0 > /sys/devices/system/cpu/cpu1/online echo 0 > /debug/tracing/trace echo Fail state: $state echo $state > /sys/devices/system/cpu/cpu1/hotplug/fail cat /sys/devices/system/cpu/cpu1/hotplug/fail echo 1 > /sys/devices/system/cpu/cpu1/online cat /debug/tracing/trace > hotfail-${state}.trace sleep 1 done Can be used to test for all possible rollback (barring multi-instance) scenarios on CPU-up, CPU-down is a trivial modification of the above. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: bigeasy@linutronix.de Cc: efault@gmx.de Cc: rostedt@goodmis.org Cc: max.byungchul.park@gmail.com Link: https://lkml.kernel.org/r/20170920170546.972581715@infradead.org --- include/linux/cpuhotplug.h | 3 ++- kernel/cpu.c | 60 +++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 61 insertions(+), 2 deletions(-) diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 477b2e6f60f7..6d508767e144 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -22,7 +22,8 @@ */ enum cpuhp_state { - CPUHP_OFFLINE, + CPUHP_INVALID = -1, + CPUHP_OFFLINE = 0, CPUHP_CREATE_THREADS, CPUHP_PERF_PREPARE, CPUHP_PERF_X86_PREPARE, diff --git a/kernel/cpu.c b/kernel/cpu.c index 6bbe261b851f..8de11a29e495 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -52,6 +52,7 @@ struct cpuhp_cpu_state { enum cpuhp_state state; enum cpuhp_state target; + enum cpuhp_state fail; #ifdef CONFIG_SMP struct task_struct *thread; bool should_run; @@ -67,7 +68,9 @@ struct cpuhp_cpu_state { #endif }; -static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state); +static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state) = { + .fail = CPUHP_INVALID, +}; #if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP) static struct lockdep_map cpuhp_state_up_map = @@ -160,6 +163,15 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, int (*cb)(unsigned int cpu); int ret, cnt; + if (st->fail == state) { + st->fail = CPUHP_INVALID; + + if (!(bringup ? step->startup.single : step->teardown.single)) + return 0; + + return -EAGAIN; + } + if (!step->multi_instance) { WARN_ON_ONCE(lastp && *lastp); cb = bringup ? step->startup.single : step->teardown.single; @@ -1805,9 +1817,55 @@ static ssize_t show_cpuhp_target(struct device *dev, } static DEVICE_ATTR(target, 0644, show_cpuhp_target, write_cpuhp_target); + +static ssize_t write_cpuhp_fail(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); + struct cpuhp_step *sp; + int fail, ret; + + ret = kstrtoint(buf, 10, &fail); + if (ret) + return ret; + + /* + * Cannot fail STARTING/DYING callbacks. + */ + if (cpuhp_is_atomic_state(fail)) + return -EINVAL; + + /* + * Cannot fail anything that doesn't have callbacks. + */ + mutex_lock(&cpuhp_state_mutex); + sp = cpuhp_get_step(fail); + if (!sp->startup.single && !sp->teardown.single) + ret = -EINVAL; + mutex_unlock(&cpuhp_state_mutex); + if (ret) + return ret; + + st->fail = fail; + + return count; +} + +static ssize_t show_cpuhp_fail(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); + + return sprintf(buf, "%d\n", st->fail); +} + +static DEVICE_ATTR(fail, 0644, show_cpuhp_fail, write_cpuhp_fail); + static struct attribute *cpuhp_cpu_attrs[] = { &dev_attr_state.attr, &dev_attr_target.attr, + &dev_attr_fail.attr, NULL }; -- cgit From cdd10c9627496ad25c87ce6394e29752253c69d3 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 22 Sep 2017 15:39:23 +0200 Subject: l2tp: ensure sessions are freed after their PPPOL2TP socket If l2tp_tunnel_delete() or l2tp_tunnel_closeall() deletes a session right after pppol2tp_release() orphaned its socket, then the 'sock' variable of the pppol2tp_session_close() callback is NULL. Yet the session is still used by pppol2tp_release(). Therefore we need to take an extra reference in any case, to prevent l2tp_tunnel_delete() or l2tp_tunnel_closeall() from freeing the session. Since the pppol2tp_session_close() callback is only set if the session is associated to a PPPOL2TP socket and that both l2tp_tunnel_delete() and l2tp_tunnel_closeall() hold the PPPOL2TP socket before calling pppol2tp_session_close(), we're sure that pppol2tp_session_close() and pppol2tp_session_destruct() are paired and called in the right order. So the reference taken by the former will be released by the later. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_ppp.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 50e3ee9a9d61..bc6e8bfc5be4 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -437,11 +437,11 @@ static void pppol2tp_session_close(struct l2tp_session *session) BUG_ON(session->magic != L2TP_SESSION_MAGIC); - if (sock) { + if (sock) inet_shutdown(sock, SEND_SHUTDOWN); - /* Don't let the session go away before our socket does */ - l2tp_session_inc_refcount(session); - } + + /* Don't let the session go away before our socket does */ + l2tp_session_inc_refcount(session); } /* Really kill the session socket. (Called from sock_put() if -- cgit From b228a94066406b6c456321d69643b0d7ce11cfa6 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 22 Sep 2017 15:39:24 +0200 Subject: l2tp: fix race between l2tp_session_delete() and l2tp_tunnel_closeall() There are several ways to remove L2TP sessions: * deleting a session explicitly using the netlink interface (with L2TP_CMD_SESSION_DELETE), * deleting the session's parent tunnel (either by closing the tunnel's file descriptor or using the netlink interface), * closing the PPPOL2TP file descriptor of a PPP pseudo-wire. In some cases, when these methods are used concurrently on the same session, the session can be removed twice, leading to use-after-free bugs. This patch adds a 'dead' flag, used by l2tp_session_delete() and l2tp_tunnel_closeall() to prevent them from stepping on each other's toes. The session deletion path used when closing a PPPOL2TP file descriptor doesn't need to be adapted. It already has to ensure that a session remains valid for the lifetime of its PPPOL2TP file descriptor. So it takes an extra reference on the session in the ->session_close() callback (pppol2tp_session_close()), which is eventually dropped in the ->sk_destruct() callback of the PPPOL2TP socket (pppol2tp_session_destruct()). Still, __l2tp_session_unhash() and l2tp_session_queue_purge() can be called twice and even concurrently for a given session, but thanks to proper locking and re-initialisation of list fields, this is not an issue. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 6 ++++++ net/l2tp/l2tp_core.h | 1 + 2 files changed, 7 insertions(+) diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index ee485df73ccd..d8c2a89a76e1 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1314,6 +1314,9 @@ again: hlist_del_init(&session->hlist); + if (test_and_set_bit(0, &session->dead)) + goto again; + if (session->ref != NULL) (*session->ref)(session); @@ -1750,6 +1753,9 @@ EXPORT_SYMBOL_GPL(__l2tp_session_unhash); */ int l2tp_session_delete(struct l2tp_session *session) { + if (test_and_set_bit(0, &session->dead)) + return 0; + if (session->ref) (*session->ref)(session); __l2tp_session_unhash(session); diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index a305e0c5925a..70a12df40a5f 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -76,6 +76,7 @@ struct l2tp_session_cfg { struct l2tp_session { int magic; /* should be * L2TP_SESSION_MAGIC */ + long dead; struct l2tp_tunnel *tunnel; /* back pointer to tunnel * context */ -- cgit From 910801809b2e40a4baedd080ef5d80b4a180e70e Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Wed, 20 Sep 2017 16:58:38 +0200 Subject: security/keys: properly zero out sensitive key material in big_key Error paths forgot to zero out sensitive material, so this patch changes some kfrees into a kzfrees. Signed-off-by: Jason A. Donenfeld Signed-off-by: David Howells Reviewed-by: Eric Biggers Cc: Herbert Xu Cc: Kirill Marinushkin Cc: security@kernel.org Cc: stable@vger.kernel.org --- security/keys/big_key.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/security/keys/big_key.c b/security/keys/big_key.c index 6acb00f6f22c..507d6fb86a4f 100644 --- a/security/keys/big_key.c +++ b/security/keys/big_key.c @@ -195,7 +195,7 @@ int big_key_preparse(struct key_preparsed_payload *prep) *path = file->f_path; path_get(path); fput(file); - kfree(data); + kzfree(data); } else { /* Just store the data in a buffer */ void *data = kmalloc(datalen, GFP_KERNEL); @@ -211,9 +211,9 @@ int big_key_preparse(struct key_preparsed_payload *prep) err_fput: fput(file); err_enckey: - kfree(enckey); + kzfree(enckey); error: - kfree(data); + kzfree(data); return ret; } @@ -227,7 +227,7 @@ void big_key_free_preparse(struct key_preparsed_payload *prep) path_put(path); } - kfree(prep->payload.data[big_key_data]); + kzfree(prep->payload.data[big_key_data]); } /* @@ -259,7 +259,7 @@ void big_key_destroy(struct key *key) path->mnt = NULL; path->dentry = NULL; } - kfree(key->payload.data[big_key_data]); + kzfree(key->payload.data[big_key_data]); key->payload.data[big_key_data] = NULL; } @@ -328,7 +328,7 @@ long big_key_read(const struct key *key, char __user *buffer, size_t buflen) err_fput: fput(file); error: - kfree(data); + kzfree(data); } else { ret = datalen; if (copy_to_user(buffer, key->payload.data[big_key_data], -- cgit From 428490e38b2e352812e0b765d8bceafab0ec441d Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Wed, 20 Sep 2017 16:58:39 +0200 Subject: security/keys: rewrite all of big_key crypto This started out as just replacing the use of crypto/rng with get_random_bytes_wait, so that we wouldn't use bad randomness at boot time. But, upon looking further, it appears that there were even deeper underlying cryptographic problems, and that this seems to have been committed with very little crypto review. So, I rewrote the whole thing, trying to keep to the conventions introduced by the previous author, to fix these cryptographic flaws. It makes no sense to seed crypto/rng at boot time and then keep using it like this, when in fact there's already get_random_bytes_wait, which can ensure there's enough entropy and be a much more standard way of generating keys. Since this sensitive material is being stored untrusted, using ECB and no authentication is simply not okay at all. I find it surprising and a bit horrifying that this code even made it past basic crypto review, which perhaps points to some larger issues. This patch moves from using AES-ECB to using AES-GCM. Since keys are uniquely generated each time, we can set the nonce to zero. There was also a race condition in which the same key would be reused at the same time in different threads. A mutex fixes this issue now. So, to summarize, this commit fixes the following vulnerabilities: * Low entropy key generation, allowing an attacker to potentially guess or predict keys. * Unauthenticated encryption, allowing an attacker to modify the cipher text in particular ways in order to manipulate the plaintext, which is is even more frightening considering the next point. * Use of ECB mode, allowing an attacker to trivially swap blocks or compare identical plaintext blocks. * Key re-use. * Faulty memory zeroing. Signed-off-by: Jason A. Donenfeld Reviewed-by: Eric Biggers Signed-off-by: David Howells Cc: Herbert Xu Cc: Kirill Marinushkin Cc: security@kernel.org Cc: stable@vger.kernel.org --- security/keys/Kconfig | 4 +- security/keys/big_key.c | 127 ++++++++++++++++++++++-------------------------- 2 files changed, 60 insertions(+), 71 deletions(-) diff --git a/security/keys/Kconfig b/security/keys/Kconfig index a7a23b5541f8..91eafada3164 100644 --- a/security/keys/Kconfig +++ b/security/keys/Kconfig @@ -45,10 +45,8 @@ config BIG_KEYS bool "Large payload keys" depends on KEYS depends on TMPFS - depends on (CRYPTO_ANSI_CPRNG = y || CRYPTO_DRBG = y) select CRYPTO_AES - select CRYPTO_ECB - select CRYPTO_RNG + select CRYPTO_GCM help This option provides support for holding large keys within the kernel (for example Kerberos ticket caches). The data may be stored out to diff --git a/security/keys/big_key.c b/security/keys/big_key.c index 507d6fb86a4f..e607830b6154 100644 --- a/security/keys/big_key.c +++ b/security/keys/big_key.c @@ -1,5 +1,6 @@ /* Large capacity key type * + * Copyright (C) 2017 Jason A. Donenfeld . All Rights Reserved. * Copyright (C) 2013 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * @@ -16,10 +17,10 @@ #include #include #include +#include #include #include -#include -#include +#include /* * Layout of key payload words. @@ -49,7 +50,12 @@ enum big_key_op { /* * Key size for big_key data encryption */ -#define ENC_KEY_SIZE 16 +#define ENC_KEY_SIZE 32 + +/* + * Authentication tag length + */ +#define ENC_AUTHTAG_SIZE 16 /* * big_key defined keys take an arbitrary string as the description and an @@ -64,57 +70,62 @@ struct key_type key_type_big_key = { .destroy = big_key_destroy, .describe = big_key_describe, .read = big_key_read, + /* no ->update(); don't add it without changing big_key_crypt() nonce */ }; /* - * Crypto names for big_key data encryption + * Crypto names for big_key data authenticated encryption */ -static const char big_key_rng_name[] = "stdrng"; -static const char big_key_alg_name[] = "ecb(aes)"; +static const char big_key_alg_name[] = "gcm(aes)"; /* - * Crypto algorithms for big_key data encryption + * Crypto algorithms for big_key data authenticated encryption */ -static struct crypto_rng *big_key_rng; -static struct crypto_skcipher *big_key_skcipher; +static struct crypto_aead *big_key_aead; /* - * Generate random key to encrypt big_key data + * Since changing the key affects the entire object, we need a mutex. */ -static inline int big_key_gen_enckey(u8 *key) -{ - return crypto_rng_get_bytes(big_key_rng, key, ENC_KEY_SIZE); -} +static DEFINE_MUTEX(big_key_aead_lock); /* * Encrypt/decrypt big_key data */ static int big_key_crypt(enum big_key_op op, u8 *data, size_t datalen, u8 *key) { - int ret = -EINVAL; + int ret; struct scatterlist sgio; - SKCIPHER_REQUEST_ON_STACK(req, big_key_skcipher); - - if (crypto_skcipher_setkey(big_key_skcipher, key, ENC_KEY_SIZE)) { + struct aead_request *aead_req; + /* We always use a zero nonce. The reason we can get away with this is + * because we're using a different randomly generated key for every + * different encryption. Notably, too, key_type_big_key doesn't define + * an .update function, so there's no chance we'll wind up reusing the + * key to encrypt updated data. Simply put: one key, one encryption. + */ + u8 zero_nonce[crypto_aead_ivsize(big_key_aead)]; + + aead_req = aead_request_alloc(big_key_aead, GFP_KERNEL); + if (!aead_req) + return -ENOMEM; + + memset(zero_nonce, 0, sizeof(zero_nonce)); + sg_init_one(&sgio, data, datalen + (op == BIG_KEY_ENC ? ENC_AUTHTAG_SIZE : 0)); + aead_request_set_crypt(aead_req, &sgio, &sgio, datalen, zero_nonce); + aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL); + aead_request_set_ad(aead_req, 0); + + mutex_lock(&big_key_aead_lock); + if (crypto_aead_setkey(big_key_aead, key, ENC_KEY_SIZE)) { ret = -EAGAIN; goto error; } - - skcipher_request_set_tfm(req, big_key_skcipher); - skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, - NULL, NULL); - - sg_init_one(&sgio, data, datalen); - skcipher_request_set_crypt(req, &sgio, &sgio, datalen, NULL); - if (op == BIG_KEY_ENC) - ret = crypto_skcipher_encrypt(req); + ret = crypto_aead_encrypt(aead_req); else - ret = crypto_skcipher_decrypt(req); - - skcipher_request_zero(req); - + ret = crypto_aead_decrypt(aead_req); error: + mutex_unlock(&big_key_aead_lock); + aead_request_free(aead_req); return ret; } @@ -146,16 +157,13 @@ int big_key_preparse(struct key_preparsed_payload *prep) * * File content is stored encrypted with randomly generated key. */ - size_t enclen = ALIGN(datalen, crypto_skcipher_blocksize(big_key_skcipher)); + size_t enclen = datalen + ENC_AUTHTAG_SIZE; loff_t pos = 0; - /* prepare aligned data to encrypt */ data = kmalloc(enclen, GFP_KERNEL); if (!data) return -ENOMEM; - memcpy(data, prep->data, datalen); - memset(data + datalen, 0x00, enclen - datalen); /* generate random key */ enckey = kmalloc(ENC_KEY_SIZE, GFP_KERNEL); @@ -163,13 +171,12 @@ int big_key_preparse(struct key_preparsed_payload *prep) ret = -ENOMEM; goto error; } - - ret = big_key_gen_enckey(enckey); - if (ret) + ret = get_random_bytes_wait(enckey, ENC_KEY_SIZE); + if (unlikely(ret)) goto err_enckey; /* encrypt aligned data */ - ret = big_key_crypt(BIG_KEY_ENC, data, enclen, enckey); + ret = big_key_crypt(BIG_KEY_ENC, data, datalen, enckey); if (ret) goto err_enckey; @@ -295,7 +302,7 @@ long big_key_read(const struct key *key, char __user *buffer, size_t buflen) struct file *file; u8 *data; u8 *enckey = (u8 *)key->payload.data[big_key_data]; - size_t enclen = ALIGN(datalen, crypto_skcipher_blocksize(big_key_skcipher)); + size_t enclen = datalen + ENC_AUTHTAG_SIZE; loff_t pos = 0; data = kmalloc(enclen, GFP_KERNEL); @@ -344,47 +351,31 @@ error: */ static int __init big_key_init(void) { - struct crypto_skcipher *cipher; - struct crypto_rng *rng; int ret; - rng = crypto_alloc_rng(big_key_rng_name, 0, 0); - if (IS_ERR(rng)) { - pr_err("Can't alloc rng: %ld\n", PTR_ERR(rng)); - return PTR_ERR(rng); - } - - big_key_rng = rng; - - /* seed RNG */ - ret = crypto_rng_reset(rng, NULL, crypto_rng_seedsize(rng)); - if (ret) { - pr_err("Can't reset rng: %d\n", ret); - goto error_rng; - } - /* init block cipher */ - cipher = crypto_alloc_skcipher(big_key_alg_name, 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(cipher)) { - ret = PTR_ERR(cipher); + big_key_aead = crypto_alloc_aead(big_key_alg_name, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(big_key_aead)) { + ret = PTR_ERR(big_key_aead); pr_err("Can't alloc crypto: %d\n", ret); - goto error_rng; + return ret; + } + ret = crypto_aead_setauthsize(big_key_aead, ENC_AUTHTAG_SIZE); + if (ret < 0) { + pr_err("Can't set crypto auth tag len: %d\n", ret); + goto free_aead; } - - big_key_skcipher = cipher; ret = register_key_type(&key_type_big_key); if (ret < 0) { pr_err("Can't register type: %d\n", ret); - goto error_cipher; + goto free_aead; } return 0; -error_cipher: - crypto_free_skcipher(big_key_skcipher); -error_rng: - crypto_free_rng(big_key_rng); +free_aead: + crypto_free_aead(big_key_aead); return ret; } -- cgit From e4d8ae00169f7686e1da5a62e5cf797d12bf8822 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 21 Sep 2017 10:44:36 -0700 Subject: PM / OPP: Call notifier without holding opp_table->lock The notifier callbacks may want to call some OPP helper routines which may try to take the same opp_table->lock again and cause a deadlock. One such usecase was reported by Chanwoo Choi, where calling dev_pm_opp_disable() leads us to the devfreq's OPP notifier handler, which further calls dev_pm_opp_find_freq_floor() and it deadlocks. We don't really need the opp_table->lock to be held across the notifier call though, all we want to make sure is that the 'opp' doesn't get freed while being used from within the notifier chain. We can do it with help of dev_pm_opp_get/put() as well. Let's do it. Cc: 4.11+ # 4.11+ Fixes: 5b650b388844 "PM / OPP: Take kref from _find_opp_table()" Reported-by: Chanwoo Choi Tested-by: Chanwoo Choi Reviewed-by: Stephen Boyd Reviewed-by: Chanwoo Choi Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/base/power/opp/core.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/base/power/opp/core.c b/drivers/base/power/opp/core.c index a8cc14fd8ae4..a6de32530693 100644 --- a/drivers/base/power/opp/core.c +++ b/drivers/base/power/opp/core.c @@ -1581,6 +1581,9 @@ static int _opp_set_availability(struct device *dev, unsigned long freq, opp->available = availability_req; + dev_pm_opp_get(opp); + mutex_unlock(&opp_table->lock); + /* Notify the change of the OPP availability */ if (availability_req) blocking_notifier_call_chain(&opp_table->head, OPP_EVENT_ENABLE, @@ -1589,8 +1592,12 @@ static int _opp_set_availability(struct device *dev, unsigned long freq, blocking_notifier_call_chain(&opp_table->head, OPP_EVENT_DISABLE, opp); + dev_pm_opp_put(opp); + goto put_table; + unlock: mutex_unlock(&opp_table->lock); +put_table: dev_pm_opp_put_opp_table(opp_table); return r; } -- cgit From 675195d0be27391d48d8d23c7c62991505168528 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Wed, 20 Sep 2017 08:58:53 +0200 Subject: scsi: scsi_transport_fc: set scsi_target_id upon rescan When an rport is found in the bindings array there is no guarantee that it had been a target port, so we need to call fc_remote_port_rolechg() here to ensure the scsi_target_id is set correctly. Otherwise the port will never be scanned. Signed-off-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Tested-by: Chad Dupuis Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_transport_fc.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c index ba9d70f8a6a1..e74fffc32c75 100644 --- a/drivers/scsi/scsi_transport_fc.c +++ b/drivers/scsi/scsi_transport_fc.c @@ -2876,7 +2876,6 @@ fc_remote_port_add(struct Scsi_Host *shost, int channel, memcpy(&rport->port_name, &ids->port_name, sizeof(rport->port_name)); rport->port_id = ids->port_id; - rport->roles = ids->roles; rport->port_state = FC_PORTSTATE_ONLINE; rport->flags &= ~FC_RPORT_FAST_FAIL_TIMEDOUT; @@ -2885,15 +2884,7 @@ fc_remote_port_add(struct Scsi_Host *shost, int channel, fci->f->dd_fcrport_size); spin_unlock_irqrestore(shost->host_lock, flags); - if (ids->roles & FC_PORT_ROLE_FCP_TARGET) { - scsi_target_unblock(&rport->dev, SDEV_RUNNING); - - /* initiate a scan of the target */ - spin_lock_irqsave(shost->host_lock, flags); - rport->flags |= FC_RPORT_SCAN_PENDING; - scsi_queue_work(shost, &rport->scan_work); - spin_unlock_irqrestore(shost->host_lock, flags); - } + fc_remote_port_rolechg(rport, ids->roles); return rport; } } -- cgit From d477bf3af1e88fe27c893f84136647fe11963198 Mon Sep 17 00:00:00 2001 From: Suniel Mahesh Date: Thu, 21 Sep 2017 19:09:03 +0530 Subject: cpufreq: dt: Fix sysfs duplicate filename creation for platform-device ti-cpufreq and cpufreq-dt-platdev drivers are registering platform-device with same name "cpufreq-dt" using platform_device_register_*() routines. This is leading to build warnings appended below. Providing hardware information to OPP framework along with the platform- device creation should be done by ti-cpufreq driver before cpufreq-dt driver comes into place. This patch add's TI am33xx, am43 and dra7 platforms (which use opp-v2 property) to the blacklist of devices in cpufreq-dt-platform driver to avoid creating platform-device twice and remove build warnings. [ 2.370167] ------------[ cut here ]------------ [ 2.375087] WARNING: CPU: 0 PID: 1 at fs/sysfs/dir.c:31 sysfs_warn_dup+0x58/0x78 [ 2.383112] sysfs: cannot create duplicate filename '/devices/platform/cpufreq-dt' [ 2.391219] Modules linked in: [ 2.394506] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.13.0-next-20170912 #1 [ 2.402006] Hardware name: Generic AM33XX (Flattened Device Tree) [ 2.408437] [] (unwind_backtrace) from [] (show_stack+0x10/0x14) [ 2.416568] [] (show_stack) from [] (dump_stack+0xac/0xe0) [ 2.424165] [] (dump_stack) from [] (__warn+0xd8/0x104) [ 2.431488] [] (__warn) from [] (warn_slowpath_fmt+0x34/0x44) [ 2.439351] [] (warn_slowpath_fmt) from [] (sysfs_warn_dup+0x58/0x78) [ 2.447938] [] (sysfs_warn_dup) from [] (sysfs_create_dir_ns+0x80/0x98) [ 2.456719] [] (sysfs_create_dir_ns) from [] (kobject_add_internal+0x9c/0x2d4) [ 2.466124] [] (kobject_add_internal) from [] (kobject_add+0x4c/0x9c) [ 2.474712] [] (kobject_add) from [] (device_add+0xcc/0x57c) [ 2.482489] [] (device_add) from [] (platform_device_add+0x100/0x220) [ 2.491085] [] (platform_device_add) from [] (platform_device_register_full+0xf4/0x118) [ 2.501305] [] (platform_device_register_full) from [] (ti_cpufreq_init+0x150/0x22c) [ 2.511253] [] (ti_cpufreq_init) from [] (do_one_initcall+0x3c/0x170) [ 2.519838] [] (do_one_initcall) from [] (kernel_init_freeable+0x1fc/0x2c4) [ 2.528974] [] (kernel_init_freeable) from [] (kernel_init+0x8/0x110) [ 2.537565] [] (kernel_init) from [] (ret_from_fork+0x14/0x3c) [ 2.545981] ---[ end trace 2fc00e213c13ab20 ]--- [ 2.551051] ------------[ cut here ]------------ [ 2.555931] WARNING: CPU: 0 PID: 1 at lib/kobject.c:240 kobject_add_internal+0x254/0x2d4 [ 2.564578] kobject_add_internal failed for cpufreq-dt with -EEXIST, don't try to register things with the same name in the same directory. [ 2.577977] Modules linked in: [ 2.581261] CPU: 0 PID: 1 Comm: swapper/0 Tainted: G W 4.13.0-next-20170912 #1 [ 2.590013] Hardware name: Generic AM33XX (Flattened Device Tree) [ 2.596437] [] (unwind_backtrace) from [] (show_stack+0x10/0x14) [ 2.604573] [] (show_stack) from [] (dump_stack+0xac/0xe0) [ 2.612172] [] (dump_stack) from [] (__warn+0xd8/0x104) [ 2.619494] [] (__warn) from [] (warn_slowpath_fmt+0x34/0x44) [ 2.627362] [] (warn_slowpath_fmt) from [] (kobject_add_internal+0x254/0x2d4) [ 2.636666] [] (kobject_add_internal) from [] (kobject_add+0x4c/0x9c) [ 2.645255] [] (kobject_add) from [] (device_add+0xcc/0x57c) [ 2.653027] [] (device_add) from [] (platform_device_add+0x100/0x220) [ 2.661615] [] (platform_device_add) from [] (platform_device_register_full+0xf4/0x118) [ 2.671833] [] (platform_device_register_full) from [] (ti_cpufreq_init+0x150/0x22c) [ 2.681779] [] (ti_cpufreq_init) from [] (do_one_initcall+0x3c/0x170) [ 2.690377] [] (do_one_initcall) from [] (kernel_init_freeable+0x1fc/0x2c4) [ 2.699510] [] (kernel_init_freeable) from [] (kernel_init+0x8/0x110) [ 2.708106] [] (kernel_init) from [] (ret_from_fork+0x14/0x3c) [ 2.716217] ---[ end trace 2fc00e213c13ab21 ]--- Fixes: edeec420de24 (cpufreq: dt-cpufreq: platdev Automatically create device with OPP v2) Signed-off-by: Suniel Mahesh Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq-dt-platdev.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/cpufreq/cpufreq-dt-platdev.c b/drivers/cpufreq/cpufreq-dt-platdev.c index 430edadca527..a753c50e9e41 100644 --- a/drivers/cpufreq/cpufreq-dt-platdev.c +++ b/drivers/cpufreq/cpufreq-dt-platdev.c @@ -118,6 +118,10 @@ static const struct of_device_id blacklist[] __initconst = { { .compatible = "sigma,tango4", }, + { .compatible = "ti,am33xx", }, + { .compatible = "ti,am43", }, + { .compatible = "ti,dra7", }, + { } }; -- cgit From a93ad944f4ff9a797abff17c73fc4b1e4a1d9141 Mon Sep 17 00:00:00 2001 From: Timur Tabi Date: Fri, 22 Sep 2017 15:32:44 -0500 Subject: net: qcom/emac: specify the correct size when mapping a DMA buffer When mapping the RX DMA buffers, the driver was accidentally specifying zero for the buffer length. Under normal circumstances, SWIOTLB does not need to allocate a bounce buffer, so the address is just mapped without checking the size field. This is why the error was not detected earlier. Fixes: b9b17debc69d ("net: emac: emac gigabit ethernet controller driver") Cc: stable@vger.kernel.org Signed-off-by: Timur Tabi Signed-off-by: David S. Miller --- drivers/net/ethernet/qualcomm/emac/emac-mac.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/qualcomm/emac/emac-mac.c b/drivers/net/ethernet/qualcomm/emac/emac-mac.c index 0ea3ca09c689..3ed9033e56db 100644 --- a/drivers/net/ethernet/qualcomm/emac/emac-mac.c +++ b/drivers/net/ethernet/qualcomm/emac/emac-mac.c @@ -898,7 +898,8 @@ static void emac_mac_rx_descs_refill(struct emac_adapter *adpt, curr_rxbuf->dma_addr = dma_map_single(adpt->netdev->dev.parent, skb->data, - curr_rxbuf->length, DMA_FROM_DEVICE); + adpt->rxbuf_size, DMA_FROM_DEVICE); + ret = dma_mapping_error(adpt->netdev->dev.parent, curr_rxbuf->dma_addr); if (ret) { -- cgit From 9561475db680f7144d2223a409dd3d7e322aca03 Mon Sep 17 00:00:00 2001 From: Nicolai Stange Date: Mon, 11 Sep 2017 09:45:40 +0200 Subject: PCI: Fix race condition with driver_override The driver_override implementation is susceptible to a race condition when different threads are reading vs. storing a different driver override. Add locking to avoid the race condition. This is in close analogy to commit 6265539776a0 ("driver core: platform: fix race condition with driver_override") from Adrian Salido. Fixes: 782a985d7af2 ("PCI: Introduce new device binding path using pci_dev.driver_override") Signed-off-by: Nicolai Stange Signed-off-by: Bjorn Helgaas Cc: stable@vger.kernel.org # v3.16+ --- drivers/pci/pci-sysfs.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index 1eecfa301f7f..8e075ea2743e 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -686,7 +686,7 @@ static ssize_t driver_override_store(struct device *dev, const char *buf, size_t count) { struct pci_dev *pdev = to_pci_dev(dev); - char *driver_override, *old = pdev->driver_override, *cp; + char *driver_override, *old, *cp; /* We need to keep extra room for a newline */ if (count >= (PAGE_SIZE - 1)) @@ -700,12 +700,15 @@ static ssize_t driver_override_store(struct device *dev, if (cp) *cp = '\0'; + device_lock(dev); + old = pdev->driver_override; if (strlen(driver_override)) { pdev->driver_override = driver_override; } else { kfree(driver_override); pdev->driver_override = NULL; } + device_unlock(dev); kfree(old); @@ -716,8 +719,12 @@ static ssize_t driver_override_show(struct device *dev, struct device_attribute *attr, char *buf) { struct pci_dev *pdev = to_pci_dev(dev); + ssize_t len; - return snprintf(buf, PAGE_SIZE, "%s\n", pdev->driver_override); + device_lock(dev); + len = snprintf(buf, PAGE_SIZE, "%s\n", pdev->driver_override); + device_unlock(dev); + return len; } static DEVICE_ATTR_RW(driver_override); -- cgit From b776e4b1a990045a7c70798f1f353c3160c26594 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 25 Sep 2017 20:38:45 -0400 Subject: fix a typo in put_compat_shm_info() "uip" misspelled as "up"; unfortunately, the latter happens to be a function and gcc is happy to convert it to void *... Signed-off-by: Al Viro --- ipc/shm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ipc/shm.c b/ipc/shm.c index 1e2b1692ba2c..badac463e2c8 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -1154,7 +1154,7 @@ static int put_compat_shm_info(struct shm_info *ip, info.shm_swp = ip->shm_swp; info.swap_attempts = ip->swap_attempts; info.swap_successes = ip->swap_successes; - return copy_to_user(up, &info, sizeof(info)); + return copy_to_user(uip, &info, sizeof(info)); } static int copy_compat_shmid_to_user(void __user *buf, struct shmid64_ds *in, -- cgit From cc6f77710a6de6210f9feda7cd53e2f5ee7a7e69 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 18 Sep 2017 09:41:16 -0700 Subject: xfs: don't unconditionally clear the reflink flag on zero-block files If we have speculative cow preallocations hanging around in the cow fork, don't let a truncate operation clear the reflink flag because if we do then there's a chance we'll forget to free those extents when we destroy the incore inode. Reported-by: Amir Goldstein Reviewed-by: Carlos Maiolino Reviewed-by: Christoph Hellwig Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_inode.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 5599dda4727a..4ec5b7f45401 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1624,10 +1624,12 @@ xfs_itruncate_extents( goto out; /* - * Clear the reflink flag if we truncated everything. + * Clear the reflink flag if there are no data fork blocks and + * there are no extents staged in the cow fork. */ - if (ip->i_d.di_nblocks == 0 && xfs_is_reflink_inode(ip)) { - ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; + if (xfs_is_reflink_inode(ip) && ip->i_cnextents == 0) { + if (ip->i_d.di_nblocks == 0) + ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; xfs_inode_clear_cowblocks_tag(ip); } -- cgit From 3af423b03435c81036fa710623d3ae92fbe346a3 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 18 Sep 2017 09:41:17 -0700 Subject: xfs: evict CoW fork extents when performing finsert/fcollapse When we perform an finsert/fcollapse operation, cancel all the CoW extents for the affected file offset range so that they don't end up pointing to the wrong blocks. Reported-by: Amir Goldstein Reviewed-by: Carlos Maiolino Reviewed-by: Christoph Hellwig Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_bmap_util.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index cd9a5400ba4f..bc6c6e10a969 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1459,7 +1459,19 @@ xfs_shift_file_space( return error; /* - * The extent shiting code works on extent granularity. So, if + * Clean out anything hanging around in the cow fork now that + * we've flushed all the dirty data out to disk to avoid having + * CoW extents at the wrong offsets. + */ + if (xfs_is_reflink_inode(ip)) { + error = xfs_reflink_cancel_cow_range(ip, offset, NULLFILEOFF, + true); + if (error) + return error; + } + + /* + * The extent shifting code works on extent granularity. So, if * stop_fsb is not the starting block of extent, we need to split * the extent at stop_fsb. */ -- cgit From e150dcd459e1b441eaf08f341a986f04e61bf3b8 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Mon, 18 Sep 2017 11:34:16 -0700 Subject: fs/xfs: Use %pS printk format for direct addresses Use the %pS instead of the %pF printk format specifier for printing symbols from direct addresses. This is needed for the ia64, ppc64 and parisc64 architectures. Signed-off-by: Helge Deller Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_error.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index bd786a9ac2c3..eaf86f55b7f2 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -347,7 +347,7 @@ xfs_verifier_error( { struct xfs_mount *mp = bp->b_target->bt_mount; - xfs_alert(mp, "Metadata %s detected at %pF, %s block 0x%llx", + xfs_alert(mp, "Metadata %s detected at %pS, %s block 0x%llx", bp->b_error == -EFSBADCRC ? "CRC error" : "corruption", __return_address, bp->b_ops->name, bp->b_bn); -- cgit From 64671bafbdd984535aa382bccadd91fbe7be0e80 Mon Sep 17 00:00:00 2001 From: Eryu Guan Date: Mon, 18 Sep 2017 11:38:58 -0700 Subject: xfs: kill meaningless variable 'zero' In xfs_file_aio_write_checks(), variable 'zero' is there only to satisfy xfs_zero_eof(), the result of it is ignored. Now, with iomap_zero_range() based xfs_zero_eof(), we can safely pass NULL as the last param of it and kill 'zero'. Signed-off-by: Eryu Guan Reviewed-by: Carlos Maiolino Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_file.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index ebdd0bd2b261..261d83f1db76 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -377,8 +377,6 @@ restart: */ spin_lock(&ip->i_flags_lock); if (iocb->ki_pos > i_size_read(inode)) { - bool zero = false; - spin_unlock(&ip->i_flags_lock); if (!drained_dio) { if (*iolock == XFS_IOLOCK_SHARED) { @@ -399,7 +397,7 @@ restart: drained_dio = true; goto restart; } - error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), &zero); + error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), NULL); if (error) return error; } else -- cgit From d20a5e3851969fa685f118a80e4df670255a4e8d Mon Sep 17 00:00:00 2001 From: Eryu Guan Date: Mon, 18 Sep 2017 11:39:23 -0700 Subject: xfs: report zeroed or not correctly in xfs_zero_range() The 'did_zero' param of xfs_zero_range() was not passed to iomap_zero_range() correctly. This was introduced by commit 7bb41db3ea16 ("xfs: handle 64-bit length in xfs_iozero"), and found by code inspection. Signed-off-by: Eryu Guan Reviewed-by: Carlos Maiolino Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 261d83f1db76..350b6d43ba23 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -58,7 +58,7 @@ xfs_zero_range( xfs_off_t count, bool *did_zero) { - return iomap_zero_range(VFS_I(ip), pos, count, NULL, &xfs_iomap_ops); + return iomap_zero_range(VFS_I(ip), pos, count, did_zero, &xfs_iomap_ops); } int -- cgit From 1e6fa688bffc0ff419a4c3e78dbaf7aabfb55183 Mon Sep 17 00:00:00 2001 From: Kenjiro Nakayama Date: Mon, 18 Sep 2017 12:03:56 -0700 Subject: xfs: Output warning message when discard option was enabled even though the device does not support discard In order to using discard function, it is necessary that not only xfs is mounted with discard option, but also the discard function is supported by the device. Current code doesn't output any message when users mount with discard option on unsupported device, so it is difficult to notice that it was not enabled actually. This patch adds the warning message to notice that discard option is not enabled due to unsupported device when the filesystem is mounted. Changes in v2 (Suggested by Brian Foster): - Move the unsupported device check into xfs_fs_fill_super(). - Clear the discard flag when device is unsupported. Signed-off-by: Kenjiro Nakayama Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_super.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index c996f4ae4a5f..584cf2d573ba 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1654,6 +1654,16 @@ xfs_fs_fill_super( "DAX and reflink have not been tested together!"); } + if (mp->m_flags & XFS_MOUNT_DISCARD) { + struct request_queue *q = bdev_get_queue(sb->s_bdev); + + if (!blk_queue_discard(q)) { + xfs_warn(mp, "mounting with \"discard\" option, but " + "the device does not support discard"); + mp->m_flags &= ~XFS_MOUNT_DISCARD; + } + } + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) { if (mp->m_sb.sb_rblocks) { xfs_alert(mp, -- cgit From 60915f83cd1e021a66fc1503a446aef5c772553a Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 18 Sep 2017 13:38:46 -0700 Subject: xfs: remove redundant re-initialization of total_nr_pages Variable total_nr_pages is being initialized and then updated with the same value, this latter assignment is redundant and can be removed. Cleans up clang build warning: Value stored to 'total_nr_pages' during its initialization is never read Signed-off-by: Colin Ian King Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_buf.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index da14658da310..2f97c12ca75e 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1258,8 +1258,6 @@ xfs_buf_ioapply_map( int size; int offset; - total_nr_pages = bp->b_page_count; - /* skip the pages in the buffer before the start offset */ page_index = 0; offset = *buf_offset; -- cgit From f091fb8c344ce13cbf058d304c6cbb042be97058 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 25 Sep 2017 13:47:23 +0200 Subject: scsi: scsi_transport_fc: Also check for NOTPRESENT in fc_remote_port_add() During failover there is a small race window between fc_remote_port_add() and fc_timeout_deleted_rport(); the latter drops the lock after setting the port to NOTPRESENT, so if fc_remote_port_add() is called right at that time it will fail to detect the existing rport and happily adding a new structure, causing rports to get registered twice. Signed-off-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_transport_fc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c index e74fffc32c75..cbd4495d0ff9 100644 --- a/drivers/scsi/scsi_transport_fc.c +++ b/drivers/scsi/scsi_transport_fc.c @@ -2739,7 +2739,8 @@ fc_remote_port_add(struct Scsi_Host *shost, int channel, list_for_each_entry(rport, &fc_host->rports, peers) { - if ((rport->port_state == FC_PORTSTATE_BLOCKED) && + if ((rport->port_state == FC_PORTSTATE_BLOCKED || + rport->port_state == FC_PORTSTATE_NOTPRESENT) && (rport->channel == channel)) { switch (fc_host->tgtid_bind_type) { -- cgit From 6098d7ddd62f532f80ee2a4b01aca500a8e4e9e4 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 22 Sep 2017 23:29:18 +0200 Subject: rocker: fix rocker_tlv_put_* functions for KASAN Inlining these functions creates lots of stack variables that each take 64 bytes when KASAN is enabled, leading to this warning about potential stack overflow: drivers/net/ethernet/rocker/rocker_ofdpa.c: In function 'ofdpa_cmd_flow_tbl_add': drivers/net/ethernet/rocker/rocker_ofdpa.c:621:1: error: the frame size of 2752 bytes is larger than 1536 bytes [-Werror=frame-larger-than=] gcc-8 can now consolidate the stack slots itself, but on older versions we get the same behavior by using a temporary variable that holds a copy of the inline function argument. Cc: stable@vger.kernel.org Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81715 Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- drivers/net/ethernet/rocker/rocker_tlv.h | 48 ++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/drivers/net/ethernet/rocker/rocker_tlv.h b/drivers/net/ethernet/rocker/rocker_tlv.h index a63ef82e7c72..dfae3c9d57c6 100644 --- a/drivers/net/ethernet/rocker/rocker_tlv.h +++ b/drivers/net/ethernet/rocker/rocker_tlv.h @@ -139,40 +139,52 @@ rocker_tlv_start(struct rocker_desc_info *desc_info) int rocker_tlv_put(struct rocker_desc_info *desc_info, int attrtype, int attrlen, const void *data); -static inline int rocker_tlv_put_u8(struct rocker_desc_info *desc_info, - int attrtype, u8 value) +static inline int +rocker_tlv_put_u8(struct rocker_desc_info *desc_info, int attrtype, u8 value) { - return rocker_tlv_put(desc_info, attrtype, sizeof(u8), &value); + u8 tmp = value; /* work around GCC PR81715 */ + + return rocker_tlv_put(desc_info, attrtype, sizeof(u8), &tmp); } -static inline int rocker_tlv_put_u16(struct rocker_desc_info *desc_info, - int attrtype, u16 value) +static inline int +rocker_tlv_put_u16(struct rocker_desc_info *desc_info, int attrtype, u16 value) { - return rocker_tlv_put(desc_info, attrtype, sizeof(u16), &value); + u16 tmp = value; + + return rocker_tlv_put(desc_info, attrtype, sizeof(u16), &tmp); } -static inline int rocker_tlv_put_be16(struct rocker_desc_info *desc_info, - int attrtype, __be16 value) +static inline int +rocker_tlv_put_be16(struct rocker_desc_info *desc_info, int attrtype, __be16 value) { - return rocker_tlv_put(desc_info, attrtype, sizeof(__be16), &value); + __be16 tmp = value; + + return rocker_tlv_put(desc_info, attrtype, sizeof(__be16), &tmp); } -static inline int rocker_tlv_put_u32(struct rocker_desc_info *desc_info, - int attrtype, u32 value) +static inline int +rocker_tlv_put_u32(struct rocker_desc_info *desc_info, int attrtype, u32 value) { - return rocker_tlv_put(desc_info, attrtype, sizeof(u32), &value); + u32 tmp = value; + + return rocker_tlv_put(desc_info, attrtype, sizeof(u32), &tmp); } -static inline int rocker_tlv_put_be32(struct rocker_desc_info *desc_info, - int attrtype, __be32 value) +static inline int +rocker_tlv_put_be32(struct rocker_desc_info *desc_info, int attrtype, __be32 value) { - return rocker_tlv_put(desc_info, attrtype, sizeof(__be32), &value); + __be32 tmp = value; + + return rocker_tlv_put(desc_info, attrtype, sizeof(__be32), &tmp); } -static inline int rocker_tlv_put_u64(struct rocker_desc_info *desc_info, - int attrtype, u64 value) +static inline int +rocker_tlv_put_u64(struct rocker_desc_info *desc_info, int attrtype, u64 value) { - return rocker_tlv_put(desc_info, attrtype, sizeof(u64), &value); + u64 tmp = value; + + return rocker_tlv_put(desc_info, attrtype, sizeof(u64), &tmp); } static inline struct rocker_tlv * -- cgit From b4391db42308c9940944b5d7be5ca4b78fb88dd0 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 22 Sep 2017 23:29:19 +0200 Subject: netlink: fix nla_put_{u8,u16,u32} for KASAN When CONFIG_KASAN is enabled, the "--param asan-stack=1" causes rather large stack frames in some functions. This goes unnoticed normally because CONFIG_FRAME_WARN is disabled with CONFIG_KASAN by default as of commit 3f181b4d8652 ("lib/Kconfig.debug: disable -Wframe-larger-than warnings with KASAN=y"). The kernelci.org build bot however has the warning enabled and that led me to investigate it a little further, as every build produces these warnings: net/wireless/nl80211.c:4389:1: warning: the frame size of 2240 bytes is larger than 2048 bytes [-Wframe-larger-than=] net/wireless/nl80211.c:1895:1: warning: the frame size of 3776 bytes is larger than 2048 bytes [-Wframe-larger-than=] net/wireless/nl80211.c:1410:1: warning: the frame size of 2208 bytes is larger than 2048 bytes [-Wframe-larger-than=] net/bridge/br_netlink.c:1282:1: warning: the frame size of 2544 bytes is larger than 2048 bytes [-Wframe-larger-than=] Most of this problem is now solved in gcc-8, which can consolidate the stack slots for the inline function arguments. On older compilers we can add a workaround by declaring a local variable in each function to pass the inline function argument. Cc: stable@vger.kernel.org Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81715 Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- include/net/netlink.h | 73 ++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 55 insertions(+), 18 deletions(-) diff --git a/include/net/netlink.h b/include/net/netlink.h index e51cf5f81597..14c289393071 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -773,7 +773,10 @@ static inline int nla_parse_nested(struct nlattr *tb[], int maxtype, */ static inline int nla_put_u8(struct sk_buff *skb, int attrtype, u8 value) { - return nla_put(skb, attrtype, sizeof(u8), &value); + /* temporary variables to work around GCC PR81715 with asan-stack=1 */ + u8 tmp = value; + + return nla_put(skb, attrtype, sizeof(u8), &tmp); } /** @@ -784,7 +787,9 @@ static inline int nla_put_u8(struct sk_buff *skb, int attrtype, u8 value) */ static inline int nla_put_u16(struct sk_buff *skb, int attrtype, u16 value) { - return nla_put(skb, attrtype, sizeof(u16), &value); + u16 tmp = value; + + return nla_put(skb, attrtype, sizeof(u16), &tmp); } /** @@ -795,7 +800,9 @@ static inline int nla_put_u16(struct sk_buff *skb, int attrtype, u16 value) */ static inline int nla_put_be16(struct sk_buff *skb, int attrtype, __be16 value) { - return nla_put(skb, attrtype, sizeof(__be16), &value); + __be16 tmp = value; + + return nla_put(skb, attrtype, sizeof(__be16), &tmp); } /** @@ -806,7 +813,9 @@ static inline int nla_put_be16(struct sk_buff *skb, int attrtype, __be16 value) */ static inline int nla_put_net16(struct sk_buff *skb, int attrtype, __be16 value) { - return nla_put_be16(skb, attrtype | NLA_F_NET_BYTEORDER, value); + __be16 tmp = value; + + return nla_put_be16(skb, attrtype | NLA_F_NET_BYTEORDER, tmp); } /** @@ -817,7 +826,9 @@ static inline int nla_put_net16(struct sk_buff *skb, int attrtype, __be16 value) */ static inline int nla_put_le16(struct sk_buff *skb, int attrtype, __le16 value) { - return nla_put(skb, attrtype, sizeof(__le16), &value); + __le16 tmp = value; + + return nla_put(skb, attrtype, sizeof(__le16), &tmp); } /** @@ -828,7 +839,9 @@ static inline int nla_put_le16(struct sk_buff *skb, int attrtype, __le16 value) */ static inline int nla_put_u32(struct sk_buff *skb, int attrtype, u32 value) { - return nla_put(skb, attrtype, sizeof(u32), &value); + u32 tmp = value; + + return nla_put(skb, attrtype, sizeof(u32), &tmp); } /** @@ -839,7 +852,9 @@ static inline int nla_put_u32(struct sk_buff *skb, int attrtype, u32 value) */ static inline int nla_put_be32(struct sk_buff *skb, int attrtype, __be32 value) { - return nla_put(skb, attrtype, sizeof(__be32), &value); + __be32 tmp = value; + + return nla_put(skb, attrtype, sizeof(__be32), &tmp); } /** @@ -850,7 +865,9 @@ static inline int nla_put_be32(struct sk_buff *skb, int attrtype, __be32 value) */ static inline int nla_put_net32(struct sk_buff *skb, int attrtype, __be32 value) { - return nla_put_be32(skb, attrtype | NLA_F_NET_BYTEORDER, value); + __be32 tmp = value; + + return nla_put_be32(skb, attrtype | NLA_F_NET_BYTEORDER, tmp); } /** @@ -861,7 +878,9 @@ static inline int nla_put_net32(struct sk_buff *skb, int attrtype, __be32 value) */ static inline int nla_put_le32(struct sk_buff *skb, int attrtype, __le32 value) { - return nla_put(skb, attrtype, sizeof(__le32), &value); + __le32 tmp = value; + + return nla_put(skb, attrtype, sizeof(__le32), &tmp); } /** @@ -874,7 +893,9 @@ static inline int nla_put_le32(struct sk_buff *skb, int attrtype, __le32 value) static inline int nla_put_u64_64bit(struct sk_buff *skb, int attrtype, u64 value, int padattr) { - return nla_put_64bit(skb, attrtype, sizeof(u64), &value, padattr); + u64 tmp = value; + + return nla_put_64bit(skb, attrtype, sizeof(u64), &tmp, padattr); } /** @@ -887,7 +908,9 @@ static inline int nla_put_u64_64bit(struct sk_buff *skb, int attrtype, static inline int nla_put_be64(struct sk_buff *skb, int attrtype, __be64 value, int padattr) { - return nla_put_64bit(skb, attrtype, sizeof(__be64), &value, padattr); + __be64 tmp = value; + + return nla_put_64bit(skb, attrtype, sizeof(__be64), &tmp, padattr); } /** @@ -900,7 +923,9 @@ static inline int nla_put_be64(struct sk_buff *skb, int attrtype, __be64 value, static inline int nla_put_net64(struct sk_buff *skb, int attrtype, __be64 value, int padattr) { - return nla_put_be64(skb, attrtype | NLA_F_NET_BYTEORDER, value, + __be64 tmp = value; + + return nla_put_be64(skb, attrtype | NLA_F_NET_BYTEORDER, tmp, padattr); } @@ -914,7 +939,9 @@ static inline int nla_put_net64(struct sk_buff *skb, int attrtype, __be64 value, static inline int nla_put_le64(struct sk_buff *skb, int attrtype, __le64 value, int padattr) { - return nla_put_64bit(skb, attrtype, sizeof(__le64), &value, padattr); + __le64 tmp = value; + + return nla_put_64bit(skb, attrtype, sizeof(__le64), &tmp, padattr); } /** @@ -925,7 +952,9 @@ static inline int nla_put_le64(struct sk_buff *skb, int attrtype, __le64 value, */ static inline int nla_put_s8(struct sk_buff *skb, int attrtype, s8 value) { - return nla_put(skb, attrtype, sizeof(s8), &value); + s8 tmp = value; + + return nla_put(skb, attrtype, sizeof(s8), &tmp); } /** @@ -936,7 +965,9 @@ static inline int nla_put_s8(struct sk_buff *skb, int attrtype, s8 value) */ static inline int nla_put_s16(struct sk_buff *skb, int attrtype, s16 value) { - return nla_put(skb, attrtype, sizeof(s16), &value); + s16 tmp = value; + + return nla_put(skb, attrtype, sizeof(s16), &tmp); } /** @@ -947,7 +978,9 @@ static inline int nla_put_s16(struct sk_buff *skb, int attrtype, s16 value) */ static inline int nla_put_s32(struct sk_buff *skb, int attrtype, s32 value) { - return nla_put(skb, attrtype, sizeof(s32), &value); + s32 tmp = value; + + return nla_put(skb, attrtype, sizeof(s32), &tmp); } /** @@ -960,7 +993,9 @@ static inline int nla_put_s32(struct sk_buff *skb, int attrtype, s32 value) static inline int nla_put_s64(struct sk_buff *skb, int attrtype, s64 value, int padattr) { - return nla_put_64bit(skb, attrtype, sizeof(s64), &value, padattr); + s64 tmp = value; + + return nla_put_64bit(skb, attrtype, sizeof(s64), &tmp, padattr); } /** @@ -1010,7 +1045,9 @@ static inline int nla_put_msecs(struct sk_buff *skb, int attrtype, static inline int nla_put_in_addr(struct sk_buff *skb, int attrtype, __be32 addr) { - return nla_put_be32(skb, attrtype, addr); + __be32 tmp = addr; + + return nla_put_be32(skb, attrtype, tmp); } /** -- cgit From 4618e90965f272fe522f2af2523a60d0d4bc78f3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 15:00:10 +0200 Subject: x86/fpu: Fix fpu__activate_fpstate_read() and update comments fpu__activate_fpstate_read() can be called for the current task when coredumping - or for stopped tasks when ptrace-ing. Implement this properly in the code and update the comments. This also fixes an incorrect (but harmless) warning introduced by one of the earlier patches. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-28-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/core.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 93103a909c47..afd3f2a5c64e 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -254,18 +254,21 @@ EXPORT_SYMBOL_GPL(fpu__activate_curr); /* * This function must be called before we read a task's fpstate. * - * If the task has not used the FPU before then initialize its - * fpstate. + * There's two cases where this gets called: + * + * - for the current task (when coredumping), in which case we have + * to save the latest FPU registers into the fpstate, + * + * - or it's called for stopped tasks (ptrace), in which case the + * registers were already saved by the context-switch code when + * the task scheduled out - we only have to initialize the registers + * if they've never been initialized. * * If the task has used the FPU before then save it. */ void fpu__activate_fpstate_read(struct fpu *fpu) { - /* - * If fpregs are active (in the current CPU), then - * copy them to the fpstate: - */ - if (fpu->fpstate_active) { + if (fpu == ¤t->thread.fpu) { fpu__save(fpu); } else { if (!fpu->fpstate_active) { -- cgit From 685c930d6e58e31e251ec354f9dca3958a4c5040 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 15:00:11 +0200 Subject: x86/fpu: Remove fpu__current_fpstate_write_begin/end() These functions are not used anymore, so remove them. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Bobby Powers Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-29-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 2 -- arch/x86/kernel/fpu/core.c | 63 ------------------------------------- 2 files changed, 65 deletions(-) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index cf290d424e48..508e4181c4af 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -26,8 +26,6 @@ extern void fpu__activate_curr(struct fpu *fpu); extern void fpu__activate_fpstate_read(struct fpu *fpu); extern void fpu__activate_fpstate_write(struct fpu *fpu); -extern void fpu__current_fpstate_write_begin(void); -extern void fpu__current_fpstate_write_end(void); extern void fpu__save(struct fpu *fpu); extern void fpu__restore(struct fpu *fpu); extern int fpu__restore_sig(void __user *buf, int ia32_frame); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index afd3f2a5c64e..b2cdeb3b1860 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -316,69 +316,6 @@ void fpu__activate_fpstate_write(struct fpu *fpu) } } -/* - * This function must be called before we write the current - * task's fpstate. - * - * This call gets the current FPU register state and moves - * it in to the 'fpstate'. Preemption is disabled so that - * no writes to the 'fpstate' can occur from context - * swiches. - * - * Must be followed by a fpu__current_fpstate_write_end(). - */ -void fpu__current_fpstate_write_begin(void) -{ - struct fpu *fpu = ¤t->thread.fpu; - - /* - * Ensure that the context-switching code does not write - * over the fpstate while we are doing our update. - */ - preempt_disable(); - - /* - * Move the fpregs in to the fpu's 'fpstate'. - */ - fpu__activate_fpstate_read(fpu); - - /* - * The caller is about to write to 'fpu'. Ensure that no - * CPU thinks that its fpregs match the fpstate. This - * ensures we will not be lazy and skip a XRSTOR in the - * future. - */ - __fpu_invalidate_fpregs_state(fpu); -} - -/* - * This function must be paired with fpu__current_fpstate_write_begin() - * - * This will ensure that the modified fpstate gets placed back in - * the fpregs if necessary. - * - * Note: This function may be called whether or not an _actual_ - * write to the fpstate occurred. - */ -void fpu__current_fpstate_write_end(void) -{ - struct fpu *fpu = ¤t->thread.fpu; - - /* - * 'fpu' now has an updated copy of the state, but the - * registers may still be out of date. Update them with - * an XRSTOR if they are active. - */ - if (fpu->fpstate_active) - copy_kernel_to_fpregs(&fpu->state); - - /* - * Our update is done and the fpregs/fpstate are in sync - * if necessary. Context switches can happen again. - */ - preempt_enable(); -} - /* * 'fpu__restore()' is called to copy FPU registers from * the FPU fpstate to the live hw registers and to activate -- cgit From e4a81bfcaae1ebbdc6efe74e8ea563144d90e9a9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 26 Sep 2017 09:43:36 +0200 Subject: x86/fpu: Rename fpu::fpstate_active to fpu::initialized The x86 FPU code used to have a complex state machine where both the FPU registers and the FPU state context could be 'active' (or inactive) independently of each other - which enabled features like lazy FPU restore. Much of this complexity is gone in the current code: now we basically can have FPU-less tasks (kernel threads) that don't use (and save/restore) FPU state at all, plus full FPU users that save/restore directly with no laziness whatsoever. But the fpu::fpstate_active still carries bits of the old complexity - meanwhile this flag has become a simple flag that shows whether the FPU context saving area in the thread struct is initialized and used, or not. Rename it to fpu::initialized to express this simplicity in the name as well. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-30-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32_signal.c | 2 +- arch/x86/include/asm/fpu/internal.h | 4 ++-- arch/x86/include/asm/fpu/types.h | 6 +++--- arch/x86/include/asm/trace/fpu.h | 8 ++++---- arch/x86/kernel/fpu/core.c | 24 ++++++++++++------------ arch/x86/kernel/fpu/init.c | 2 +- arch/x86/kernel/fpu/regset.c | 6 +++--- arch/x86/kernel/fpu/signal.c | 8 ++++---- arch/x86/kernel/fpu/xstate.c | 2 +- arch/x86/kernel/signal.c | 6 +++--- arch/x86/mm/pkeys.c | 2 +- 11 files changed, 35 insertions(+), 35 deletions(-) diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index e0bb46c02857..0e2a5edbce00 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -231,7 +231,7 @@ static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs, ksig->ka.sa.sa_restorer) sp = (unsigned long) ksig->ka.sa.sa_restorer; - if (fpu->fpstate_active) { + if (fpu->initialized) { unsigned long fx_aligned, math_size; sp = fpu__alloc_mathframe(sp, 1, &fx_aligned, &math_size); diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 508e4181c4af..b26ae05da18a 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -527,7 +527,7 @@ static inline void fpregs_activate(struct fpu *fpu) static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu) { - if (old_fpu->fpstate_active) { + if (old_fpu->initialized) { if (!copy_fpregs_to_fpstate(old_fpu)) old_fpu->last_cpu = -1; else @@ -550,7 +550,7 @@ switch_fpu_prepare(struct fpu *old_fpu, int cpu) static inline void switch_fpu_finish(struct fpu *new_fpu, int cpu) { bool preload = static_cpu_has(X86_FEATURE_FPU) && - new_fpu->fpstate_active; + new_fpu->initialized; if (preload) { if (!fpregs_state_valid(new_fpu, cpu)) diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 71db45ca8870..a1520575d86b 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -293,13 +293,13 @@ struct fpu { unsigned int last_cpu; /* - * @fpstate_active: + * @initialized: * - * This flag indicates whether this context is active: if the task + * This flag indicates whether this context is initialized: if the task * is not running then we can restore from this context, if the task * is running then we should save into this context. */ - unsigned char fpstate_active; + unsigned char initialized; /* * @state: diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h index da565aae9fd2..39f7a27bef13 100644 --- a/arch/x86/include/asm/trace/fpu.h +++ b/arch/x86/include/asm/trace/fpu.h @@ -12,22 +12,22 @@ DECLARE_EVENT_CLASS(x86_fpu, TP_STRUCT__entry( __field(struct fpu *, fpu) - __field(bool, fpstate_active) + __field(bool, initialized) __field(u64, xfeatures) __field(u64, xcomp_bv) ), TP_fast_assign( __entry->fpu = fpu; - __entry->fpstate_active = fpu->fpstate_active; + __entry->initialized = fpu->initialized; if (boot_cpu_has(X86_FEATURE_OSXSAVE)) { __entry->xfeatures = fpu->state.xsave.header.xfeatures; __entry->xcomp_bv = fpu->state.xsave.header.xcomp_bv; } ), - TP_printk("x86/fpu: %p fpstate_active: %d xfeatures: %llx xcomp_bv: %llx", + TP_printk("x86/fpu: %p initialized: %d xfeatures: %llx xcomp_bv: %llx", __entry->fpu, - __entry->fpstate_active, + __entry->initialized, __entry->xfeatures, __entry->xcomp_bv ) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index b2cdeb3b1860..c8d6032f04d0 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -100,7 +100,7 @@ void __kernel_fpu_begin(void) kernel_fpu_disable(); - if (fpu->fpstate_active) { + if (fpu->initialized) { /* * Ignore return value -- we don't care if reg state * is clobbered. @@ -116,7 +116,7 @@ void __kernel_fpu_end(void) { struct fpu *fpu = ¤t->thread.fpu; - if (fpu->fpstate_active) + if (fpu->initialized) copy_kernel_to_fpregs(&fpu->state); kernel_fpu_enable(); @@ -148,7 +148,7 @@ void fpu__save(struct fpu *fpu) preempt_disable(); trace_x86_fpu_before_save(fpu); - if (fpu->fpstate_active) { + if (fpu->initialized) { if (!copy_fpregs_to_fpstate(fpu)) { copy_kernel_to_fpregs(&fpu->state); } @@ -191,7 +191,7 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) { dst_fpu->last_cpu = -1; - if (!src_fpu->fpstate_active || !static_cpu_has(X86_FEATURE_FPU)) + if (!src_fpu->initialized || !static_cpu_has(X86_FEATURE_FPU)) return 0; WARN_ON_FPU(src_fpu != ¤t->thread.fpu); @@ -240,13 +240,13 @@ void fpu__activate_curr(struct fpu *fpu) { WARN_ON_FPU(fpu != ¤t->thread.fpu); - if (!fpu->fpstate_active) { + if (!fpu->initialized) { fpstate_init(&fpu->state); trace_x86_fpu_init_state(fpu); trace_x86_fpu_activate_state(fpu); /* Safe to do for the current task: */ - fpu->fpstate_active = 1; + fpu->initialized = 1; } } EXPORT_SYMBOL_GPL(fpu__activate_curr); @@ -271,13 +271,13 @@ void fpu__activate_fpstate_read(struct fpu *fpu) if (fpu == ¤t->thread.fpu) { fpu__save(fpu); } else { - if (!fpu->fpstate_active) { + if (!fpu->initialized) { fpstate_init(&fpu->state); trace_x86_fpu_init_state(fpu); trace_x86_fpu_activate_state(fpu); /* Safe to do for current and for stopped child tasks: */ - fpu->fpstate_active = 1; + fpu->initialized = 1; } } } @@ -303,7 +303,7 @@ void fpu__activate_fpstate_write(struct fpu *fpu) */ WARN_ON_FPU(fpu == ¤t->thread.fpu); - if (fpu->fpstate_active) { + if (fpu->initialized) { /* Invalidate any lazy state: */ __fpu_invalidate_fpregs_state(fpu); } else { @@ -312,7 +312,7 @@ void fpu__activate_fpstate_write(struct fpu *fpu) trace_x86_fpu_activate_state(fpu); /* Safe to do for stopped child tasks: */ - fpu->fpstate_active = 1; + fpu->initialized = 1; } } @@ -354,7 +354,7 @@ void fpu__drop(struct fpu *fpu) preempt_disable(); if (fpu == ¤t->thread.fpu) { - if (fpu->fpstate_active) { + if (fpu->initialized) { /* Ignore delayed exceptions from user space */ asm volatile("1: fwait\n" "2:\n" @@ -363,7 +363,7 @@ void fpu__drop(struct fpu *fpu) } } - fpu->fpstate_active = 0; + fpu->initialized = 0; trace_x86_fpu_dropped(fpu); diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index d5d44c452624..7affb7e3d9a5 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -240,7 +240,7 @@ static void __init fpu__init_system_ctx_switch(void) WARN_ON_FPU(!on_boot_cpu); on_boot_cpu = 0; - WARN_ON_FPU(current->thread.fpu.fpstate_active); + WARN_ON_FPU(current->thread.fpu.initialized); } /* diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index c764f7405322..19e82334e811 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -16,14 +16,14 @@ int regset_fpregs_active(struct task_struct *target, const struct user_regset *r { struct fpu *target_fpu = &target->thread.fpu; - return target_fpu->fpstate_active ? regset->n : 0; + return target_fpu->initialized ? regset->n : 0; } int regset_xregset_fpregs_active(struct task_struct *target, const struct user_regset *regset) { struct fpu *target_fpu = &target->thread.fpu; - if (boot_cpu_has(X86_FEATURE_FXSR) && target_fpu->fpstate_active) + if (boot_cpu_has(X86_FEATURE_FXSR) && target_fpu->initialized) return regset->n; else return 0; @@ -380,7 +380,7 @@ int dump_fpu(struct pt_regs *regs, struct user_i387_struct *ufpu) struct fpu *fpu = &tsk->thread.fpu; int fpvalid; - fpvalid = fpu->fpstate_active; + fpvalid = fpu->initialized; if (fpvalid) fpvalid = !fpregs_get(tsk, NULL, 0, sizeof(struct user_i387_ia32_struct), diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index da68ea1c3a44..ab2dd24cfea4 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -171,7 +171,7 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) sizeof(struct user_i387_ia32_struct), NULL, (struct _fpstate_32 __user *) buf) ? -1 : 1; - if (fpu->fpstate_active || using_compacted_format()) { + if (fpu->initialized || using_compacted_format()) { /* Save the live register state to the user directly. */ if (copy_fpregs_to_sigframe(buf_fx)) return -1; @@ -315,12 +315,12 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) int err = 0; /* - * Drop the current fpu which clears fpu->fpstate_active. This ensures + * Drop the current fpu which clears fpu->initialized. This ensures * that any context-switch during the copy of the new state, * avoids the intermediate state from getting restored/saved. * Thus avoiding the new restored state from getting corrupted. * We will be ready to restore/save the state only after - * fpu->fpstate_active is again set. + * fpu->initialized is again set. */ fpu__drop(fpu); @@ -342,7 +342,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) sanitize_restored_xstate(tsk, &env, xfeatures, fx_only); } - fpu->fpstate_active = 1; + fpu->initialized = 1; preempt_disable(); fpu__restore(fpu); preempt_enable(); diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index fda1109cc355..703e76d027ee 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -867,7 +867,7 @@ const void *get_xsave_field_ptr(int xsave_state) { struct fpu *fpu = ¤t->thread.fpu; - if (!fpu->fpstate_active) + if (!fpu->initialized) return NULL; /* * fpu__save() takes the CPU's xstate registers diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index e04442345fc0..4e188fda5961 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -263,7 +263,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, sp = (unsigned long) ka->sa.sa_restorer; } - if (fpu->fpstate_active) { + if (fpu->initialized) { sp = fpu__alloc_mathframe(sp, IS_ENABLED(CONFIG_X86_32), &buf_fx, &math_size); *fpstate = (void __user *)sp; @@ -279,7 +279,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, return (void __user *)-1L; /* save i387 and extended state */ - if (fpu->fpstate_active && + if (fpu->initialized && copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size) < 0) return (void __user *)-1L; @@ -755,7 +755,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) /* * Ensure the signal handler starts with the new fpu state. */ - if (fpu->fpstate_active) + if (fpu->initialized) fpu__clear(fpu); } signal_setup_done(failed, ksig, stepping); diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c index 4d24269c071f..d7bc0eea20a5 100644 --- a/arch/x86/mm/pkeys.c +++ b/arch/x86/mm/pkeys.c @@ -44,7 +44,7 @@ int __execute_only_pkey(struct mm_struct *mm) */ preempt_disable(); if (!need_to_set_mm_pkey && - current->thread.fpu.fpstate_active && + current->thread.fpu.initialized && !__pkru_allows_read(read_pkru(), execute_only_pkey)) { preempt_enable(); return execute_only_pkey; -- cgit From 7f1487c59b7c6dcb20155f4302985da2659a2997 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 15:00:13 +0200 Subject: x86/fpu: Fix stale comments about lazy FPU logic We don't do any lazy restore anymore, what we have are two pieces of optimization: - no-FPU tasks that don't save/restore the FPU context (kernel threads are such) - cached FPU registers maintained via the fpu->last_cpu field. This means that if an FPU task context switches to a non-FPU task then we can maintain the FPU registers as an in-FPU copies (cache), and skip the restoration of them once we switch back to the original FPU-using task. Update all the comments that still referred to old 'lazy' and 'unlazy' concepts. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-31-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/core.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index c8d6032f04d0..77668d91fdc1 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -205,9 +205,6 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) /* * Save current FPU registers directly into the child * FPU context, without any memory-to-memory copying. - * In lazy mode, if the FPU context isn't loaded into - * fpregs, CR0.TS will be set and do_device_not_available - * will load the FPU context. * * We have to do all this with preemption disabled, * mostly because of the FNSAVE case, because in that @@ -285,13 +282,13 @@ void fpu__activate_fpstate_read(struct fpu *fpu) /* * This function must be called before we write a task's fpstate. * - * If the task has used the FPU before then unlazy it. + * If the task has used the FPU before then invalidate any cached FPU registers. * If the task has not used the FPU before then initialize its fpstate. * * After this function call, after registers in the fpstate are * modified and the child task has woken up, the child task will * restore the modified FPU state from the modified context. If we - * didn't clear its lazy status here then the lazy in-registers + * didn't clear its cached status here then the cached in-registers * state pending on its former CPU could be restored, corrupting * the modifications. */ @@ -304,7 +301,7 @@ void fpu__activate_fpstate_write(struct fpu *fpu) WARN_ON_FPU(fpu == ¤t->thread.fpu); if (fpu->initialized) { - /* Invalidate any lazy state: */ + /* Invalidate any cached state: */ __fpu_invalidate_fpregs_state(fpu); } else { fpstate_init(&fpu->state); -- cgit From e10078eba69859359ce8644dd423b4132a6a8913 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 15:00:14 +0200 Subject: x86/fpu: Simplify and speed up fpu__copy() fpu__copy() has a preempt_disable()/enable() pair, which it had to do to be able to atomically unlazy the current task when doing an FNSAVE. But we don't unlazy tasks anymore, we always do direct saves/restores of FPU context. So remove both the unnecessary critical section, and update the comments. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-32-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/core.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 77668d91fdc1..52122dd418ae 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -206,22 +206,13 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) * Save current FPU registers directly into the child * FPU context, without any memory-to-memory copying. * - * We have to do all this with preemption disabled, - * mostly because of the FNSAVE case, because in that - * case we must not allow preemption in the window - * between the FNSAVE and us marking the context lazy. - * - * It shouldn't be an issue as even FNSAVE is plenty - * fast in terms of critical section length. + * ( The function 'fails' in the FNSAVE case, which destroys + * register contents so we have to copy them back. ) */ - preempt_disable(); if (!copy_fpregs_to_fpstate(dst_fpu)) { - memcpy(&src_fpu->state, &dst_fpu->state, - fpu_kernel_xstate_size); - + memcpy(&src_fpu->state, &dst_fpu->state, fpu_kernel_xstate_size); copy_kernel_to_fpregs(&src_fpu->state); } - preempt_enable(); trace_x86_fpu_copy_src(src_fpu); trace_x86_fpu_copy_dst(dst_fpu); -- cgit From 2ce03d850b9a2f17d55596ecfa86e72b5687a627 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 15:00:15 +0200 Subject: x86/fpu: Rename fpu__activate_curr() to fpu__initialize() Rename this function to better express that it's all about initializing the FPU state of a task which goes hand in hand with the fpu::initialized field. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-33-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 2 +- arch/x86/kernel/fpu/core.c | 8 ++++---- arch/x86/kernel/fpu/signal.c | 2 +- arch/x86/kvm/x86.c | 2 +- arch/x86/math-emu/fpu_entry.c | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index b26ae05da18a..7c980aafb8aa 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -23,7 +23,7 @@ /* * High level FPU state handling functions: */ -extern void fpu__activate_curr(struct fpu *fpu); +extern void fpu__initialize(struct fpu *fpu); extern void fpu__activate_fpstate_read(struct fpu *fpu); extern void fpu__activate_fpstate_write(struct fpu *fpu); extern void fpu__save(struct fpu *fpu); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 52122dd418ae..07db9d94b68b 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -224,7 +224,7 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) * Activate the current task's in-memory FPU context, * if it has not been used before: */ -void fpu__activate_curr(struct fpu *fpu) +void fpu__initialize(struct fpu *fpu) { WARN_ON_FPU(fpu != ¤t->thread.fpu); @@ -237,7 +237,7 @@ void fpu__activate_curr(struct fpu *fpu) fpu->initialized = 1; } } -EXPORT_SYMBOL_GPL(fpu__activate_curr); +EXPORT_SYMBOL_GPL(fpu__initialize); /* * This function must be called before we read a task's fpstate. @@ -316,7 +316,7 @@ void fpu__activate_fpstate_write(struct fpu *fpu) */ void fpu__restore(struct fpu *fpu) { - fpu__activate_curr(fpu); + fpu__initialize(fpu); /* Avoid __kernel_fpu_begin() right after fpregs_activate() */ kernel_fpu_disable(); @@ -392,7 +392,7 @@ void fpu__clear(struct fpu *fpu) */ if (static_cpu_has(X86_FEATURE_FPU)) { preempt_disable(); - fpu__activate_curr(fpu); + fpu__initialize(fpu); user_fpu_begin(); copy_init_fpstate_to_fpregs(); preempt_enable(); diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index ab2dd24cfea4..7fa3bdb331e9 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -280,7 +280,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) if (!access_ok(VERIFY_READ, buf, size)) return -EACCES; - fpu__activate_curr(fpu); + fpu__initialize(fpu); if (!static_cpu_has(X86_FEATURE_FPU)) return fpregs_soft_set(current, NULL, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index cd17b7d9a107..03869eb7fcd6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7225,7 +7225,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) int r; sigset_t sigsaved; - fpu__activate_curr(fpu); + fpu__initialize(fpu); if (vcpu->sigset_active) sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c index d4a7df2205b8..220638a4cb94 100644 --- a/arch/x86/math-emu/fpu_entry.c +++ b/arch/x86/math-emu/fpu_entry.c @@ -114,7 +114,7 @@ void math_emulate(struct math_emu_info *info) struct desc_struct code_descriptor; struct fpu *fpu = ¤t->thread.fpu; - fpu__activate_curr(fpu); + fpu__initialize(fpu); #ifdef RE_ENTRANT_CHECKING if (emulating) { -- cgit From 369a036de206710ff27a66f9bffe78ef657648c3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 13:37:45 +0200 Subject: x86/fpu: Rename fpu__activate_fpstate_read/write() to fpu__prepare_[read|write]() As per the new nomenclature we don't 'activate' the FPU state anymore, we initialize it. So drop the _activate_fpstate name from these functions, which were a bit of a mouthful anyway, and name them: fpu__prepare_read() fpu__prepare_write() Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Eric Biggers Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 4 ++-- arch/x86/kernel/fpu/core.c | 4 ++-- arch/x86/kernel/fpu/regset.c | 12 ++++++------ 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 7c980aafb8aa..e3221ffa304e 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -24,8 +24,8 @@ * High level FPU state handling functions: */ extern void fpu__initialize(struct fpu *fpu); -extern void fpu__activate_fpstate_read(struct fpu *fpu); -extern void fpu__activate_fpstate_write(struct fpu *fpu); +extern void fpu__prepare_read(struct fpu *fpu); +extern void fpu__prepare_write(struct fpu *fpu); extern void fpu__save(struct fpu *fpu); extern void fpu__restore(struct fpu *fpu); extern int fpu__restore_sig(void __user *buf, int ia32_frame); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 07db9d94b68b..f92a6593de1e 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -254,7 +254,7 @@ EXPORT_SYMBOL_GPL(fpu__initialize); * * If the task has used the FPU before then save it. */ -void fpu__activate_fpstate_read(struct fpu *fpu) +void fpu__prepare_read(struct fpu *fpu) { if (fpu == ¤t->thread.fpu) { fpu__save(fpu); @@ -283,7 +283,7 @@ void fpu__activate_fpstate_read(struct fpu *fpu) * state pending on its former CPU could be restored, corrupting * the modifications. */ -void fpu__activate_fpstate_write(struct fpu *fpu) +void fpu__prepare_write(struct fpu *fpu) { /* * Only stopped child tasks can be used to modify the FPU diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 19e82334e811..ee8d2f049818 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -38,7 +38,7 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset, if (!boot_cpu_has(X86_FEATURE_FXSR)) return -ENODEV; - fpu__activate_fpstate_read(fpu); + fpu__prepare_read(fpu); fpstate_sanitize_xstate(fpu); return user_regset_copyout(&pos, &count, &kbuf, &ubuf, @@ -55,7 +55,7 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, if (!boot_cpu_has(X86_FEATURE_FXSR)) return -ENODEV; - fpu__activate_fpstate_write(fpu); + fpu__prepare_write(fpu); fpstate_sanitize_xstate(fpu); ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, @@ -89,7 +89,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, xsave = &fpu->state.xsave; - fpu__activate_fpstate_read(fpu); + fpu__prepare_read(fpu); if (using_compacted_format()) { if (kbuf) @@ -132,7 +132,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, xsave = &fpu->state.xsave; - fpu__activate_fpstate_write(fpu); + fpu__prepare_write(fpu); if (boot_cpu_has(X86_FEATURE_XSAVES)) { if (kbuf) @@ -310,7 +310,7 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset, struct fpu *fpu = &target->thread.fpu; struct user_i387_ia32_struct env; - fpu__activate_fpstate_read(fpu); + fpu__prepare_read(fpu); if (!boot_cpu_has(X86_FEATURE_FPU)) return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf); @@ -340,7 +340,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, struct user_i387_ia32_struct env; int ret; - fpu__activate_fpstate_write(fpu); + fpu__prepare_write(fpu); fpstate_sanitize_xstate(fpu); if (!boot_cpu_has(X86_FEATURE_FPU)) -- cgit From e63e5d5c15c6b1dba26f7cbd1b1089a1d6155db5 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 24 Sep 2017 12:59:04 +0200 Subject: x86/fpu: Introduce validate_xstate_header() Move validation of user-supplied xstate_header into a helper function, in preparation of calling it from both the ptrace and sigreturn syscall paths. The new function also considers it to be an error if *any* reserved bits are set, whereas before we were just clearing most of them silently. This should reduce the chance of bugs that fail to correctly validate user-supplied XSAVE areas. It also will expose any broken userspace programs that set the other reserved bits; this is desirable because such programs will lose compatibility with future CPUs and kernels if those bits are ever used for anything. (There shouldn't be any such programs, and in fact in the case where the compacted format is in use we were already validating xfeatures. But you never know...) Signed-off-by: Eric Biggers Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Fenghua Yu Cc: Kees Cook Cc: Kevin Hao Cc: Linus Torvalds Cc: Michael Halcrow Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Wanpeng Li Cc: Yu-cheng Yu Cc: kernel-hardening@lists.openwall.com Link: http://lkml.kernel.org/r/20170924105913.9157-2-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/xstate.h | 4 ++++ arch/x86/kernel/fpu/xstate.c | 24 ++++++++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 579ac2358e63..83fee2469eb7 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -52,4 +52,8 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset, unsigned int size); int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf); int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf); + +/* Validate an xstate header supplied by userspace (ptrace or sigreturn) */ +extern int validate_xstate_header(const struct xstate_header *hdr); + #endif diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 703e76d027ee..2427aeea33b5 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -483,6 +483,30 @@ int using_compacted_format(void) return boot_cpu_has(X86_FEATURE_XSAVES); } +/* Validate an xstate header supplied by userspace (ptrace or sigreturn) */ +int validate_xstate_header(const struct xstate_header *hdr) +{ + /* No unknown or supervisor features may be set */ + if (hdr->xfeatures & (~xfeatures_mask | XFEATURE_MASK_SUPERVISOR)) + return -EINVAL; + + /* Userspace must use the uncompacted format */ + if (hdr->xcomp_bv) + return -EINVAL; + + /* + * If 'reserved' is shrunken to add a new field, make sure to validate + * that new field here! + */ + BUILD_BUG_ON(sizeof(hdr->reserved) != 48); + + /* No reserved bits may be set */ + if (memchr_inv(hdr->reserved, 0, sizeof(hdr->reserved))) + return -EINVAL; + + return 0; +} + static void __xstate_dump_leaves(void) { int i; -- cgit From cf9df81b139b6ebaec188d73758f02ca3b2110e4 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 24 Sep 2017 12:59:05 +0200 Subject: x86/fpu: Use validate_xstate_header() to validate the xstate_header in xstateregs_set() Tighten the checks in xstateregs_set(). Signed-off-by: Eric Biggers Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Fenghua Yu Cc: Kees Cook Cc: Kevin Hao Cc: Linus Torvalds Cc: Michael Halcrow Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Wanpeng Li Cc: Yu-cheng Yu Cc: kernel-hardening@lists.openwall.com Link: http://lkml.kernel.org/r/20170924105913.9157-3-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/regset.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index ee8d2f049818..b831d5b9de99 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -141,27 +141,20 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, ret = copy_user_to_xstate(xsave, ubuf); } else { ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); - - /* xcomp_bv must be 0 when using uncompacted format */ - if (!ret && xsave->header.xcomp_bv) - ret = -EINVAL; + if (!ret) + ret = validate_xstate_header(&xsave->header); } - /* - * In case of failure, mark all states as init: - */ - if (ret) - fpstate_init(&fpu->state); - /* * mxcsr reserved bits must be masked to zero for security reasons. */ xsave->i387.mxcsr &= mxcsr_feature_mask; - xsave->header.xfeatures &= xfeatures_mask; + /* - * These bits must be zero. + * In case of failure, mark all states as init: */ - memset(&xsave->header.reserved, 0, 48); + if (ret) + fpstate_init(&fpu->state); return ret; } -- cgit From b11e2e18a7fc8eaa3d592c260d50c7129e094ded Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 24 Sep 2017 12:59:06 +0200 Subject: x86/fpu: Use validate_xstate_header() to validate the xstate_header in __fpu__restore_sig() Tighten the checks in __fpu__restore_sig() and update comments. Signed-off-by: Eric Biggers Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Fenghua Yu Cc: Kees Cook Cc: Kevin Hao Cc: Linus Torvalds Cc: Michael Halcrow Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Wanpeng Li Cc: Yu-cheng Yu Cc: kernel-hardening@lists.openwall.com Link: http://lkml.kernel.org/r/20170924105913.9157-4-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/signal.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 7fa3bdb331e9..fb639e70048f 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -214,8 +214,11 @@ sanitize_restored_xstate(struct task_struct *tsk, struct xstate_header *header = &xsave->header; if (use_xsave()) { - /* These bits must be zero. */ - memset(header->reserved, 0, 48); + /* + * Note: we don't need to zero the reserved bits in the + * xstate_header here because we either didn't copy them at all, + * or we checked earlier that they aren't set. + */ /* * Init the state that is not present in the memory @@ -224,7 +227,7 @@ sanitize_restored_xstate(struct task_struct *tsk, if (fx_only) header->xfeatures = XFEATURE_MASK_FPSSE; else - header->xfeatures &= (xfeatures_mask & xfeatures); + header->xfeatures &= xfeatures; } if (use_fxsr()) { @@ -308,7 +311,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) /* * For 32-bit frames with fxstate, copy the user state to the * thread's fpu state, reconstruct fxstate from the fsave - * header. Sanitize the copied state etc. + * header. Validate and sanitize the copied state. */ struct fpu *fpu = &tsk->thread.fpu; struct user_i387_ia32_struct env; @@ -329,9 +332,8 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) } else { err = __copy_from_user(&fpu->state.xsave, buf_fx, state_size); - /* xcomp_bv must be 0 when using uncompacted format */ - if (!err && state_size > offsetof(struct xregs_state, header) && fpu->state.xsave.header.xcomp_bv) - err = -EINVAL; + if (!err && state_size > offsetof(struct xregs_state, header)) + err = validate_xstate_header(&fpu->state.xsave.header); } if (err || __copy_from_user(&env, buf, sizeof(env))) { -- cgit From 80d8ae86b36791a545ca28ddc95133ea59bba6e0 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 24 Sep 2017 12:59:07 +0200 Subject: x86/fpu: Copy the full state_header in copy_kernel_to_xstate() This is in preparation to verify the full xstate header as supplied by user-space. Signed-off-by: Eric Biggers Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Fenghua Yu Cc: Kees Cook Cc: Kevin Hao Cc: Linus Torvalds Cc: Michael Halcrow Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Wanpeng Li Cc: Yu-cheng Yu Cc: kernel-hardening@lists.openwall.com Link: http://lkml.kernel.org/r/20170924105913.9157-5-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 2427aeea33b5..02591b96bb25 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1148,11 +1148,13 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) int i; u64 xfeatures; u64 allowed_features; + struct xstate_header hdr; offset = offsetof(struct xregs_state, header); - size = sizeof(xfeatures); + size = sizeof(hdr); - memcpy(&xfeatures, kbuf + offset, size); + memcpy(&hdr, kbuf + offset, size); + xfeatures = hdr.xfeatures; /* * Reject if the user sets any disabled or supervisor features: -- cgit From b89eda482d7849a1c146b6d0a42f4e76369bb08e Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 24 Sep 2017 12:59:08 +0200 Subject: x86/fpu: Eliminate the 'xfeatures' local variable in copy_kernel_to_xstate() We have this information in the xstate_header. Signed-off-by: Eric Biggers Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Fenghua Yu Cc: Kees Cook Cc: Kevin Hao Cc: Linus Torvalds Cc: Michael Halcrow Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Wanpeng Li Cc: Yu-cheng Yu Cc: kernel-hardening@lists.openwall.com Link: http://lkml.kernel.org/r/20170924105913.9157-6-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 02591b96bb25..c97c4a9db52a 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1146,7 +1146,6 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) { unsigned int offset, size; int i; - u64 xfeatures; u64 allowed_features; struct xstate_header hdr; @@ -1154,20 +1153,19 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) size = sizeof(hdr); memcpy(&hdr, kbuf + offset, size); - xfeatures = hdr.xfeatures; /* * Reject if the user sets any disabled or supervisor features: */ allowed_features = xfeatures_mask & ~XFEATURE_MASK_SUPERVISOR; - if (xfeatures & ~allowed_features) + if (hdr.xfeatures & ~allowed_features) return -EINVAL; for (i = 0; i < XFEATURE_MAX; i++) { u64 mask = ((u64)1 << i); - if (xfeatures & mask) { + if (hdr.xfeatures & mask) { void *dst = __raw_xsave_addr(xsave, 1 << i); offset = xstate_offsets[i]; @@ -1177,7 +1175,7 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) } } - if (xfeatures_mxcsr_quirk(xfeatures)) { + if (xfeatures_mxcsr_quirk(hdr.xfeatures)) { offset = offsetof(struct fxregs_state, mxcsr); size = MXCSR_AND_FLAGS_SIZE; memcpy(&xsave->i387.mxcsr, kbuf + offset, size); @@ -1192,7 +1190,7 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) /* * Add back in the features that came in from userspace: */ - xsave->header.xfeatures |= xfeatures; + xsave->header.xfeatures |= hdr.xfeatures; return 0; } -- cgit From af95774b3ca080b0e1e651c0fc7680f3444ddda7 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 24 Sep 2017 12:59:09 +0200 Subject: x86/fpu: Use validate_xstate_header() to validate the xstate_header in copy_kernel_to_xstate() Tighten the checks in copy_kernel_to_xstate(). Signed-off-by: Eric Biggers Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Fenghua Yu Cc: Kees Cook Cc: Kevin Hao Cc: Linus Torvalds Cc: Michael Halcrow Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Wanpeng Li Cc: Yu-cheng Yu Cc: kernel-hardening@lists.openwall.com Link: http://lkml.kernel.org/r/20170924105913.9157-7-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index c97c4a9db52a..325db7850335 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1138,15 +1138,12 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i /* * Convert from a ptrace standard-format kernel buffer to kernel XSAVES format - * and copy to the target thread. This is called from xstateregs_set() and - * there we check the CPU has XSAVES and a whole standard-sized buffer - * exists. + * and copy to the target thread. This is called from xstateregs_set(). */ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) { unsigned int offset, size; int i; - u64 allowed_features; struct xstate_header hdr; offset = offsetof(struct xregs_state, header); @@ -1154,12 +1151,7 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) memcpy(&hdr, kbuf + offset, size); - /* - * Reject if the user sets any disabled or supervisor features: - */ - allowed_features = xfeatures_mask & ~XFEATURE_MASK_SUPERVISOR; - - if (hdr.xfeatures & ~allowed_features) + if (validate_xstate_header(&hdr)) return -EINVAL; for (i = 0; i < XFEATURE_MAX; i++) { -- cgit From af2c4322d986a08a6e793b74b83a62b325019c20 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 24 Sep 2017 12:59:10 +0200 Subject: x86/fpu: Copy the full header in copy_user_to_xstate() This is in preparation to verify the full xstate header as supplied by user-space. Signed-off-by: Eric Biggers Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Fenghua Yu Cc: Kees Cook Cc: Kevin Hao Cc: Linus Torvalds Cc: Michael Halcrow Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Wanpeng Li Cc: Yu-cheng Yu Cc: kernel-hardening@lists.openwall.com Link: http://lkml.kernel.org/r/20170924105913.9157-8-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 325db7850335..0cd7b73c25e8 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1199,13 +1199,16 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) int i; u64 xfeatures; u64 allowed_features; + struct xstate_header hdr; offset = offsetof(struct xregs_state, header); - size = sizeof(xfeatures); + size = sizeof(hdr); - if (__copy_from_user(&xfeatures, ubuf + offset, size)) + if (__copy_from_user(&hdr, ubuf + offset, size)) return -EFAULT; + xfeatures = hdr.xfeatures; + /* * Reject if the user sets any disabled or supervisor features: */ -- cgit From 3d703477bcfe8bb57079d97198cf1e342fe1fef9 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 24 Sep 2017 12:59:11 +0200 Subject: x86/fpu: Eliminate the 'xfeatures' local variable in copy_user_to_xstate() We now have this field in hdr.xfeatures. Signed-off-by: Eric Biggers Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Fenghua Yu Cc: Kees Cook Cc: Kevin Hao Cc: Linus Torvalds Cc: Michael Halcrow Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Wanpeng Li Cc: Yu-cheng Yu Cc: kernel-hardening@lists.openwall.com Link: http://lkml.kernel.org/r/20170924105913.9157-9-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 0cd7b73c25e8..b6d78b78b5c2 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1197,7 +1197,6 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) { unsigned int offset, size; int i; - u64 xfeatures; u64 allowed_features; struct xstate_header hdr; @@ -1207,20 +1206,18 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) if (__copy_from_user(&hdr, ubuf + offset, size)) return -EFAULT; - xfeatures = hdr.xfeatures; - /* * Reject if the user sets any disabled or supervisor features: */ allowed_features = xfeatures_mask & ~XFEATURE_MASK_SUPERVISOR; - if (xfeatures & ~allowed_features) + if (hdr.xfeatures & ~allowed_features) return -EINVAL; for (i = 0; i < XFEATURE_MAX; i++) { u64 mask = ((u64)1 << i); - if (xfeatures & mask) { + if (hdr.xfeatures & mask) { void *dst = __raw_xsave_addr(xsave, 1 << i); offset = xstate_offsets[i]; @@ -1231,7 +1228,7 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) } } - if (xfeatures_mxcsr_quirk(xfeatures)) { + if (xfeatures_mxcsr_quirk(hdr.xfeatures)) { offset = offsetof(struct fxregs_state, mxcsr); size = MXCSR_AND_FLAGS_SIZE; if (__copy_from_user(&xsave->i387.mxcsr, ubuf + offset, size)) @@ -1247,7 +1244,7 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) /* * Add back in the features that came in from userspace: */ - xsave->header.xfeatures |= xfeatures; + xsave->header.xfeatures |= hdr.xfeatures; return 0; } -- cgit From 98c0fad9d60e8b2cd47e15b7bee7df343648f5bb Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 24 Sep 2017 12:59:12 +0200 Subject: x86/fpu: Use validate_xstate_header() to validate the xstate_header in copy_user_to_xstate() Tighten the checks in copy_user_to_xstate(). Signed-off-by: Eric Biggers Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Fenghua Yu Cc: Kees Cook Cc: Kevin Hao Cc: Linus Torvalds Cc: Michael Halcrow Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Wanpeng Li Cc: Yu-cheng Yu Cc: kernel-hardening@lists.openwall.com Link: http://lkml.kernel.org/r/20170924105913.9157-10-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index b6d78b78b5c2..f1d5476c9022 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1188,16 +1188,15 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) } /* - * Convert from a ptrace standard-format user-space buffer to kernel XSAVES format - * and copy to the target thread. This is called from xstateregs_set() and - * there we check the CPU has XSAVES and a whole standard-sized buffer - * exists. + * Convert from a ptrace or sigreturn standard-format user-space buffer to + * kernel XSAVES format and copy to the target thread. This is called from + * xstateregs_set(), as well as potentially from the sigreturn() and + * rt_sigreturn() system calls. */ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) { unsigned int offset, size; int i; - u64 allowed_features; struct xstate_header hdr; offset = offsetof(struct xregs_state, header); @@ -1206,12 +1205,7 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) if (__copy_from_user(&hdr, ubuf + offset, size)) return -EFAULT; - /* - * Reject if the user sets any disabled or supervisor features: - */ - allowed_features = xfeatures_mask & ~XFEATURE_MASK_SUPERVISOR; - - if (hdr.xfeatures & ~allowed_features) + if (validate_xstate_header(&hdr)) return -EINVAL; for (i = 0; i < XFEATURE_MAX; i++) { -- cgit From 738f48cb5fdd5878d11934f1898aa2bcf1578289 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 24 Sep 2017 12:59:13 +0200 Subject: x86/fpu: Use using_compacted_format() instead of open coded X86_FEATURE_XSAVES This is the canonical method to use. Signed-off-by: Eric Biggers Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Fenghua Yu Cc: Kees Cook Cc: Kevin Hao Cc: Linus Torvalds Cc: Michael Halcrow Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Wanpeng Li Cc: Yu-cheng Yu Cc: kernel-hardening@lists.openwall.com Link: http://lkml.kernel.org/r/20170924105913.9157-11-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/regset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index b831d5b9de99..3ea151372389 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -134,7 +134,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, fpu__prepare_write(fpu); - if (boot_cpu_has(X86_FEATURE_XSAVES)) { + if (using_compacted_format()) { if (kbuf) ret = copy_kernel_to_xstate(xsave, kbuf); else -- cgit From a98c75fcd0ec02623f4f56d824d76e659410a52b Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Wed, 23 Aug 2017 19:13:26 +0200 Subject: drm/tegra: trace: Fix path to include The TRACE_INCLUDE_FILE macro needs to specify the path relative to the define_trace.h header rather than relative to the file defining it. Reported-by: Dmitry Osipenko Tested-by: Dmitry Osipenko Signed-off-by: Thierry Reding Link: https://patchwork.freedesktop.org/patch/msgid/20170823171326.23620-1-thierry.reding@gmail.com --- drivers/gpu/drm/tegra/trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/tegra/trace.h b/drivers/gpu/drm/tegra/trace.h index e9b7cdad5c4c..5a1ab4046e92 100644 --- a/drivers/gpu/drm/tegra/trace.h +++ b/drivers/gpu/drm/tegra/trace.h @@ -63,6 +63,6 @@ DEFINE_EVENT(register_access, sor_readl, /* This part must be outside protection */ #undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_PATH ../../drivers/gpu/drm/tegra #define TRACE_INCLUDE_FILE trace #include -- cgit From d8bd9f3f0925d22726de159531bfe3774b5cacc6 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Fri, 22 Sep 2017 13:32:21 +1000 Subject: powerpc: Handle MCE on POWER9 with only DSISR bit 30 set On POWER9 DD2.1 and below, it's possible for a paste instruction to cause a Machine Check Exception (MCE) where only DSISR bit 30 (IBM 33) is set. This will result in the MCE handler seeing an unknown event, which triggers linux to crash. We change this by detecting unknown events caused by load/stores in the MCE handler and marking them as handled so that we no longer crash. An MCE that occurs like this is spurious, so we don't need to do anything in terms of servicing it. If there is something that needs to be serviced, the CPU will raise the MCE again with the correct DSISR so that it can be serviced properly. Signed-off-by: Michael Neuling Reviewed-by: Nicholas Piggin [mpe: Expand comment with details from change log, use normal bit #s] Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/mce_power.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c index b76ca198e09c..f523125b9d34 100644 --- a/arch/powerpc/kernel/mce_power.c +++ b/arch/powerpc/kernel/mce_power.c @@ -624,5 +624,18 @@ long __machine_check_early_realmode_p8(struct pt_regs *regs) long __machine_check_early_realmode_p9(struct pt_regs *regs) { + /* + * On POWER9 DD2.1 and below, it's possible to get a machine check + * caused by a paste instruction where only DSISR bit 30 is set. This + * will result in the MCE handler seeing an unknown event and the kernel + * crashing. An MCE that occurs like this is spurious, so we don't need + * to do anything in terms of servicing it. If there is something that + * needs to be serviced, the CPU will raise the MCE again with the + * correct DSISR so that it can be serviced properly. So detect this + * case and mark it as handled. + */ + if (SRR1_MC_LOADSTORE(regs->msr) && regs->dsisr == 0x40000000) + return 1; + return mce_handle_error(regs, mce_p9_derror_table, mce_p9_ierror_table); } -- cgit From ff40adf7fbdff96860b1153332c0b1c7bab6e0c1 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 24 Aug 2017 18:19:48 -0600 Subject: Btrfs: use the new helper wbc_to_write_flags This updates btrfs to use the helper wbc_to_write_flags which has been applied in ext4/xfs/f2fs/block. Please note that, with this, btrfs's dirty pages written by a writeback job will carry the flag REQ_BACKGROUND, which is currently used by writeback-throttle to determine whether it should go to get a request or wait. Signed-off-by: Liu Bo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d17783d70228..4ead6da5a645 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3471,8 +3471,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, unsigned int write_flags = 0; unsigned long nr_written = 0; - if (wbc->sync_mode == WB_SYNC_ALL) - write_flags = REQ_SYNC; + write_flags = wbc_to_write_flags(wbc); trace___extent_writepage(page, inode, wbc); @@ -3718,7 +3717,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, unsigned long i, num_pages; unsigned long bio_flags = 0; unsigned long start, end; - unsigned int write_flags = (epd->sync_io ? REQ_SYNC : 0) | REQ_META; + unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META; int ret = 0; clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); -- cgit From 5f14efd3d437205143dcffcf776e0122eae1755a Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 23 Aug 2017 12:15:09 -0600 Subject: Btrfs: do not reset bio->bi_ops while writing bio flush_epd_write_bio() sets bio->bi_opf by itself to honor REQ_SYNC, but it's not needed at all since bio->bi_opf has set up properly in both __extent_writepage() and write_one_eb(), and in the case of write_one_eb(), it also sets REQ_META, which we will lose in flush_epd_write_bio(). This remove this unnecessary bio->bi_opf setting. Signed-off-by: Liu Bo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4ead6da5a645..3738d245518c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4062,9 +4062,6 @@ static void flush_epd_write_bio(struct extent_page_data *epd) if (epd->bio) { int ret; - bio_set_op_attrs(epd->bio, REQ_OP_WRITE, - epd->sync_io ? REQ_SYNC : 0); - ret = submit_one_bio(epd->bio, 0, epd->bio_flags); BUG_ON(ret < 0); /* -ENOMEM */ epd->bio = NULL; -- cgit From bea7eafdbda3ba1d4b2ccb9cca829eefb7989bb9 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Tue, 22 Aug 2017 23:46:00 -0700 Subject: Btrfs: fix incorrect {node,sector}size endianness from BTRFS_IOC_FS_INFO fs_info->super_copy->{node,sector}size are little-endian, but the ioctl should return the values in native endianness. Use the cached values in btrfs_fs_info instead. Found with sparse. Fixes: 80a773fbfc2d ("btrfs: retrieve more info from FS_INFO ioctl") Signed-off-by: Omar Sandoval Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index ae8fbf9d3de2..cf1c2ee030dd 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2769,9 +2769,9 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info, } mutex_unlock(&fs_devices->device_list_mutex); - fi_args->nodesize = fs_info->super_copy->nodesize; - fi_args->sectorsize = fs_info->super_copy->sectorsize; - fi_args->clone_alignment = fs_info->super_copy->sectorsize; + fi_args->nodesize = fs_info->nodesize; + fi_args->sectorsize = fs_info->sectorsize; + fi_args->clone_alignment = fs_info->sectorsize; if (copy_to_user(arg, fi_args, sizeof(*fi_args))) ret = -EFAULT; -- cgit From 63d71450c8d817649a79e37d685523f988b9cc98 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Fri, 1 Sep 2017 17:58:47 +0900 Subject: btrfs: clear ordered flag on cleaning up ordered extents Commit 524272607e88 ("btrfs: Handle delalloc error correctly to avoid ordered extent hang") introduced btrfs_cleanup_ordered_extents() to cleanup submitted ordered extents. However, it does not clear the ordered bit (Private2) of corresponding pages. Thus, the following BUG occurs from free_pages_check_bad() (on btrfs/125 with nospace_cache). BUG: Bad page state in process btrfs pfn:3fa787 page:ffffdf2acfe9e1c0 count:0 mapcount:0 mapping: (null) index:0xd flags: 0x8000000000002008(uptodate|private_2) raw: 8000000000002008 0000000000000000 000000000000000d 00000000ffffffff raw: ffffdf2acf5c1b20 ffffb443802238b0 0000000000000000 0000000000000000 page dumped because: PAGE_FLAGS_CHECK_AT_FREE flag(s) set bad because of flags: 0x2000(private_2) This patch clears the flag same as other places calling btrfs_dec_test_ordered_pending() for every page in the specified range. Fixes: 524272607e88 ("btrfs: Handle delalloc error correctly to avoid ordered extent hang") Cc: # 4.12 Signed-off-by: Naohiro Aota Reviewed-by: Qu Wenruo Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/inode.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d184a46e46c4..455c0f22fe2d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -135,6 +135,18 @@ static inline void btrfs_cleanup_ordered_extents(struct inode *inode, const u64 offset, const u64 bytes) { + unsigned long index = offset >> PAGE_SHIFT; + unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; + struct page *page; + + while (index <= end_index) { + page = find_get_page(inode->i_mapping, index); + index++; + if (!page) + continue; + ClearPagePrivate2(page); + put_page(page); + } return __endio_write_update_ordered(inode, offset + PAGE_SIZE, bytes - PAGE_SIZE, false); } -- cgit From 67c003f90fd68062d92a7ffade36f9b2a9098bd8 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Fri, 1 Sep 2017 17:59:07 +0900 Subject: btrfs: finish ordered extent cleaning if no progress is found __endio_write_update_ordered() repeats the search until it reaches the end of the specified range. This works well with direct IO path, because before the function is called, it's ensured that there are ordered extents filling whole the range. It's not the case, however, when it's called from run_delalloc_range(): it is possible to have error in the midle of the loop in e.g. run_delalloc_nocow(), so that there exisits the range not covered by any ordered extents. By cleaning such "uncomplete" range, __endio_write_update_ordered() stucks at offset where there're no ordered extents. Since the ordered extents are created from head to tail, we can stop the search if there are no offset progress. Fixes: 524272607e88 ("btrfs: Handle delalloc error correctly to avoid ordered extent hang") Cc: # 4.12 Signed-off-by: Naohiro Aota Reviewed-by: Qu Wenruo Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/inode.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 455c0f22fe2d..f78c5640c6dc 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8396,6 +8396,7 @@ static void __endio_write_update_ordered(struct inode *inode, btrfs_work_func_t func; u64 ordered_offset = offset; u64 ordered_bytes = bytes; + u64 last_offset; int ret; if (btrfs_is_free_space_inode(BTRFS_I(inode))) { @@ -8407,6 +8408,7 @@ static void __endio_write_update_ordered(struct inode *inode, } again: + last_offset = ordered_offset; ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, &ordered_offset, ordered_bytes, @@ -8417,6 +8419,12 @@ again: btrfs_init_work(&ordered->work, func, finish_ordered_fn, NULL, NULL); btrfs_queue_work(wq, &ordered->work); out_test: + /* + * If btrfs_dec_test_ordered_pending does not find any ordered extent + * in the range, we can exit. + */ + if (ordered_offset == last_offset) + return; /* * our bio might span multiple ordered extents. If we haven't * completed the accounting for the whole dio, go back and try again -- cgit From bb166d7207432d3c7d10c45dc052f12ba3a2121d Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Fri, 25 Aug 2017 14:15:14 +0900 Subject: btrfs: fix NULL pointer dereference from free_reloc_roots() __del_reloc_root should be called before freeing up reloc_root->node. If not, calling __del_reloc_root() dereference reloc_root->node, causing the system BUG. Fixes: 6bdf131fac23 ("Btrfs: don't leak reloc root nodes on error") Cc: # 4.9 Signed-off-by: Naohiro Aota Reviewed-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 3a49a3c2fca4..9841faef08ea 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2400,11 +2400,11 @@ void free_reloc_roots(struct list_head *list) while (!list_empty(list)) { reloc_root = list_entry(list->next, struct btrfs_root, root_list); + __del_reloc_root(reloc_root); free_extent_buffer(reloc_root->node); free_extent_buffer(reloc_root->commit_root); reloc_root->node = NULL; reloc_root->commit_root = NULL; - __del_reloc_root(reloc_root); } } -- cgit From ca6842bf01dc1ad41195eac1e343b4f08c496ba8 Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Fri, 22 Jan 2016 09:13:25 +0900 Subject: Btrfs: send: fix error number for unknown inode types ENOTSUPP should not be returned to the user program. (cf. include/linux/errno.h) Therefore, EOPNOTSUPP is used instead of ENOTSUPP. Signed-off-by: Tsutomu Itoh Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/send.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 8f1d3d6e7087..43430e6c99aa 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -2640,7 +2640,7 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino) } else { btrfs_warn(sctx->send_root->fs_info, "unexpected inode type %o", (int)(mode & S_IFMT)); - ret = -ENOTSUPP; + ret = -EOPNOTSUPP; goto out; } -- cgit From 6d6d282932d1a609e60dc4467677e0e863682f57 Mon Sep 17 00:00:00 2001 From: satoru takeuchi Date: Tue, 12 Sep 2017 22:42:52 +0900 Subject: btrfs: prevent to set invalid default subvolid `btrfs sub set-default` succeeds to set an ID which isn't corresponding to any fs/file tree. If such the bad ID is set to a filesystem, we can't mount this filesystem without specifying `subvol` or `subvolid` mount options. Fixes: 6ef5ed0d386b ("Btrfs: add ioctl and incompat flag to set the default mount subvol") Cc: Signed-off-by: Satoru Takeuchi Reviewed-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index cf1c2ee030dd..d4a77993e52f 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4057,6 +4057,10 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) ret = PTR_ERR(new_root); goto out; } + if (!is_fstree(new_root->objectid)) { + ret = -ENOENT; + goto out; + } path = btrfs_alloc_path(); if (!path) { -- cgit From 78ad4ce014d025f41b8dde3a81876832ead643cf Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Fri, 8 Sep 2017 17:48:55 +0900 Subject: btrfs: propagate error to btrfs_cmp_data_prepare caller btrfs_cmp_data_prepare() (almost) always returns 0 i.e. ignoring errors from gather_extent_pages(). While the pages are freed by btrfs_cmp_data_free(), cmp->num_pages still has > 0. Then, btrfs_extent_same() try to access the already freed pages causing faults (or violates PageLocked assertion). This patch just return the error as is so that the caller stop the process. Signed-off-by: Naohiro Aota Fixes: f441460202cb ("btrfs: fix deadlock with extent-same and readpage") Cc: # 4.2 Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index d4a77993e52f..802df5755cd3 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3028,7 +3028,7 @@ static int btrfs_cmp_data_prepare(struct inode *src, u64 loff, out: if (ret) btrfs_cmp_data_free(cmp); - return 0; + return ret; } static int btrfs_cmp_data(u64 len, struct cmp_pages *cmp) -- cgit From c2faff790ccd11ea5be8e3ca99713f116fcd6030 Mon Sep 17 00:00:00 2001 From: "Misono, Tomohiro" Date: Wed, 30 Aug 2017 16:33:16 +0900 Subject: btrfs: remove BTRFS_FS_QUOTA_DISABLING flag Currently, "btrfs quota enable" would fail after "btrfs quota disable" on the first time with syslog output "qgroup_rescan_init failed with -22", but it would succeed on the second time. When "quota disable" is called, BTRFS_FS_QUOTA_DISABLING flag bit will be set in fs_info->flags in btrfs_quota_disable(), but it will not be droppd in btrfs_run_qgroups() (which is called in btrfs_commit_transaction()) because quota_root has already been freed. If "quota enable" is called after that, both BTRFS_FS_QUOTA_DISABLING and BTRFS_FS_QUOTA_ENABLED flag would be dropped in the btrfs_run_qgroups() since quota_root is not NULL. This leads to the failure of "quota enable" on the first time. BTRFS_FS_QUOTA_DISABLING flag is not used outside of "quota disable" context and is equivalent to whether quota_root is NULL or not. btrfs_run_qgroups() checks whether quota_root is NULL or not in the first place. So, let's remove BTRFS_FS_QUOTA_DISABLING flag. Signed-off-by: Tomohiro Misono Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 1 - fs/btrfs/qgroup.c | 4 ---- 2 files changed, 5 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2add002662f4..b7ccfcc01732 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -708,7 +708,6 @@ struct btrfs_delayed_root; #define BTRFS_FS_OPEN 5 #define BTRFS_FS_QUOTA_ENABLED 6 #define BTRFS_FS_QUOTA_ENABLING 7 -#define BTRFS_FS_QUOTA_DISABLING 8 #define BTRFS_FS_UPDATE_UUID_TREE_GEN 9 #define BTRFS_FS_CREATING_FREE_SPACE_TREE 10 #define BTRFS_FS_BTREE_ERR 11 diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 5c8b61c86e61..770f667269f5 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -807,7 +807,6 @@ static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans, } ret = 0; out: - set_bit(BTRFS_FS_QUOTA_DISABLING, &root->fs_info->flags); btrfs_free_path(path); return ret; } @@ -953,7 +952,6 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans, if (!fs_info->quota_root) goto out; clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); - set_bit(BTRFS_FS_QUOTA_DISABLING, &fs_info->flags); btrfs_qgroup_wait_for_completion(fs_info, false); spin_lock(&fs_info->qgroup_lock); quota_root = fs_info->quota_root; @@ -2086,8 +2084,6 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans, if (test_and_clear_bit(BTRFS_FS_QUOTA_ENABLING, &fs_info->flags)) set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); - if (test_and_clear_bit(BTRFS_FS_QUOTA_DISABLING, &fs_info->flags)) - clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); spin_lock(&fs_info->qgroup_lock); while (!list_empty(&fs_info->dirty_qgroups)) { -- cgit From fed3b381145e2e7c66b0b3f640851e1633ebd07f Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 13 Sep 2017 12:25:21 -0600 Subject: Btrfs: do not backup tree roots when fsync It doesn't make sense to backup tree roots when doing fsync, since during fsync those tree roots have not been consistent on disk. Signed-off-by: Liu Bo Reviewed-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 27d458640536..0f2271815eb6 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3641,7 +3641,14 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) u64 flags; do_barriers = !btrfs_test_opt(fs_info, NOBARRIER); - backup_super_roots(fs_info); + + /* + * max_mirrors == 0 indicates we're from commit_transaction, + * not from fsync where the tree roots in fs_info have not + * been consistent on disk. + */ + if (max_mirrors == 0) + backup_super_roots(fs_info); sb = fs_info->super_for_commit; dev_item = &sb->dev_item; -- cgit From bd7d63c2ceaf737eeb21630a2b62fc5fe34dba29 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Tue, 19 Sep 2017 17:50:09 -0600 Subject: Btrfs: use btrfs_op instead of bio_op in __btrfs_map_block This seems to be a leftover of commit cf8cddd38bab ("btrfs: don't abuse REQ_OP_* flags for btrfs_map_block"). It should use btrfs_op() helper to provide one of 'enum btrfs_map_op' types. Fixes: cf8cddd38bab ("btrfs: don't abuse REQ_OP_* flags for btrfs_map_block") Signed-off-by: Liu Bo Reviewed-by: Satoru Takeuchi Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d024f1b07282..6a72f88f77b6 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6166,7 +6166,7 @@ int btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, map_length = length; btrfs_bio_counter_inc_blocked(fs_info); - ret = __btrfs_map_block(fs_info, bio_op(bio), logical, + ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, &bbio, mirror_num, 1); if (ret) { btrfs_bio_counter_dec(fs_info); -- cgit From cf1167d5c1abf3bc42b2a1562bfa7937c05337e2 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 20 Sep 2017 17:50:18 -0600 Subject: Btrfs: fix kernel oops while reading compressed data The kernel oops happens at kernel BUG at fs/btrfs/extent_io.c:2104! ... RIP: clean_io_failure+0x263/0x2a0 [btrfs] It's showing that read-repair code is using an improper mirror index. This is due to the fact that compression read's endio hasn't recorded the failed mirror index in %cb->orig_bio. With this, btrfs's read-repair can work properly on reading compressed data. Signed-off-by: Liu Bo Reported-by: Paul Jones Tested-by: Paul Jones Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 883ecc58fd0d..c6aa53c4c102 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -107,6 +107,7 @@ static void end_compressed_bio_read(struct bio *bio) struct inode *inode; struct page *page; unsigned long index; + unsigned int mirror = btrfs_io_bio(bio)->mirror_num; int ret; if (bio->bi_status) @@ -118,6 +119,14 @@ static void end_compressed_bio_read(struct bio *bio) if (!refcount_dec_and_test(&cb->pending_bios)) goto out; + /* + * Record the correct mirror_num in cb->orig_bio so that + * read-repair can work properly. + */ + ASSERT(btrfs_io_bio(cb->orig_bio)); + btrfs_io_bio(cb->orig_bio)->mirror_num = mirror; + cb->mirror_num = mirror; + inode = cb->inode; ret = check_compressed_csum(BTRFS_I(inode), cb, (u64)bio->bi_iter.bi_sector << 9); -- cgit From e6311f240c946788131ba2b97e14f37312688072 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 20 Sep 2017 17:50:19 -0600 Subject: Btrfs: skip checksum when reading compressed data if some IO have failed Currently even if the underlying disk reports failure on IO, compressed read endio still gets to verify checksum and reports it as a checksum error. In fact, if some IO have failed during reading a compressed data extent , there's no way the checksum could match, therefore, we can skip that in order to return error quickly to the upper layer. Please note that we need to do this after recording the failed mirror index so that read-repair in the upper layer's endio can work properly. Signed-off-by: Liu Bo Tested-by: Paul Jones Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index c6aa53c4c102..fc31af98a41b 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -108,7 +108,7 @@ static void end_compressed_bio_read(struct bio *bio) struct page *page; unsigned long index; unsigned int mirror = btrfs_io_bio(bio)->mirror_num; - int ret; + int ret = 0; if (bio->bi_status) cb->errors = 1; @@ -127,6 +127,13 @@ static void end_compressed_bio_read(struct bio *bio) btrfs_io_bio(cb->orig_bio)->mirror_num = mirror; cb->mirror_num = mirror; + /* + * Some IO in this cb have failed, just skip checksum as there + * is no way it could be correct. + */ + if (cb->errors == 1) + goto csum_failed; + inode = cb->inode; ret = check_compressed_csum(BTRFS_I(inode), cb, (u64)bio->bi_iter.bi_sector << 9); -- cgit From 36b96fdc6b2dc6f4a0fedc563fa7508c91b90a10 Mon Sep 17 00:00:00 2001 From: Sargun Dhillon Date: Sun, 17 Sep 2017 09:02:29 +0000 Subject: btrfs: Report error on removing qgroup if del_qgroup_item fails Previously, we were calling del_qgroup_item, and ignoring the return code resulting in a potential to have divergent in-memory state without an error. Perhaps, it makes sense to handle this error code, and put the filesystem into a read only, or similar state. This patch only adds reporting of the error if the error is fatal, (any error other than qgroup not found). Signed-off-by: Sargun Dhillon Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 770f667269f5..e172d4843eae 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1305,6 +1305,8 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, } } ret = del_qgroup_item(trans, quota_root, qgroupid); + if (ret && ret != -ENOENT) + goto out; while (!list_empty(&qgroup->groups)) { list = list_first_entry(&qgroup->groups, -- cgit From 99c4e3b96c797f047be4e6b7c03cfca01959f146 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 15 Sep 2017 15:06:51 -0600 Subject: Btrfs: fix unexpected result when dio reading corrupted blocks commit 4246a0b63bd8 ("block: add a bi_error field to struct bio") changed the logic of how dio read endio reports errors. For single stripe dio read, %bio->bi_status reflects the error before verifying checksum, and now we're updating it when data block matches with its checksum, while in the mismatching case, %bio->bi_status is not updated to relfect that. When some blocks in a file have been corrupted on disk, reading such a file ends up with 1) checksum errors are reported in kernel log 2) read(2) returns successfully with some content being 0x01. In order to fix it, we need to report its checksum mismatch error to the upper layer (dio layer in this case) as well. Fixes: 4246a0b63bd8 ("block: add a bi_error field to struct bio") Signed-off-by: Liu Bo Reported-by: Goffredo Baroncelli Tested-by: Goffredo Baroncelli Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f78c5640c6dc..c242d0230db9 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8366,11 +8366,8 @@ static void btrfs_endio_direct_read(struct bio *bio) struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); blk_status_t err = bio->bi_status; - if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED) { + if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED) err = btrfs_subio_endio_read(inode, io_bio, err); - if (!err) - bio->bi_status = 0; - } unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, dip->logical_offset + dip->bytes - 1); @@ -8378,7 +8375,7 @@ static void btrfs_endio_direct_read(struct bio *bio) kfree(dip); - dio_bio->bi_status = bio->bi_status; + dio_bio->bi_status = err; dio_end_io(dio_bio); if (io_bio->end_io) -- cgit From 8c6c592831a09a28428448e68fb08c6bbb8b9b8b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 29 Aug 2017 10:11:39 -0400 Subject: btrfs: log csums for all modified extents Amir reported a bug discovered by his cleaned up version of my dm-log-writes xfstests where we were missing csums at certain replay points. This is because fsx was doing an msync(), which essentially fsync()'s a specific range of a file. We will log all modified extents, but only search for the checksums in the range we are being asked to sync. We cannot simply log the extents in the range we're being asked because we are logging the inode item as it is currently, which if it has had a i_size update before the msync means we will miss extents when replaying. We could possibly get around this by marking the inode with the transaction that extended the i_size to see if we have this case, but this would be racy and we'd have to lock the whole range of the inode to make sure we didn't have an ordered extent outside of our range that was in the middle of completing. Fix this simply by keeping track of the modified extents range and logging the csums for the entire range of extents that we are logging. This makes the xfstest pass. Reported-by: Amir Goldstein Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index ad7f4bab640b..c800d067fcbf 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4181,6 +4181,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, struct extent_map *em, *n; struct list_head extents; struct extent_map_tree *tree = &inode->extent_tree; + u64 logged_start, logged_end; u64 test_gen; int ret = 0; int num = 0; @@ -4190,10 +4191,11 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, down_write(&inode->dio_sem); write_lock(&tree->lock); test_gen = root->fs_info->last_trans_committed; + logged_start = start; + logged_end = end; list_for_each_entry_safe(em, n, &tree->modified_extents, list) { list_del_init(&em->list); - /* * Just an arbitrary number, this can be really CPU intensive * once we start getting a lot of extents, and really once we @@ -4208,6 +4210,12 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, if (em->generation <= test_gen) continue; + + if (em->start < logged_start) + logged_start = em->start; + if ((em->start + em->len - 1) > logged_end) + logged_end = em->start + em->len - 1; + /* Need a ref to keep it from getting evicted from cache */ refcount_inc(&em->refs); set_bit(EXTENT_FLAG_LOGGING, &em->flags); @@ -4216,7 +4224,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, } list_sort(NULL, &extents, extent_cmp); - btrfs_get_logged_extents(inode, logged_list, start, end); + btrfs_get_logged_extents(inode, logged_list, logged_start, logged_end); /* * Some ordered extents started by fsync might have completed * before we could collect them into the list logged_list, which -- cgit From 0bc15d85d97d44e8979ff91d0c1fbafe6fd4172c Mon Sep 17 00:00:00 2001 From: Nickey Yang Date: Tue, 26 Sep 2017 15:55:22 +0800 Subject: arm64: dts: rockchip: add the grf clk for dw-mipi-dsi on rk3399 The clk of grf must be enabled before writing grf register for rk3399. Signed-off-by: Nickey Yang [the grf clock is already part of the binding since march 2017] Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3399.dtsi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3399.dtsi b/arch/arm64/boot/dts/rockchip/rk3399.dtsi index 6aa43fd47148..ab7629c5b856 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3399.dtsi @@ -1630,8 +1630,8 @@ reg = <0x0 0xff960000 0x0 0x8000>; interrupts = ; clocks = <&cru SCLK_DPHY_PLL>, <&cru PCLK_MIPI_DSI0>, - <&cru SCLK_DPHY_TX0_CFG>; - clock-names = "ref", "pclk", "phy_cfg"; + <&cru SCLK_DPHY_TX0_CFG>, <&cru PCLK_VIO_GRF>; + clock-names = "ref", "pclk", "phy_cfg", "grf"; power-domains = <&power RK3399_PD_VIO>; rockchip,grf = <&grf>; status = "disabled"; -- cgit From e88d62cd4b2f0b1ae55e9008e79c2794b1fc914d Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 26 Sep 2017 12:41:52 +0100 Subject: percpu: make this_cpu_generic_read() atomic w.r.t. interrupts As raw_cpu_generic_read() is a plain read from a raw_cpu_ptr() address, it's possible (albeit unlikely) that the compiler will split the access across multiple instructions. In this_cpu_generic_read() we disable preemption but not interrupts before calling raw_cpu_generic_read(). Thus, an interrupt could be taken in the middle of the split load instructions. If a this_cpu_write() or RMW this_cpu_*() op is made to the same variable in the interrupt handling path, this_cpu_read() will return a torn value. For native word types, we can avoid tearing using READ_ONCE(), but this won't work in all cases (e.g. 64-bit types on most 32-bit platforms). This patch reworks this_cpu_generic_read() to use READ_ONCE() where possible, otherwise falling back to disabling interrupts. Signed-off-by: Mark Rutland Cc: Arnd Bergmann Cc: Christoph Lameter Cc: Peter Zijlstra Cc: Pranith Kumar Cc: Tejun Heo Cc: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: stable@vger.kernel.org Signed-off-by: Tejun Heo --- include/asm-generic/percpu.h | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h index 0504ef8f3aa3..976f8ac26665 100644 --- a/include/asm-generic/percpu.h +++ b/include/asm-generic/percpu.h @@ -115,15 +115,35 @@ do { \ (__ret); \ }) -#define this_cpu_generic_read(pcp) \ +#define __this_cpu_generic_read_nopreempt(pcp) \ ({ \ typeof(pcp) __ret; \ preempt_disable_notrace(); \ - __ret = raw_cpu_generic_read(pcp); \ + __ret = READ_ONCE(*raw_cpu_ptr(&(pcp))); \ preempt_enable_notrace(); \ __ret; \ }) +#define __this_cpu_generic_read_noirq(pcp) \ +({ \ + typeof(pcp) __ret; \ + unsigned long __flags; \ + raw_local_irq_save(__flags); \ + __ret = raw_cpu_generic_read(pcp); \ + raw_local_irq_restore(__flags); \ + __ret; \ +}) + +#define this_cpu_generic_read(pcp) \ +({ \ + typeof(pcp) __ret; \ + if (__native_word(pcp)) \ + __ret = __this_cpu_generic_read_nopreempt(pcp); \ + else \ + __ret = __this_cpu_generic_read_noirq(pcp); \ + __ret; \ +}) + #define this_cpu_generic_to_op(pcp, val, op) \ do { \ unsigned long __flags; \ -- cgit From 2d8f63297b9f0b430c96329893667c0bfdcbd47e Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Tue, 19 Sep 2017 18:38:13 +0300 Subject: drm/i915: always update ELD connector type after get modes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit drm_edid_to_eld() initializes the connector ELD to zero, overwriting the ELD connector type initialized in intel_audio_codec_enable(). If userspace does getconnector and thus get_modes after modeset, a subsequent audio component i915_audio_component_get_eld() call will receive an ELD without the connector type properly set. It's fine for HDMI, but screws up audio for DP. Always set the ELD connector type at intel_connector_update_modes() based on the connector type. We can drop the connector type update from intel_audio_codec_enable(). Credits to Joseph Nuzman for figuring this out. Cc: Ville Syrjälä Cc: Joseph Nuzman Reported-by: Joseph Nuzman Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=101583 Reviewed-by: Ville Syrjälä Tested-by: Joseph Nuzman Cc: stable@vger.kernel.org # v4.10+, maybe earlier Signed-off-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/20170919153813.29808-1-jani.nikula@intel.com (cherry picked from commit d81fb7fd9436e81fda67e5bc8ed0713aa28d3db2) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/intel_audio.c | 5 ----- drivers/gpu/drm/i915/intel_modes.c | 17 +++++++++++++++++ 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_audio.c b/drivers/gpu/drm/i915/intel_audio.c index d805b6e6fe71..27743be5b768 100644 --- a/drivers/gpu/drm/i915/intel_audio.c +++ b/drivers/gpu/drm/i915/intel_audio.c @@ -606,11 +606,6 @@ void intel_audio_codec_enable(struct intel_encoder *intel_encoder, connector->encoder->base.id, connector->encoder->name); - /* ELD Conn_Type */ - connector->eld[5] &= ~(3 << 2); - if (intel_crtc_has_dp_encoder(crtc_state)) - connector->eld[5] |= (1 << 2); - connector->eld[6] = drm_av_sync_delay(connector, adjusted_mode) / 2; if (dev_priv->display.audio_codec_enable) diff --git a/drivers/gpu/drm/i915/intel_modes.c b/drivers/gpu/drm/i915/intel_modes.c index 951e834dd274..28a778b785ac 100644 --- a/drivers/gpu/drm/i915/intel_modes.c +++ b/drivers/gpu/drm/i915/intel_modes.c @@ -30,6 +30,21 @@ #include "intel_drv.h" #include "i915_drv.h" +static void intel_connector_update_eld_conn_type(struct drm_connector *connector) +{ + u8 conn_type; + + if (connector->connector_type == DRM_MODE_CONNECTOR_DisplayPort || + connector->connector_type == DRM_MODE_CONNECTOR_eDP) { + conn_type = DRM_ELD_CONN_TYPE_DP; + } else { + conn_type = DRM_ELD_CONN_TYPE_HDMI; + } + + connector->eld[DRM_ELD_SAD_COUNT_CONN_TYPE] &= ~DRM_ELD_CONN_TYPE_MASK; + connector->eld[DRM_ELD_SAD_COUNT_CONN_TYPE] |= conn_type; +} + /** * intel_connector_update_modes - update connector from edid * @connector: DRM connector device to use @@ -44,6 +59,8 @@ int intel_connector_update_modes(struct drm_connector *connector, ret = drm_add_edid_modes(connector, edid); drm_edid_to_eld(connector, edid); + intel_connector_update_eld_conn_type(connector); + return ret; } -- cgit From bf5d10dcae3549b779490672c705c6ac79cf68a3 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 14 Sep 2017 17:21:54 +0100 Subject: drm/i915: remove redundant variable hw_check hw_check is being assigned and updated but is no longer being read, hence it is redundant and can be removed. Detected by clang scan-build: "warning: Value stored to 'hw_check' during its initialization is never read" Fixes: f6d1973db2d2 ("drm/i915: Move modeset state verifier calls") Signed-off-by: Colin Ian King Reviewed-by: Chris Wilson Signed-off-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20170914162154.11304-1-colin.king@canonical.com (cherry picked from commit 4babc5e27cfda59e2e257d28628b8d853aea5206) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/intel_display.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c index 00cd17c76fdc..64f7b51ed97c 100644 --- a/drivers/gpu/drm/i915/intel_display.c +++ b/drivers/gpu/drm/i915/intel_display.c @@ -12359,7 +12359,6 @@ static void intel_atomic_commit_tail(struct drm_atomic_state *state) struct drm_crtc_state *old_crtc_state, *new_crtc_state; struct drm_crtc *crtc; struct intel_crtc_state *intel_cstate; - bool hw_check = intel_state->modeset; u64 put_domains[I915_MAX_PIPES] = {}; unsigned crtc_vblank_mask = 0; int i; @@ -12376,7 +12375,6 @@ static void intel_atomic_commit_tail(struct drm_atomic_state *state) if (needs_modeset(new_crtc_state) || to_intel_crtc_state(new_crtc_state)->update_pipe) { - hw_check = true; put_domains[to_intel_crtc(crtc)->pipe] = modeset_get_crtc_power_domains(crtc, -- cgit From 2ba7d7e0437127314864238f8bfcb8369d81075c Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Thu, 21 Sep 2017 17:19:20 +0300 Subject: drm/i915/bios: ignore HDMI on port A MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The hardware state readout oopses after several warnings when trying to use HDMI on port A, if such a combination is configured in VBT. Filter the combo out already at the VBT parsing phase. v2: also ignore DVI (Ville) Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=102889 Cc: stable@vger.kernel.org Cc: Imre Deak Reviewed-by: Ville Syrjälä Tested-by: Daniel Drake Signed-off-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/20170921141920.18172-1-jani.nikula@intel.com (cherry picked from commit d27ffc1d00327c29b3aa97f941b42f0949f9e99f) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/intel_bios.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/gpu/drm/i915/intel_bios.c b/drivers/gpu/drm/i915/intel_bios.c index 183e87e8ea31..00c6aee0a9a1 100644 --- a/drivers/gpu/drm/i915/intel_bios.c +++ b/drivers/gpu/drm/i915/intel_bios.c @@ -1163,6 +1163,13 @@ static void parse_ddi_port(struct drm_i915_private *dev_priv, enum port port, is_hdmi = is_dvi && (child->common.device_type & DEVICE_TYPE_NOT_HDMI_OUTPUT) == 0; is_edp = is_dp && (child->common.device_type & DEVICE_TYPE_INTERNAL_CONNECTOR); + if (port == PORT_A && is_dvi) { + DRM_DEBUG_KMS("VBT claims port A supports DVI%s, ignoring\n", + is_hdmi ? "/HDMI" : ""); + is_dvi = false; + is_hdmi = false; + } + info->supports_dvi = is_dvi; info->supports_hdmi = is_hdmi; info->supports_dp = is_dp; -- cgit From 36f6ee22d2d66046e369757ec6bbe1c482957ba6 Mon Sep 17 00:00:00 2001 From: Alexey Kodanev Date: Tue, 26 Sep 2017 15:14:29 +0300 Subject: vti: fix use after free in vti_tunnel_xmit/vti6_tnl_xmit When running LTP IPsec tests, KASan might report: BUG: KASAN: use-after-free in vti_tunnel_xmit+0xeee/0xff0 [ip_vti] Read of size 4 at addr ffff880dc6ad1980 by task swapper/0/0 ... Call Trace: dump_stack+0x63/0x89 print_address_description+0x7c/0x290 kasan_report+0x28d/0x370 ? vti_tunnel_xmit+0xeee/0xff0 [ip_vti] __asan_report_load4_noabort+0x19/0x20 vti_tunnel_xmit+0xeee/0xff0 [ip_vti] ? vti_init_net+0x190/0x190 [ip_vti] ? save_stack_trace+0x1b/0x20 ? save_stack+0x46/0xd0 dev_hard_start_xmit+0x147/0x510 ? icmp_echo.part.24+0x1f0/0x210 __dev_queue_xmit+0x1394/0x1c60 ... Freed by task 0: save_stack_trace+0x1b/0x20 save_stack+0x46/0xd0 kasan_slab_free+0x70/0xc0 kmem_cache_free+0x81/0x1e0 kfree_skbmem+0xb1/0xe0 kfree_skb+0x75/0x170 kfree_skb_list+0x3e/0x60 __dev_queue_xmit+0x1298/0x1c60 dev_queue_xmit+0x10/0x20 neigh_resolve_output+0x3a8/0x740 ip_finish_output2+0x5c0/0xe70 ip_finish_output+0x4ba/0x680 ip_output+0x1c1/0x3a0 xfrm_output_resume+0xc65/0x13d0 xfrm_output+0x1e4/0x380 xfrm4_output_finish+0x5c/0x70 Can be fixed if we get skb->len before dst_output(). Fixes: b9959fd3b0fa ("vti: switch to new ip tunnel code") Fixes: 22e1b23dafa8 ("vti6: Support inter address family tunneling.") Signed-off-by: Alexey Kodanev Signed-off-by: David S. Miller --- net/ipv4/ip_vti.c | 3 ++- net/ipv6/ip6_vti.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 5ed63d250950..89453cf62158 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -168,6 +168,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, struct ip_tunnel_parm *parms = &tunnel->parms; struct dst_entry *dst = skb_dst(skb); struct net_device *tdev; /* Device to other host */ + int pkt_len = skb->len; int err; int mtu; @@ -229,7 +230,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, err = dst_output(tunnel->net, skb->sk, skb); if (net_xmit_eval(err) == 0) - err = skb->len; + err = pkt_len; iptunnel_xmit_stats(dev, err); return NETDEV_TX_OK; diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 79444a4bfd6d..bcdc2d557de1 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -445,6 +445,7 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) struct dst_entry *dst = skb_dst(skb); struct net_device *tdev; struct xfrm_state *x; + int pkt_len = skb->len; int err = -1; int mtu; @@ -502,7 +503,7 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats); u64_stats_update_begin(&tstats->syncp); - tstats->tx_bytes += skb->len; + tstats->tx_bytes += pkt_len; tstats->tx_packets++; u64_stats_update_end(&tstats->syncp); } else { -- cgit From 62b982eeb4589b2e6d7c01a90590e3a4c2b2ca19 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Tue, 26 Sep 2017 16:16:43 +0200 Subject: l2tp: fix race condition in l2tp_tunnel_delete If we try to delete the same tunnel twice, the first delete operation does a lookup (l2tp_tunnel_get), finds the tunnel, calls l2tp_tunnel_delete, which queues it for deletion by l2tp_tunnel_del_work. The second delete operation also finds the tunnel and calls l2tp_tunnel_delete. If the workqueue has already fired and started running l2tp_tunnel_del_work, then l2tp_tunnel_delete will queue the same tunnel a second time, and try to free the socket again. Add a dead flag to prevent firing the workqueue twice. Then we can remove the check of queue_work's result that was meant to prevent that race but doesn't. Reproducer: ip l2tp add tunnel tunnel_id 3000 peer_tunnel_id 4000 local 192.168.0.2 remote 192.168.0.1 encap udp udp_sport 5000 udp_dport 6000 ip l2tp add session name l2tp1 tunnel_id 3000 session_id 1000 peer_session_id 2000 ip link set l2tp1 up ip l2tp del tunnel tunnel_id 3000 ip l2tp del tunnel tunnel_id 3000 Fixes: f8ccac0e4493 ("l2tp: put tunnel socket release on a workqueue") Reported-by: Jianlin Shi Signed-off-by: Sabrina Dubroca Acked-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 10 ++++------ net/l2tp/l2tp_core.h | 5 ++++- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index d8c2a89a76e1..02d61101b108 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1688,14 +1688,12 @@ EXPORT_SYMBOL_GPL(l2tp_tunnel_create); /* This function is used by the netlink TUNNEL_DELETE command. */ -int l2tp_tunnel_delete(struct l2tp_tunnel *tunnel) +void l2tp_tunnel_delete(struct l2tp_tunnel *tunnel) { - l2tp_tunnel_inc_refcount(tunnel); - if (false == queue_work(l2tp_wq, &tunnel->del_work)) { - l2tp_tunnel_dec_refcount(tunnel); - return 1; + if (!test_and_set_bit(0, &tunnel->dead)) { + l2tp_tunnel_inc_refcount(tunnel); + queue_work(l2tp_wq, &tunnel->del_work); } - return 0; } EXPORT_SYMBOL_GPL(l2tp_tunnel_delete); diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index 70a12df40a5f..67c79d9b5c6c 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -161,6 +161,9 @@ struct l2tp_tunnel_cfg { struct l2tp_tunnel { int magic; /* Should be L2TP_TUNNEL_MAGIC */ + + unsigned long dead; + struct rcu_head rcu; rwlock_t hlist_lock; /* protect session_hlist */ bool acpt_newsess; /* Indicates whether this @@ -255,7 +258,7 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 peer_tunnel_id, struct l2tp_tunnel_cfg *cfg, struct l2tp_tunnel **tunnelp); void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel); -int l2tp_tunnel_delete(struct l2tp_tunnel *tunnel); +void l2tp_tunnel_delete(struct l2tp_tunnel *tunnel); struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, -- cgit From 6851a3db7e224bbb85e23b3c64a506c9e0904382 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Mon, 18 Sep 2017 14:46:03 -0700 Subject: xfs: validate bdev support for DAX inode flag Currently only the blocksize is checked, but we should really be calling bdev_dax_supported() which also tests to make sure we can get a struct dax_device and that the dax_direct_access() path is working. This is the same check that we do for the "-o dax" mount option in xfs_fs_fill_super(). This does not fix the race issues that caused the XFS DAX inode option to be disabled, so that option will still be disabled. If/when we re-enable it, though, I think we will want this issue to have been fixed. I also do think that we want to fix this in stable kernels. Signed-off-by: Ross Zwisler CC: stable@vger.kernel.org Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_ioctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 5049e8ab6e30..aa75389be8cf 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1088,6 +1088,7 @@ xfs_ioctl_setattr_dax_invalidate( int *join_flags) { struct inode *inode = VFS_I(ip); + struct super_block *sb = inode->i_sb; int error; *join_flags = 0; @@ -1100,7 +1101,7 @@ xfs_ioctl_setattr_dax_invalidate( if (fa->fsx_xflags & FS_XFLAG_DAX) { if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) return -EINVAL; - if (ip->i_mount->m_sb.sb_blocksize != PAGE_SIZE) + if (bdev_dax_supported(sb, sb->s_blocksize) < 0) return -EINVAL; } -- cgit From 546e7be8244dc050effef0555df5b8d94d10dafc Mon Sep 17 00:00:00 2001 From: Chandan Rajendra Date: Fri, 22 Sep 2017 11:47:33 -0700 Subject: iomap_dio_rw: Allocate AIO completion queue before submitting dio Executing xfs/104 test in a loop on Linux-v4.13 kernel on a ppc64 machine can cause the following NULL pointer dereference, .queue_work_on+0x4c/0x80 .iomap_dio_bio_end_io+0xbc/0x1f0 .bio_endio+0x118/0x1f0 .blk_update_request+0xd0/0x470 .blk_mq_end_request+0x24/0xc0 .lo_complete_rq+0x40/0xe0 .__blk_mq_complete_request_remote+0x28/0x40 .flush_smp_call_function_queue+0xc4/0x1e0 .smp_ipi_demux_relaxed+0x8c/0x100 .icp_hv_ipi_action+0x54/0xa0 .__handle_irq_event_percpu+0x84/0x2c0 .handle_irq_event_percpu+0x28/0x80 .handle_percpu_irq+0x78/0xc0 .generic_handle_irq+0x40/0x70 .__do_irq+0x88/0x200 .call_do_irq+0x14/0x24 .do_IRQ+0x84/0x130 This occurs due to the following sequence of events, 1. Allocate dio for Direct I/O write. 2. Invoke iomap_apply() until iov_iter_count() bytes have been submitted. - Assume that we have submitted atleast one bio. Hence iomap_dio->ref value will be >= 2. - If during the second iteration, iomap_apply() ends up returning -ENOSPC, we would break out of the loop and since the 'ret' value is a negative number we end up not allocating memory for super_block->s_dio_done_wq. 3. Meanwhile, iomap_dio_bio_end_io() is invoked for bios that have been submitted and here the code ends up dereferencing the NULL pointer stored at super_block->s_dio_done_wq. This commit fixes the bug by allocating memory for super_block->s_dio_done_wq before iomap_apply() is invoked. Reported-by: Eryu Guan Reviewed-by: Christoph Hellwig Tested-by: Eryu Guan Signed-off-by: Chandan Rajendra Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/iomap.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/iomap.c b/fs/iomap.c index 269b24a01f32..d4f526a3f5b2 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -993,6 +993,13 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, WARN_ON_ONCE(ret); ret = 0; + if (iov_iter_rw(iter) == WRITE && !is_sync_kiocb(iocb) && + !inode->i_sb->s_dio_done_wq) { + ret = sb_init_dio_done_wq(inode->i_sb); + if (ret < 0) + goto out_free_dio; + } + inode_dio_begin(inode); blk_start_plug(&plug); @@ -1015,13 +1022,6 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (ret < 0) iomap_dio_set_error(dio, ret); - if (ret >= 0 && iov_iter_rw(iter) == WRITE && !is_sync_kiocb(iocb) && - !inode->i_sb->s_dio_done_wq) { - ret = sb_init_dio_done_wq(inode->i_sb); - if (ret < 0) - iomap_dio_set_error(dio, ret); - } - if (!atomic_dec_and_test(&dio->ref)) { if (!is_sync_kiocb(iocb)) return -EIOCBQUEUED; -- cgit From ee70daaba82d70766d0723b743d9fdeb3b06102a Mon Sep 17 00:00:00 2001 From: Eryu Guan Date: Thu, 21 Sep 2017 11:26:18 -0700 Subject: xfs: update i_size after unwritten conversion in dio completion Since commit d531d91d6990 ("xfs: always use unwritten extents for direct I/O writes"), we start allocating unwritten extents for all direct writes to allow appending aio in XFS. But for dio writes that could extend file size we update the in-core inode size first, then convert the unwritten extents to real allocations at dio completion time in xfs_dio_write_end_io(). Thus a racing direct read could see the new i_size and find the unwritten extents first and read zeros instead of actual data, if the direct writer also takes a shared iolock. Fix it by updating the in-core inode size after the unwritten extent conversion. To do this, introduce a new boolean argument to xfs_iomap_write_unwritten() to tell if we want to update in-core i_size or not. Suggested-by: Brian Foster Reviewed-by: Brian Foster Signed-off-by: Eryu Guan Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_aops.c | 3 ++- fs/xfs/xfs_file.c | 33 +++++++++++++++++++-------------- fs/xfs/xfs_iomap.c | 7 +++++-- fs/xfs/xfs_iomap.h | 2 +- fs/xfs/xfs_pnfs.c | 2 +- 5 files changed, 28 insertions(+), 19 deletions(-) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 29172609f2a3..f18e5932aec4 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -343,7 +343,8 @@ xfs_end_io( error = xfs_reflink_end_cow(ip, offset, size); break; case XFS_IO_UNWRITTEN: - error = xfs_iomap_write_unwritten(ip, offset, size); + /* writeback should never update isize */ + error = xfs_iomap_write_unwritten(ip, offset, size, false); break; default: ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 350b6d43ba23..309e26c9dddb 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -434,7 +434,6 @@ xfs_dio_write_end_io( struct inode *inode = file_inode(iocb->ki_filp); struct xfs_inode *ip = XFS_I(inode); loff_t offset = iocb->ki_pos; - bool update_size = false; int error = 0; trace_xfs_end_io_direct_write(ip, offset, size); @@ -445,6 +444,21 @@ xfs_dio_write_end_io( if (size <= 0) return size; + if (flags & IOMAP_DIO_COW) { + error = xfs_reflink_end_cow(ip, offset, size); + if (error) + return error; + } + + /* + * Unwritten conversion updates the in-core isize after extent + * conversion but before updating the on-disk size. Updating isize any + * earlier allows a racing dio read to find unwritten extents before + * they are converted. + */ + if (flags & IOMAP_DIO_UNWRITTEN) + return xfs_iomap_write_unwritten(ip, offset, size, true); + /* * We need to update the in-core inode size here so that we don't end up * with the on-disk inode size being outside the in-core inode size. We @@ -459,20 +473,11 @@ xfs_dio_write_end_io( spin_lock(&ip->i_flags_lock); if (offset + size > i_size_read(inode)) { i_size_write(inode, offset + size); - update_size = true; - } - spin_unlock(&ip->i_flags_lock); - - if (flags & IOMAP_DIO_COW) { - error = xfs_reflink_end_cow(ip, offset, size); - if (error) - return error; - } - - if (flags & IOMAP_DIO_UNWRITTEN) - error = xfs_iomap_write_unwritten(ip, offset, size); - else if (update_size) + spin_unlock(&ip->i_flags_lock); error = xfs_setfilesize(ip, offset, size); + } else { + spin_unlock(&ip->i_flags_lock); + } return error; } diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index a1909bc064e9..f179bdf1644d 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -829,7 +829,8 @@ int xfs_iomap_write_unwritten( xfs_inode_t *ip, xfs_off_t offset, - xfs_off_t count) + xfs_off_t count, + bool update_isize) { xfs_mount_t *mp = ip->i_mount; xfs_fileoff_t offset_fsb; @@ -840,6 +841,7 @@ xfs_iomap_write_unwritten( xfs_trans_t *tp; xfs_bmbt_irec_t imap; struct xfs_defer_ops dfops; + struct inode *inode = VFS_I(ip); xfs_fsize_t i_size; uint resblks; int error; @@ -899,7 +901,8 @@ xfs_iomap_write_unwritten( i_size = XFS_FSB_TO_B(mp, offset_fsb + count_fsb); if (i_size > offset + count) i_size = offset + count; - + if (update_isize && i_size > i_size_read(inode)) + i_size_write(inode, i_size); i_size = xfs_new_eof(ip, i_size); if (i_size) { ip->i_d.di_size = i_size; diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 00db3ecea084..ee535065c5d0 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -27,7 +27,7 @@ int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t, struct xfs_bmbt_irec *, int); int xfs_iomap_write_allocate(struct xfs_inode *, int, xfs_off_t, struct xfs_bmbt_irec *); -int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t); +int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool); void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *, struct xfs_bmbt_irec *); diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 2f2dc3c09ad0..4246876df7b7 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -274,7 +274,7 @@ xfs_fs_commit_blocks( (end - 1) >> PAGE_SHIFT); WARN_ON_ONCE(error); - error = xfs_iomap_write_unwritten(ip, start, length); + error = xfs_iomap_write_unwritten(ip, start, length, false); if (error) goto out_drop_iolock; } -- cgit From 9789dd9e1d939232e8ff4c50ef8e75aa6781b3fb Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 18 Sep 2017 09:42:09 -0700 Subject: xfs: perag initialization should only touch m_ag_max_usable for AG 0 We call __xfs_ag_resv_init to make a per-AG reservation for each AG. This makes the reservation per-AG, not per-filesystem. Therefore, it is incorrect to adjust m_ag_max_usable for each AG. Adjust it only when we're reserving AG 0's blocks so that we only do it once per fs. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/libxfs/xfs_ag_resv.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index b008ff3250eb..df3e600835e8 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -156,7 +156,8 @@ __xfs_ag_resv_free( trace_xfs_ag_resv_free(pag, type, 0); resv = xfs_perag_resv(pag, type); - pag->pag_mount->m_ag_max_usable += resv->ar_asked; + if (pag->pag_agno == 0) + pag->pag_mount->m_ag_max_usable += resv->ar_asked; /* * AGFL blocks are always considered "free", so whatever * was reserved at mount time must be given back at umount. @@ -216,7 +217,14 @@ __xfs_ag_resv_init( return error; } - mp->m_ag_max_usable -= ask; + /* + * Reduce the maximum per-AG allocation length by however much we're + * trying to reserve for an AG. Since this is a filesystem-wide + * counter, we only make the adjustment for AG 0. This assumes that + * there aren't any AGs hungrier for per-AG reservation than AG 0. + */ + if (pag->pag_agno == 0) + mp->m_ag_max_usable -= ask; resv = xfs_perag_resv(pag, type); resv->ar_asked = ask; -- cgit From 842f6e9f786226c58fcbd5ef80eadca72fdfe652 Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Fri, 22 Sep 2017 11:47:46 -0700 Subject: xfs: Capture state of the right inode in xfs_iflush_done My previous patch: d3a304b6292168b83b45d624784f973fdc1ca674 check for XFS_LI_FAILED flag xfs_iflush done, so the failed item can be properly resubmitted. In the loop scanning other inodes being completed, it should check the current item for the XFS_LI_FAILED, and not the initial one. The state of the initial inode is checked after the loop ends Kudos to Eric for catching this. Signed-off-by: Carlos Maiolino Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_inode_item.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 6d0f74ec31e8..a705f34b58fa 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -745,7 +745,7 @@ xfs_iflush_done( */ iip = INODE_ITEM(blip); if ((iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) || - lip->li_flags & XFS_LI_FAILED) + (blip->li_flags & XFS_LI_FAILED)) need_ail++; blip = next; -- cgit From 5e5c943c1f257c2b3424fc3f8a7b18570152dab3 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 18 Sep 2017 09:41:17 -0700 Subject: xfs: revert "xfs: factor rmap btree size into the indlen calculations" In commit fd26a88093ba we added a worst case estimate for rmapbt blocks needed to satisfy the block mapping request. Since then, we added the ability to reserve enough space in each AG such that we should never run out of blocks to grow the rmapbt, which makes this calculation unnecessary. Revert the commit because it makes the extra delalloc indlen accounting unnecessary and incorrect. Reported-by: Eryu Guan Reviewed-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 459f4b4f08fe..044a363119be 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -49,7 +49,6 @@ #include "xfs_rmap.h" #include "xfs_ag_resv.h" #include "xfs_refcount.h" -#include "xfs_rmap_btree.h" #include "xfs_icache.h" @@ -192,12 +191,8 @@ xfs_bmap_worst_indlen( int maxrecs; /* maximum record count at this level */ xfs_mount_t *mp; /* mount structure */ xfs_filblks_t rval; /* return value */ - xfs_filblks_t orig_len; mp = ip->i_mount; - - /* Calculate the worst-case size of the bmbt. */ - orig_len = len; maxrecs = mp->m_bmap_dmxr[0]; for (level = 0, rval = 0; level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK); @@ -205,20 +200,12 @@ xfs_bmap_worst_indlen( len += maxrecs - 1; do_div(len, maxrecs); rval += len; - if (len == 1) { - rval += XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) - + if (len == 1) + return rval + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) - level - 1; - break; - } if (level == 0) maxrecs = mp->m_bmap_dmxr[1]; } - - /* Calculate the worst-case size of the rmapbt. */ - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) - rval += 1 + xfs_rmapbt_calc_size(mp, orig_len) + - mp->m_rmap_maxlevels; - return rval; } -- cgit From d85fc17beeb06f9979d63fe4d9fbffbb1a00bba4 Mon Sep 17 00:00:00 2001 From: Igor Russkikh Date: Mon, 25 Sep 2017 10:48:47 +0300 Subject: aquantia: Setup max_mtu in ndev to enable jumbo frames Although hardware is capable for almost 16K MTU, without max_mtu field correctly set it only allows standard MTU to be used. This patch enables max MTU, calculating it from hardware maximum frame size of 16352 octets (including FCS). Fixes: 5513e16421cb ("net: ethernet: aquantia: Fixes for aq_ndev_change_mtu") Signed-off-by: Pavel Belous Signed-off-by: Igor Russkikh Signed-off-by: David S. Miller --- drivers/net/ethernet/aquantia/atlantic/aq_nic.c | 11 ++--------- .../ethernet/aquantia/atlantic/hw_atl/hw_atl_b0_internal.h | 2 +- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c index 6ac9e2602d6d..bf26a59a9d8e 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c @@ -214,7 +214,6 @@ struct aq_nic_s *aq_nic_alloc_cold(const struct net_device_ops *ndev_ops, SET_NETDEV_DEV(ndev, dev); ndev->if_port = port; - ndev->min_mtu = ETH_MIN_MTU; self->ndev = ndev; self->aq_pci_func = aq_pci_func; @@ -283,6 +282,7 @@ int aq_nic_ndev_init(struct aq_nic_s *self) self->ndev->features = aq_hw_caps->hw_features; self->ndev->priv_flags = aq_hw_caps->hw_priv_flags; self->ndev->mtu = aq_nic_cfg->mtu - ETH_HLEN; + self->ndev->max_mtu = self->aq_hw_caps.mtu - ETH_FCS_LEN - ETH_HLEN; return 0; } @@ -693,16 +693,9 @@ int aq_nic_set_multicast_list(struct aq_nic_s *self, struct net_device *ndev) int aq_nic_set_mtu(struct aq_nic_s *self, int new_mtu) { - int err = 0; - - if (new_mtu > self->aq_hw_caps.mtu) { - err = -EINVAL; - goto err_exit; - } self->aq_nic_cfg.mtu = new_mtu; -err_exit: - return err; + return 0; } int aq_nic_set_mac(struct aq_nic_s *self, struct net_device *ndev) diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0_internal.h b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0_internal.h index f3957e930340..fcf89e25a773 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0_internal.h +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0_internal.h @@ -16,7 +16,7 @@ #include "../aq_common.h" -#define HW_ATL_B0_MTU_JUMBO (16000U) +#define HW_ATL_B0_MTU_JUMBO 16352U #define HW_ATL_B0_MTU 1514U #define HW_ATL_B0_TX_RINGS 4U -- cgit From 3aec6412e007b294d4c135f5c7ed5e5ecf37dd2e Mon Sep 17 00:00:00 2001 From: Igor Russkikh Date: Mon, 25 Sep 2017 10:48:48 +0300 Subject: aquantia: Fix Tx queue hangups Driver did a poor job in managing its Tx queues: Sometimes it could stop tx queues due to link down condition in aq_nic_xmit - but never waked up them. That led to Tx path total suspend. This patch fixes this and improves generic queue management: - introduces queue restart counter - uses generic netif_ interface to disable and enable tx path - refactors link up/down condition and introduces dmesg log event when link changes. - introduces new constant for minimum descriptors count required for queue wakeup Signed-off-by: Pavel Belous Signed-off-by: Igor Russkikh Signed-off-by: David S. Miller --- drivers/net/ethernet/aquantia/atlantic/aq_cfg.h | 4 ++ drivers/net/ethernet/aquantia/atlantic/aq_nic.c | 91 +++++++++++------------- drivers/net/ethernet/aquantia/atlantic/aq_nic.h | 2 - drivers/net/ethernet/aquantia/atlantic/aq_ring.c | 26 +++++++ drivers/net/ethernet/aquantia/atlantic/aq_ring.h | 4 ++ drivers/net/ethernet/aquantia/atlantic/aq_vec.c | 8 +-- 6 files changed, 76 insertions(+), 59 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h b/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h index 214986436ece..0fdaaa643073 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h +++ b/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h @@ -51,6 +51,10 @@ #define AQ_CFG_SKB_FRAGS_MAX 32U +/* Number of descriptors available in one ring to resume this ring queue + */ +#define AQ_CFG_RESTART_DESC_THRES (AQ_CFG_SKB_FRAGS_MAX * 2) + #define AQ_CFG_NAPI_WEIGHT 64U #define AQ_CFG_MULTICAST_ADDRESS_MAX 32U diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c index bf26a59a9d8e..072a55029f04 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c @@ -119,6 +119,35 @@ int aq_nic_cfg_start(struct aq_nic_s *self) return 0; } +static int aq_nic_update_link_status(struct aq_nic_s *self) +{ + int err = self->aq_hw_ops.hw_get_link_status(self->aq_hw); + + if (err) + return err; + + if (self->link_status.mbps != self->aq_hw->aq_link_status.mbps) + pr_info("%s: link change old %d new %d\n", + AQ_CFG_DRV_NAME, self->link_status.mbps, + self->aq_hw->aq_link_status.mbps); + + self->link_status = self->aq_hw->aq_link_status; + if (!netif_carrier_ok(self->ndev) && self->link_status.mbps) { + aq_utils_obj_set(&self->header.flags, + AQ_NIC_FLAG_STARTED); + aq_utils_obj_clear(&self->header.flags, + AQ_NIC_LINK_DOWN); + netif_carrier_on(self->ndev); + netif_tx_wake_all_queues(self->ndev); + } + if (netif_carrier_ok(self->ndev) && !self->link_status.mbps) { + netif_carrier_off(self->ndev); + netif_tx_disable(self->ndev); + aq_utils_obj_set(&self->header.flags, AQ_NIC_LINK_DOWN); + } + return 0; +} + static void aq_nic_service_timer_cb(unsigned long param) { struct aq_nic_s *self = (struct aq_nic_s *)param; @@ -131,26 +160,13 @@ static void aq_nic_service_timer_cb(unsigned long param) if (aq_utils_obj_test(&self->header.flags, AQ_NIC_FLAGS_IS_NOT_READY)) goto err_exit; - err = self->aq_hw_ops.hw_get_link_status(self->aq_hw); - if (err < 0) + err = aq_nic_update_link_status(self); + if (err) goto err_exit; - self->link_status = self->aq_hw->aq_link_status; - self->aq_hw_ops.hw_interrupt_moderation_set(self->aq_hw, self->aq_nic_cfg.is_interrupt_moderation); - if (self->link_status.mbps) { - aq_utils_obj_set(&self->header.flags, - AQ_NIC_FLAG_STARTED); - aq_utils_obj_clear(&self->header.flags, - AQ_NIC_LINK_DOWN); - netif_carrier_on(self->ndev); - } else { - netif_carrier_off(self->ndev); - aq_utils_obj_set(&self->header.flags, AQ_NIC_LINK_DOWN); - } - memset(&stats_rx, 0U, sizeof(struct aq_ring_stats_rx_s)); memset(&stats_tx, 0U, sizeof(struct aq_ring_stats_tx_s)); for (i = AQ_DIMOF(self->aq_vec); i--;) { @@ -240,7 +256,6 @@ err_exit: int aq_nic_ndev_register(struct aq_nic_s *self) { int err = 0; - unsigned int i = 0U; if (!self->ndev) { err = -EINVAL; @@ -262,8 +277,7 @@ int aq_nic_ndev_register(struct aq_nic_s *self) netif_carrier_off(self->ndev); - for (i = AQ_CFG_VECS_MAX; i--;) - aq_nic_ndev_queue_stop(self, i); + netif_tx_disable(self->ndev); err = register_netdev(self->ndev); if (err < 0) @@ -318,12 +332,8 @@ struct aq_nic_s *aq_nic_alloc_hot(struct net_device *ndev) err = -EINVAL; goto err_exit; } - if (netif_running(ndev)) { - unsigned int i; - - for (i = AQ_CFG_VECS_MAX; i--;) - netif_stop_subqueue(ndev, i); - } + if (netif_running(ndev)) + netif_tx_disable(ndev); for (self->aq_vecs = 0; self->aq_vecs < self->aq_nic_cfg.vecs; self->aq_vecs++) { @@ -383,16 +393,6 @@ err_exit: return err; } -void aq_nic_ndev_queue_start(struct aq_nic_s *self, unsigned int idx) -{ - netif_start_subqueue(self->ndev, idx); -} - -void aq_nic_ndev_queue_stop(struct aq_nic_s *self, unsigned int idx) -{ - netif_stop_subqueue(self->ndev, idx); -} - int aq_nic_start(struct aq_nic_s *self) { struct aq_vec_s *aq_vec = NULL; @@ -451,10 +451,6 @@ int aq_nic_start(struct aq_nic_s *self) goto err_exit; } - for (i = 0U, aq_vec = self->aq_vec[0]; - self->aq_vecs > i; ++i, aq_vec = self->aq_vec[i]) - aq_nic_ndev_queue_start(self, i); - err = netif_set_real_num_tx_queues(self->ndev, self->aq_vecs); if (err < 0) goto err_exit; @@ -463,6 +459,8 @@ int aq_nic_start(struct aq_nic_s *self) if (err < 0) goto err_exit; + netif_tx_start_all_queues(self->ndev); + err_exit: return err; } @@ -602,7 +600,6 @@ int aq_nic_xmit(struct aq_nic_s *self, struct sk_buff *skb) unsigned int vec = skb->queue_mapping % self->aq_nic_cfg.vecs; unsigned int tc = 0U; int err = NETDEV_TX_OK; - bool is_nic_in_bad_state; frags = skb_shinfo(skb)->nr_frags + 1; @@ -613,13 +610,10 @@ int aq_nic_xmit(struct aq_nic_s *self, struct sk_buff *skb) goto err_exit; } - is_nic_in_bad_state = aq_utils_obj_test(&self->header.flags, - AQ_NIC_FLAGS_IS_NOT_TX_READY) || - (aq_ring_avail_dx(ring) < - AQ_CFG_SKB_FRAGS_MAX); + aq_ring_update_queue_state(ring); - if (is_nic_in_bad_state) { - aq_nic_ndev_queue_stop(self, ring->idx); + /* Above status update may stop the queue. Check this. */ + if (__netif_subqueue_stopped(self->ndev, ring->idx)) { err = NETDEV_TX_BUSY; goto err_exit; } @@ -631,9 +625,6 @@ int aq_nic_xmit(struct aq_nic_s *self, struct sk_buff *skb) ring, frags); if (err >= 0) { - if (aq_ring_avail_dx(ring) < AQ_CFG_SKB_FRAGS_MAX + 1) - aq_nic_ndev_queue_stop(self, ring->idx); - ++ring->stats.tx.packets; ring->stats.tx.bytes += skb->len; } @@ -898,9 +889,7 @@ int aq_nic_stop(struct aq_nic_s *self) struct aq_vec_s *aq_vec = NULL; unsigned int i = 0U; - for (i = 0U, aq_vec = self->aq_vec[0]; - self->aq_vecs > i; ++i, aq_vec = self->aq_vec[i]) - aq_nic_ndev_queue_stop(self, i); + netif_tx_disable(self->ndev); del_timer_sync(&self->service_timer); diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.h b/drivers/net/ethernet/aquantia/atlantic/aq_nic.h index 7fc2a5ecb2b7..0ddd556ff901 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.h +++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.h @@ -83,8 +83,6 @@ struct net_device *aq_nic_get_ndev(struct aq_nic_s *self); int aq_nic_init(struct aq_nic_s *self); int aq_nic_cfg_start(struct aq_nic_s *self); int aq_nic_ndev_register(struct aq_nic_s *self); -void aq_nic_ndev_queue_start(struct aq_nic_s *self, unsigned int idx); -void aq_nic_ndev_queue_stop(struct aq_nic_s *self, unsigned int idx); void aq_nic_ndev_free(struct aq_nic_s *self); int aq_nic_start(struct aq_nic_s *self); int aq_nic_xmit(struct aq_nic_s *self, struct sk_buff *skb); diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c index 4eee1996a825..02f79b0640ba 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c @@ -104,6 +104,32 @@ int aq_ring_init(struct aq_ring_s *self) return 0; } +void aq_ring_update_queue_state(struct aq_ring_s *ring) +{ + if (aq_ring_avail_dx(ring) <= AQ_CFG_SKB_FRAGS_MAX) + aq_ring_queue_stop(ring); + else if (aq_ring_avail_dx(ring) > AQ_CFG_RESTART_DESC_THRES) + aq_ring_queue_wake(ring); +} + +void aq_ring_queue_wake(struct aq_ring_s *ring) +{ + struct net_device *ndev = aq_nic_get_ndev(ring->aq_nic); + + if (__netif_subqueue_stopped(ndev, ring->idx)) { + netif_wake_subqueue(ndev, ring->idx); + ring->stats.tx.queue_restarts++; + } +} + +void aq_ring_queue_stop(struct aq_ring_s *ring) +{ + struct net_device *ndev = aq_nic_get_ndev(ring->aq_nic); + + if (!__netif_subqueue_stopped(ndev, ring->idx)) + netif_stop_subqueue(ndev, ring->idx); +} + void aq_ring_tx_clean(struct aq_ring_s *self) { struct device *dev = aq_nic_get_dev(self->aq_nic); diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.h b/drivers/net/ethernet/aquantia/atlantic/aq_ring.h index 782176c5f4f8..24523b5ac68c 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.h +++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.h @@ -94,6 +94,7 @@ struct aq_ring_stats_tx_s { u64 errors; u64 packets; u64 bytes; + u64 queue_restarts; }; union aq_ring_stats_s { @@ -147,6 +148,9 @@ struct aq_ring_s *aq_ring_rx_alloc(struct aq_ring_s *self, int aq_ring_init(struct aq_ring_s *self); void aq_ring_rx_deinit(struct aq_ring_s *self); void aq_ring_free(struct aq_ring_s *self); +void aq_ring_update_queue_state(struct aq_ring_s *ring); +void aq_ring_queue_wake(struct aq_ring_s *ring); +void aq_ring_queue_stop(struct aq_ring_s *ring); void aq_ring_tx_clean(struct aq_ring_s *self); int aq_ring_rx_clean(struct aq_ring_s *self, struct napi_struct *napi, diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_vec.c b/drivers/net/ethernet/aquantia/atlantic/aq_vec.c index ebf588004c46..305ff8ffac2c 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_vec.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_vec.c @@ -59,12 +59,7 @@ static int aq_vec_poll(struct napi_struct *napi, int budget) if (ring[AQ_VEC_TX_ID].sw_head != ring[AQ_VEC_TX_ID].hw_head) { aq_ring_tx_clean(&ring[AQ_VEC_TX_ID]); - - if (aq_ring_avail_dx(&ring[AQ_VEC_TX_ID]) > - AQ_CFG_SKB_FRAGS_MAX) { - aq_nic_ndev_queue_start(self->aq_nic, - ring[AQ_VEC_TX_ID].idx); - } + aq_ring_update_queue_state(&ring[AQ_VEC_TX_ID]); was_tx_cleaned = true; } @@ -364,6 +359,7 @@ void aq_vec_add_stats(struct aq_vec_s *self, stats_tx->packets += tx->packets; stats_tx->bytes += tx->bytes; stats_tx->errors += tx->errors; + stats_tx->queue_restarts += tx->queue_restarts; } } -- cgit From a7bb1bea3a296549ebfc28afa76276ef392f9afa Mon Sep 17 00:00:00 2001 From: Igor Russkikh Date: Mon, 25 Sep 2017 10:48:49 +0300 Subject: aquantia: Fix transient invalid link down/up indications Due to a bug in aquantia atlantic card firmware, it sometimes reports invalid link speed bits. That caused driver to report link down events, although link itself is totally fine. This patch ignores such out of blue readings. Signed-off-by: Pavel Belous Signed-off-by: Igor Russkikh Signed-off-by: David S. Miller --- drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c index 4f5ec9a0fbfb..bf734b32e44b 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c @@ -351,8 +351,7 @@ int hw_atl_utils_mpi_get_link_status(struct aq_hw_s *self) break; default: - link_status->mbps = 0U; - break; + return -EBUSY; } } -- cgit From c7545689244b50c562b1fbbc71905fba224c8a05 Mon Sep 17 00:00:00 2001 From: Pavel Belous Date: Mon, 25 Sep 2017 10:48:50 +0300 Subject: atlantic: fix iommu errors Call skb_frag_dma_map multiple times if tx length is greater than device max and avoid processing tx ring until entire packet has been sent. Signed-off-by: Igor Russkikh Signed-off-by: Pavel Belous Signed-off-by: David S. Miller --- drivers/net/ethernet/aquantia/atlantic/aq_nic.c | 43 ++++++++++++++---------- drivers/net/ethernet/aquantia/atlantic/aq_ring.c | 27 ++++++++++----- drivers/net/ethernet/aquantia/atlantic/aq_ring.h | 6 ++-- 3 files changed, 49 insertions(+), 27 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c index 072a55029f04..0a5bb4114eb4 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c @@ -473,6 +473,7 @@ static unsigned int aq_nic_map_skb(struct aq_nic_s *self, unsigned int nr_frags = skb_shinfo(skb)->nr_frags; unsigned int frag_count = 0U; unsigned int dx = ring->sw_tail; + struct aq_ring_buff_s *first = NULL; struct aq_ring_buff_s *dx_buff = &ring->buff_ring[dx]; if (unlikely(skb_is_gso(skb))) { @@ -483,6 +484,7 @@ static unsigned int aq_nic_map_skb(struct aq_nic_s *self, dx_buff->len_l4 = tcp_hdrlen(skb); dx_buff->mss = skb_shinfo(skb)->gso_size; dx_buff->is_txc = 1U; + dx_buff->eop_index = 0xffffU; dx_buff->is_ipv6 = (ip_hdr(skb)->version == 6) ? 1U : 0U; @@ -502,6 +504,7 @@ static unsigned int aq_nic_map_skb(struct aq_nic_s *self, if (unlikely(dma_mapping_error(aq_nic_get_dev(self), dx_buff->pa))) goto exit; + first = dx_buff; dx_buff->len_pkt = skb->len; dx_buff->is_sop = 1U; dx_buff->is_mapped = 1U; @@ -530,40 +533,46 @@ static unsigned int aq_nic_map_skb(struct aq_nic_s *self, for (; nr_frags--; ++frag_count) { unsigned int frag_len = 0U; + unsigned int buff_offset = 0U; + unsigned int buff_size = 0U; dma_addr_t frag_pa; skb_frag_t *frag = &skb_shinfo(skb)->frags[frag_count]; frag_len = skb_frag_size(frag); - frag_pa = skb_frag_dma_map(aq_nic_get_dev(self), frag, 0, - frag_len, DMA_TO_DEVICE); - if (unlikely(dma_mapping_error(aq_nic_get_dev(self), frag_pa))) - goto mapping_error; + while (frag_len) { + if (frag_len > AQ_CFG_TX_FRAME_MAX) + buff_size = AQ_CFG_TX_FRAME_MAX; + else + buff_size = frag_len; + + frag_pa = skb_frag_dma_map(aq_nic_get_dev(self), + frag, + buff_offset, + buff_size, + DMA_TO_DEVICE); + + if (unlikely(dma_mapping_error(aq_nic_get_dev(self), + frag_pa))) + goto mapping_error; - while (frag_len > AQ_CFG_TX_FRAME_MAX) { dx = aq_ring_next_dx(ring, dx); dx_buff = &ring->buff_ring[dx]; dx_buff->flags = 0U; - dx_buff->len = AQ_CFG_TX_FRAME_MAX; + dx_buff->len = buff_size; dx_buff->pa = frag_pa; dx_buff->is_mapped = 1U; + dx_buff->eop_index = 0xffffU; + + frag_len -= buff_size; + buff_offset += buff_size; - frag_len -= AQ_CFG_TX_FRAME_MAX; - frag_pa += AQ_CFG_TX_FRAME_MAX; ++ret; } - - dx = aq_ring_next_dx(ring, dx); - dx_buff = &ring->buff_ring[dx]; - - dx_buff->flags = 0U; - dx_buff->len = frag_len; - dx_buff->pa = frag_pa; - dx_buff->is_mapped = 1U; - ++ret; } + first->eop_index = dx; dx_buff->is_eop = 1U; dx_buff->skb = skb; goto exit; diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c index 02f79b0640ba..0654e0c76bc2 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c @@ -104,6 +104,12 @@ int aq_ring_init(struct aq_ring_s *self) return 0; } +static inline bool aq_ring_dx_in_range(unsigned int h, unsigned int i, + unsigned int t) +{ + return (h < t) ? ((h < i) && (i < t)) : ((h < i) || (i < t)); +} + void aq_ring_update_queue_state(struct aq_ring_s *ring) { if (aq_ring_avail_dx(ring) <= AQ_CFG_SKB_FRAGS_MAX) @@ -139,23 +145,28 @@ void aq_ring_tx_clean(struct aq_ring_s *self) struct aq_ring_buff_s *buff = &self->buff_ring[self->sw_head]; if (likely(buff->is_mapped)) { - if (unlikely(buff->is_sop)) + if (unlikely(buff->is_sop)) { + if (!buff->is_eop && + buff->eop_index != 0xffffU && + (!aq_ring_dx_in_range(self->sw_head, + buff->eop_index, + self->hw_head))) + break; + dma_unmap_single(dev, buff->pa, buff->len, DMA_TO_DEVICE); - else + } else { dma_unmap_page(dev, buff->pa, buff->len, DMA_TO_DEVICE); + } } if (unlikely(buff->is_eop)) dev_kfree_skb_any(buff->skb); - } -} -static inline unsigned int aq_ring_dx_in_range(unsigned int h, unsigned int i, - unsigned int t) -{ - return (h < t) ? ((h < i) && (i < t)) : ((h < i) || (i < t)); + buff->pa = 0U; + buff->eop_index = 0xffffU; + } } #define AQ_SKB_ALIGN SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.h b/drivers/net/ethernet/aquantia/atlantic/aq_ring.h index 24523b5ac68c..5844078764bd 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.h +++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.h @@ -65,7 +65,7 @@ struct __packed aq_ring_buff_s { }; union { struct { - u32 len:16; + u16 len; u32 is_ip_cso:1; u32 is_udp_cso:1; u32 is_tcp_cso:1; @@ -77,8 +77,10 @@ struct __packed aq_ring_buff_s { u32 is_cleaned:1; u32 is_error:1; u32 rsvd3:6; + u16 eop_index; + u16 rsvd4; }; - u32 flags; + u64 flags; }; }; -- cgit From fc46820b27a2d9a46f7e90c9ceb4a64a1bc5fab8 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 25 Sep 2017 12:23:03 +0200 Subject: vfs: Return -ENXIO for negative SEEK_HOLE / SEEK_DATA offsets In generic_file_llseek_size, return -ENXIO for negative offsets as well as offsets beyond EOF. This affects filesystems which don't implement SEEK_HOLE / SEEK_DATA internally, possibly because they don't support holes. Fixes xfstest generic/448. Signed-off-by: Andreas Gruenbacher Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- fs/read_write.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/read_write.c b/fs/read_write.c index a2b9a47235c5..f0d4b16873e8 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -112,7 +112,7 @@ generic_file_llseek_size(struct file *file, loff_t offset, int whence, * In the generic case the entire file is data, so as long as * offset isn't at the end of the file then the offset is data. */ - if (offset >= eof) + if ((unsigned long long)offset >= eof) return -ENXIO; break; case SEEK_HOLE: @@ -120,7 +120,7 @@ generic_file_llseek_size(struct file *file, loff_t offset, int whence, * There is a virtual hole at the end of the file, so as long as * offset isn't i_size or larger, return i_size. */ - if (offset >= eof) + if ((unsigned long long)offset >= eof) return -ENXIO; offset = eof; break; -- cgit From c2cc187e53011c1c4931055984657da9085c763b Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 25 Sep 2017 13:19:26 +0300 Subject: sctp: Fix a big endian bug in sctp_diag_dump() The sctp_for_each_transport() function takes an pointer to int. The cb->args[] array holds longs so it's only using the high 32 bits. It works on little endian system but will break on big endian 64 bit machines. Fixes: d25adbeb0cdb ("sctp: fix an use-after-free issue in sctp_sock_dump") Signed-off-by: Dan Carpenter Acked-by: Neil Horman Reviewed-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/sctp_diag.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/sctp/sctp_diag.c b/net/sctp/sctp_diag.c index 22ed01a76b19..a72a7d925d46 100644 --- a/net/sctp/sctp_diag.c +++ b/net/sctp/sctp_diag.c @@ -463,6 +463,7 @@ static void sctp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, .r = r, .net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN), }; + int pos = cb->args[2]; /* eps hashtable dumps * args: @@ -493,7 +494,8 @@ skip: goto done; sctp_for_each_transport(sctp_sock_filter, sctp_sock_dump, - net, (int *)&cb->args[2], &commp); + net, &pos, &commp); + cb->args[2] = pos; done: cb->args[1] = cb->args[4]; -- cgit From ce7c47d60bda6c7f09ccf16e978d971c8fa16ff0 Mon Sep 17 00:00:00 2001 From: Ville Syrjälä Date: Mon, 18 Sep 2017 23:00:59 +0300 Subject: platform/x86: fujitsu-laptop: Don't oops when FUJ02E3 is not presnt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit My Fujitsu-Siemens Lifebook S6120 doesn't have the FUJ02E3 device, but it does have FUJ02B1. That means we do register the backlight device (and it even seems to work), but the code will oops as soon as we try to set the backlight brightness because it's trying to call call_fext_func() with a NULL device. Let's just skip those function calls when the FUJ02E3 device is not present. Cc: Jonathan Woithe Cc: Andy Shevchenko Signed-off-by: Ville Syrjälä Cc: # 4.13.x Signed-off-by: Darren Hart (VMware) --- drivers/platform/x86/fujitsu-laptop.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/platform/x86/fujitsu-laptop.c b/drivers/platform/x86/fujitsu-laptop.c index 85de30f93a9c..56a8195096a2 100644 --- a/drivers/platform/x86/fujitsu-laptop.c +++ b/drivers/platform/x86/fujitsu-laptop.c @@ -254,10 +254,12 @@ static int bl_update_status(struct backlight_device *b) { struct acpi_device *device = bl_get_data(b); - if (b->props.power == FB_BLANK_POWERDOWN) - call_fext_func(fext, FUNC_BACKLIGHT, 0x1, 0x4, 0x3); - else - call_fext_func(fext, FUNC_BACKLIGHT, 0x1, 0x4, 0x0); + if (fext) { + if (b->props.power == FB_BLANK_POWERDOWN) + call_fext_func(fext, FUNC_BACKLIGHT, 0x1, 0x4, 0x3); + else + call_fext_func(fext, FUNC_BACKLIGHT, 0x1, 0x4, 0x0); + } return set_lcd_level(device, b->props.brightness); } -- cgit From 4c6bb69663b3a3f2db8f488356e96acb5460f25f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 26 Sep 2017 10:36:05 +0200 Subject: quota: Fix quota corruption with generic/232 test Eric has reported that since commit d2faa415166b "quota: Do not acquire dqio_sem for dquot overwrites in v2 format" test generic/232 occasionally fails due to quota information being incorrect. Indeed that commit was too eager to remove dqio_sem completely from the path that just overwrites quota structure with updated information. Although that is innocent on its own, another process that inserts new quota structure to the same block can perform read-modify-write cycle of that block thus effectively discarding quota information update if they race in a wrong way. Fix the problem by acquiring dqio_sem for reading for overwrites of quota structure. Note that it *is* possible to completely avoid taking dqio_sem in the overwrite path however that will require modifying path inserting / deleting quota structures to avoid RMW cycles of the full block and for now it is not clear whether it is worth the hassle. Fixes: d2faa415166b2883428efa92f451774ef44373ac Reported-and-tested-by: Eric Whitney Signed-off-by: Jan Kara --- fs/quota/quota_v2.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c index c0187cda2c1e..a73e5b34db41 100644 --- a/fs/quota/quota_v2.c +++ b/fs/quota/quota_v2.c @@ -328,12 +328,16 @@ static int v2_write_dquot(struct dquot *dquot) if (!dquot->dq_off) { alloc = true; down_write(&dqopt->dqio_sem); + } else { + down_read(&dqopt->dqio_sem); } ret = qtree_write_dquot( sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv, dquot); if (alloc) up_write(&dqopt->dqio_sem); + else + up_read(&dqopt->dqio_sem); return ret; } -- cgit From 5371513fb338fb9989c569dc071326d369d6ade8 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 26 Sep 2017 15:57:16 +0100 Subject: arm64: Make sure SPsel is always set When the kernel is entered at EL2 on an ARMv8.0 system, we construct the EL1 pstate and make sure this uses the the EL1 stack pointer (we perform an exception return to EL1h). But if the kernel is either entered at EL1 or stays at EL2 (because we're on a VHE-capable system), we fail to set SPsel, and use whatever stack selection the higher exception level has choosen for us. Let's not take any chance, and make sure that SPsel is set to one before we decide the mode we're going to run in. Cc: Acked-by: Mark Rutland Signed-off-by: Marc Zyngier Signed-off-by: Will Deacon Signed-off-by: Catalin Marinas --- arch/arm64/kernel/head.S | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 7434ec0c7a27..0b243ecaf7ac 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -384,6 +384,7 @@ ENTRY(kimage_vaddr) * booted in EL1 or EL2 respectively. */ ENTRY(el2_setup) + msr SPsel, #1 // We want to use SP_EL{1,2} mrs x0, CurrentEL cmp x0, #CurrentEL_EL2 b.eq 1f -- cgit From cd39e1176d320157831ce030b4c869bd2d5eb142 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 6 Jun 2017 12:57:04 +0200 Subject: KVM: VMX: extract __pi_post_block MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Simple code movement patch, preparing for the next one. Cc: Huangweidong Cc: Gonglei Cc: wangxin Cc: Radim Krčmář Tested-by: Longpeng (Mike) Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 71 +++++++++++++++++++++++++++++------------------------- 1 file changed, 38 insertions(+), 33 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c83d28b0ab05..0002b14307ab 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11705,6 +11705,43 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask); } +static void __pi_post_block(struct kvm_vcpu *vcpu) +{ + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + struct pi_desc old, new; + unsigned int dest; + unsigned long flags; + + do { + old.control = new.control = pi_desc->control; + + dest = cpu_physical_id(vcpu->cpu); + + if (x2apic_enabled()) + new.ndst = dest; + else + new.ndst = (dest << 8) & 0xFF00; + + /* Allow posting non-urgent interrupts */ + new.sn = 0; + + /* set 'NV' to 'notification vector' */ + new.nv = POSTED_INTR_VECTOR; + } while (cmpxchg(&pi_desc->control, old.control, + new.control) != old.control); + + if(vcpu->pre_pcpu != -1) { + spin_lock_irqsave( + &per_cpu(blocked_vcpu_on_cpu_lock, + vcpu->pre_pcpu), flags); + list_del(&vcpu->blocked_vcpu_list); + spin_unlock_irqrestore( + &per_cpu(blocked_vcpu_on_cpu_lock, + vcpu->pre_pcpu), flags); + vcpu->pre_pcpu = -1; + } +} + /* * This routine does the following things for vCPU which is going * to be blocked if VT-d PI is enabled. @@ -11798,44 +11835,12 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu) static void pi_post_block(struct kvm_vcpu *vcpu) { - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - struct pi_desc old, new; - unsigned int dest; - unsigned long flags; - if (!kvm_arch_has_assigned_device(vcpu->kvm) || !irq_remapping_cap(IRQ_POSTING_CAP) || !kvm_vcpu_apicv_active(vcpu)) return; - do { - old.control = new.control = pi_desc->control; - - dest = cpu_physical_id(vcpu->cpu); - - if (x2apic_enabled()) - new.ndst = dest; - else - new.ndst = (dest << 8) & 0xFF00; - - /* Allow posting non-urgent interrupts */ - new.sn = 0; - - /* set 'NV' to 'notification vector' */ - new.nv = POSTED_INTR_VECTOR; - } while (cmpxchg(&pi_desc->control, old.control, - new.control) != old.control); - - if(vcpu->pre_pcpu != -1) { - spin_lock_irqsave( - &per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); - list_del(&vcpu->blocked_vcpu_list); - spin_unlock_irqrestore( - &per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); - vcpu->pre_pcpu = -1; - } + __pi_post_block(vcpu); } static void vmx_post_block(struct kvm_vcpu *vcpu) -- cgit From 8b306e2f3c41939ea528e6174c88cfbfff893ce1 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 6 Jun 2017 12:57:05 +0200 Subject: KVM: VMX: avoid double list add with VT-d posted interrupts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In some cases, for example involving hot-unplug of assigned devices, pi_post_block can forget to remove the vCPU from the blocked_vcpu_list. When this happens, the next call to pi_pre_block corrupts the list. Fix this in two ways. First, check vcpu->pre_pcpu in pi_pre_block and WARN instead of adding the element twice in the list. Second, always do the list removal in pi_post_block if vcpu->pre_pcpu is set (not -1). The new code keeps interrupts disabled for the whole duration of pi_pre_block/pi_post_block. This is not strictly necessary, but easier to follow. For the same reason, PI.ON is checked only after the cmpxchg, and to handle it we just call the post-block code. This removes duplication of the list removal code. Cc: Huangweidong Cc: Gonglei Cc: wangxin Cc: Radim Krčmář Tested-by: Longpeng (Mike) Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 62 ++++++++++++++++++++++-------------------------------- 1 file changed, 25 insertions(+), 37 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 0002b14307ab..0bfe97e50a40 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11710,10 +11710,11 @@ static void __pi_post_block(struct kvm_vcpu *vcpu) struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); struct pi_desc old, new; unsigned int dest; - unsigned long flags; do { old.control = new.control = pi_desc->control; + WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR, + "Wakeup handler not enabled while the VCPU is blocked\n"); dest = cpu_physical_id(vcpu->cpu); @@ -11730,14 +11731,10 @@ static void __pi_post_block(struct kvm_vcpu *vcpu) } while (cmpxchg(&pi_desc->control, old.control, new.control) != old.control); - if(vcpu->pre_pcpu != -1) { - spin_lock_irqsave( - &per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); + if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) { + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); list_del(&vcpu->blocked_vcpu_list); - spin_unlock_irqrestore( - &per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); vcpu->pre_pcpu = -1; } } @@ -11757,7 +11754,6 @@ static void __pi_post_block(struct kvm_vcpu *vcpu) */ static int pi_pre_block(struct kvm_vcpu *vcpu) { - unsigned long flags; unsigned int dest; struct pi_desc old, new; struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); @@ -11767,34 +11763,20 @@ static int pi_pre_block(struct kvm_vcpu *vcpu) !kvm_vcpu_apicv_active(vcpu)) return 0; - vcpu->pre_pcpu = vcpu->cpu; - spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); - list_add_tail(&vcpu->blocked_vcpu_list, - &per_cpu(blocked_vcpu_on_cpu, - vcpu->pre_pcpu)); - spin_unlock_irqrestore(&per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); + WARN_ON(irqs_disabled()); + local_irq_disable(); + if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) { + vcpu->pre_pcpu = vcpu->cpu; + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + list_add_tail(&vcpu->blocked_vcpu_list, + &per_cpu(blocked_vcpu_on_cpu, + vcpu->pre_pcpu)); + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + } do { old.control = new.control = pi_desc->control; - /* - * We should not block the vCPU if - * an interrupt is posted for it. - */ - if (pi_test_on(pi_desc) == 1) { - spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); - list_del(&vcpu->blocked_vcpu_list); - spin_unlock_irqrestore( - &per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); - vcpu->pre_pcpu = -1; - - return 1; - } - WARN((pi_desc->sn == 1), "Warning: SN field of posted-interrupts " "is set before blocking\n"); @@ -11819,7 +11801,12 @@ static int pi_pre_block(struct kvm_vcpu *vcpu) } while (cmpxchg(&pi_desc->control, old.control, new.control) != old.control); - return 0; + /* We should not block the vCPU if an interrupt is posted for it. */ + if (pi_test_on(pi_desc) == 1) + __pi_post_block(vcpu); + + local_irq_enable(); + return (vcpu->pre_pcpu == -1); } static int vmx_pre_block(struct kvm_vcpu *vcpu) @@ -11835,12 +11822,13 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu) static void pi_post_block(struct kvm_vcpu *vcpu) { - if (!kvm_arch_has_assigned_device(vcpu->kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(vcpu)) + if (vcpu->pre_pcpu == -1) return; + WARN_ON(irqs_disabled()); + local_irq_disable(); __pi_post_block(vcpu); + local_irq_enable(); } static void vmx_post_block(struct kvm_vcpu *vcpu) -- cgit From 31afb2ea2b10a7d17ce3db4cdb0a12b63b2fe08a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 6 Jun 2017 12:57:06 +0200 Subject: KVM: VMX: simplify and fix vmx_vcpu_pi_load MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The simplify part: do not touch pi_desc.nv, we can set it when the VCPU is first created. Likewise, pi_desc.sn is only handled by vmx_vcpu_pi_load, do not touch it in __pi_post_block. The fix part: do not check kvm_arch_has_assigned_device, instead check the SN bit to figure out whether vmx_vcpu_pi_put ran before. This matches what the previous patch did in pi_post_block. Cc: Huangweidong Cc: Gonglei Cc: wangxin Cc: Radim Krčmář Tested-by: Longpeng (Mike) Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 68 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 35 insertions(+), 33 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 0bfe97e50a40..b9d2140eb212 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2202,43 +2202,41 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) struct pi_desc old, new; unsigned int dest; - if (!kvm_arch_has_assigned_device(vcpu->kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(vcpu)) + /* + * In case of hot-plug or hot-unplug, we may have to undo + * vmx_vcpu_pi_put even if there is no assigned device. And we + * always keep PI.NDST up to date for simplicity: it makes the + * code easier, and CPU migration is not a fast path. + */ + if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu) + return; + + /* + * First handle the simple case where no cmpxchg is necessary; just + * allow posting non-urgent interrupts. + * + * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change + * PI.NDST: pi_post_block will do it for us and the wakeup_handler + * expects the VCPU to be on the blocked_vcpu_list that matches + * PI.NDST. + */ + if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || + vcpu->cpu == cpu) { + pi_clear_sn(pi_desc); return; + } + /* The full case. */ do { old.control = new.control = pi_desc->control; - /* - * If 'nv' field is POSTED_INTR_WAKEUP_VECTOR, there - * are two possible cases: - * 1. After running 'pre_block', context switch - * happened. For this case, 'sn' was set in - * vmx_vcpu_put(), so we need to clear it here. - * 2. After running 'pre_block', we were blocked, - * and woken up by some other guy. For this case, - * we don't need to do anything, 'pi_post_block' - * will do everything for us. However, we cannot - * check whether it is case #1 or case #2 here - * (maybe, not needed), so we also clear sn here, - * I think it is not a big deal. - */ - if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR) { - if (vcpu->cpu != cpu) { - dest = cpu_physical_id(cpu); - - if (x2apic_enabled()) - new.ndst = dest; - else - new.ndst = (dest << 8) & 0xFF00; - } + dest = cpu_physical_id(cpu); - /* set 'NV' to 'notification vector' */ - new.nv = POSTED_INTR_VECTOR; - } + if (x2apic_enabled()) + new.ndst = dest; + else + new.ndst = (dest << 8) & 0xFF00; - /* Allow posting non-urgent interrupts */ new.sn = 0; } while (cmpxchg(&pi_desc->control, old.control, new.control) != old.control); @@ -9592,6 +9590,13 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED; + /* + * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR + * or POSTED_INTR_WAKEUP_VECTOR. + */ + vmx->pi_desc.nv = POSTED_INTR_VECTOR; + vmx->pi_desc.sn = 1; + return &vmx->vcpu; free_vmcs: @@ -11723,9 +11728,6 @@ static void __pi_post_block(struct kvm_vcpu *vcpu) else new.ndst = (dest << 8) & 0xFF00; - /* Allow posting non-urgent interrupts */ - new.sn = 0; - /* set 'NV' to 'notification vector' */ new.nv = POSTED_INTR_VECTOR; } while (cmpxchg(&pi_desc->control, old.control, -- cgit From 7e439681af82984045efc215437ebb2ca8d33a4c Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Mon, 25 Sep 2017 10:19:57 +0200 Subject: mtd: Fix partition alignment check on multi-erasesize devices Commit 1eeef2d7483a ("mtd: handle partitioning on devices with 0 erasesize") introduced a regression on heterogeneous erase region devices. Alignment of the partition was tested against the master eraseblock size which can be bigger than the slave one, thus leading to some partitions being marked as read-only. Update wr_alignment to match this slave erasesize after this erasesize has been determined by picking the biggest erasesize of all the regions embedded in the MTD partition. Reported-by: Mathias Thore Fixes: 1eeef2d7483a ("mtd: handle partitioning on devices with 0 erasesize") Cc: Signed-off-by: Boris Brezillon Tested-by: Mathias Thore Reviewed-by: Mathias Thore --- drivers/mtd/mtdpart.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c index 5736b0c90b33..a308e707392d 100644 --- a/drivers/mtd/mtdpart.c +++ b/drivers/mtd/mtdpart.c @@ -581,6 +581,14 @@ static struct mtd_part *allocate_partition(struct mtd_info *parent, slave->mtd.erasesize = parent->erasesize; } + /* + * Slave erasesize might differ from the master one if the master + * exposes several regions with different erasesize. Adjust + * wr_alignment accordingly. + */ + if (!(slave->mtd.flags & MTD_NO_ERASE)) + wr_alignment = slave->mtd.erasesize; + tmp = slave->offset; remainder = do_div(tmp, wr_alignment); if ((slave->mtd.flags & MTD_WRITEABLE) && remainder) { -- cgit From 5c62c1c67903621cfa715d6f690548ee53301620 Mon Sep 17 00:00:00 2001 From: Yong Wu Date: Mon, 25 Sep 2017 17:28:47 +0800 Subject: iommu/io-pgtable-arm-v7s: Need dma-sync while there is no QUIRK_NO_DMA Fix the commit 81b3c2521844 ("iommu/io-pgtable: Introduce explicit coherency"). If there is no IO_PGTABLE_QUIRK_NO_DMA, we should call dma_sync_single_for_device for cache synchronization. Signed-off-by: Yong Wu Fixes: 81b3c2521844 ('iommu/io-pgtable: Introduce explicit coherency') Reviewed-by: Robin Murphy Signed-off-by: Joerg Roedel --- drivers/iommu/io-pgtable-arm-v7s.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/io-pgtable-arm-v7s.c b/drivers/iommu/io-pgtable-arm-v7s.c index d665d0dc16e8..6961fc393f0b 100644 --- a/drivers/iommu/io-pgtable-arm-v7s.c +++ b/drivers/iommu/io-pgtable-arm-v7s.c @@ -245,7 +245,7 @@ static void __arm_v7s_free_table(void *table, int lvl, static void __arm_v7s_pte_sync(arm_v7s_iopte *ptep, int num_entries, struct io_pgtable_cfg *cfg) { - if (!(cfg->quirks & IO_PGTABLE_QUIRK_NO_DMA)) + if (cfg->quirks & IO_PGTABLE_QUIRK_NO_DMA) return; dma_sync_single_for_device(cfg->iommu_dev, __arm_v7s_dma_addr(ptep), -- cgit From 1ff9b17cedb39bc78f9e3f82485765f9b467177d Mon Sep 17 00:00:00 2001 From: Yong Wu Date: Mon, 25 Sep 2017 18:15:26 +0800 Subject: iommu/mediatek: Limit the physical address in 32bit for v7s The ARM short descriptor has already limited the physical address to 32bit after the commit <76557391433c> ("iommu/io-pgtable: Sanitise map/unmap addresses"). But in MediaTek 4GB mode, the physical address is from 0x1_0000_0000 to 0x1_ffff_ffff. this will cause: WARNING: CPU: 4 PID: 3900 at xxx/drivers/iommu/io-pgtable-arm-v7s.c:482 arm_v7s_map+0x40/0xf8 Modules linked in: CPU: 4 PID: 3900 Comm: weston Tainted: G S W 4.9.44 #1 Hardware name: MediaTek MT2712m1v1 board (DT) task: ffffffc0eaa5b280 task.stack: ffffffc0e9858000 PC is at arm_v7s_map+0x40/0xf8 LR is at mtk_iommu_map+0x64/0x90 pc : [] lr : [] pstate: 000001c5 sp : ffffffc0e985b920 x29: ffffffc0e985b920 x28: 0000000127d00000 x27: 0000000000100000 x26: ffffff8008f9e000 x25: 0000000000000003 x24: 0000000000100000 x23: 0000000127d00000 x22: 00000000ff800000 x21: ffffffc0f7ec8ce0 x20: 0000000000000003 x19: 0000000000000003 x18: 0000000000000002 x17: 0000007f7e5d72c0 x16: ffffff80082b0f08 x15: 0000000000000001 x14: 000000000000003f x13: 0000000000000000 x12: 0000000000000028 x11: 0088000000000000 x10: 0000000000000000 x9 : ffffff80092fa000 x8 : ffffffc0e9858000 x7 : ffffff80085b29d8 x6 : 0000000000000000 x5 : ffffff80085b09a8 x4 : 0000000000000003 x3 : 0000000000100000 x2 : 0000000127d00000 x1 : 00000000ff800000 x0 : 0000000000000001 ... Call trace: [] arm_v7s_map+0x40/0xf8 [] mtk_iommu_map+0x64/0x90 [] iommu_map+0x100/0x3a0 [] default_iommu_map_sg+0x104/0x168 [] iommu_dma_alloc+0x238/0x3f8 [] __iommu_alloc_attrs+0xa8/0x260 [] mtk_drm_gem_create+0xac/0x180 [] mtk_drm_gem_dumb_create+0x54/0xc8 [] drm_mode_create_dumb_ioctl+0xa4/0xd8 [] drm_ioctl+0x1c0/0x490 In order to satify this, Limit the physical address to 32bit. Signed-off-by: Yong Wu Acked-by: Will Deacon Signed-off-by: Joerg Roedel --- drivers/iommu/mtk_iommu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c index bd515be5b380..16d33ac19db0 100644 --- a/drivers/iommu/mtk_iommu.c +++ b/drivers/iommu/mtk_iommu.c @@ -371,7 +371,8 @@ static int mtk_iommu_map(struct iommu_domain *domain, unsigned long iova, int ret; spin_lock_irqsave(&dom->pgtlock, flags); - ret = dom->iop->map(dom->iop, iova, paddr, size, prot); + ret = dom->iop->map(dom->iop, iova, paddr & DMA_BIT_MASK(32), + size, prot); spin_unlock_irqrestore(&dom->pgtlock, flags); return ret; -- cgit From 3c6bae62136ba5b24f0b113e68121b783457ca4b Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Tue, 26 Sep 2017 13:07:46 +0530 Subject: iommu/amd: pr_err() strings should end with newlines pr_err() messages should end with a new-line to avoid other messages being concatenated. So replace '/n' with '\n'. Signed-off-by: Arvind Yadav Fixes: 45a01c42933b ('iommu/amd: Add function copy_dev_tables()') Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu_init.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c index 382de42b8359..6fe2d0346073 100644 --- a/drivers/iommu/amd_iommu_init.c +++ b/drivers/iommu/amd_iommu_init.c @@ -874,7 +874,7 @@ static bool copy_device_table(void) hi = readl(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET + 4); entry = (((u64) hi) << 32) + lo; if (last_entry && last_entry != entry) { - pr_err("IOMMU:%d should use the same dev table as others!/n", + pr_err("IOMMU:%d should use the same dev table as others!\n", iommu->index); return false; } @@ -882,7 +882,7 @@ static bool copy_device_table(void) old_devtb_size = ((entry & ~PAGE_MASK) + 1) << 12; if (old_devtb_size != dev_table_size) { - pr_err("The device table size of IOMMU:%d is not expected!/n", + pr_err("The device table size of IOMMU:%d is not expected!\n", iommu->index); return false; } @@ -890,7 +890,7 @@ static bool copy_device_table(void) old_devtb_phys = entry & PAGE_MASK; if (old_devtb_phys >= 0x100000000ULL) { - pr_err("The address of old device table is above 4G, not trustworthy!/n"); + pr_err("The address of old device table is above 4G, not trustworthy!\n"); return false; } old_devtb = memremap(old_devtb_phys, dev_table_size, MEMREMAP_WB); @@ -901,7 +901,7 @@ static bool copy_device_table(void) old_dev_tbl_cpy = (void *)__get_free_pages(gfp_flag, get_order(dev_table_size)); if (old_dev_tbl_cpy == NULL) { - pr_err("Failed to allocate memory for copying old device table!/n"); + pr_err("Failed to allocate memory for copying old device table!\n"); return false; } -- cgit From 50ce6312f293e129eedf2affc7bd791c71d8287e Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Tue, 26 Sep 2017 19:32:52 +0100 Subject: iommu: Fix comment for iommu_ops.map_sg The definition of map_sg was split during a recent addition to iommu_ops. Put it back together. Fixes: add02cfdc9bc ("iommu: Introduce Interface for IOMMU TLB Flushing") Signed-off-by: Jean-Philippe Brucker Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/iommu.h b/include/linux/iommu.h index a7f2ac689d29..41b8c5757859 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -167,11 +167,11 @@ struct iommu_resv_region { * @map: map a physically contiguous memory region to an iommu domain * @unmap: unmap a physically contiguous memory region from an iommu domain * @map_sg: map a scatter-gather list of physically contiguous memory chunks + * to an iommu domain * @flush_tlb_all: Synchronously flush all hardware TLBs for this domain * @tlb_range_add: Add a given iova range to the flush queue for this domain * @tlb_sync: Flush all queued ranges from the hardware TLBs and empty flush * queue - * to an iommu domain * @iova_to_phys: translate iova to physical address * @add_device: add device to iommu grouping * @remove_device: remove device from iommu grouping -- cgit From df5efdd97029f2cff7e5c91ea1c9f2b94d009b0f Mon Sep 17 00:00:00 2001 From: Sebastian Sanchez Date: Tue, 26 Sep 2017 06:05:57 -0700 Subject: IB/hfi1: Turn off AOC TX after offline substates Offline.quietDuration was added in the 8051 firmware, and the driver only turns off the AOC transmitters when offline.quiet is reached. However, the AOC transmitters need to be turned off at the new state. Therefore, turn off the AOC transmitters at any offline substates including offline.quiet and offline.quietDuration, then recheck we reached offline.quiet to support backwards compatibility. Reviewed-by: Jakub Byczkowski Reviewed-by: Mike Marciniszyn Signed-off-by: Sebastian Sanchez Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/chip.c | 85 +++++++++++++++++++++++++++++---------- drivers/infiniband/hw/hfi1/chip.h | 1 + 2 files changed, 65 insertions(+), 21 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index b2ed4b9cda6e..1c810d65721a 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -1066,6 +1066,8 @@ static int read_idle_sma(struct hfi1_devdata *dd, u64 *data); static int thermal_init(struct hfi1_devdata *dd); static void update_statusp(struct hfi1_pportdata *ppd, u32 state); +static int wait_phys_link_offline_substates(struct hfi1_pportdata *ppd, + int msecs); static int wait_logical_linkstate(struct hfi1_pportdata *ppd, u32 state, int msecs); static void log_state_transition(struct hfi1_pportdata *ppd, u32 state); @@ -10305,6 +10307,7 @@ static int goto_offline(struct hfi1_pportdata *ppd, u8 rem_reason) { struct hfi1_devdata *dd = ppd->dd; u32 previous_state; + int offline_state_ret; int ret; update_lcb_cache(dd); @@ -10326,28 +10329,11 @@ static int goto_offline(struct hfi1_pportdata *ppd, u8 rem_reason) ppd->offline_disabled_reason = HFI1_ODR_MASK(OPA_LINKDOWN_REASON_TRANSIENT); - /* - * Wait for offline transition. It can take a while for - * the link to go down. - */ - ret = wait_physical_linkstate(ppd, PLS_OFFLINE, 10000); - if (ret < 0) - return ret; - - /* - * Now in charge of LCB - must be after the physical state is - * offline.quiet and before host_link_state is changed. - */ - set_host_lcb_access(dd); - write_csr(dd, DC_LCB_ERR_EN, ~0ull); /* watch LCB errors */ - - /* make sure the logical state is also down */ - ret = wait_logical_linkstate(ppd, IB_PORT_DOWN, 1000); - if (ret) - force_logical_link_state_down(ppd); - - ppd->host_link_state = HLS_LINK_COOLDOWN; /* LCB access allowed */ + offline_state_ret = wait_phys_link_offline_substates(ppd, 10000); + if (offline_state_ret < 0) + return offline_state_ret; + /* Disabling AOC transmitters */ if (ppd->port_type == PORT_TYPE_QSFP && ppd->qsfp_info.limiting_active && qsfp_mod_present(ppd)) { @@ -10364,6 +10350,30 @@ static int goto_offline(struct hfi1_pportdata *ppd, u8 rem_reason) } } + /* + * Wait for the offline.Quiet transition if it hasn't happened yet. It + * can take a while for the link to go down. + */ + if (offline_state_ret != PLS_OFFLINE_QUIET) { + ret = wait_physical_linkstate(ppd, PLS_OFFLINE, 30000); + if (ret < 0) + return ret; + } + + /* + * Now in charge of LCB - must be after the physical state is + * offline.quiet and before host_link_state is changed. + */ + set_host_lcb_access(dd); + write_csr(dd, DC_LCB_ERR_EN, ~0ull); /* watch LCB errors */ + + /* make sure the logical state is also down */ + ret = wait_logical_linkstate(ppd, IB_PORT_DOWN, 1000); + if (ret) + force_logical_link_state_down(ppd); + + ppd->host_link_state = HLS_LINK_COOLDOWN; /* LCB access allowed */ + /* * The LNI has a mandatory wait time after the physical state * moves to Offline.Quiet. The wait time may be different @@ -12804,6 +12814,39 @@ static int wait_physical_linkstate(struct hfi1_pportdata *ppd, u32 state, return 0; } +/* + * wait_phys_link_offline_quiet_substates - wait for any offline substate + * @ppd: port device + * @msecs: the number of milliseconds to wait + * + * Wait up to msecs milliseconds for any offline physical link + * state change to occur. + * Returns 0 if at least one state is reached, otherwise -ETIMEDOUT. + */ +static int wait_phys_link_offline_substates(struct hfi1_pportdata *ppd, + int msecs) +{ + u32 read_state; + unsigned long timeout; + + timeout = jiffies + msecs_to_jiffies(msecs); + while (1) { + read_state = read_physical_state(ppd->dd); + if ((read_state & 0xF0) == PLS_OFFLINE) + break; + if (time_after(jiffies, timeout)) { + dd_dev_err(ppd->dd, + "timeout waiting for phy link offline.quiet substates. Read state 0x%x, %dms\n", + read_state, msecs); + return -ETIMEDOUT; + } + usleep_range(1950, 2050); /* sleep 2ms-ish */ + } + + log_state_transition(ppd, read_state); + return read_state; +} + #define CLEAR_STATIC_RATE_CONTROL_SMASK(r) \ (r &= ~SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK) diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h index b8345a60a0fb..461f937fe110 100644 --- a/drivers/infiniband/hw/hfi1/chip.h +++ b/drivers/infiniband/hw/hfi1/chip.h @@ -204,6 +204,7 @@ #define PLS_OFFLINE_READY_TO_QUIET_LT 0x92 #define PLS_OFFLINE_REPORT_FAILURE 0x93 #define PLS_OFFLINE_READY_TO_QUIET_BCC 0x94 +#define PLS_OFFLINE_QUIET_DURATION 0x95 #define PLS_POLLING 0x20 #define PLS_POLLING_QUIET 0x20 #define PLS_POLLING_ACTIVE 0x21 -- cgit From 30e10527bcce376114e627abb7fabfbe9bfee91e Mon Sep 17 00:00:00 2001 From: Sebastian Sanchez Date: Tue, 26 Sep 2017 06:06:03 -0700 Subject: IB/hfi1: Only reset QSFP after link up and turn off AOC TX QSFP reset enables AOC transmitters by default. They should be off before moving to high power mode to complete the setup. There is no need to reset the QSFP during LNI failure as it was reset at link down. Reviewed-by: Mike Marciniszyn Reviewed-by: Jakub Byczkowski Signed-off-by: Sebastian Sanchez Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/chip.c | 12 +++++++++++- drivers/infiniband/hw/hfi1/chip.h | 2 +- drivers/infiniband/hw/hfi1/platform.c | 4 +++- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index 1c810d65721a..27b75a8f5097 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -9415,7 +9415,7 @@ static void set_qsfp_int_n(struct hfi1_pportdata *ppd, u8 enable) write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_MASK : ASIC_QSFP1_MASK, mask); } -void reset_qsfp(struct hfi1_pportdata *ppd) +int reset_qsfp(struct hfi1_pportdata *ppd) { struct hfi1_devdata *dd = ppd->dd; u64 mask, qsfp_mask; @@ -9445,6 +9445,13 @@ void reset_qsfp(struct hfi1_pportdata *ppd) * for alarms and warnings */ set_qsfp_int_n(ppd, 1); + + /* + * After the reset, AOC transmitters are enabled by default. They need + * to be turned off to complete the QSFP setup before they can be + * enabled again. + */ + return set_qsfp_tx(ppd, 0); } static int handle_qsfp_error_conditions(struct hfi1_pportdata *ppd, @@ -10406,6 +10413,9 @@ static int goto_offline(struct hfi1_pportdata *ppd, u8 rem_reason) & (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) { /* went down while attempting link up */ check_lni_states(ppd); + + /* The QSFP doesn't need to be reset on LNI failure */ + ppd->qsfp_info.reset_needed = 0; } /* the active link width (downgrade) is 0 on link down */ diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h index 461f937fe110..50b8645d0b87 100644 --- a/drivers/infiniband/hw/hfi1/chip.h +++ b/drivers/infiniband/hw/hfi1/chip.h @@ -723,7 +723,7 @@ void handle_link_downgrade(struct work_struct *work); void handle_link_bounce(struct work_struct *work); void handle_start_link(struct work_struct *work); void handle_sma_message(struct work_struct *work); -void reset_qsfp(struct hfi1_pportdata *ppd); +int reset_qsfp(struct hfi1_pportdata *ppd); void qsfp_event(struct work_struct *work); void start_freeze_handling(struct hfi1_pportdata *ppd, int flags); int send_idle_sma(struct hfi1_devdata *dd, u64 message); diff --git a/drivers/infiniband/hw/hfi1/platform.c b/drivers/infiniband/hw/hfi1/platform.c index a8af96d2b1b0..d486355880cb 100644 --- a/drivers/infiniband/hw/hfi1/platform.c +++ b/drivers/infiniband/hw/hfi1/platform.c @@ -790,7 +790,9 @@ static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset, * reuse of stale settings established in our previous pass through. */ if (ppd->qsfp_info.reset_needed) { - reset_qsfp(ppd); + ret = reset_qsfp(ppd); + if (ret) + return ret; refresh_qsfp_cache(ppd, &ppd->qsfp_info); } else { ppd->qsfp_info.reset_needed = 1; -- cgit From 753b19afb19dd97d0767df5e8afb13faff605315 Mon Sep 17 00:00:00 2001 From: Jan Sokolowski Date: Tue, 26 Sep 2017 06:06:09 -0700 Subject: IB/hfi1: Check eeprom config partition validity Relying on a trailing magic value is incorrect. There are instances where this is not present as trailing magic value has a specific purpose which is not partition validation. Instead use the header magic value which is present in all variants of the platform configuration and is intended for validation. This is also used in other locations in the driver. Fixes: bc5214ee2922 (IB/hfi1: Handle missing magic values in config file) Reviewed-by: Jakub Byczkowski Signed-off-by: Jan Sokolowski Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/eprom.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/eprom.c b/drivers/infiniband/hw/hfi1/eprom.c index d46b17107901..1613af1c58d9 100644 --- a/drivers/infiniband/hw/hfi1/eprom.c +++ b/drivers/infiniband/hw/hfi1/eprom.c @@ -204,7 +204,10 @@ done_asic: return ret; } -/* magic character sequence that trails an image */ +/* magic character sequence that begins an image */ +#define IMAGE_START_MAGIC "APO=" + +/* magic character sequence that might trail an image */ #define IMAGE_TRAIL_MAGIC "egamiAPO" /* EPROM file types */ @@ -250,6 +253,7 @@ static int read_partition_platform_config(struct hfi1_devdata *dd, void **data, { void *buffer; void *p; + u32 length; int ret; buffer = kmalloc(P1_SIZE, GFP_KERNEL); @@ -262,15 +266,21 @@ static int read_partition_platform_config(struct hfi1_devdata *dd, void **data, return ret; } - /* scan for image magic that may trail the actual data */ - p = strnstr(buffer, IMAGE_TRAIL_MAGIC, P1_SIZE); - if (!p) { + /* config partition is valid only if it starts with IMAGE_START_MAGIC */ + if (memcmp(buffer, IMAGE_START_MAGIC, strlen(IMAGE_START_MAGIC))) { kfree(buffer); return -ENOENT; } + /* scan for image magic that may trail the actual data */ + p = strnstr(buffer, IMAGE_TRAIL_MAGIC, P1_SIZE); + if (p) + length = p - buffer; + else + length = P1_SIZE; + *data = buffer; - *size = p - buffer; + *size = length; return 0; } -- cgit From 09592af5fdd722615ebe435fb34308de26c74bcf Mon Sep 17 00:00:00 2001 From: Kamenee Arumugam Date: Tue, 26 Sep 2017 06:06:15 -0700 Subject: IB/hfi1: Return correct value in general interrupt handler The general interrupt handler returns IRQ_HANDLED whether an IRQ was handled or not. Determine if an IRQ was handled and return the correct value. Reviewed-by: Dennis Dalessandro Reviewed-by: Michael J. Ruhl Signed-off-by: Kamenee Arumugam Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/chip.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index 27b75a8f5097..0be42787759f 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -8240,6 +8240,7 @@ static irqreturn_t general_interrupt(int irq, void *data) u64 regs[CCE_NUM_INT_CSRS]; u32 bit; int i; + irqreturn_t handled = IRQ_NONE; this_cpu_inc(*dd->int_counter); @@ -8260,9 +8261,10 @@ static irqreturn_t general_interrupt(int irq, void *data) for_each_set_bit(bit, (unsigned long *)®s[0], CCE_NUM_INT_CSRS * 64) { is_interrupt(dd, bit); + handled = IRQ_HANDLED; } - return IRQ_HANDLED; + return handled; } static irqreturn_t sdma_interrupt(int irq, void *data) -- cgit From 612601d0013f03de9dc134809f242ba6da9ca252 Mon Sep 17 00:00:00 2001 From: Alex Estrin Date: Tue, 26 Sep 2017 06:06:22 -0700 Subject: Revert "IB/ipoib: Update broadcast object if PKey value was changed in index 0" commit 9a9b8112699d will cause core to fail UD QP from being destroyed on ipoib unload, therefore cause resources leakage. On pkey change event above patch modifies mgid before calling underlying driver to detach it from QP. Drivers' detach_mcast() will fail to find modified mgid it was never given to attach in a first place. Core qp->usecnt will never go down, so ib_destroy_qp() will fail. IPoIB driver actually does take care of new broadcast mgid based on new pkey by destroying an old mcast object in ipoib_mcast_dev_flush()) .... if (priv->broadcast) { rb_erase(&priv->broadcast->rb_node, &priv->multicast_tree); list_add_tail(&priv->broadcast->list, &remove_list); priv->broadcast = NULL; } ... then in restarted ipoib_macst_join_task() creating a new broadcast mcast object, sending join request and on completion tells the driver to attach to reinitialized QP: ... if (!priv->broadcast) { ... broadcast = ipoib_mcast_alloc(dev, 0); ... memcpy(broadcast->mcmember.mgid.raw, priv->dev->broadcast + 4, sizeof (union ib_gid)); priv->broadcast = broadcast; ... Fixes: 9a9b8112699d ("IB/ipoib: Update broadcast object if PKey value was changed in index 0") Cc: stable@vger.kernel.org Reviewed-by: Mike Marciniszyn Reviewed-by: Dennis Dalessandro Signed-off-by: Alex Estrin Signed-off-by: Dennis Dalessandro Reviewed-by: Feras Daoud Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/ipoib/ipoib_ib.c | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index 2e075377242e..6cd61638b441 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -1000,19 +1000,6 @@ static inline int update_parent_pkey(struct ipoib_dev_priv *priv) */ priv->dev->broadcast[8] = priv->pkey >> 8; priv->dev->broadcast[9] = priv->pkey & 0xff; - - /* - * Update the broadcast address in the priv->broadcast object, - * in case it already exists, otherwise no one will do that. - */ - if (priv->broadcast) { - spin_lock_irq(&priv->lock); - memcpy(priv->broadcast->mcmember.mgid.raw, - priv->dev->broadcast + 4, - sizeof(union ib_gid)); - spin_unlock_irq(&priv->lock); - } - return 0; } -- cgit From b8f42738acaddf67731c34935c0994e09a588ca7 Mon Sep 17 00:00:00 2001 From: "Michael J. Ruhl" Date: Tue, 26 Sep 2017 06:06:28 -0700 Subject: IB/hfi1: On error, fix use after free during user context setup During base context setup, if setup_base_ctxt() fails, the context is deallocated. This is incorrect because the context is referenced on return, to notify any waiting subcontext. If there are no subcontexts the pointer will be invalid. Reorganize the error path so that deallocate_ctxt() is called after all the possible subcontexts have been notified. Reviewed-by: Ira Weiny Signed-off-by: Michael J. Ruhl Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/file_ops.c | 41 +++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c index 2bc89260235a..d9a1e9893136 100644 --- a/drivers/infiniband/hw/hfi1/file_ops.c +++ b/drivers/infiniband/hw/hfi1/file_ops.c @@ -930,15 +930,8 @@ static int assign_ctxt(struct hfi1_filedata *fd, struct hfi1_user_info *uinfo) switch (ret) { case 0: ret = setup_base_ctxt(fd, uctxt); - if (uctxt->subctxt_cnt) { - /* - * Base context is done (successfully or not), notify - * anybody using a sub-context that is waiting for - * this completion. - */ - clear_bit(HFI1_CTXT_BASE_UNINIT, &uctxt->event_flags); - wake_up(&uctxt->wait); - } + if (ret) + deallocate_ctxt(uctxt); break; case 1: ret = complete_subctxt(fd); @@ -1305,25 +1298,25 @@ static int setup_base_ctxt(struct hfi1_filedata *fd, /* Now allocate the RcvHdr queue and eager buffers. */ ret = hfi1_create_rcvhdrq(dd, uctxt); if (ret) - return ret; + goto done; ret = hfi1_setup_eagerbufs(uctxt); if (ret) - goto setup_failed; + goto done; /* If sub-contexts are enabled, do the appropriate setup */ if (uctxt->subctxt_cnt) ret = setup_subctxt(uctxt); if (ret) - goto setup_failed; + goto done; ret = hfi1_alloc_ctxt_rcv_groups(uctxt); if (ret) - goto setup_failed; + goto done; ret = init_user_ctxt(fd, uctxt); if (ret) - goto setup_failed; + goto done; user_init(uctxt); @@ -1331,12 +1324,22 @@ static int setup_base_ctxt(struct hfi1_filedata *fd, fd->uctxt = uctxt; hfi1_rcd_get(uctxt); - return 0; +done: + if (uctxt->subctxt_cnt) { + /* + * On error, set the failed bit so sub-contexts will clean up + * correctly. + */ + if (ret) + set_bit(HFI1_CTXT_BASE_FAILED, &uctxt->event_flags); -setup_failed: - /* Set the failed bit so sub-context init can do the right thing */ - set_bit(HFI1_CTXT_BASE_FAILED, &uctxt->event_flags); - deallocate_ctxt(uctxt); + /* + * Base context is done (successfully or not), notify anybody + * using a sub-context that is waiting for this completion. + */ + clear_bit(HFI1_CTXT_BASE_UNINIT, &uctxt->event_flags); + wake_up(&uctxt->wait); + } return ret; } -- cgit From 828bcbdc975fbcfb27946c33d4b1d1bfab70789b Mon Sep 17 00:00:00 2001 From: Harish Chegondi Date: Tue, 26 Sep 2017 06:06:34 -0700 Subject: IB/hfi1: Unsuccessful PCIe caps tuning should not fail driver load Failure to tune PCIe capabilities should not fail driver load. This can cause the driver load to fail on systems with any of the following: 1. HFI's parent is not root. Example: HFI card is behind a PCIe bridge. 2. HFI's parent is not PCI Express capable. In these situations, failure to tune PCIe capabilities should be logged in the system message logs but not cause the driver load to fail. This patch also ensures pcie capability word DevCtl is written only after a successful read and the capability tuning process continues even if read/write of the pcie capability word DevCtl fails. Fixes: c53df62c7a9a ("IB/hfi1: Check return values from PCI config API calls") Fixes: bf70a7757736 ("staging/rdma/hfi1: Enable WFR PCIe extended tags from the driver") Reviewed-by: Michael J. Ruhl Reviewed-by: Mike Marciniszyn Reviewed-by: Jakub Byczkowski Signed-off-by: Harish Chegondi Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/pcie.c | 50 ++++++++++++++++----------------------- 1 file changed, 21 insertions(+), 29 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/pcie.c b/drivers/infiniband/hw/hfi1/pcie.c index 82447b7cdda1..09e50fd2a08f 100644 --- a/drivers/infiniband/hw/hfi1/pcie.c +++ b/drivers/infiniband/hw/hfi1/pcie.c @@ -68,7 +68,7 @@ /* * Code to adjust PCIe capabilities. */ -static int tune_pcie_caps(struct hfi1_devdata *); +static void tune_pcie_caps(struct hfi1_devdata *); /* * Do all the common PCIe setup and initialization. @@ -351,7 +351,7 @@ int pcie_speeds(struct hfi1_devdata *dd) */ int request_msix(struct hfi1_devdata *dd, u32 msireq) { - int nvec, ret; + int nvec; nvec = pci_alloc_irq_vectors(dd->pcidev, 1, msireq, PCI_IRQ_MSIX | PCI_IRQ_LEGACY); @@ -360,12 +360,7 @@ int request_msix(struct hfi1_devdata *dd, u32 msireq) return nvec; } - ret = tune_pcie_caps(dd); - if (ret) { - dd_dev_err(dd, "tune_pcie_caps() failed: %d\n", ret); - pci_free_irq_vectors(dd->pcidev); - return ret; - } + tune_pcie_caps(dd); /* check for legacy IRQ */ if (nvec == 1 && !dd->pcidev->msix_enabled) @@ -502,7 +497,7 @@ uint aspm_mode = ASPM_MODE_DISABLED; module_param_named(aspm, aspm_mode, uint, S_IRUGO); MODULE_PARM_DESC(aspm, "PCIe ASPM: 0: disable, 1: enable, 2: dynamic"); -static int tune_pcie_caps(struct hfi1_devdata *dd) +static void tune_pcie_caps(struct hfi1_devdata *dd) { struct pci_dev *parent; u16 rc_mpss, rc_mps, ep_mpss, ep_mps; @@ -513,22 +508,14 @@ static int tune_pcie_caps(struct hfi1_devdata *dd) * Turn on extended tags in DevCtl in case the BIOS has turned it off * to improve WFR SDMA bandwidth */ - ret = pcie_capability_read_word(dd->pcidev, - PCI_EXP_DEVCTL, &ectl); - if (ret) { - dd_dev_err(dd, "Unable to read from PCI config\n"); - return ret; - } - - if (!(ectl & PCI_EXP_DEVCTL_EXT_TAG)) { + ret = pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL, &ectl); + if ((!ret) && !(ectl & PCI_EXP_DEVCTL_EXT_TAG)) { dd_dev_info(dd, "Enabling PCIe extended tags\n"); ectl |= PCI_EXP_DEVCTL_EXT_TAG; ret = pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL, ectl); - if (ret) { - dd_dev_err(dd, "Unable to write to PCI config\n"); - return ret; - } + if (ret) + dd_dev_info(dd, "Unable to write to PCI config\n"); } /* Find out supported and configured values for parent (root) */ parent = dd->pcidev->bus->self; @@ -536,15 +523,22 @@ static int tune_pcie_caps(struct hfi1_devdata *dd) * The driver cannot perform the tuning if it does not have * access to the upstream component. */ - if (!parent) - return -EINVAL; + if (!parent) { + dd_dev_info(dd, "Parent not found\n"); + return; + } if (!pci_is_root_bus(parent->bus)) { dd_dev_info(dd, "Parent not root\n"); - return -EINVAL; + return; + } + if (!pci_is_pcie(parent)) { + dd_dev_info(dd, "Parent is not PCI Express capable\n"); + return; + } + if (!pci_is_pcie(dd->pcidev)) { + dd_dev_info(dd, "PCI device is not PCI Express capable\n"); + return; } - - if (!pci_is_pcie(parent) || !pci_is_pcie(dd->pcidev)) - return -EINVAL; rc_mpss = parent->pcie_mpss; rc_mps = ffs(pcie_get_mps(parent)) - 8; /* Find out supported and configured values for endpoint (us) */ @@ -590,8 +584,6 @@ static int tune_pcie_caps(struct hfi1_devdata *dd) ep_mrrs = max_mrrs; pcie_set_readrq(dd->pcidev, ep_mrrs); } - - return 0; } /* End of PCIe capability tuning */ -- cgit From 36de80740008e6a4a55115b4a92e2059e47c1cba Mon Sep 17 00:00:00 2001 From: Richard Genoud Date: Wed, 27 Sep 2017 14:49:17 +0200 Subject: mtd: nand: atmel: fix buffer overflow in atmel_pmecc_user When calculating the size needed by struct atmel_pmecc_user *user, the dmu and delta buffer sizes were forgotten. This lead to a memory corruption (especially with a large ecc_strength). Link: http://lkml.kernel.org/r/1506503157.3016.5.camel@gmail.com Fixes: f88fc122cc34 ("mtd: nand: Cleanup/rework the atmel_nand driver") Cc: stable@vger.kernel.org Reported-by: Richard Genoud Pointed-at-by: Boris Brezillon Signed-off-by: Richard Genoud Reviewed-by: Nicolas Ferre Signed-off-by: Boris Brezillon --- drivers/mtd/nand/atmel/pmecc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mtd/nand/atmel/pmecc.c b/drivers/mtd/nand/atmel/pmecc.c index 146af8218314..8268636675ef 100644 --- a/drivers/mtd/nand/atmel/pmecc.c +++ b/drivers/mtd/nand/atmel/pmecc.c @@ -363,7 +363,7 @@ atmel_pmecc_create_user(struct atmel_pmecc *pmecc, size += (req->ecc.strength + 1) * sizeof(u16); /* Reserve space for mu, dmu and delta. */ size = ALIGN(size, sizeof(s32)); - size += (req->ecc.strength + 1) * sizeof(s32); + size += (req->ecc.strength + 1) * sizeof(s32) * 3; user = kzalloc(size, GFP_KERNEL); if (!user) -- cgit From a5f3d8a5eaaf917878f07998e6f1ea46024e6bab Mon Sep 17 00:00:00 2001 From: Coly Li Date: Tue, 26 Sep 2017 17:54:12 +0800 Subject: bcache: use llist_for_each_entry_safe() in __closure_wake_up() Commit 09b3efec ("bcache: Don't reinvent the wheel but use existing llist API") replaces the following while loop by llist_for_each_entry(), - - while (reverse) { - cl = container_of(reverse, struct closure, list); - reverse = llist_next(reverse); - + llist_for_each_entry(cl, reverse, list) { closure_set_waiting(cl, 0); closure_sub(cl, CLOSURE_WAITING + 1); } This modification introduces a potential race by iterating a corrupted list. Here is how it happens. In the above modification, closure_sub() may wake up a process which is waiting on reverse list. If this process decides to wait again by calling closure_wait(), its cl->list will be added to another wait list. Then when llist_for_each_entry() continues to iterate next node, it will travel on another new wait list which is added in closure_wait(), not the original reverse list in __closure_wake_up(). It is more probably to happen on UP machine because the waked up process may preempt the process which wakes up it. Use llist_for_each_entry_safe() will fix the issue, the safe version fetch next node before waking up a process. Then the copy of next node will make sure list iteration stays on original reverse list. Fixes: 09b3efec81de ("bcache: Don't reinvent the wheel but use existing llist API") Signed-off-by: Coly Li Reported-by: Michael Lyle Reviewed-by: Byungchul Park Signed-off-by: Jens Axboe --- drivers/md/bcache/closure.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c index 7d5286b05036..1841d0359bac 100644 --- a/drivers/md/bcache/closure.c +++ b/drivers/md/bcache/closure.c @@ -64,7 +64,7 @@ EXPORT_SYMBOL(closure_put); void __closure_wake_up(struct closure_waitlist *wait_list) { struct llist_node *list; - struct closure *cl; + struct closure *cl, *t; struct llist_node *reverse = NULL; list = llist_del_all(&wait_list->list); @@ -73,7 +73,7 @@ void __closure_wake_up(struct closure_waitlist *wait_list) reverse = llist_reverse_order(list); /* Then do the wakeups */ - llist_for_each_entry(cl, reverse, list) { + llist_for_each_entry_safe(cl, t, reverse, list) { closure_set_waiting(cl, 0); closure_sub(cl, CLOSURE_WAITING + 1); } -- cgit From aaf2c2fb0f51f91c699039440862b6ae9c25c10e Mon Sep 17 00:00:00 2001 From: Tyler Baicar Date: Mon, 28 Aug 2017 10:53:41 -0600 Subject: ACPI / APEI: clear error status before acknowledging the error Currently we acknowledge errors before clearing the error status. This could cause a new error to be populated by firmware in-between the error acknowledgment and the error status clearing which would cause the second error's status to be cleared without being handled. So, clear the error status before acknowledging the errors. Also, make sure to acknowledge the error if the error status read fails. Signed-off-by: Tyler Baicar Reviewed-by: Borislav Petkov Signed-off-by: Rafael J. Wysocki --- drivers/acpi/apei/ghes.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 077f9bad6f44..3c3a37b8503b 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -743,17 +743,19 @@ static int ghes_proc(struct ghes *ghes) } ghes_do_proc(ghes, ghes->estatus); +out: + ghes_clear_estatus(ghes); + + if (rc == -ENOENT) + return rc; + /* * GHESv2 type HEST entries introduce support for error acknowledgment, * so only acknowledge the error if this support is present. */ - if (is_hest_type_generic_v2(ghes)) { - rc = ghes_ack_error(ghes->generic_v2); - if (rc) - return rc; - } -out: - ghes_clear_estatus(ghes); + if (is_hest_type_generic_v2(ghes)) + return ghes_ack_error(ghes->generic_v2); + return rc; } -- cgit From 2e08d20d777e997bf37806b22b471f98fbe6b693 Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Wed, 27 Sep 2017 16:34:59 -0500 Subject: percpu: fix starting offset for chunk statistics traversal This patch fixes the starting offset used when scanning chunks to compute the chunk statistics. The value start_offset (and end_offset) are managed in bytes while the traversal occurs over bits. Thus for the reserved and dynamic chunk, it may incorrectly skip over the initial allocations. Signed-off-by: Dennis Zhou Signed-off-by: Tejun Heo --- mm/percpu-stats.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c index 6142484e88f7..7a58460bfd27 100644 --- a/mm/percpu-stats.c +++ b/mm/percpu-stats.c @@ -73,7 +73,7 @@ static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk, last_alloc + 1 : 0; as_len = 0; - start = chunk->start_offset; + start = chunk->start_offset / PCPU_MIN_ALLOC_SIZE; /* * If a bit is set in the allocation map, the bound_map identifies -- cgit From 8aba2333904f9b1c1ea038df261bf7ae8fefb98e Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 28 Sep 2017 02:08:43 +0200 Subject: cpufreq: docs: Drop intel-pstate.txt from index.txt Commit 33fc30b47098 (cpufreq: intel_pstate: Document the current behavior and user interface) dropped the intel-pstate.txt file from Documentation/cpu-freq/, but it did not update the index.txt file in there accordingly, so do that now. Fixes: 33fc30b47098 (cpufreq: intel_pstate: Document the current behavior and user interface) Signed-off-by: Rafael J. Wysocki --- Documentation/cpu-freq/index.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/Documentation/cpu-freq/index.txt b/Documentation/cpu-freq/index.txt index 03a7cee6ac73..c15e75386a05 100644 --- a/Documentation/cpu-freq/index.txt +++ b/Documentation/cpu-freq/index.txt @@ -32,8 +32,6 @@ cpufreq-stats.txt - General description of sysfs cpufreq stats. index.txt - File index, Mailing list and Links (this document) -intel-pstate.txt - Intel pstate cpufreq driver specific file. - pcc-cpufreq.txt - PCC cpufreq driver specific file. -- cgit From d1b490939d8c117a06dfc562c41d933f71d30289 Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Tue, 19 Sep 2017 12:11:55 -0300 Subject: scsi: aacraid: Add a small delay after IOP reset Commit 0e9973ed3382 ("scsi: aacraid: Add periodic checks to see IOP reset status") changed the way driver checks if a reset succeeded. Now, after an IOP reset, aacraid immediately start polling a register to verify the reset is complete. This behavior cause regressions on the reset path in PowerPC (at least). Since the delay after the IOP reset was removed by the aforementioned patch, the fact driver just starts to read a register instantly after the reset was issued (by writing in another register) "corrupts" the reset procedure, which ends up failing all the time. The issue highly impacted kdump on PowerPC, since on kdump path we proactively issue a reset in adapter (through the reset_devices kernel parameter). This patch (re-)adds a delay right after IOP reset is issued. Empirically we measured that 3 seconds is enough, but for safety reasons we delay for 5s (and since it was 30s before, 5s is still a small amount). For reference, without this patch we observe the following messages on kdump kernel boot process: [ 76.294] aacraid 0003:01:00.0: IOP reset failed [ 76.294] aacraid 0003:01:00.0: ARC Reset attempt failed [ 86.524] aacraid 0003:01:00.0: adapter kernel panic'd ff. [ 86.524] aacraid 0003:01:00.0: Controller reset type is 3 [ 86.524] aacraid 0003:01:00.0: Issuing IOP reset [146.534] aacraid 0003:01:00.0: IOP reset failed [146.534] aacraid 0003:01:00.0: ARC Reset attempt failed Fixes: 0e9973ed3382 ("scsi: aacraid: Add periodic checks to see IOP reset status") Cc: stable@vger.kernel.org # v4.13+ Signed-off-by: Guilherme G. Piccoli Acked-by: Dave Carroll Signed-off-by: Martin K. Petersen --- drivers/scsi/aacraid/src.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/scsi/aacraid/src.c b/drivers/scsi/aacraid/src.c index 48c2b2b34b72..0c9361c87ec8 100644 --- a/drivers/scsi/aacraid/src.c +++ b/drivers/scsi/aacraid/src.c @@ -740,6 +740,8 @@ static void aac_send_iop_reset(struct aac_dev *dev) aac_set_intx_mode(dev); src_writel(dev, MUnit.IDR, IOP_SRC_RESET_MASK); + + msleep(5000); } static void aac_send_hardware_soft_reset(struct aac_dev *dev) -- cgit From d0b7a9095c0730b92a0a2eecaba2e6b77ed87339 Mon Sep 17 00:00:00 2001 From: Martin Wilck Date: Wed, 27 Sep 2017 14:44:19 +0200 Subject: scsi: ILLEGAL REQUEST + ASC==27 => target failure ASC 0x27 is "WRITE PROTECTED". This error code is returned e.g. by Fujitsu ETERNUS systems under certain conditions for WRITE SAME 16 commands with UNMAP bit set. It should not be treated as a path error. In general, it makes sense to assume that being write protected is a target rather than a path property. Signed-off-by: Martin Wilck Acked-by: Lee Duncan Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_error.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index 38942050b265..dab876c65473 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -580,7 +580,8 @@ int scsi_check_sense(struct scsi_cmnd *scmd) if (sshdr.asc == 0x20 || /* Invalid command operation code */ sshdr.asc == 0x21 || /* Logical block address out of range */ sshdr.asc == 0x24 || /* Invalid field in cdb */ - sshdr.asc == 0x26) { /* Parameter value invalid */ + sshdr.asc == 0x26 || /* Parameter value invalid */ + sshdr.asc == 0x27) { /* Write protected */ set_host_byte(scmd, DID_TARGET_FAILURE); } return SUCCESS; -- cgit From 393debc23c7820211d1c8253dd6a8408a7628fe7 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 21 Sep 2017 10:23:35 -0700 Subject: md: separate request handling With commit cc27b0c78c79, pers->make_request could bail out without handling the bio. If that happens, we should retry. The commit fixes md_make_request but not other call sites. Separate the request handling part, so other call sites can use it. Reported-by: Nate Dailey Fix: cc27b0c78c79(md: fix deadlock between mddev_suspend() and md_write_start()) Cc: stable@vger.kernel.org Reviewed-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/md.c | 58 ++++++++++++++++++++++++++++++++------------------------- drivers/md/md.h | 1 + 2 files changed, 34 insertions(+), 25 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 08fcaebc61bd..1db1a22ed835 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -266,6 +266,37 @@ static DEFINE_SPINLOCK(all_mddevs_lock); * call has finished, the bio has been linked into some internal structure * and so is visible to ->quiesce(), so we don't need the refcount any more. */ +void md_handle_request(struct mddev *mddev, struct bio *bio) +{ +check_suspended: + rcu_read_lock(); + if (mddev->suspended) { + DEFINE_WAIT(__wait); + for (;;) { + prepare_to_wait(&mddev->sb_wait, &__wait, + TASK_UNINTERRUPTIBLE); + if (!mddev->suspended) + break; + rcu_read_unlock(); + schedule(); + rcu_read_lock(); + } + finish_wait(&mddev->sb_wait, &__wait); + } + atomic_inc(&mddev->active_io); + rcu_read_unlock(); + + if (!mddev->pers->make_request(mddev, bio)) { + atomic_dec(&mddev->active_io); + wake_up(&mddev->sb_wait); + goto check_suspended; + } + + if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) + wake_up(&mddev->sb_wait); +} +EXPORT_SYMBOL(md_handle_request); + static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) { const int rw = bio_data_dir(bio); @@ -285,23 +316,6 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) bio_endio(bio); return BLK_QC_T_NONE; } -check_suspended: - rcu_read_lock(); - if (mddev->suspended) { - DEFINE_WAIT(__wait); - for (;;) { - prepare_to_wait(&mddev->sb_wait, &__wait, - TASK_UNINTERRUPTIBLE); - if (!mddev->suspended) - break; - rcu_read_unlock(); - schedule(); - rcu_read_lock(); - } - finish_wait(&mddev->sb_wait, &__wait); - } - atomic_inc(&mddev->active_io); - rcu_read_unlock(); /* * save the sectors now since our bio can @@ -310,20 +324,14 @@ check_suspended: sectors = bio_sectors(bio); /* bio could be mergeable after passing to underlayer */ bio->bi_opf &= ~REQ_NOMERGE; - if (!mddev->pers->make_request(mddev, bio)) { - atomic_dec(&mddev->active_io); - wake_up(&mddev->sb_wait); - goto check_suspended; - } + + md_handle_request(mddev, bio); cpu = part_stat_lock(); part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors); part_stat_unlock(); - if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) - wake_up(&mddev->sb_wait); - return BLK_QC_T_NONE; } diff --git a/drivers/md/md.h b/drivers/md/md.h index 561d22b9a9a8..d8287d3cd1bf 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -692,6 +692,7 @@ extern void md_stop_writes(struct mddev *mddev); extern int md_rdev_init(struct md_rdev *rdev); extern void md_rdev_clear(struct md_rdev *rdev); +extern void md_handle_request(struct mddev *mddev, struct bio *bio); extern void mddev_suspend(struct mddev *mddev); extern void mddev_resume(struct mddev *mddev); extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, -- cgit From 79bf31a3b2a7ca467cfec8ff97d359a77065d01f Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 21 Sep 2017 09:55:28 -0700 Subject: md: fix a race condition for flush request handling md_submit_flush_data calls pers->make_request, which missed the suspend check. Fix it with the new md_handle_request API. Reported-by: Nate Dailey Tested-by: Nate Dailey Fix: cc27b0c78c79(md: fix deadlock between mddev_suspend() and md_write_start()) Cc: stable@vger.kernel.org Reviewed-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/md.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 1db1a22ed835..0ff1bbf6c90e 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -447,16 +447,22 @@ static void md_submit_flush_data(struct work_struct *ws) struct mddev *mddev = container_of(ws, struct mddev, flush_work); struct bio *bio = mddev->flush_bio; + /* + * must reset flush_bio before calling into md_handle_request to avoid a + * deadlock, because other bios passed md_handle_request suspend check + * could wait for this and below md_handle_request could wait for those + * bios because of suspend check + */ + mddev->flush_bio = NULL; + wake_up(&mddev->sb_wait); + if (bio->bi_iter.bi_size == 0) /* an empty barrier - all done */ bio_endio(bio); else { bio->bi_opf &= ~REQ_PREFLUSH; - mddev->pers->make_request(mddev, bio); + md_handle_request(mddev, bio); } - - mddev->flush_bio = NULL; - wake_up(&mddev->sb_wait); } void md_flush_request(struct mddev *mddev, struct bio *bio) -- cgit From c4d6a1b8e8ea79c439a4871cba540443c9eb13b9 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 21 Sep 2017 10:29:22 -0700 Subject: dm-raid: fix a race condition in request handling raid_map calls pers->make_request, which missed the suspend check. Fix it with the new md_handle_request API. Fix: cc27b0c78c79(md: fix deadlock between mddev_suspend() and md_write_start()) Cc: Heinz Mauelshagen Cc: Mike Snitzer Cc: stable@vger.kernel.org Reviewed-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/dm-raid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 5bfe285ea9d1..1ac58c5651b7 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3238,7 +3238,7 @@ static int raid_map(struct dm_target *ti, struct bio *bio) if (unlikely(bio_end_sector(bio) > mddev->array_sectors)) return DM_MAPIO_REQUEUE; - mddev->pers->make_request(mddev, bio); + md_handle_request(mddev, bio); return DM_MAPIO_SUBMITTED; } -- cgit From 7d5d7b5058fbd638914e42504677141a69f43011 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 21 Sep 2017 11:03:52 -0700 Subject: md/raid5: cap worker count static checker reports a potential integer overflow. Cap the worker count to avoid the overflow. Reported:-by: Dan Carpenter Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 076409455b60..928e24a07133 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -6575,14 +6575,17 @@ static ssize_t raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) { struct r5conf *conf; - unsigned long new; + unsigned int new; int err; struct r5worker_group *new_groups, *old_groups; int group_cnt, worker_cnt_per_group; if (len >= PAGE_SIZE) return -EINVAL; - if (kstrtoul(page, 10, &new)) + if (kstrtouint(page, 10, &new)) + return -EINVAL; + /* 8192 should be big enough */ + if (new > 8192) return -EINVAL; err = mddev_lock(mddev); -- cgit From 38e8a5c040d3ec99a8351c688dcdf0f549611565 Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Mon, 21 Aug 2017 12:04:50 +0300 Subject: net/mlx5e: IPoIB, Fix access to invalid memory address When cleaning rdma netdevice we need to save the mdev pointer because priv is released when we release netdev. This bug was found using the kernel address sanitizer (KASAN). use-after-free in mlx5_rdma_netdev_free+0xe3/0x100 [mlx5_core] Fixes: 48935bbb7ae8 ("net/mlx5e: IPoIB, Add netdevice profile skeleton") Signed-off-by: Roi Dayan Reviewed-by: Or Gerlitz Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c index 85298051a3e4..145e392ab849 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c @@ -572,12 +572,13 @@ void mlx5_rdma_netdev_free(struct net_device *netdev) { struct mlx5e_priv *priv = mlx5i_epriv(netdev); const struct mlx5e_profile *profile = priv->profile; + struct mlx5_core_dev *mdev = priv->mdev; mlx5e_detach_netdev(priv); profile->cleanup(priv); destroy_workqueue(priv->wq); free_netdev(netdev); - mlx5e_destroy_mdev_resources(priv->mdev); + mlx5e_destroy_mdev_resources(mdev); } EXPORT_SYMBOL(mlx5_rdma_netdev_free); -- cgit From 99d3cd27f755d63fd6cf85169eaa873d90769aa5 Mon Sep 17 00:00:00 2001 From: Inbar Karmy Date: Thu, 24 Aug 2017 17:21:44 +0300 Subject: net/mlx5: Fix FPGA capability location Currently, FPGA capability is located in (mdev)->caps.hca_cur, change the location to be (mdev)->caps.fpga, since hca_cur is reserved for HCA device capabilities. Fixes: e29341fb3a5b ("net/mlx5: FPGA, Add basic support for Innova") Signed-off-by: Inbar Karmy Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.c | 4 ++-- drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.h | 2 +- drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c | 3 +-- include/linux/mlx5/device.h | 5 ++--- include/linux/mlx5/driver.h | 1 + 5 files changed, 7 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.c index e37453d838db..c0fd2212e890 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.c @@ -71,11 +71,11 @@ int mlx5_fpga_access_reg(struct mlx5_core_dev *dev, u8 size, u64 addr, return 0; } -int mlx5_fpga_caps(struct mlx5_core_dev *dev, u32 *caps) +int mlx5_fpga_caps(struct mlx5_core_dev *dev) { u32 in[MLX5_ST_SZ_DW(fpga_cap)] = {0}; - return mlx5_core_access_reg(dev, in, sizeof(in), caps, + return mlx5_core_access_reg(dev, in, sizeof(in), dev->caps.fpga, MLX5_ST_SZ_BYTES(fpga_cap), MLX5_REG_FPGA_CAP, 0, 0); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.h b/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.h index 94bdfd47c3f0..d05233c9b4f6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.h @@ -65,7 +65,7 @@ struct mlx5_fpga_qp_counters { u64 rx_total_drop; }; -int mlx5_fpga_caps(struct mlx5_core_dev *dev, u32 *caps); +int mlx5_fpga_caps(struct mlx5_core_dev *dev); int mlx5_fpga_query(struct mlx5_core_dev *dev, struct mlx5_fpga_query *query); int mlx5_fpga_ctrl_op(struct mlx5_core_dev *dev, u8 op); int mlx5_fpga_access_reg(struct mlx5_core_dev *dev, u8 size, u64 addr, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c index 9034e9960a76..dc8970346521 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c @@ -139,8 +139,7 @@ int mlx5_fpga_device_start(struct mlx5_core_dev *mdev) if (err) goto out; - err = mlx5_fpga_caps(fdev->mdev, - fdev->mdev->caps.hca_cur[MLX5_CAP_FPGA]); + err = mlx5_fpga_caps(fdev->mdev); if (err) goto out; diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index eaf4ad209c8f..e32dbc4934db 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -980,7 +980,6 @@ enum mlx5_cap_type { MLX5_CAP_RESERVED, MLX5_CAP_VECTOR_CALC, MLX5_CAP_QOS, - MLX5_CAP_FPGA, /* NUM OF CAP Types */ MLX5_CAP_NUM }; @@ -1110,10 +1109,10 @@ enum mlx5_mcam_feature_groups { MLX5_GET(mcam_reg, (mdev)->caps.mcam, mng_feature_cap_mask.enhanced_features.fld) #define MLX5_CAP_FPGA(mdev, cap) \ - MLX5_GET(fpga_cap, (mdev)->caps.hca_cur[MLX5_CAP_FPGA], cap) + MLX5_GET(fpga_cap, (mdev)->caps.fpga, cap) #define MLX5_CAP64_FPGA(mdev, cap) \ - MLX5_GET64(fpga_cap, (mdev)->caps.hca_cur[MLX5_CAP_FPGA], cap) + MLX5_GET64(fpga_cap, (mdev)->caps.fpga, cap) enum { MLX5_CMD_STAT_OK = 0x0, diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 02ff700e4f30..401c8972cc3a 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -774,6 +774,7 @@ struct mlx5_core_dev { u32 hca_max[MLX5_CAP_NUM][MLX5_UN_SZ_DW(hca_cap_union)]; u32 pcam[MLX5_ST_SZ_DW(pcam_reg)]; u32 mcam[MLX5_ST_SZ_DW(mcam_reg)]; + u32 fpga[MLX5_ST_SZ_DW(fpga_cap)]; } caps; phys_addr_t iseg_base; struct mlx5_init_seg __iomem *iseg; -- cgit From 16f1c5bb3ed75b3cf3ced537db40f7e1a244debe Mon Sep 17 00:00:00 2001 From: Raed Salem Date: Sun, 30 Jul 2017 11:02:51 +0300 Subject: net/mlx5: Check device capability for maximum flow counters Added check for the maximal number of flow counters attached to rule (FTE). Fixes: bd5251dbf156b ('net/mlx5_core: Introduce flow steering destination of type counter') Signed-off-by: Raed Salem Reviewed-by: Maor Gottlieb Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c | 8 ++++++++ drivers/net/ethernet/mellanox/mlx5/core/fs_core.h | 11 +++++++++++ include/linux/mlx5/mlx5_ifc.h | 3 ++- 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c index e0d0efd903bc..36ecc2b2e187 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c @@ -293,6 +293,9 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev, } if (fte->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) { + int max_list_size = BIT(MLX5_CAP_FLOWTABLE_TYPE(dev, + log_max_flow_counter, + ft->type)); int list_size = 0; list_for_each_entry(dst, &fte->node.children, node.list) { @@ -305,12 +308,17 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev, in_dests += MLX5_ST_SZ_BYTES(dest_format_struct); list_size++; } + if (list_size > max_list_size) { + err = -EINVAL; + goto err_out; + } MLX5_SET(flow_context, in_flow_context, flow_counter_list_size, list_size); } err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); +err_out: kvfree(in); return err; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h index 5509a752f98e..48dd78975062 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h @@ -52,6 +52,7 @@ enum fs_flow_table_type { FS_FT_FDB = 0X4, FS_FT_SNIFFER_RX = 0X5, FS_FT_SNIFFER_TX = 0X6, + FS_FT_MAX_TYPE = FS_FT_SNIFFER_TX, }; enum fs_flow_table_op_mod { @@ -260,4 +261,14 @@ void mlx5_cleanup_fs(struct mlx5_core_dev *dev); #define fs_for_each_dst(pos, fte) \ fs_list_for_each_entry(pos, &(fte)->node.children) +#define MLX5_CAP_FLOWTABLE_TYPE(mdev, cap, type) ( \ + (type == FS_FT_NIC_RX) ? MLX5_CAP_FLOWTABLE_NIC_RX(mdev, cap) : \ + (type == FS_FT_ESW_EGRESS_ACL) ? MLX5_CAP_ESW_EGRESS_ACL(mdev, cap) : \ + (type == FS_FT_ESW_INGRESS_ACL) ? MLX5_CAP_ESW_INGRESS_ACL(mdev, cap) : \ + (type == FS_FT_FDB) ? MLX5_CAP_ESW_FLOWTABLE_FDB(mdev, cap) : \ + (type == FS_FT_SNIFFER_RX) ? MLX5_CAP_FLOWTABLE_SNIFFER_RX(mdev, cap) : \ + (type == FS_FT_SNIFFER_TX) ? MLX5_CAP_FLOWTABLE_SNIFFER_TX(mdev, cap) : \ + (BUILD_BUG_ON_ZERO(FS_FT_SNIFFER_TX != FS_FT_MAX_TYPE))\ + ) + #endif diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index a528b35a022e..69772347f866 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -327,7 +327,8 @@ struct mlx5_ifc_flow_table_prop_layout_bits { u8 reserved_at_80[0x18]; u8 log_max_destination[0x8]; - u8 reserved_at_a0[0x18]; + u8 log_max_flow_counter[0x8]; + u8 reserved_at_a8[0x10]; u8 log_max_flow[0x8]; u8 reserved_at_c0[0x40]; -- cgit From ace743214ea205c7d433562c5fa24e33bdfda7ab Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Tue, 5 Sep 2017 15:05:51 +0300 Subject: net/mlx5e: Fix erroneous freeing of encap header buffer In case the neighbour for the tunnel destination isn't valid, we send a neighbour update request but we free the encap header buffer. This is wrong, because we still need it for allocating a HW encap entry once the neighbour is available. Fix that by skipping freeing it if we wait for neighbour. Fixes: 232c001398ae ('net/mlx5e: Add support to neighbour update flow') Signed-off-by: Paul Blakey Reviewed-by: Or Gerlitz Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index da503e6411da..4e2fc016bdd6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -1564,7 +1564,7 @@ static int mlx5e_create_encap_header_ipv4(struct mlx5e_priv *priv, break; default: err = -EOPNOTSUPP; - goto out; + goto free_encap; } fl4.flowi4_tos = tun_key->tos; fl4.daddr = tun_key->u.ipv4.dst; @@ -1573,7 +1573,7 @@ static int mlx5e_create_encap_header_ipv4(struct mlx5e_priv *priv, err = mlx5e_route_lookup_ipv4(priv, mirred_dev, &out_dev, &fl4, &n, &ttl); if (err) - goto out; + goto free_encap; /* used by mlx5e_detach_encap to lookup a neigh hash table * entry in the neigh hash table when a user deletes a rule @@ -1590,7 +1590,7 @@ static int mlx5e_create_encap_header_ipv4(struct mlx5e_priv *priv, */ err = mlx5e_rep_encap_entry_attach(netdev_priv(out_dev), e); if (err) - goto out; + goto free_encap; read_lock_bh(&n->lock); nud_state = n->nud_state; @@ -1630,8 +1630,9 @@ static int mlx5e_create_encap_header_ipv4(struct mlx5e_priv *priv, destroy_neigh_entry: mlx5e_rep_encap_entry_detach(netdev_priv(e->out_dev), e); -out: +free_encap: kfree(encap_header); +out: if (n) neigh_release(n); return err; @@ -1668,7 +1669,7 @@ static int mlx5e_create_encap_header_ipv6(struct mlx5e_priv *priv, break; default: err = -EOPNOTSUPP; - goto out; + goto free_encap; } fl6.flowlabel = ip6_make_flowinfo(RT_TOS(tun_key->tos), tun_key->label); @@ -1678,7 +1679,7 @@ static int mlx5e_create_encap_header_ipv6(struct mlx5e_priv *priv, err = mlx5e_route_lookup_ipv6(priv, mirred_dev, &out_dev, &fl6, &n, &ttl); if (err) - goto out; + goto free_encap; /* used by mlx5e_detach_encap to lookup a neigh hash table * entry in the neigh hash table when a user deletes a rule @@ -1695,7 +1696,7 @@ static int mlx5e_create_encap_header_ipv6(struct mlx5e_priv *priv, */ err = mlx5e_rep_encap_entry_attach(netdev_priv(out_dev), e); if (err) - goto out; + goto free_encap; read_lock_bh(&n->lock); nud_state = n->nud_state; @@ -1736,8 +1737,9 @@ static int mlx5e_create_encap_header_ipv6(struct mlx5e_priv *priv, destroy_neigh_entry: mlx5e_rep_encap_entry_detach(netdev_priv(e->out_dev), e); -out: +free_encap: kfree(encap_header); +out: if (n) neigh_release(n); return err; -- cgit From bdd66ac0aeed971d1cb42b3aa0d11b0ea3842e09 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Sun, 11 Jun 2017 21:13:25 +0300 Subject: net/mlx5e: Disallow TC offloading of unsupported match/action combinations When offloading header re-write, the HW may need to adjust checksums along the packet. For IP traffic, and a case where we are asked to modify fields in the IP header, current HW supports that only for TCP and UDP. Enforce it, in this case fail the offloading attempt for non TCP/UDP packets. Fixes: d7e75a325cb2 ('net/mlx5e: Add offloading of E-Switch TC pedit (header re-write) actions') Fixes: 2f4fe4cab073 ('net/mlx5e: Add offloading of NIC TC pedit (header re-write) actions') Signed-off-by: Or Gerlitz Reviewed-by: Paul Blakey Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 70 +++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 4e2fc016bdd6..d3786005fba7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -1317,6 +1317,69 @@ static bool csum_offload_supported(struct mlx5e_priv *priv, u32 action, u32 upda return true; } +static bool modify_header_match_supported(struct mlx5_flow_spec *spec, + struct tcf_exts *exts) +{ + const struct tc_action *a; + bool modify_ip_header; + LIST_HEAD(actions); + u8 htype, ip_proto; + void *headers_v; + u16 ethertype; + int nkeys, i; + + headers_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, outer_headers); + ethertype = MLX5_GET(fte_match_set_lyr_2_4, headers_v, ethertype); + + /* for non-IP we only re-write MACs, so we're okay */ + if (ethertype != ETH_P_IP && ethertype != ETH_P_IPV6) + goto out_ok; + + modify_ip_header = false; + tcf_exts_to_list(exts, &actions); + list_for_each_entry(a, &actions, list) { + if (!is_tcf_pedit(a)) + continue; + + nkeys = tcf_pedit_nkeys(a); + for (i = 0; i < nkeys; i++) { + htype = tcf_pedit_htype(a, i); + if (htype == TCA_PEDIT_KEY_EX_HDR_TYPE_IP4 || + htype == TCA_PEDIT_KEY_EX_HDR_TYPE_IP6) { + modify_ip_header = true; + break; + } + } + } + + ip_proto = MLX5_GET(fte_match_set_lyr_2_4, headers_v, ip_protocol); + if (modify_ip_header && ip_proto != IPPROTO_TCP && ip_proto != IPPROTO_UDP) { + pr_info("can't offload re-write of ip proto %d\n", ip_proto); + return false; + } + +out_ok: + return true; +} + +static bool actions_match_supported(struct mlx5e_priv *priv, + struct tcf_exts *exts, + struct mlx5e_tc_flow_parse_attr *parse_attr, + struct mlx5e_tc_flow *flow) +{ + u32 actions; + + if (flow->flags & MLX5E_TC_FLOW_ESWITCH) + actions = flow->esw_attr->action; + else + actions = flow->nic_attr->action; + + if (actions & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR) + return modify_header_match_supported(&parse_attr->spec, exts); + + return true; +} + static int parse_tc_nic_actions(struct mlx5e_priv *priv, struct tcf_exts *exts, struct mlx5e_tc_flow_parse_attr *parse_attr, struct mlx5e_tc_flow *flow) @@ -1378,6 +1441,9 @@ static int parse_tc_nic_actions(struct mlx5e_priv *priv, struct tcf_exts *exts, return -EINVAL; } + if (!actions_match_supported(priv, exts, parse_attr, flow)) + return -EOPNOTSUPP; + return 0; } @@ -1936,6 +2002,10 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts, return -EINVAL; } + + if (!actions_match_supported(priv, exts, parse_attr, flow)) + return -EOPNOTSUPP; + return err; } -- cgit From b281208911a549e391d92ee6cb680dcd3d71783b Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Tue, 8 Aug 2017 11:45:28 +0300 Subject: net/mlx5e: Check encap entry state when offloading tunneled flows Encap entries cached by the driver could be invalidated due to tunnel destination neighbour state changes. When attempting to offload a flow that uses a cached encap entry, we must check the entry validity and defer the offloading if the entry exists but not valid. When EAGAIN is returned, the flow offloading to hardware takes place by the neigh update code when the tunnel destination neighbour becomes connected. Fixes: 232c001398ae ("net/mlx5e: Add support to neighbour update flow") Signed-off-by: Vlad Buslov Reviewed-by: Or Gerlitz Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index d3786005fba7..1aa2028ed995 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -1859,6 +1859,7 @@ vxlan_encap_offload_err: } } + /* must verify if encap is valid or not */ if (found) goto attach_flow; @@ -1885,6 +1886,8 @@ attach_flow: *encap_dev = e->out_dev; if (e->flags & MLX5_ENCAP_ENTRY_VALID) attr->encap_id = e->encap_id; + else + err = -EAGAIN; return err; -- cgit From b20eab15a1d5091e45022401e75b49948e8be33f Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Tue, 12 Sep 2017 17:51:12 +0300 Subject: net/mlx5e: Print netdev features correctly in error message Use the correct formatting for netdev features. Fixes: 0e405443e803 ("net/mlx5e: Improve set features ndo resiliency") Signed-off-by: Gal Pressman Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index dfc29720ab77..84b013dc62e9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -3333,8 +3333,8 @@ static int mlx5e_handle_feature(struct net_device *netdev, err = feature_handler(netdev, enable); if (err) { - netdev_err(netdev, "%s feature 0x%llx failed err %d\n", - enable ? "Enable" : "Disable", feature, err); + netdev_err(netdev, "%s feature %pNF failed, err %d\n", + enable ? "Enable" : "Disable", &feature, err); return err; } -- cgit From 1456f69ff5fbba48ed5bc86e858e945e693ba0b7 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Sun, 10 Sep 2017 10:36:06 +0300 Subject: net/mlx5e: Don't add/remove 802.1ad rules when changing 802.1Q VLAN filter Toggling of C-tag VLAN filter should not affect the "any S-tag" steering rule. Fixes: 8a271746a264 ("net/mlx5e: Receive s-tagged packets in promiscuous mode") Signed-off-by: Gal Pressman Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_fs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c index f11fd07ac4dd..850cdc980ab5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c @@ -291,7 +291,7 @@ void mlx5e_enable_vlan_filter(struct mlx5e_priv *priv) priv->fs.vlan.filter_disabled = false; if (priv->netdev->flags & IFF_PROMISC) return; - mlx5e_del_any_vid_rules(priv); + mlx5e_del_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_ANY_CTAG_VID, 0); } void mlx5e_disable_vlan_filter(struct mlx5e_priv *priv) @@ -302,7 +302,7 @@ void mlx5e_disable_vlan_filter(struct mlx5e_priv *priv) priv->fs.vlan.filter_disabled = true; if (priv->netdev->flags & IFF_PROMISC) return; - mlx5e_add_any_vid_rules(priv); + mlx5e_add_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_ANY_CTAG_VID, 0); } int mlx5e_vlan_rx_add_vid(struct net_device *dev, __always_unused __be16 proto, -- cgit From 603e1f5bd3ca76f16688e10040545594d2e91ba4 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Wed, 13 Sep 2017 15:37:50 +0300 Subject: net/mlx5e: Fix calculated checksum offloads counters Instead of calculating the offloads counters, count them explicitly. The calculations done for these counters would result in bugs in some cases, for example: When running TCP traffic over a VXLAN tunnel with TSO enabled the following counters would increase: tx_csum_partial: 1,333,284 tx_csum_partial_inner: 29,286 tx4_csum_partial_inner: 384 tx7_csum_partial_inner: 8 tx9_csum_partial_inner: 34 tx10_csum_partial_inner: 26,807 tx11_csum_partial_inner: 287 tx12_csum_partial_inner: 27 tx16_csum_partial_inner: 6 tx25_csum_partial_inner: 1,733 Seems like tx_csum_partial increased out of nowhere. The issue is in the following calculation in mlx5e_update_sw_counters: s->tx_csum_partial = s->tx_packets - tx_offload_none - s->tx_csum_partial_inner; While tx_packets increases by the number of GSO segments for each SKB, tx_csum_partial_inner will only increase by one, resulting in wrong tx_csum_partial counter. Fixes: bfe6d8d1d433 ("net/mlx5e: Reorganize ethtool statistics") Signed-off-by: Gal Pressman Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 9 +++------ drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 3 +++ drivers/net/ethernet/mellanox/mlx5/core/en_stats.h | 6 ++++++ drivers/net/ethernet/mellanox/mlx5/core/en_tx.c | 1 + 4 files changed, 13 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 84b013dc62e9..cc11bbbd0309 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -184,7 +184,6 @@ static void mlx5e_update_sw_counters(struct mlx5e_priv *priv) struct mlx5e_sw_stats temp, *s = &temp; struct mlx5e_rq_stats *rq_stats; struct mlx5e_sq_stats *sq_stats; - u64 tx_offload_none = 0; int i, j; memset(s, 0, sizeof(*s)); @@ -199,6 +198,7 @@ static void mlx5e_update_sw_counters(struct mlx5e_priv *priv) s->rx_lro_bytes += rq_stats->lro_bytes; s->rx_csum_none += rq_stats->csum_none; s->rx_csum_complete += rq_stats->csum_complete; + s->rx_csum_unnecessary += rq_stats->csum_unnecessary; s->rx_csum_unnecessary_inner += rq_stats->csum_unnecessary_inner; s->rx_xdp_drop += rq_stats->xdp_drop; s->rx_xdp_tx += rq_stats->xdp_tx; @@ -229,14 +229,11 @@ static void mlx5e_update_sw_counters(struct mlx5e_priv *priv) s->tx_queue_dropped += sq_stats->dropped; s->tx_xmit_more += sq_stats->xmit_more; s->tx_csum_partial_inner += sq_stats->csum_partial_inner; - tx_offload_none += sq_stats->csum_none; + s->tx_csum_none += sq_stats->csum_none; + s->tx_csum_partial += sq_stats->csum_partial; } } - /* Update calculated offload counters */ - s->tx_csum_partial = s->tx_packets - tx_offload_none - s->tx_csum_partial_inner; - s->rx_csum_unnecessary = s->rx_packets - s->rx_csum_none - s->rx_csum_complete; - s->link_down_events_phy = MLX5_GET(ppcnt_reg, priv->stats.pport.phy_counters, counter_set.phys_layer_cntrs.link_down_events); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index f1dd638384d3..15a1687483cc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -627,6 +627,7 @@ static inline void mlx5e_handle_csum(struct net_device *netdev, if (lro) { skb->ip_summed = CHECKSUM_UNNECESSARY; + rq->stats.csum_unnecessary++; return; } @@ -644,7 +645,9 @@ static inline void mlx5e_handle_csum(struct net_device *netdev, skb->csum_level = 1; skb->encapsulation = 1; rq->stats.csum_unnecessary_inner++; + return; } + rq->stats.csum_unnecessary++; return; } csum_none: diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h index 6d199ffb1c0b..f8637213afc0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h @@ -68,6 +68,7 @@ struct mlx5e_sw_stats { u64 rx_xdp_drop; u64 rx_xdp_tx; u64 rx_xdp_tx_full; + u64 tx_csum_none; u64 tx_csum_partial; u64 tx_csum_partial_inner; u64 tx_queue_stopped; @@ -108,6 +109,7 @@ static const struct counter_desc sw_stats_desc[] = { { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_drop) }, { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_tx) }, { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_tx_full) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_csum_none) }, { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_csum_partial) }, { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_csum_partial_inner) }, { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_queue_stopped) }, @@ -339,6 +341,7 @@ struct mlx5e_rq_stats { u64 packets; u64 bytes; u64 csum_complete; + u64 csum_unnecessary; u64 csum_unnecessary_inner; u64 csum_none; u64 lro_packets; @@ -363,6 +366,7 @@ static const struct counter_desc rq_stats_desc[] = { { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, packets) }, { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, bytes) }, { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, csum_complete) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, csum_unnecessary) }, { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, csum_unnecessary_inner) }, { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, csum_none) }, { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, xdp_drop) }, @@ -392,6 +396,7 @@ struct mlx5e_sq_stats { u64 tso_bytes; u64 tso_inner_packets; u64 tso_inner_bytes; + u64 csum_partial; u64 csum_partial_inner; u64 nop; /* less likely accessed in data path */ @@ -408,6 +413,7 @@ static const struct counter_desc sq_stats_desc[] = { { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, tso_bytes) }, { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, tso_inner_packets) }, { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, tso_inner_bytes) }, + { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, csum_partial) }, { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, csum_partial_inner) }, { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, nop) }, { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, csum_none) }, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c index fee43e40fa16..1d6925d4369a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c @@ -193,6 +193,7 @@ mlx5e_txwqe_build_eseg_csum(struct mlx5e_txqsq *sq, struct sk_buff *skb, struct sq->stats.csum_partial_inner++; } else { eseg->cs_flags |= MLX5_ETH_WQE_L4_CSUM; + sq->stats.csum_partial++; } } else sq->stats.csum_none++; -- cgit From 480df991b869eff02a004e8fe7707900437cfcd4 Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Thu, 31 Aug 2017 18:52:14 +0300 Subject: net/mlx5: Fix static checker warning on steering tracepoints code Fix this sparse complaint: drivers/net/ethernet/mellanox/mlx5/core/./diag/fs_tracepoint.h:172:1: warning: odd constant _Bool cast (ffffffffffffffff becomes 1) Fixes: d9fea79171ee ('net/mlx5: Add tracepoints') Signed-off-by: Matan Barak Reviewed-by: Or Gerlitz Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h index 1e3a6c3e4132..80eef4163f52 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h @@ -139,7 +139,7 @@ TRACE_EVENT(mlx5_fs_del_fg, {MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO, "NEXT_PRIO"} TRACE_EVENT(mlx5_fs_set_fte, - TP_PROTO(const struct fs_fte *fte, bool new_fte), + TP_PROTO(const struct fs_fte *fte, int new_fte), TP_ARGS(fte, new_fte), TP_STRUCT__entry( __field(const struct fs_fte *, fte) @@ -149,7 +149,7 @@ TRACE_EVENT(mlx5_fs_set_fte, __field(u32, action) __field(u32, flow_tag) __field(u8, mask_enable) - __field(bool, new_fte) + __field(int, new_fte) __array(u32, mask_outer, MLX5_ST_SZ_DW(fte_match_set_lyr_2_4)) __array(u32, mask_inner, MLX5_ST_SZ_DW(fte_match_set_lyr_2_4)) __array(u32, mask_misc, MLX5_ST_SZ_DW(fte_match_set_misc)) -- cgit From 353f59f4d41e9c5798a15c5c52958f25b579a3d5 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Sun, 24 Sep 2017 09:54:00 +0200 Subject: net/mlx5: Fix wrong indentation in enable SRIOV code Smatch is screaming: drivers/net/ethernet/mellanox/mlx5/core/sriov.c:112 mlx5_device_enable_sriov() warn: inconsistent indenting fix that. Fixes: 7ecf6d8ff154 ('IB/mlx5: Restore IB guid/policy for virtual functions') Signed-off-by: Or Gerlitz Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/sriov.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c index 6c48e9959b65..2a8b529ce6dd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c @@ -109,7 +109,7 @@ static int mlx5_device_enable_sriov(struct mlx5_core_dev *dev, int num_vfs) mlx5_core_warn(dev, "failed to restore VF %d settings, err %d\n", vf, err); - continue; + continue; } } mlx5_core_dbg(dev, "successfully enabled VF* %d\n", vf); -- cgit From da541b20021c781f8b65492eeaee824e729599eb Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 27 Sep 2017 17:34:23 -0500 Subject: objtool: Skip unreachable warnings for GCC 4.4 and older The kbuild bot occasionally reports warnings like: drivers/scsi/pcmcia/aha152x_core.o: warning: objtool: seldo_run()+0x130: unreachable instruction These warnings are always with GCC 4.4. That version of GCC sometimes places unreachable instructions after calls to noreturn functions. The unreachable warnings aren't very important anyway. Just ignore them for old versions of GCC. Reported-by: kbuild test robot Signed-off-by: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/bc89b807d965b98ec18a0bb94f96a594bd58f2f2.1506551639.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- scripts/Makefile.build | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/Makefile.build b/scripts/Makefile.build index 2e3a10e79ca9..061d0c3a420a 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -265,6 +265,8 @@ objtool_args += --no-fp endif ifdef CONFIG_GCOV_KERNEL objtool_args += --no-unreachable +else +objtool_args += $(call cc-ifversion, -lt, 0405, --no-unreachable) endif # 'OBJECT_FILES_NON_STANDARD := y': skip objtool checking for a directory -- cgit From 607a4029d439cdfa258aff5da32bb9cd6ed1a66d Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 27 Sep 2017 10:36:38 -0500 Subject: objtool: Support unoptimized frame pointer setup Arnd Bergmann reported a bunch of warnings like: crypto/jitterentropy.o: warning: objtool: jent_fold_time()+0x3b: call without frame pointer save/setup crypto/jitterentropy.o: warning: objtool: jent_stuck()+0x1d: call without frame pointer save/setup crypto/jitterentropy.o: warning: objtool: jent_unbiased_bit()+0x15: call without frame pointer save/setup crypto/jitterentropy.o: warning: objtool: jent_read_entropy()+0x32: call without frame pointer save/setup crypto/jitterentropy.o: warning: objtool: jent_entropy_collector_free()+0x19: call without frame pointer save/setup and arch/x86/events/core.o: warning: objtool: collect_events uses BP as a scratch register arch/x86/events/core.o: warning: objtool: events_ht_sysfs_show()+0x22: call without frame pointer save/setup With certain rare configurations, GCC sometimes sets up the frame pointer with: lea (%rsp),%rbp instead of: mov %rsp,%rbp The instructions are equivalent, so treat the former like the latter. Reported-by: Arnd Bergmann Signed-off-by: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/a468af8b28a69b83fffc6d7668be9b6fcc873699.1506526584.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- tools/objtool/arch/x86/decode.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index 0f22768c0d4d..34a579f806e3 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -284,11 +284,16 @@ int arch_decode_instruction(struct elf *elf, struct section *sec, case 0x8d: if (sib == 0x24 && rex_w && !rex_b && !rex_x) { - /* lea disp(%rsp), reg */ *type = INSN_STACK; - op->src.type = OP_SRC_ADD; + if (!insn.displacement.value) { + /* lea (%rsp), reg */ + op->src.type = OP_SRC_REG; + } else { + /* lea disp(%rsp), reg */ + op->src.type = OP_SRC_ADD; + op->src.offset = insn.displacement.value; + } op->src.reg = CFI_SP; - op->src.offset = insn.displacement.value; op->dest.type = OP_DEST_REG; op->dest.reg = op_to_cfi_reg[modrm_reg][rex_r]; -- cgit From 66a733ea6b611aecf0119514d2dddab5f9d6c01e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 27 Sep 2017 09:25:30 -0600 Subject: seccomp: fix the usage of get/put_seccomp_filter() in seccomp_get_filter() As Chris explains, get_seccomp_filter() and put_seccomp_filter() can end up using different filters. Once we drop ->siglock it is possible for task->seccomp.filter to have been replaced by SECCOMP_FILTER_FLAG_TSYNC. Fixes: f8e529ed941b ("seccomp, ptrace: add support for dumping seccomp filters") Reported-by: Chris Salls Cc: stable@vger.kernel.org # needs s/refcount_/atomic_/ for v4.12 and earlier Signed-off-by: Oleg Nesterov [tycho: add __get_seccomp_filter vs. open coding refcount_inc()] Signed-off-by: Tycho Andersen [kees: tweak commit log] Signed-off-by: Kees Cook --- kernel/seccomp.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index c24579dfa7a1..bb3a38005b9c 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -473,14 +473,19 @@ static long seccomp_attach_filter(unsigned int flags, return 0; } +void __get_seccomp_filter(struct seccomp_filter *filter) +{ + /* Reference count is bounded by the number of total processes. */ + refcount_inc(&filter->usage); +} + /* get_seccomp_filter - increments the reference count of the filter on @tsk */ void get_seccomp_filter(struct task_struct *tsk) { struct seccomp_filter *orig = tsk->seccomp.filter; if (!orig) return; - /* Reference count is bounded by the number of total processes. */ - refcount_inc(&orig->usage); + __get_seccomp_filter(orig); } static inline void seccomp_filter_free(struct seccomp_filter *filter) @@ -491,10 +496,8 @@ static inline void seccomp_filter_free(struct seccomp_filter *filter) } } -/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ -void put_seccomp_filter(struct task_struct *tsk) +static void __put_seccomp_filter(struct seccomp_filter *orig) { - struct seccomp_filter *orig = tsk->seccomp.filter; /* Clean up single-reference branches iteratively. */ while (orig && refcount_dec_and_test(&orig->usage)) { struct seccomp_filter *freeme = orig; @@ -503,6 +506,12 @@ void put_seccomp_filter(struct task_struct *tsk) } } +/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ +void put_seccomp_filter(struct task_struct *tsk) +{ + __put_seccomp_filter(tsk->seccomp.filter); +} + static void seccomp_init_siginfo(siginfo_t *info, int syscall, int reason) { memset(info, 0, sizeof(*info)); @@ -1025,13 +1034,13 @@ long seccomp_get_filter(struct task_struct *task, unsigned long filter_off, if (!data) goto out; - get_seccomp_filter(task); + __get_seccomp_filter(filter); spin_unlock_irq(&task->sighand->siglock); if (copy_to_user(data, fprog->filter, bpf_classic_proglen(fprog))) ret = -EFAULT; - put_seccomp_filter(task); + __put_seccomp_filter(filter); return ret; out: -- cgit From fe659bcc9b173bcfdd958ce2aec75e47651e74e1 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Tue, 26 Sep 2017 15:15:22 -0400 Subject: USB: dummy-hcd: fix connection failures (wrong speed) The dummy-hcd UDC driver is not careful about the way it handles connection speeds. It ignores the module parameter that is supposed to govern the maximum connection speed and it doesn't set the HCD flags properly for the case where it ends up running at full speed. The result is that in many cases, gadget enumeration over dummy-hcd fails because the bMaxPacketSize byte in the device descriptor is set incorrectly. For example, the default settings call for a high-speed connection, but the maxpacket value for ep0 ends up being set for a Super-Speed connection. This patch fixes the problem by initializing the gadget's max_speed and the HCD flags correctly. Signed-off-by: Alan Stern CC: Signed-off-by: Felipe Balbi --- drivers/usb/gadget/udc/dummy_hcd.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/usb/gadget/udc/dummy_hcd.c b/drivers/usb/gadget/udc/dummy_hcd.c index b1e21b3be6e1..d515ec31afe4 100644 --- a/drivers/usb/gadget/udc/dummy_hcd.c +++ b/drivers/usb/gadget/udc/dummy_hcd.c @@ -1036,7 +1036,12 @@ static int dummy_udc_probe(struct platform_device *pdev) memzero_explicit(&dum->gadget, sizeof(struct usb_gadget)); dum->gadget.name = gadget_name; dum->gadget.ops = &dummy_ops; - dum->gadget.max_speed = USB_SPEED_SUPER; + if (mod_data.is_super_speed) + dum->gadget.max_speed = USB_SPEED_SUPER; + else if (mod_data.is_high_speed) + dum->gadget.max_speed = USB_SPEED_HIGH; + else + dum->gadget.max_speed = USB_SPEED_FULL; dum->gadget.dev.parent = &pdev->dev; init_dummy_udc_hw(dum); @@ -2560,8 +2565,6 @@ static struct hc_driver dummy_hcd = { .product_desc = "Dummy host controller", .hcd_priv_size = sizeof(struct dummy_hcd), - .flags = HCD_USB3 | HCD_SHARED, - .reset = dummy_setup, .start = dummy_start, .stop = dummy_stop, @@ -2590,8 +2593,12 @@ static int dummy_hcd_probe(struct platform_device *pdev) dev_info(&pdev->dev, "%s, driver " DRIVER_VERSION "\n", driver_desc); dum = *((void **)dev_get_platdata(&pdev->dev)); - if (!mod_data.is_super_speed) + if (mod_data.is_super_speed) + dummy_hcd.flags = HCD_USB3 | HCD_SHARED; + else if (mod_data.is_high_speed) dummy_hcd.flags = HCD_USB2; + else + dummy_hcd.flags = HCD_USB11; hs_hcd = usb_create_hcd(&dummy_hcd, &pdev->dev, dev_name(&pdev->dev)); if (!hs_hcd) return -ENOMEM; -- cgit From 0173a68bfb0ad1c72a6ee39cc485aa2c97540b98 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Tue, 26 Sep 2017 15:15:40 -0400 Subject: USB: dummy-hcd: fix infinite-loop resubmission bug The dummy-hcd HCD/UDC emulator tries not to do too much work during each timer interrupt. But it doesn't try very hard; currently all it does is limit the total amount of bulk data transferred. Other transfer types aren't limited, and URBs that transfer no data (because of an error, perhaps) don't count toward the limit, even though on a real USB bus they would consume at least a minimum overhead. This means it's possible to get the driver stuck in an infinite loop, for example, if the host class driver resubmits an URB every time it completes (which is common for interrupt URBs). Each time the URB is resubmitted it gets added to the end of the pending-URBs list, and dummy-hcd doesn't stop until that list is empty. Andrey Konovalov was able to trigger this failure mode using the syzkaller fuzzer. This patch fixes the infinite-loop problem by restricting the URBs handled during each timer interrupt to those that were already on the pending list when the interrupt routine started. Newly added URBs won't be processed until the next timer interrupt. The problem of properly accounting for non-bulk bandwidth (as well as packet and transaction overhead) is not addressed here. Signed-off-by: Alan Stern Reported-by: Andrey Konovalov Tested-by: Andrey Konovalov CC: Signed-off-by: Felipe Balbi --- drivers/usb/gadget/udc/dummy_hcd.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/usb/gadget/udc/dummy_hcd.c b/drivers/usb/gadget/udc/dummy_hcd.c index d515ec31afe4..b2ab9cc33fec 100644 --- a/drivers/usb/gadget/udc/dummy_hcd.c +++ b/drivers/usb/gadget/udc/dummy_hcd.c @@ -237,6 +237,8 @@ struct dummy_hcd { struct usb_device *udev; struct list_head urbp_list; + struct urbp *next_frame_urbp; + u32 stream_en_ep; u8 num_stream[30 / 2]; @@ -1250,6 +1252,8 @@ static int dummy_urb_enqueue( list_add_tail(&urbp->urbp_list, &dum_hcd->urbp_list); urb->hcpriv = urbp; + if (!dum_hcd->next_frame_urbp) + dum_hcd->next_frame_urbp = urbp; if (usb_pipetype(urb->pipe) == PIPE_CONTROL) urb->error_count = 1; /* mark as a new urb */ @@ -1766,6 +1770,7 @@ static void dummy_timer(unsigned long _dum_hcd) spin_unlock_irqrestore(&dum->lock, flags); return; } + dum_hcd->next_frame_urbp = NULL; for (i = 0; i < DUMMY_ENDPOINTS; i++) { if (!ep_info[i].name) @@ -1782,6 +1787,10 @@ restart: int type; int status = -EINPROGRESS; + /* stop when we reach URBs queued after the timer interrupt */ + if (urbp == dum_hcd->next_frame_urbp) + break; + urb = urbp->urb; if (urb->unlinked) goto return_urb; -- cgit From 7dbd8f4cabd96db5a50513de9d83a8105a5ffc81 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Tue, 26 Sep 2017 15:15:49 -0400 Subject: USB: dummy-hcd: Fix erroneous synchronization change A recent change to the synchronization in dummy-hcd was incorrect. The issue was that dummy_udc_stop() contained no locking and therefore could race with various gadget driver callbacks, and the fix was to add locking and issue the callbacks with the private spinlock held. UDC drivers aren't supposed to do this. Gadget driver callback routines are allowed to invoke functions in the UDC driver, and these functions will generally try to acquire the private spinlock. This would deadlock the driver. The correct solution is to drop the spinlock before issuing callbacks, and avoid races by emulating the synchronize_irq() call that all real UDC drivers must perform in their ->udc_stop() routines after disabling interrupts. This involves adding a flag to dummy-hcd's private structure to keep track of whether interrupts are supposed to be enabled, and adding a counter to keep track of ongoing callbacks so that dummy_udc_stop() can wait for them all to finish. A real UDC driver won't receive disconnect, reset, suspend, resume, or setup events once it has disabled interrupts. dummy-hcd will receive them but won't try to issue any gadget driver callbacks, which should be just as good. Signed-off-by: Alan Stern Fixes: f16443a034c7 ("USB: gadgetfs, dummy-hcd, net2280: fix locking for callbacks") CC: Signed-off-by: Felipe Balbi --- drivers/usb/gadget/udc/dummy_hcd.c | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/drivers/usb/gadget/udc/dummy_hcd.c b/drivers/usb/gadget/udc/dummy_hcd.c index b2ab9cc33fec..b17618a55f1b 100644 --- a/drivers/usb/gadget/udc/dummy_hcd.c +++ b/drivers/usb/gadget/udc/dummy_hcd.c @@ -255,11 +255,13 @@ struct dummy { */ struct dummy_ep ep[DUMMY_ENDPOINTS]; int address; + int callback_usage; struct usb_gadget gadget; struct usb_gadget_driver *driver; struct dummy_request fifo_req; u8 fifo_buf[FIFO_SIZE]; u16 devstatus; + unsigned ints_enabled:1; unsigned udc_suspended:1; unsigned pullup:1; @@ -441,18 +443,27 @@ static void set_link_state(struct dummy_hcd *dum_hcd) (~dum_hcd->old_status) & dum_hcd->port_status; /* Report reset and disconnect events to the driver */ - if (dum->driver && (disconnect || reset)) { + if (dum->ints_enabled && (disconnect || reset)) { stop_activity(dum); + ++dum->callback_usage; + spin_unlock(&dum->lock); if (reset) usb_gadget_udc_reset(&dum->gadget, dum->driver); else dum->driver->disconnect(&dum->gadget); + spin_lock(&dum->lock); + --dum->callback_usage; } - } else if (dum_hcd->active != dum_hcd->old_active) { + } else if (dum_hcd->active != dum_hcd->old_active && + dum->ints_enabled) { + ++dum->callback_usage; + spin_unlock(&dum->lock); if (dum_hcd->old_active && dum->driver->suspend) dum->driver->suspend(&dum->gadget); else if (!dum_hcd->old_active && dum->driver->resume) dum->driver->resume(&dum->gadget); + spin_lock(&dum->lock); + --dum->callback_usage; } dum_hcd->old_status = dum_hcd->port_status; @@ -973,8 +984,11 @@ static int dummy_udc_start(struct usb_gadget *g, * can't enumerate without help from the driver we're binding. */ + spin_lock_irq(&dum->lock); dum->devstatus = 0; dum->driver = driver; + dum->ints_enabled = 1; + spin_unlock_irq(&dum->lock); return 0; } @@ -985,6 +999,16 @@ static int dummy_udc_stop(struct usb_gadget *g) struct dummy *dum = dum_hcd->dum; spin_lock_irq(&dum->lock); + dum->ints_enabled = 0; + stop_activity(dum); + + /* emulate synchronize_irq(): wait for callbacks to finish */ + while (dum->callback_usage > 0) { + spin_unlock_irq(&dum->lock); + usleep_range(1000, 2000); + spin_lock_irq(&dum->lock); + } + dum->driver = NULL; spin_unlock_irq(&dum->lock); @@ -1529,6 +1553,8 @@ static struct dummy_ep *find_endpoint(struct dummy *dum, u8 address) if (!is_active((dum->gadget.speed == USB_SPEED_SUPER ? dum->ss_hcd : dum->hs_hcd))) return NULL; + if (!dum->ints_enabled) + return NULL; if ((address & ~USB_DIR_IN) == 0) return &dum->ep[0]; for (i = 1; i < DUMMY_ENDPOINTS; i++) { @@ -1870,10 +1896,12 @@ restart: * until setup() returns; no reentrancy issues etc. */ if (value > 0) { + ++dum->callback_usage; spin_unlock(&dum->lock); value = dum->driver->setup(&dum->gadget, &setup); spin_lock(&dum->lock); + --dum->callback_usage; if (value >= 0) { /* no delays (max 64KB data stage) */ -- cgit From 4dcf4bab4a409e81284b8202137e4a85b96b34de Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Mon, 25 Sep 2017 17:01:23 +0900 Subject: usb: gadget: udc: renesas_usb3: fix for no-data control transfer When bRequestType & USB_DIR_IN is false and req.length is 0 in control transfer, since it means non-data, this driver should not set the mode as control write. So, this patch fixes it. Fixes: 746bfe63bba3 ("usb: gadget: renesas_usb3: add support for Renesas USB3.0 peripheral controller") Cc: # v4.5+ Signed-off-by: Yoshihiro Shimoda Signed-off-by: Felipe Balbi --- drivers/usb/gadget/udc/renesas_usb3.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/usb/gadget/udc/renesas_usb3.c b/drivers/usb/gadget/udc/renesas_usb3.c index df37c1e6e9d5..555c105e82df 100644 --- a/drivers/usb/gadget/udc/renesas_usb3.c +++ b/drivers/usb/gadget/udc/renesas_usb3.c @@ -1150,7 +1150,8 @@ static void usb3_start_pipe0(struct renesas_usb3_ep *usb3_ep, usb3_set_p0_con_for_ctrl_read_data(usb3); } else { usb3_clear_bit(usb3, P0_MOD_DIR, USB3_P0_MOD); - usb3_set_p0_con_for_ctrl_write_data(usb3); + if (usb3_req->req.length) + usb3_set_p0_con_for_ctrl_write_data(usb3); } usb3_p0_xfer(usb3_ep, usb3_req); -- cgit From 73f2f5745f18b4ccfe9484deac4e84a1378d19fd Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Mon, 25 Sep 2017 17:01:24 +0900 Subject: usb: gadget: udc: renesas_usb3: fix Pn_RAMMAP.Pn_MPKT value According to the datasheet of R-Car Gen3, the Pn_RAMMAP.Pn_MPKT should be set to one of 8, 16, 32, 64, 512 and 1024. Otherwise, when a gadget driver uses an interrupt endpoint, unexpected behavior happens. So, this patch fixes it. Fixes: 746bfe63bba3 ("usb: gadget: renesas_usb3: add support for Renesas USB3.0 peripheral controller") Cc: # v4.5+ Signed-off-by: Yoshihiro Shimoda Signed-off-by: Felipe Balbi --- drivers/usb/gadget/udc/renesas_usb3.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/usb/gadget/udc/renesas_usb3.c b/drivers/usb/gadget/udc/renesas_usb3.c index 555c105e82df..7e0c53492356 100644 --- a/drivers/usb/gadget/udc/renesas_usb3.c +++ b/drivers/usb/gadget/udc/renesas_usb3.c @@ -2054,7 +2054,16 @@ static u32 usb3_calc_ramarea(int ram_size) static u32 usb3_calc_rammap_val(struct renesas_usb3_ep *usb3_ep, const struct usb_endpoint_descriptor *desc) { - return usb3_ep->rammap_val | PN_RAMMAP_MPKT(usb_endpoint_maxp(desc)); + int i; + const u32 max_packet_array[] = {8, 16, 32, 64, 512}; + u32 mpkt = PN_RAMMAP_MPKT(1024); + + for (i = 0; i < ARRAY_SIZE(max_packet_array); i++) { + if (usb_endpoint_maxp(desc) <= max_packet_array[i]) + mpkt = PN_RAMMAP_MPKT(max_packet_array[i]); + } + + return usb3_ep->rammap_val | mpkt; } static int usb3_enable_pipe_n(struct renesas_usb3_ep *usb3_ep, -- cgit From 447b8a01b84f048d93d43bfe1fcaa4fcc56595cc Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Mon, 25 Sep 2017 17:01:25 +0900 Subject: usb: gadget: udc: renesas_usb3: Fix return value of usb3_write_pipe() This patch fixes an issue that this driver cannot go status stage in control read when the req.zero is set to 1 and the len in usb3_write_pipe() is set to 0. Otherwise, if we use g_ncm driver, usb enumeration takes long time (5 seconds or more). Fixes: 746bfe63bba3 ("usb: gadget: renesas_usb3: add support for Renesas USB3.0 peripheral controller") Cc: # v4.5+ Signed-off-by: Yoshihiro Shimoda Signed-off-by: Felipe Balbi --- drivers/usb/gadget/udc/renesas_usb3.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/usb/gadget/udc/renesas_usb3.c b/drivers/usb/gadget/udc/renesas_usb3.c index 7e0c53492356..63a206122058 100644 --- a/drivers/usb/gadget/udc/renesas_usb3.c +++ b/drivers/usb/gadget/udc/renesas_usb3.c @@ -1038,7 +1038,7 @@ static int usb3_write_pipe(struct renesas_usb3_ep *usb3_ep, usb3_ep->ep.maxpacket); u8 *buf = usb3_req->req.buf + usb3_req->req.actual; u32 tmp = 0; - bool is_last; + bool is_last = !len ? true : false; if (usb3_wait_pipe_status(usb3_ep, PX_STA_BUFSTS) < 0) return -EBUSY; @@ -1059,7 +1059,8 @@ static int usb3_write_pipe(struct renesas_usb3_ep *usb3_ep, usb3_write(usb3, tmp, fifo_reg); } - is_last = usb3_is_transfer_complete(usb3_ep, usb3_req); + if (!is_last) + is_last = usb3_is_transfer_complete(usb3_ep, usb3_req); /* Send the data */ usb3_set_px_con_send(usb3_ep, len, is_last); -- cgit From 6124607acc88fffeaadf3aacfeb3cc1304c87387 Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Wed, 27 Sep 2017 18:47:12 +0900 Subject: usb: renesas_usbhs: fix the BCLR setting condition for non-DCP pipe This patch fixes an issue that the driver sets the BCLR bit of {C,Dn}FIFOCTR register to 1 even when it's non-DCP pipe and the FRDY bit of {C,Dn}FIFOCTR register is set to 1. Fixes: e8d548d54968 ("usb: renesas_usbhs: fifo became independent from pipe.") Cc: # v3.1+ Signed-off-by: Yoshihiro Shimoda Signed-off-by: Felipe Balbi --- drivers/usb/renesas_usbhs/fifo.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/usb/renesas_usbhs/fifo.c b/drivers/usb/renesas_usbhs/fifo.c index d1af831f43eb..03cac07f57b5 100644 --- a/drivers/usb/renesas_usbhs/fifo.c +++ b/drivers/usb/renesas_usbhs/fifo.c @@ -282,11 +282,17 @@ static void usbhsf_fifo_clear(struct usbhs_pipe *pipe, struct usbhs_fifo *fifo) { struct usbhs_priv *priv = usbhs_pipe_to_priv(pipe); + int ret = 0; if (!usbhs_pipe_is_dcp(pipe)) - usbhsf_fifo_barrier(priv, fifo); + ret = usbhsf_fifo_barrier(priv, fifo); - usbhs_write(priv, fifo->ctr, BCLR); + /* + * if non-DCP pipe, this driver should set BCLR when + * usbhsf_fifo_barrier() returns 0. + */ + if (!ret) + usbhs_write(priv, fifo->ctr, BCLR); } static int usbhsf_fifo_rcv_len(struct usbhs_priv *priv, -- cgit From 0a2ce62b61f2c76d0213edf4e37aaf54a8ddf295 Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Wed, 27 Sep 2017 18:47:13 +0900 Subject: usb: renesas_usbhs: fix usbhsf_fifo_clear() for RX direction This patch fixes an issue that the usbhsf_fifo_clear() is possible to cause 10 msec delay if the pipe is RX direction and empty because the FRDY bit will never be set to 1 in such case. Fixes: e8d548d54968 ("usb: renesas_usbhs: fifo became independent from pipe.") Cc: # v3.1+ Signed-off-by: Yoshihiro Shimoda Signed-off-by: Felipe Balbi --- drivers/usb/renesas_usbhs/fifo.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/usb/renesas_usbhs/fifo.c b/drivers/usb/renesas_usbhs/fifo.c index 03cac07f57b5..68f26904c316 100644 --- a/drivers/usb/renesas_usbhs/fifo.c +++ b/drivers/usb/renesas_usbhs/fifo.c @@ -284,8 +284,17 @@ static void usbhsf_fifo_clear(struct usbhs_pipe *pipe, struct usbhs_priv *priv = usbhs_pipe_to_priv(pipe); int ret = 0; - if (!usbhs_pipe_is_dcp(pipe)) - ret = usbhsf_fifo_barrier(priv, fifo); + if (!usbhs_pipe_is_dcp(pipe)) { + /* + * This driver checks the pipe condition first to avoid -EBUSY + * from usbhsf_fifo_barrier() with about 10 msec delay in + * the interrupt handler if the pipe is RX direction and empty. + */ + if (usbhs_pipe_is_dir_in(pipe)) + ret = usbhs_pipe_is_accessible(pipe); + if (!ret) + ret = usbhsf_fifo_barrier(priv, fifo); + } /* * if non-DCP pipe, this driver should set BCLR when -- cgit From addfc5823dbf3e6ed400e98e49c7e64b10e191d6 Mon Sep 17 00:00:00 2001 From: John Keeping Date: Tue, 12 Sep 2017 10:24:40 +0100 Subject: usb: gadget: ffs: handle I/O completion in-order By submitting completed transfers to the system workqueue there is no guarantee that completion events will be queued up in the correct order, as in multi-processor systems there is a thread running for each processor and the work items are not bound to a particular core. This means that several completions are in the queue at the same time, they may be processed in parallel and complete out of order, resulting in data appearing corrupt when read by userspace. Create a single-threaded workqueue for FunctionFS so that data completed requests is passed to userspace in the order in which they complete. Acked-by: Michal Nazarewicz Signed-off-by: John Keeping Signed-off-by: Felipe Balbi --- drivers/usb/gadget/function/f_fs.c | 17 +++++++++++++---- drivers/usb/gadget/function/u_fs.h | 1 + 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index 9990944a7245..8b342587f8ad 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -46,7 +46,8 @@ static void ffs_data_get(struct ffs_data *ffs); static void ffs_data_put(struct ffs_data *ffs); /* Creates new ffs_data object. */ -static struct ffs_data *__must_check ffs_data_new(void) __attribute__((malloc)); +static struct ffs_data *__must_check ffs_data_new(const char *dev_name) + __attribute__((malloc)); /* Opened counter handling. */ static void ffs_data_opened(struct ffs_data *ffs); @@ -780,11 +781,12 @@ static void ffs_epfile_async_io_complete(struct usb_ep *_ep, struct usb_request *req) { struct ffs_io_data *io_data = req->context; + struct ffs_data *ffs = io_data->ffs; ENTER(); INIT_WORK(&io_data->work, ffs_user_copy_worker); - schedule_work(&io_data->work); + queue_work(ffs->io_completion_wq, &io_data->work); } static void __ffs_epfile_read_buffer_free(struct ffs_epfile *epfile) @@ -1500,7 +1502,7 @@ ffs_fs_mount(struct file_system_type *t, int flags, if (unlikely(ret < 0)) return ERR_PTR(ret); - ffs = ffs_data_new(); + ffs = ffs_data_new(dev_name); if (unlikely(!ffs)) return ERR_PTR(-ENOMEM); ffs->file_perms = data.perms; @@ -1610,6 +1612,7 @@ static void ffs_data_put(struct ffs_data *ffs) BUG_ON(waitqueue_active(&ffs->ev.waitq) || waitqueue_active(&ffs->ep0req_completion.wait) || waitqueue_active(&ffs->wait)); + destroy_workqueue(ffs->io_completion_wq); kfree(ffs->dev_name); kfree(ffs); } @@ -1642,7 +1645,7 @@ static void ffs_data_closed(struct ffs_data *ffs) ffs_data_put(ffs); } -static struct ffs_data *ffs_data_new(void) +static struct ffs_data *ffs_data_new(const char *dev_name) { struct ffs_data *ffs = kzalloc(sizeof *ffs, GFP_KERNEL); if (unlikely(!ffs)) @@ -1650,6 +1653,12 @@ static struct ffs_data *ffs_data_new(void) ENTER(); + ffs->io_completion_wq = alloc_ordered_workqueue("%s", 0, dev_name); + if (!ffs->io_completion_wq) { + kfree(ffs); + return NULL; + } + refcount_set(&ffs->ref, 1); atomic_set(&ffs->opened, 0); ffs->state = FFS_READ_DESCRIPTORS; diff --git a/drivers/usb/gadget/function/u_fs.h b/drivers/usb/gadget/function/u_fs.h index 540f1c48c1a8..79f70ebf85dc 100644 --- a/drivers/usb/gadget/function/u_fs.h +++ b/drivers/usb/gadget/function/u_fs.h @@ -279,6 +279,7 @@ struct ffs_data { } file_perms; struct eventfd_ctx *ffs_eventfd; + struct workqueue_struct *io_completion_wq; bool no_disconnect; struct work_struct reset_work; -- cgit From 6baeda120d90aa637b08f7604de104ab00ce9126 Mon Sep 17 00:00:00 2001 From: Nicolas Ferre Date: Thu, 31 Aug 2017 14:51:40 +0200 Subject: usb: gadget: udc: atmel: set vbus irqflags explicitly The driver triggers actions on both edges of the vbus signal. The former PIO controller was triggering IRQs on both falling and rising edges by default. Newer PIO controller don't, so it's better to set it explicitly to IRQF_TRIGGER_FALLING | IRQF_TRIGGER_RISING. Without this patch we may trigger the connection with host but only on some bouncing signal conditions and thus lose connecting events. Acked-by: Ludovic Desroches Signed-off-by: Nicolas Ferre Cc: stable # v4.4+ Signed-off-by: Felipe Balbi --- drivers/usb/gadget/udc/atmel_usba_udc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/usb/gadget/udc/atmel_usba_udc.c b/drivers/usb/gadget/udc/atmel_usba_udc.c index 98d71400f8a1..a884c022df7a 100644 --- a/drivers/usb/gadget/udc/atmel_usba_udc.c +++ b/drivers/usb/gadget/udc/atmel_usba_udc.c @@ -29,6 +29,8 @@ #include #include "atmel_usba_udc.h" +#define USBA_VBUS_IRQFLAGS (IRQF_ONESHOT \ + | IRQF_TRIGGER_FALLING | IRQF_TRIGGER_RISING) #ifdef CONFIG_USB_GADGET_DEBUG_FS #include @@ -2361,7 +2363,7 @@ static int usba_udc_probe(struct platform_device *pdev) IRQ_NOAUTOEN); ret = devm_request_threaded_irq(&pdev->dev, gpio_to_irq(udc->vbus_pin), NULL, - usba_vbus_irq_thread, IRQF_ONESHOT, + usba_vbus_irq_thread, USBA_VBUS_IRQFLAGS, "atmel_usba_udc", udc); if (ret) { udc->vbus_pin = -ENODEV; -- cgit From c3cdce45f8d3dfa5c3467894aa89798314920328 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Wed, 30 Aug 2017 19:03:52 +0800 Subject: usb: dwc3: of-simple: Add compatible for Spreadtrum SC9860 platform Add compatible string to use this generic glue layer to support Spreadtrum SC9860 platform's dwc3 controller. Signed-off-by: Baolin Wang Signed-off-by: Felipe Balbi --- drivers/usb/dwc3/dwc3-of-simple.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/dwc3/dwc3-of-simple.c b/drivers/usb/dwc3/dwc3-of-simple.c index 4cef7d4f9cd0..a26d1fde0f5e 100644 --- a/drivers/usb/dwc3/dwc3-of-simple.c +++ b/drivers/usb/dwc3/dwc3-of-simple.c @@ -177,6 +177,7 @@ static const struct of_device_id of_dwc3_simple_match[] = { { .compatible = "rockchip,rk3399-dwc3" }, { .compatible = "xlnx,zynqmp-dwc3" }, { .compatible = "cavium,octeon-7130-usb-uctl" }, + { .compatible = "sprd,sc9860-dwc3" }, { /* Sentinel */ } }; MODULE_DEVICE_TABLE(of, of_dwc3_simple_match); -- cgit From 72364d320644c12948786962673772f271039a4a Mon Sep 17 00:00:00 2001 From: Jeffy Chen Date: Thu, 28 Sep 2017 12:37:31 +0800 Subject: irq/generic-chip: Don't replace domain's name When generic irq chips are allocated for an irq domain the domain name is set to the irq chip name. That was done to have named domains before the recent changes which enforce domain naming were done. Since then the overwrite causes a memory leak when the domain name is dynamically allocated and even worse it would cause the domain free code to free the wrong name pointer, which might point to a constant. Remove the name assignment to prevent this. Fixes: d59f6617eef0 ("genirq: Allow fwnode to carry name information only") Signed-off-by: Jeffy Chen Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20170928043731.4764-1-jeffy.chen@rock-chips.com --- kernel/irq/generic-chip.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index f7086b78ad6e..5270a54b9fa4 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -322,7 +322,6 @@ int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, /* Calc pointer to the next generic chip */ tmp += sizeof(*gc) + num_ct * sizeof(struct irq_chip_type); } - d->name = name; return 0; } EXPORT_SYMBOL_GPL(__irq_alloc_domain_generic_chips); -- cgit From 77c01d11bbb2b5c005347061bf543ab94878314c Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 26 Sep 2017 10:36:03 +0100 Subject: watchdog/hardlockup/perf: Fix spelling mistake: "permanetely" -> "permanently" Trivial fix to spelling mistake in pr_info message Signed-off-by: Colin Ian King Signed-off-by: Thomas Gleixner Acked-by: Don Zickus Cc: Andrew Morton Cc: Babu Moger Link: https://lkml.kernel.org/r/20170926093603.7756-1-colin.king@canonical.com --- kernel/watchdog_hld.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 204a8cadb717..71a62ceacdc8 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -280,7 +280,7 @@ int __init hardlockup_detector_perf_init(void) int ret = hardlockup_detector_event_create(); if (ret) { - pr_info("Perf NMI watchdog permanetely disabled\n"); + pr_info("Perf NMI watchdog permanently disabled\n"); } else { perf_event_release_kernel(this_cpu_read(watchdog_ev)); this_cpu_write(watchdog_ev, NULL); -- cgit From 8c28ef3f1c1c57b6f468343d5959e5125b30334d Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 25 Sep 2017 02:01:01 -0600 Subject: xen-pciback: relax BAR sizing write value check Just like done in d2bd05d88d ("xen-pciback: return proper values during BAR sizing") for the ROM BAR, ordinary ones also shouldn't compare the written value directly against ~0, but consider the r/o bits at the bottom (if any). Signed-off-by: Jan Beulich Reviewed-by: Juergen Gross Signed-off-by: Boris Ostrovsky --- drivers/xen/xen-pciback/conf_space_header.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/xen/xen-pciback/conf_space_header.c b/drivers/xen/xen-pciback/conf_space_header.c index 5fbfd9cfb6d6..5b3d57fc82d3 100644 --- a/drivers/xen/xen-pciback/conf_space_header.c +++ b/drivers/xen/xen-pciback/conf_space_header.c @@ -169,6 +169,9 @@ static int rom_write(struct pci_dev *dev, int offset, u32 value, void *data) static int bar_write(struct pci_dev *dev, int offset, u32 value, void *data) { struct pci_bar_info *bar = data; + unsigned int pos = (offset - PCI_BASE_ADDRESS_0) / 4; + const struct resource *res = dev->resource; + u32 mask; if (unlikely(!bar)) { pr_warn(DRV_NAME ": driver data not found for %s\n", @@ -179,7 +182,13 @@ static int bar_write(struct pci_dev *dev, int offset, u32 value, void *data) /* A write to obtain the length must happen as a 32-bit write. * This does not (yet) support writing individual bytes */ - if (value == ~0) + if (res[pos].flags & IORESOURCE_IO) + mask = ~PCI_BASE_ADDRESS_IO_MASK; + else if (pos && (res[pos - 1].flags & IORESOURCE_MEM_64)) + mask = 0; + else + mask = ~PCI_BASE_ADDRESS_MEM_MASK; + if ((value | mask) == ~0U) bar->which = 1; else { u32 tmpval; -- cgit From 0d805ee70a69eabd38160dc199e183ac2f13fe4b Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Wed, 27 Sep 2017 02:41:25 -0700 Subject: xen/mmu: Call xen_cleanhighmap() with 4MB aligned for page tables mapping When bootup a PVM guest with large memory(Ex.240GB), XEN provided initial mapping overlaps with kernel module virtual space. When mapping in this space is cleared by xen_cleanhighmap(), in certain case there could be an 2MB mapping left. This is due to XEN initialize 4MB aligned mapping but xen_cleanhighmap() finish at 2MB boundary. When module loading is just on top of the 2MB space, got below warning: WARNING: at mm/vmalloc.c:106 vmap_pte_range+0x14e/0x190() Call Trace: [] warn_alloc_failed+0xf3/0x160 [] __vmalloc_area_node+0x182/0x1c0 [] ? module_alloc_update_bounds+0x1e/0x80 [] __vmalloc_node_range+0xa7/0x110 [] ? module_alloc_update_bounds+0x1e/0x80 [] module_alloc+0x64/0x70 [] ? module_alloc_update_bounds+0x1e/0x80 [] module_alloc_update_bounds+0x1e/0x80 [] move_module+0x27/0x150 [] layout_and_allocate+0x120/0x1b0 [] load_module+0x78/0x640 [] ? security_file_permission+0x8b/0x90 [] sys_init_module+0x62/0x1e0 [] system_call_fastpath+0x16/0x1b Then the mapping of 2MB is cleared, finally oops when the page in that space is accessed. BUG: unable to handle kernel paging request at ffff880022600000 IP: [] clear_page_c_e+0x7/0x10 PGD 1788067 PUD 178c067 PMD 22434067 PTE 0 Oops: 0002 [#1] SMP Call Trace: [] ? prep_new_page+0x127/0x1c0 [] get_page_from_freelist+0x1e2/0x550 [] ? ii_iovec_copy_to_user+0x90/0x140 [] __alloc_pages_nodemask+0x12d/0x230 [] alloc_pages_vma+0xc6/0x1a0 [] ? pte_mfn_to_pfn+0x7d/0x100 [] do_anonymous_page+0x16b/0x350 [] handle_pte_fault+0x1e4/0x200 [] ? xen_pmd_val+0xe/0x10 [] ? __raw_callee_save_xen_pmd_val+0x11/0x1e [] handle_mm_fault+0x15b/0x270 [] do_page_fault+0x140/0x470 [] page_fault+0x25/0x30 Call xen_cleanhighmap() with 4MB aligned for page tables mapping to fix it. The unnecessory call of xen_cleanhighmap() in DEBUG mode is also removed. -v2: add comment about XEN alignment from Juergen. References: https://lists.xen.org/archives/html/xen-devel/2012-07/msg01562.html Signed-off-by: Zhenzhong Duan Reviewed-by: Juergen Gross [boris: added 'xen/mmu' tag to commit subject] Signed-off-by: Boris Ostrovsky --- arch/x86/xen/mmu_pv.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 509f560bd0c6..58b09fcadbaa 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -1238,21 +1238,16 @@ static void __init xen_pagetable_cleanhighmap(void) * from _brk_limit way up to the max_pfn_mapped (which is the end of * the ramdisk). We continue on, erasing PMD entries that point to page * tables - do note that they are accessible at this stage via __va. - * For good measure we also round up to the PMD - which means that if + * As Xen is aligning the memory end to a 4MB boundary, for good + * measure we also round up to PMD_SIZE * 2 - which means that if * anybody is using __ka address to the initial boot-stack - and try * to use it - they are going to crash. The xen_start_info has been * taken care of already in xen_setup_kernel_pagetable. */ addr = xen_start_info->pt_base; - size = roundup(xen_start_info->nr_pt_frames * PAGE_SIZE, PMD_SIZE); + size = xen_start_info->nr_pt_frames * PAGE_SIZE; - xen_cleanhighmap(addr, addr + size); + xen_cleanhighmap(addr, roundup(addr + size, PMD_SIZE * 2)); xen_start_info->pt_base = (unsigned long)__va(__pa(xen_start_info->pt_base)); -#ifdef DEBUG - /* This is superfluous and is not necessary, but you know what - * lets do it. The MODULES_VADDR -> MODULES_END should be clear of - * anything at this stage. */ - xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1); -#endif } #endif -- cgit From 686fef928bba6be13cabe639f154af7d72b63120 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 28 Sep 2017 06:38:17 -0700 Subject: timer: Prepare to change timer callback argument type Modern kernel callback systems pass the structure associated with a given callback to the callback function. The timer callback remains one of the legacy cases where an arbitrary unsigned long argument continues to be passed as the callback argument. This has several problems: - This bloats the timer_list structure with a normally redundant .data field. - No type checking is being performed, forcing callbacks to do explicit type casts of the unsigned long argument into the object that was passed, rather than using container_of(), as done in most of the other callback infrastructure. - Neighboring buffer overflows can overwrite both the .function and the .data field, providing attackers with a way to elevate from a buffer overflow into a simplistic ROP-like mechanism that allows calling arbitrary functions with a controlled first argument. - For future Control Flow Integrity work, this creates a unique function prototype for timer callbacks, instead of allowing them to continue to be clustered with other void functions that take a single unsigned long argument. This adds a new timer initialization API, which will ultimately replace the existing setup_timer(), setup_{deferrable,pinned,etc}_timer() family, named timer_setup() (to mirror hrtimer_setup(), making instances of its use much easier to grep for). In order to support the migration of existing timers into the new callback arguments, timer_setup() casts its arguments to the existing legacy types, and explicitly passes the timer pointer as the legacy data argument. Once all setup_*timer() callers have been replaced with timer_setup(), the casts can be removed, and the data argument can be dropped with the timer expiration code changed to just pass the timer to the callback directly. Since the regular pattern of using container_of() during local variable declaration repeats the need for the variable type declaration to be included, this adds a helper modeled after other from_*() helpers that wrap container_of(), named from_timer(). This helper uses typeof(*variable), removing the type redundancy and minimizing the need for line wraps in forthcoming conversions from "unsigned data long" to "struct timer_list *" in the timer callbacks: -void callback(unsigned long data) +void callback(struct timer_list *t) { - struct some_data_structure *local = (struct some_data_structure *)data; + struct some_data_structure *local = from_timer(local, t, timer); Finally, in order to support the handful of timer users that perform open-coded assignments of the .function (and .data) fields, provide cast macros (TIMER_FUNC_TYPE and TIMER_DATA_TYPE) that can be used temporarily. Once conversion has been completed, these can be globally trivially removed. Signed-off-by: Kees Cook Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20170928133817.GA113410@beast --- include/linux/timer.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/include/linux/timer.h b/include/linux/timer.h index e6789b8757d5..6383c528b148 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -168,6 +168,20 @@ static inline void init_timer_on_stack_key(struct timer_list *timer, #define setup_pinned_deferrable_timer_on_stack(timer, fn, data) \ __setup_timer_on_stack((timer), (fn), (data), TIMER_DEFERRABLE | TIMER_PINNED) +#define TIMER_DATA_TYPE unsigned long +#define TIMER_FUNC_TYPE void (*)(TIMER_DATA_TYPE) + +static inline void timer_setup(struct timer_list *timer, + void (*callback)(struct timer_list *), + unsigned int flags) +{ + __setup_timer(timer, (TIMER_FUNC_TYPE)callback, + (TIMER_DATA_TYPE)timer, flags); +} + +#define from_timer(var, callback_timer, timer_fieldname) \ + container_of(callback_timer, typeof(*var), timer_fieldname) + /** * timer_pending - is a timer pending? * @timer: the timer in question -- cgit From 1fa4df3e688902d033dfda796eb83ae6ad8d0488 Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Wed, 27 Sep 2017 16:35:00 -0500 Subject: percpu: fix iteration to prevent skipping over block The iterator functions pcpu_next_md_free_region and pcpu_next_fit_region use the block offset to determine if they have checked the area in the prior iteration. However, this causes an issue when the block offset is greater than subsequent block contig hints. If within the iterator it moves to check subsequent blocks, it may fail in the second predicate due to the block offset not being cleared. Thus, this causes the allocator to skip over blocks leading to false failures when allocating from the reserved chunk. While this happens in the general case as well, it will only fail if it cannot allocate a new chunk. This patch resets the block offset to 0 to pass the second predicate when checking subseqent blocks within the iterator function. Signed-off-by: Dennis Zhou Reported-and-tested-by: Luis Henriques Signed-off-by: Tejun Heo --- mm/percpu.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/percpu.c b/mm/percpu.c index 59d44d61f5f1..aa121cef76de 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -353,6 +353,8 @@ static void pcpu_next_md_free_region(struct pcpu_chunk *chunk, int *bit_off, block->contig_hint_start); return; } + /* reset to satisfy the second predicate above */ + block_off = 0; *bits = block->right_free; *bit_off = (i + 1) * PCPU_BITMAP_BLOCK_BITS - block->right_free; @@ -407,6 +409,8 @@ static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits, *bit_off = pcpu_block_off_to_off(i, block->first_free); return; } + /* reset to satisfy the second predicate above */ + block_off = 0; *bit_off = ALIGN(PCPU_BITMAP_BLOCK_BITS - block->right_free, align); -- cgit From 2580c4c17aee3ad58e9751012bad278dd074ccae Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 28 Sep 2017 11:32:37 +0200 Subject: tun: bail out from tun_get_user() if the skb is empty KMSAN (https://github.com/google/kmsan) reported accessing uninitialized skb->data[0] in the case the skb is empty (i.e. skb->len is 0): ================================================ BUG: KMSAN: use of uninitialized memory in tun_get_user+0x19ba/0x3770 CPU: 0 PID: 3051 Comm: probe Not tainted 4.13.0+ #3140 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Call Trace: ... __msan_warning_32+0x66/0xb0 mm/kmsan/kmsan_instr.c:477 tun_get_user+0x19ba/0x3770 drivers/net/tun.c:1301 tun_chr_write_iter+0x19f/0x300 drivers/net/tun.c:1365 call_write_iter ./include/linux/fs.h:1743 new_sync_write fs/read_write.c:457 __vfs_write+0x6c3/0x7f0 fs/read_write.c:470 vfs_write+0x3e4/0x770 fs/read_write.c:518 SYSC_write+0x12f/0x2b0 fs/read_write.c:565 SyS_write+0x55/0x80 fs/read_write.c:557 do_syscall_64+0x242/0x330 arch/x86/entry/common.c:284 entry_SYSCALL64_slow_path+0x25/0x25 arch/x86/entry/entry_64.S:245 ... origin: ... kmsan_poison_shadow+0x6e/0xc0 mm/kmsan/kmsan.c:211 slab_alloc_node mm/slub.c:2732 __kmalloc_node_track_caller+0x351/0x370 mm/slub.c:4351 __kmalloc_reserve net/core/skbuff.c:138 __alloc_skb+0x26a/0x810 net/core/skbuff.c:231 alloc_skb ./include/linux/skbuff.h:903 alloc_skb_with_frags+0x1d7/0xc80 net/core/skbuff.c:4756 sock_alloc_send_pskb+0xabf/0xfe0 net/core/sock.c:2037 tun_alloc_skb drivers/net/tun.c:1144 tun_get_user+0x9a8/0x3770 drivers/net/tun.c:1274 tun_chr_write_iter+0x19f/0x300 drivers/net/tun.c:1365 call_write_iter ./include/linux/fs.h:1743 new_sync_write fs/read_write.c:457 __vfs_write+0x6c3/0x7f0 fs/read_write.c:470 vfs_write+0x3e4/0x770 fs/read_write.c:518 SYSC_write+0x12f/0x2b0 fs/read_write.c:565 SyS_write+0x55/0x80 fs/read_write.c:557 do_syscall_64+0x242/0x330 arch/x86/entry/common.c:284 return_from_SYSCALL_64+0x0/0x6a arch/x86/entry/entry_64.S:245 ================================================ Make sure tun_get_user() doesn't touch skb->data[0] unless there is actual data. C reproducer below: ========================== // autogenerated by syzkaller (http://github.com/google/syzkaller) #define _GNU_SOURCE #include #include #include #include #include #include int main() { int sock = socket(PF_INET, SOCK_STREAM, IPPROTO_IP); int tun_fd = open("/dev/net/tun", O_RDWR); struct ifreq req; memset(&req, 0, sizeof(struct ifreq)); strcpy((char*)&req.ifr_name, "gre0"); req.ifr_flags = IFF_UP | IFF_MULTICAST; ioctl(tun_fd, TUNSETIFF, &req); ioctl(sock, SIOCSIFFLAGS, "gre0"); write(tun_fd, "hi", 0); return 0; } ========================== Signed-off-by: Alexander Potapenko Signed-off-by: David S. Miller --- drivers/net/tun.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 3c9985f29950..5ce580f413b9 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1496,11 +1496,13 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, switch (tun->flags & TUN_TYPE_MASK) { case IFF_TUN: if (tun->flags & IFF_NO_PI) { - switch (skb->data[0] & 0xf0) { - case 0x40: + u8 ip_version = skb->len ? (skb->data[0] >> 4) : 0; + + switch (ip_version) { + case 4: pi.proto = htons(ETH_P_IP); break; - case 0x60: + case 6: pi.proto = htons(ETH_P_IPV6); break; default: -- cgit From c0a1666bcb2a33e84187a15eabdcd54056be9a97 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 28 Sep 2017 17:58:41 +0200 Subject: KVM: VMX: use cmpxchg64 This fixes a compilation failure on 32-bit systems. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index b9d2140eb212..7f62c94196d1 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2238,8 +2238,8 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) new.ndst = (dest << 8) & 0xFF00; new.sn = 0; - } while (cmpxchg(&pi_desc->control, old.control, - new.control) != old.control); + } while (cmpxchg64(&pi_desc->control, old.control, + new.control) != old.control); } static void decache_tsc_multiplier(struct vcpu_vmx *vmx) @@ -11730,8 +11730,8 @@ static void __pi_post_block(struct kvm_vcpu *vcpu) /* set 'NV' to 'notification vector' */ new.nv = POSTED_INTR_VECTOR; - } while (cmpxchg(&pi_desc->control, old.control, - new.control) != old.control); + } while (cmpxchg64(&pi_desc->control, old.control, + new.control) != old.control); if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) { spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); @@ -11800,8 +11800,8 @@ static int pi_pre_block(struct kvm_vcpu *vcpu) /* set 'NV' to 'wakeup vector' */ new.nv = POSTED_INTR_WAKEUP_VECTOR; - } while (cmpxchg(&pi_desc->control, old.control, - new.control) != old.control); + } while (cmpxchg64(&pi_desc->control, old.control, + new.control) != old.control); /* We should not block the vCPU if an interrupt is posted for it. */ if (pi_test_on(pi_desc) == 1) -- cgit From b28503a3fe6400757817e4460090f96bc1b9d6e7 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Fri, 15 Sep 2017 09:14:03 +0200 Subject: perf test: Fix vmlinux failure on s390x On s390x perf test 1 failed. It turned out that commit 4a084ecfc821 ("perf report: Fix module symbol adjustment for s390x") was incorrect. The previous implementation in dso__load_sym() is also suitable for s390x. Therefore this patch undoes commit 4a084ecfc821. Signed-off-by: Thomas-Mich Richter Cc: Hendrik Brueckner Cc: Zvonko Kosic Fixes: 4a084ecfc821 ("perf report: Fix module symbol adjustment for s390x") LPU-Reference: 20170915071404.58398-1-tmricht@linux.vnet.ibm.com Link: http://lkml.kernel.org/n/tip-5ani7ly57zji7s0hmzkx416l@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/s390/util/sym-handling.c | 8 -------- tools/perf/util/symbol-elf.c | 8 +------- tools/perf/util/symbol.h | 3 --- 3 files changed, 1 insertion(+), 18 deletions(-) diff --git a/tools/perf/arch/s390/util/sym-handling.c b/tools/perf/arch/s390/util/sym-handling.c index e103f6e46afe..581d4c5a896b 100644 --- a/tools/perf/arch/s390/util/sym-handling.c +++ b/tools/perf/arch/s390/util/sym-handling.c @@ -18,12 +18,4 @@ bool elf__needs_adjust_symbols(GElf_Ehdr ehdr) return false; return ehdr.e_type == ET_REL || ehdr.e_type == ET_DYN; } - -void arch__adjust_sym_map_offset(GElf_Sym *sym, - GElf_Shdr *shdr __maybe_unused, - struct map *map) -{ - if (map->type == MAP__FUNCTION) - sym->st_value += map->start; -} #endif diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c index 5c39f420111e..9cf781f0d8a2 100644 --- a/tools/perf/util/symbol-elf.c +++ b/tools/perf/util/symbol-elf.c @@ -810,12 +810,6 @@ static u64 ref_reloc(struct kmap *kmap) void __weak arch__sym_update(struct symbol *s __maybe_unused, GElf_Sym *sym __maybe_unused) { } -void __weak arch__adjust_sym_map_offset(GElf_Sym *sym, GElf_Shdr *shdr, - struct map *map __maybe_unused) -{ - sym->st_value -= shdr->sh_addr - shdr->sh_offset; -} - int dso__load_sym(struct dso *dso, struct map *map, struct symsrc *syms_ss, struct symsrc *runtime_ss, int kmodule) { @@ -996,7 +990,7 @@ int dso__load_sym(struct dso *dso, struct map *map, struct symsrc *syms_ss, /* Adjust symbol to map to file offset */ if (adjust_kernel_syms) - arch__adjust_sym_map_offset(&sym, &shdr, map); + sym.st_value -= shdr.sh_addr - shdr.sh_offset; if (strcmp(section_name, (curr_dso->short_name + diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h index 2bd6a1f01a1c..aad99e7e179b 100644 --- a/tools/perf/util/symbol.h +++ b/tools/perf/util/symbol.h @@ -344,9 +344,6 @@ int setup_intlist(struct intlist **list, const char *list_str, #ifdef HAVE_LIBELF_SUPPORT bool elf__needs_adjust_symbols(GElf_Ehdr ehdr); void arch__sym_update(struct symbol *s, GElf_Sym *sym); -void arch__adjust_sym_map_offset(GElf_Sym *sym, - GElf_Shdr *shdr __maybe_unused, - struct map *map __maybe_unused); #endif #define SYMBOL_A 0 -- cgit From 5357413f5c067f60933e4b8d79d483fbe62b2bb5 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Fri, 15 Sep 2017 09:14:04 +0200 Subject: perf test: Fix vmlinux failure on s390x part 2 On s390x perf test 1 failed. It turned out that commit cf6383f73cf2 ("perf report: Fix kernel symbol adjustment for s390x") was incorrect. The previous implementation in dso__load_sym() is also suitable for s390x. Therefore this patch undoes commit cf6383f73cf2 Signed-off-by: Thomas-Mich Richter Cc: Zvonko Kosic Cc: Hendrik Brueckner Fixes: cf6383f73cf2 ("perf report: Fix kernel symbol adjustment for s390x") LPU-Reference: 20170915071404.58398-2-tmricht@linux.vnet.ibm.com Link: http://lkml.kernel.org/n/tip-v101o8k25vuja2ogosgf15yy@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/s390/util/Build | 1 - tools/perf/arch/s390/util/sym-handling.c | 21 --------------------- 2 files changed, 22 deletions(-) delete mode 100644 tools/perf/arch/s390/util/sym-handling.c diff --git a/tools/perf/arch/s390/util/Build b/tools/perf/arch/s390/util/Build index bd518b623d7a..5bd7b9260cc0 100644 --- a/tools/perf/arch/s390/util/Build +++ b/tools/perf/arch/s390/util/Build @@ -1,5 +1,4 @@ libperf-y += header.o -libperf-y += sym-handling.o libperf-y += kvm-stat.o libperf-$(CONFIG_DWARF) += dwarf-regs.o diff --git a/tools/perf/arch/s390/util/sym-handling.c b/tools/perf/arch/s390/util/sym-handling.c deleted file mode 100644 index 581d4c5a896b..000000000000 --- a/tools/perf/arch/s390/util/sym-handling.c +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Architecture specific ELF symbol handling and relocation mapping. - * - * Copyright 2017 IBM Corp. - * Author(s): Thomas Richter - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License (version 2 only) - * as published by the Free Software Foundation. - */ - -#include "symbol.h" - -#ifdef HAVE_LIBELF_SUPPORT -bool elf__needs_adjust_symbols(GElf_Ehdr ehdr) -{ - if (ehdr.e_type == ET_EXEC) - return false; - return ehdr.e_type == ET_REL || ehdr.e_type == ET_DYN; -} -#endif -- cgit From bd86e32059526e2d0d13ca1e4447dfbbddb6e5cc Mon Sep 17 00:00:00 2001 From: Jeffy Chen Date: Wed, 27 Sep 2017 20:28:57 +0800 Subject: dm crypt: fix memory leak in crypt_ctr_cipher_old() Fix memory leak of cipher_api. Fixes: 33d2f09fcb35 (dm crypt: introduce new format of cipher with "capi:" prefix) Cc: stable@vger.kernel.org # 4.12+ Signed-off-by: Jeffy Chen Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index a55ffd4f5933..75341fdca4b6 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -2466,6 +2466,7 @@ static int crypt_ctr_cipher_old(struct dm_target *ti, char *cipher_in, char *key kfree(cipher_api); return ret; } + kfree(cipher_api); return 0; bad_mem: -- cgit From aff3da39211105a42b2108b8af79bf8e16f670fd Mon Sep 17 00:00:00 2001 From: Stefan Chulski Date: Mon, 25 Sep 2017 14:59:46 +0200 Subject: net: mvpp2: fix parsing fragmentation detection Parsing fragmentation detection failed due to wrong configured parser TCAM entry's. Some traffic was marked as fragmented in RX descriptor, even it wasn't IP fragmented. The hardware also failed to calculate checksums which lead to use software checksum and caused performance degradation. Fixes: 3f518509dedc ("ethernet: Add new driver for Marvell Armada 375 network unit") Signed-off-by: Antoine Tenart Signed-off-by: Stefan Chulski Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvpp2.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c index dd0ee2691c86..da04939a2748 100644 --- a/drivers/net/ethernet/marvell/mvpp2.c +++ b/drivers/net/ethernet/marvell/mvpp2.c @@ -676,6 +676,7 @@ enum mvpp2_tag_type { #define MVPP2_PRS_RI_L3_MCAST BIT(15) #define MVPP2_PRS_RI_L3_BCAST (BIT(15) | BIT(16)) #define MVPP2_PRS_RI_IP_FRAG_MASK 0x20000 +#define MVPP2_PRS_RI_IP_FRAG_TRUE BIT(17) #define MVPP2_PRS_RI_UDF3_MASK 0x300000 #define MVPP2_PRS_RI_UDF3_RX_SPECIAL BIT(21) #define MVPP2_PRS_RI_L4_PROTO_MASK 0x1c00000 @@ -2315,7 +2316,7 @@ static int mvpp2_prs_ip4_proto(struct mvpp2 *priv, unsigned short proto, (proto != IPPROTO_IGMP)) return -EINVAL; - /* Fragmented packet */ + /* Not fragmented packet */ tid = mvpp2_prs_tcam_first_free(priv, MVPP2_PE_FIRST_FREE_TID, MVPP2_PE_LAST_FREE_TID); if (tid < 0) @@ -2334,8 +2335,12 @@ static int mvpp2_prs_ip4_proto(struct mvpp2 *priv, unsigned short proto, MVPP2_PRS_SRAM_OP_SEL_UDF_ADD); mvpp2_prs_sram_ai_update(&pe, MVPP2_PRS_IPV4_DIP_AI_BIT, MVPP2_PRS_IPV4_DIP_AI_BIT); - mvpp2_prs_sram_ri_update(&pe, ri | MVPP2_PRS_RI_IP_FRAG_MASK, - ri_mask | MVPP2_PRS_RI_IP_FRAG_MASK); + mvpp2_prs_sram_ri_update(&pe, ri, ri_mask | MVPP2_PRS_RI_IP_FRAG_MASK); + + mvpp2_prs_tcam_data_byte_set(&pe, 2, 0x00, + MVPP2_PRS_TCAM_PROTO_MASK_L); + mvpp2_prs_tcam_data_byte_set(&pe, 3, 0x00, + MVPP2_PRS_TCAM_PROTO_MASK); mvpp2_prs_tcam_data_byte_set(&pe, 5, proto, MVPP2_PRS_TCAM_PROTO_MASK); mvpp2_prs_tcam_ai_update(&pe, 0, MVPP2_PRS_IPV4_DIP_AI_BIT); @@ -2346,7 +2351,7 @@ static int mvpp2_prs_ip4_proto(struct mvpp2 *priv, unsigned short proto, mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_IP4); mvpp2_prs_hw_write(priv, &pe); - /* Not fragmented packet */ + /* Fragmented packet */ tid = mvpp2_prs_tcam_first_free(priv, MVPP2_PE_FIRST_FREE_TID, MVPP2_PE_LAST_FREE_TID); if (tid < 0) @@ -2358,8 +2363,11 @@ static int mvpp2_prs_ip4_proto(struct mvpp2 *priv, unsigned short proto, pe.sram.word[MVPP2_PRS_SRAM_RI_CTRL_WORD] = 0x0; mvpp2_prs_sram_ri_update(&pe, ri, ri_mask); - mvpp2_prs_tcam_data_byte_set(&pe, 2, 0x00, MVPP2_PRS_TCAM_PROTO_MASK_L); - mvpp2_prs_tcam_data_byte_set(&pe, 3, 0x00, MVPP2_PRS_TCAM_PROTO_MASK); + mvpp2_prs_sram_ri_update(&pe, ri | MVPP2_PRS_RI_IP_FRAG_TRUE, + ri_mask | MVPP2_PRS_RI_IP_FRAG_MASK); + + mvpp2_prs_tcam_data_byte_set(&pe, 2, 0x00, 0x0); + mvpp2_prs_tcam_data_byte_set(&pe, 3, 0x00, 0x0); /* Update shadow table and hw entry */ mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_IP4); -- cgit From 6bf69a1d6334bed776875c5ca852594ab4e5b209 Mon Sep 17 00:00:00 2001 From: Yan Markman Date: Mon, 25 Sep 2017 14:59:47 +0200 Subject: net: mvpp2: fix port list indexing The private port_list array has a list of pointers to mvpp2_port instances. This list is allocated given the number of ports enabled in the device tree, but the pointers are set using the port-id property. If on a single port is enabled, the port_list array will be of size 1, but when registering the port, if its id is not 0 the driver will crash. Other crashes were encountered in various situations. This fixes the issue by using an index not equal to the value of the port-id property. Fixes: 3f518509dedc ("ethernet: Add new driver for Marvell Armada 375 network unit") Signed-off-by: Antoine Tenart Signed-off-by: Yan Markman Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvpp2.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c index da04939a2748..b2f99df81e9c 100644 --- a/drivers/net/ethernet/marvell/mvpp2.c +++ b/drivers/net/ethernet/marvell/mvpp2.c @@ -7504,7 +7504,7 @@ static void mvpp2_port_copy_mac_addr(struct net_device *dev, struct mvpp2 *priv, /* Ports initialization */ static int mvpp2_port_probe(struct platform_device *pdev, struct device_node *port_node, - struct mvpp2 *priv) + struct mvpp2 *priv, int index) { struct device_node *phy_node; struct phy *comphy; @@ -7678,7 +7678,7 @@ static int mvpp2_port_probe(struct platform_device *pdev, } netdev_info(dev, "Using %s mac address %pM\n", mac_from, dev->dev_addr); - priv->port_list[id] = port; + priv->port_list[index] = port; return 0; err_free_port_pcpu: @@ -8013,10 +8013,12 @@ static int mvpp2_probe(struct platform_device *pdev) } /* Initialize ports */ + i = 0; for_each_available_child_of_node(dn, port_node) { - err = mvpp2_port_probe(pdev, port_node, priv); + err = mvpp2_port_probe(pdev, port_node, priv, i); if (err < 0) goto err_mg_clk; + i++; } platform_set_drvdata(pdev, priv); -- cgit From c7dfc8c848a48f176096f66a14879fb3333a460f Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Mon, 25 Sep 2017 14:59:48 +0200 Subject: net: mvpp2: do not select the internal source clock This patch stops the internal MAC Tx clock from being enabled as the internal clock isn't used. The definition used for the bit controlling this behaviour is renamed as well as it was wrongly named (bit 4 of GMAC_CTRL_2_REG). Fixes: 3919357fb0bb ("net: mvpp2: initialize the GMAC when using a port") Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvpp2.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c index b2f99df81e9c..161055564720 100644 --- a/drivers/net/ethernet/marvell/mvpp2.c +++ b/drivers/net/ethernet/marvell/mvpp2.c @@ -333,7 +333,7 @@ #define MVPP2_GMAC_INBAND_AN_MASK BIT(0) #define MVPP2_GMAC_FLOW_CTRL_MASK GENMASK(2, 1) #define MVPP2_GMAC_PCS_ENABLE_MASK BIT(3) -#define MVPP2_GMAC_PORT_RGMII_MASK BIT(4) +#define MVPP2_GMAC_INTERNAL_CLK_MASK BIT(4) #define MVPP2_GMAC_DISABLE_PADDING BIT(5) #define MVPP2_GMAC_PORT_RESET_MASK BIT(6) #define MVPP2_GMAC_AUTONEG_CONFIG 0xc @@ -4599,7 +4599,6 @@ static void mvpp2_port_mii_gmac_configure(struct mvpp2_port *port) val |= MVPP2_GMAC_INBAND_AN_MASK | MVPP2_GMAC_PCS_ENABLE_MASK; } else if (phy_interface_mode_is_rgmii(port->phy_interface)) { val &= ~MVPP2_GMAC_PCS_ENABLE_MASK; - val |= MVPP2_GMAC_PORT_RGMII_MASK; } writel(val, port->base + MVPP2_GMAC_CTRL_2_REG); -- cgit From 35f493b87ec072c5a2497ffbee243095ef725827 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 25 Sep 2017 08:40:02 -0700 Subject: inetpeer: fix RCU lookup() again My prior fix was not complete, as we were dereferencing a pointer three times per node, not twice as I initially thought. Fixes: 4cc5b44b29a9 ("inetpeer: fix RCU lookup()") Fixes: b145425f269a ("inetpeer: remove AVL implementation in favor of RB tree") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/inetpeer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index e7eb590c86ce..b20c8ac64081 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -128,9 +128,9 @@ static struct inet_peer *lookup(const struct inetpeer_addr *daddr, break; } if (cmp == -1) - pp = &(*pp)->rb_left; + pp = &next->rb_left; else - pp = &(*pp)->rb_right; + pp = &next->rb_right; } *parent_p = parent; *pp_p = pp; -- cgit From db06ae41945b14feb7f696dcafe8048cc37e8a20 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Mon, 25 Sep 2017 23:32:20 +0200 Subject: net: dsa: mv88e6xxx: Allow dsa and cpu ports in multiple vlans Ports with the same VLAN must all be in the same bridge. However the CPU and DSA ports need to be in multiple VLANs spread over multiple bridges. So exclude them when performing this test. Fixes: b2f81d304cee ("net: dsa: add CPU and DSA ports as VLAN members") Signed-off-by: Andrew Lunn Reviewed-by: Vivien Didelot Signed-off-by: David S. Miller --- drivers/net/dsa/mv88e6xxx/chip.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index c6678aa9b4ef..674dab71d71c 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -1100,6 +1100,10 @@ static int mv88e6xxx_port_check_hw_vlan(struct dsa_switch *ds, int port, }; int i, err; + /* DSA and CPU ports have to be members of multiple vlans */ + if (dsa_is_dsa_port(ds, port) || dsa_is_cpu_port(ds, port)) + return 0; + if (!vid_begin) return -EOPNOTSUPP; -- cgit From e804441cfe0b60f6c430901946a69c01eac09df1 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 25 Sep 2017 15:55:53 -0700 Subject: net: dsa: Fix network device registration order We cannot be registering the network device first, then setting its carrier off and finally connecting it to a PHY, doing that leaves a window during which the carrier is at best inconsistent, and at worse the device is not usable without a down/up sequence since the network device is visible to user space with possibly no PHY device attached. Re-order steps so that they make logical sense. This fixes some devices where the port was not usable after e.g: an unbind then bind of the driver. Fixes: 0071f56e46da ("dsa: Register netdev before phy") Fixes: 91da11f870f0 ("net: Distributed Switch Architecture protocol support") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/slave.c | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 2afa99506f8b..865e29e62bad 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1301,28 +1301,33 @@ int dsa_slave_create(struct dsa_port *port, const char *name) p->old_duplex = -1; port->netdev = slave_dev; - ret = register_netdev(slave_dev); - if (ret) { - netdev_err(master, "error %d registering interface %s\n", - ret, slave_dev->name); - port->netdev = NULL; - free_percpu(p->stats64); - free_netdev(slave_dev); - return ret; - } netif_carrier_off(slave_dev); ret = dsa_slave_phy_setup(p, slave_dev); if (ret) { netdev_err(master, "error %d setting up slave phy\n", ret); - unregister_netdev(slave_dev); - free_percpu(p->stats64); - free_netdev(slave_dev); - return ret; + goto out_free; + } + + ret = register_netdev(slave_dev); + if (ret) { + netdev_err(master, "error %d registering interface %s\n", + ret, slave_dev->name); + goto out_phy; } return 0; + +out_phy: + phy_disconnect(p->phy); + if (of_phy_is_fixed_link(p->dp->dn)) + of_phy_deregister_fixed_link(p->dp->dn); +out_free: + free_percpu(p->stats64); + free_netdev(slave_dev); + port->netdev = NULL; + return ret; } void dsa_slave_destroy(struct net_device *slave_dev) -- cgit From 06d7a1b932c26afe2c0a1f4520ddd417d8eeda79 Mon Sep 17 00:00:00 2001 From: Ed Blake Date: Tue, 26 Sep 2017 11:43:46 +0100 Subject: net: stmmac: dwc-qos: Add suspend / resume support Add hook to stmmac_pltfr_pm_ops for suspend / resume handling. Signed-off-by: Ed Blake Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c index dd6a2f9791cc..5efef8001edf 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c @@ -511,6 +511,7 @@ static struct platform_driver dwc_eth_dwmac_driver = { .remove = dwc_eth_dwmac_remove, .driver = { .name = "dwc-eth-dwmac", + .pm = &stmmac_pltfr_pm_ops, .of_match_table = dwc_eth_dwmac_match, }, }; -- cgit From 1579f678fb4397f9e439d2e373d4ade036c673b4 Mon Sep 17 00:00:00 2001 From: Ed Blake Date: Tue, 26 Sep 2017 11:44:53 +0100 Subject: net: stmmac: dwmac4: Re-enable MAC Rx before powering down Re-enable the MAC receiver by setting CONFIG_RE before powering down, as instructed in section 6.3.5.1 of [1]. Without this the MAC fails to receive WoL packets and never wakes up. [1] DWC Ethernet QoS Databook 4.10a October 2014 Signed-off-by: Ed Blake Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c index c4407e8e39a3..2f7d7ec59962 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c @@ -296,6 +296,7 @@ static void dwmac4_pmt(struct mac_device_info *hw, unsigned long mode) { void __iomem *ioaddr = hw->pcsr; unsigned int pmt = 0; + u32 config; if (mode & WAKE_MAGIC) { pr_debug("GMAC: WOL Magic frame\n"); @@ -306,6 +307,12 @@ static void dwmac4_pmt(struct mac_device_info *hw, unsigned long mode) pmt |= power_down | global_unicast | wake_up_frame_en; } + if (pmt) { + /* The receiver must be enabled for WOL before powering down */ + config = readl(ioaddr + GMAC_CONFIG); + config |= GMAC_CONFIG_RE; + writel(config, ioaddr + GMAC_CONFIG); + } writel(pmt, ioaddr + GMAC_PMT); } -- cgit From 4971613c1639d8e5f102c4e797c3bf8f83a5a69e Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 26 Sep 2017 12:19:37 -0400 Subject: packet: in packet_do_bind, test fanout with bind_lock held Once a socket has po->fanout set, it remains a member of the group until it is destroyed. The prot_hook must be constant and identical across sockets in the group. If fanout_add races with packet_do_bind between the test of po->fanout and taking the lock, the bind call may make type or dev inconsistent with that of the fanout group. Hold po->bind_lock when testing po->fanout to avoid this race. I had to introduce artificial delay (local_bh_enable) to actually observe the race. Fixes: dc99f600698d ("packet: Add fanout support.") Signed-off-by: Willem de Bruijn Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/packet/af_packet.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index d288f52c53f7..a10c2836465c 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3069,13 +3069,15 @@ static int packet_do_bind(struct sock *sk, const char *name, int ifindex, int ret = 0; bool unlisted = false; - if (po->fanout) - return -EINVAL; - lock_sock(sk); spin_lock(&po->bind_lock); rcu_read_lock(); + if (po->fanout) { + ret = -EINVAL; + goto out_unlock; + } + if (name) { dev = dev_get_by_name_rcu(sock_net(sk), name); if (!dev) { -- cgit From da7c9561015e93d10fe6aab73e9288e0d09d65a6 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 26 Sep 2017 12:20:17 -0400 Subject: packet: only test po->has_vnet_hdr once in packet_snd Packet socket option po->has_vnet_hdr can be updated concurrently with other operations if no ring is attached. Do not test the option twice in packet_snd, as the value may change in between calls. A race on setsockopt disable may cause a packet > mtu to be sent without having GSO options set. Fixes: bfd5f4a3d605 ("packet: Add GSO/csum offload support.") Signed-off-by: Willem de Bruijn Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/packet/af_packet.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index a10c2836465c..bec01a3daf5b 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2840,6 +2840,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) struct virtio_net_hdr vnet_hdr = { 0 }; int offset = 0; struct packet_sock *po = pkt_sk(sk); + bool has_vnet_hdr = false; int hlen, tlen, linear; int extra_len = 0; @@ -2883,6 +2884,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) err = packet_snd_vnet_parse(msg, &len, &vnet_hdr); if (err) goto out_unlock; + has_vnet_hdr = true; } if (unlikely(sock_flag(sk, SOCK_NOFCS))) { @@ -2941,7 +2943,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) skb->priority = sk->sk_priority; skb->mark = sockc.mark; - if (po->has_vnet_hdr) { + if (has_vnet_hdr) { err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le()); if (err) goto out_free; -- cgit From b32ca44a88def4bf92626d8777494c6f14638c42 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Tue, 26 Sep 2017 14:57:21 -0400 Subject: net: dsa: mv88e6xxx: lock mutex when freeing IRQs mv88e6xxx_g2_irq_free locks the registers mutex, but not mv88e6xxx_g1_irq_free, which results in a stack trace from assert_reg_lock when unloading the mv88e6xxx module. Fix this. Fixes: 3460a5770ce9 ("net: dsa: mv88e6xxx: Mask g1 interrupts and free interrupt") Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/mv88e6xxx/chip.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 674dab71d71c..d74c7335c512 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -3951,7 +3951,9 @@ static void mv88e6xxx_remove(struct mdio_device *mdiodev) if (chip->irq > 0) { if (chip->info->g2_irqs > 0) mv88e6xxx_g2_irq_free(chip); + mutex_lock(&chip->reg_lock); mv88e6xxx_g1_irq_free(chip); + mutex_unlock(&chip->reg_lock); } } -- cgit From 9d538fa60bad4f7b23193c89e843797a1cf71ef3 Mon Sep 17 00:00:00 2001 From: Christoph Paasch Date: Tue, 26 Sep 2017 17:38:50 -0700 Subject: net: Set sk_prot_creator when cloning sockets to the right proto sk->sk_prot and sk->sk_prot_creator can differ when the app uses IPV6_ADDRFORM (transforming an IPv6-socket to an IPv4-one). Which is why sk_prot_creator is there to make sure that sk_prot_free() does the kmem_cache_free() on the right kmem_cache slab. Now, if such a socket gets transformed back to a listening socket (using connect() with AF_UNSPEC) we will allocate an IPv4 tcp_sock through sk_clone_lock() when a new connection comes in. But sk_prot_creator will still point to the IPv6 kmem_cache (as everything got copied in sk_clone_lock()). When freeing, we will thus put this memory back into the IPv6 kmem_cache although it was allocated in the IPv4 cache. I have seen memory corruption happening because of this. With slub-debugging and MEMCG_KMEM enabled this gives the warning "cache_from_obj: Wrong slab cache. TCPv6 but object is from TCP" A C-program to trigger this: void main(void) { int fd = socket(AF_INET6, SOCK_STREAM, IPPROTO_TCP); int new_fd, newest_fd, client_fd; struct sockaddr_in6 bind_addr; struct sockaddr_in bind_addr4, client_addr1, client_addr2; struct sockaddr unsp; int val; memset(&bind_addr, 0, sizeof(bind_addr)); bind_addr.sin6_family = AF_INET6; bind_addr.sin6_port = ntohs(42424); memset(&client_addr1, 0, sizeof(client_addr1)); client_addr1.sin_family = AF_INET; client_addr1.sin_port = ntohs(42424); client_addr1.sin_addr.s_addr = inet_addr("127.0.0.1"); memset(&client_addr2, 0, sizeof(client_addr2)); client_addr2.sin_family = AF_INET; client_addr2.sin_port = ntohs(42421); client_addr2.sin_addr.s_addr = inet_addr("127.0.0.1"); memset(&unsp, 0, sizeof(unsp)); unsp.sa_family = AF_UNSPEC; bind(fd, (struct sockaddr *)&bind_addr, sizeof(bind_addr)); listen(fd, 5); client_fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); connect(client_fd, (struct sockaddr *)&client_addr1, sizeof(client_addr1)); new_fd = accept(fd, NULL, NULL); close(fd); val = AF_INET; setsockopt(new_fd, SOL_IPV6, IPV6_ADDRFORM, &val, sizeof(val)); connect(new_fd, &unsp, sizeof(unsp)); memset(&bind_addr4, 0, sizeof(bind_addr4)); bind_addr4.sin_family = AF_INET; bind_addr4.sin_port = ntohs(42421); bind(new_fd, (struct sockaddr *)&bind_addr4, sizeof(bind_addr4)); listen(new_fd, 5); client_fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); connect(client_fd, (struct sockaddr *)&client_addr2, sizeof(client_addr2)); newest_fd = accept(new_fd, NULL, NULL); close(new_fd); close(client_fd); close(new_fd); } As far as I can see, this bug has been there since the beginning of the git-days. Signed-off-by: Christoph Paasch Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/core/sock.c b/net/core/sock.c index 9b7b6bbb2a23..7d55c05f449d 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1654,6 +1654,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) sock_copy(newsk, sk); + newsk->sk_prot_creator = sk->sk_prot; + /* SANITY */ if (likely(newsk->sk_net_refcnt)) get_net(sock_net(newsk)); -- cgit From e49aa15ef6c179f69e5578a271801f31a09e9a3f Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 28 Sep 2017 13:20:32 -0700 Subject: Revert "Bluetooth: Add option for disabling legacy ioctl interfaces" This reverts commit dbbccdc4ced015cdd4051299bd87fbe0254ad351. It turns out that the "legacy" users aren't so legacy at all, and that turning off the legacy ioctl will break the current Qt bluetooth stack for bluetooth LE devices that were released just a couple of months ago. So it's simply not true that this was a legacy interface that hasn't been needed and is only limited to old legacy BT devices. Because I actually read Kconfig help messages, and actively try to turn off features that I don't need, I turned the option off. Then I spent _way_ too much time debugging BLE issues until I realized that it wasn't the Qt and subsurface development that had broken one of my dive computer BLE downloads, but simply my broken kernel config. Maybe in a decade it will be true that this is a legacy interface. And maybe with a better help-text and correct dependencies, this kind of legacy removal might be acceptable. But as things are right now both the commit message and the Kconfig help text were misleading, and the Kconfig option had the wrong dependenencies. There's no reason to keep that broken Kconfig option in the tree. Cc: Marcel Holtmann Cc: Johan Hedberg Signed-off-by: Linus Torvalds --- net/bluetooth/Kconfig | 10 ---------- net/bluetooth/hci_sock.c | 6 ------ 2 files changed, 16 deletions(-) diff --git a/net/bluetooth/Kconfig b/net/bluetooth/Kconfig index c18115d22f00..db82a40875e8 100644 --- a/net/bluetooth/Kconfig +++ b/net/bluetooth/Kconfig @@ -126,14 +126,4 @@ config BT_DEBUGFS Provide extensive information about internal Bluetooth states in debugfs. -config BT_LEGACY_IOCTL - bool "Enable legacy ioctl interfaces" - depends on BT && BT_BREDR - default y - help - Enable support for legacy ioctl interfaces. This is only needed - for old and deprecated applications using direct ioctl calls for - controller management. Since Linux 3.4 all configuration and - setup is done via mgmt interface and this is no longer needed. - source "drivers/bluetooth/Kconfig" diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 0bad296fe0af..65d734c165bd 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -878,7 +878,6 @@ static int hci_sock_release(struct socket *sock) return 0; } -#ifdef CONFIG_BT_LEGACY_IOCTL static int hci_sock_blacklist_add(struct hci_dev *hdev, void __user *arg) { bdaddr_t bdaddr; @@ -1050,7 +1049,6 @@ done: release_sock(sk); return err; } -#endif static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) @@ -1971,11 +1969,7 @@ static const struct proto_ops hci_sock_ops = { .getname = hci_sock_getname, .sendmsg = hci_sock_sendmsg, .recvmsg = hci_sock_recvmsg, -#ifdef CONFIG_BT_LEGACY_IOCTL .ioctl = hci_sock_ioctl, -#else - .ioctl = sock_no_ioctl, -#endif .poll = datagram_poll, .listen = sock_no_listen, .shutdown = sock_no_shutdown, -- cgit From 87cbde8d9081b91df86a21d0d743cd700e04890a Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 28 Sep 2017 01:45:10 +0200 Subject: PM / s2idle: Invoke the ->wake() platform callback earlier The role of the ->wake() platform callback for suspend-to-idle is to deal with possible spurious wakeups, among other things. The ACPI implementation of it, acpi_s2idle_wake(), additionally checks the conditions for entering the Low Power S0 Idle state by the platform and reports the ones that have not been met. However, the ->wake() platform callback is invoked after calling dpm_noirq_resume_devices(), which means that the power states of some devices may have changed since s2idle_enter() returned, so some unmet Low Power S0 Idle conditions may be reported incorrectly as a result of that. To avoid these false positives, reorder the invocations of the dpm_noirq_resume_devices() routine and the ->wake() platform callback in s2idle_loop(). Fixes: 726fb6b4f2a8 (ACPI / PM: Check low power idle constraints for debug only) Tested-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 3e2b4f519009..ccd2d20e6b06 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -120,22 +120,26 @@ static void s2idle_loop(void) * frozen processes + suspended devices + idle processors. * Thus s2idle_enter() should be called right after * all devices have been suspended. + * + * Wakeups during the noirq suspend of devices may be spurious, + * so prevent them from terminating the loop right away. */ error = dpm_noirq_suspend_devices(PMSG_SUSPEND); if (!error) s2idle_enter(); + else if (error == -EBUSY && pm_wakeup_pending()) + error = 0; - dpm_noirq_resume_devices(PMSG_RESUME); - if (error && (error != -EBUSY || !pm_wakeup_pending())) { - dpm_noirq_end(); - break; - } - - if (s2idle_ops && s2idle_ops->wake) + if (!error && s2idle_ops && s2idle_ops->wake) s2idle_ops->wake(); + dpm_noirq_resume_devices(PMSG_RESUME); + dpm_noirq_end(); + if (error) + break; + if (s2idle_ops && s2idle_ops->sync) s2idle_ops->sync(); -- cgit From bca73f595a566f0262967535bb5b2ea9c4271d9a Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Thu, 28 Sep 2017 22:37:35 -0500 Subject: powerpc: Fix workaround for spurious MCE on POWER9 In the recent commit d8bd9f3f0925 ("powerpc: Handle MCE on POWER9 with only DSISR bit 30 set") I screwed up the bit number. It should be bit 25 (IBM bit 38). Fixes: d8bd9f3f0925 ("powerpc: Handle MCE on POWER9 with only DSISR bit 30 set") Signed-off-by: Michael Neuling Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/mce_power.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c index f523125b9d34..72f153c6f3fa 100644 --- a/arch/powerpc/kernel/mce_power.c +++ b/arch/powerpc/kernel/mce_power.c @@ -626,7 +626,7 @@ long __machine_check_early_realmode_p9(struct pt_regs *regs) { /* * On POWER9 DD2.1 and below, it's possible to get a machine check - * caused by a paste instruction where only DSISR bit 30 is set. This + * caused by a paste instruction where only DSISR bit 25 is set. This * will result in the MCE handler seeing an unknown event and the kernel * crashing. An MCE that occurs like this is spurious, so we don't need * to do anything in terms of servicing it. If there is something that @@ -634,7 +634,7 @@ long __machine_check_early_realmode_p9(struct pt_regs *regs) * correct DSISR so that it can be serviced properly. So detect this * case and mark it as handled. */ - if (SRR1_MC_LOADSTORE(regs->msr) && regs->dsisr == 0x40000000) + if (SRR1_MC_LOADSTORE(regs->msr) && regs->dsisr == 0x02000000) return 1; return mce_handle_error(regs, mce_p9_derror_table, mce_p9_ierror_table); -- cgit From 4fc0870d7e462fe3b86e0f938ae75ce884728c7d Mon Sep 17 00:00:00 2001 From: Christophe Lombard Date: Tue, 26 Sep 2017 10:15:21 +0200 Subject: cxl: Fix memory page not handled The in-kernel 'library' API can be called by drivers to help interaction with an IBM XSL on a POWER9 system. The cxllib_handle_fault() API is used to handle memory fault. All memory pages of the specified buffer have to be handled but under certain conditions,the last page may not be touched, and the address the adapter is trying to access is never sent to the kernel for resolution. This patch reworks start address of the loop with an address aligned on the page size. In this context, the last page is not missed. Signed-off-by: Christophe Lombard Acked-by: Frederic Barrat Acked-by: Andrew Donnellan Fixes: 3ced8d730063 ("cxl: Export library to support IBM XSL"); Signed-off-by: Michael Ellerman --- drivers/misc/cxl/cxllib.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/misc/cxl/cxllib.c b/drivers/misc/cxl/cxllib.c index 5dba23ca2e5f..dc9bc1807fdf 100644 --- a/drivers/misc/cxl/cxllib.c +++ b/drivers/misc/cxl/cxllib.c @@ -219,8 +219,17 @@ int cxllib_handle_fault(struct mm_struct *mm, u64 addr, u64 size, u64 flags) down_read(&mm->mmap_sem); - for (dar = addr; dar < addr + size; dar += page_size) { - if (!vma || dar < vma->vm_start || dar > vma->vm_end) { + vma = find_vma(mm, addr); + if (!vma) { + pr_err("Can't find vma for addr %016llx\n", addr); + rc = -EFAULT; + goto out; + } + /* get the size of the pages allocated */ + page_size = vma_kernel_pagesize(vma); + + for (dar = (addr & ~(page_size - 1)); dar < (addr + size); dar += page_size) { + if (dar < vma->vm_start || dar >= vma->vm_end) { vma = find_vma(mm, addr); if (!vma) { pr_err("Can't find vma for addr %016llx\n", addr); -- cgit From 441430eb54a00586f95f1aefc48e0801bbd6a923 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Wed, 6 Sep 2017 19:08:11 +0300 Subject: perf/aux: Only update ->aux_wakeup in non-overwrite mode The following commit: d9a50b0256 ("perf/aux: Ensure aux_wakeup represents most recent wakeup index") changed the AUX wakeup position calculation to rounddown(), which causes a division-by-zero in AUX overwrite mode (aka "snapshot mode"). The zero denominator results from the fact that perf record doesn't set aux_watermark to anything, in which case the kernel will set it to half the AUX buffer size, but only for non-overwrite mode. In the overwrite mode aux_watermark stays zero. The good news is that, AUX overwrite mode, wakeups don't happen and related bookkeeping is not relevant, so we can simply forego the whole wakeup updates. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: will.deacon@arm.com Link: http://lkml.kernel.org/r/20170906160811.16510-1-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/ring_buffer.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index af71a84e12ee..f684d8e5fa2b 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -412,6 +412,19 @@ err: return NULL; } +static bool __always_inline rb_need_aux_wakeup(struct ring_buffer *rb) +{ + if (rb->aux_overwrite) + return false; + + if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) { + rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark); + return true; + } + + return false; +} + /* * Commit the data written by hardware into the ring buffer by adjusting * aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the @@ -451,10 +464,8 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) } rb->user_page->aux_head = rb->aux_head; - if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) { + if (rb_need_aux_wakeup(rb)) wakeup = true; - rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark); - } if (wakeup) { if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED) @@ -484,9 +495,8 @@ int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size) rb->aux_head += size; rb->user_page->aux_head = rb->aux_head; - if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) { + if (rb_need_aux_wakeup(rb)) { perf_output_wakeup(handle); - rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark); handle->wakeup = rb->aux_wakeup + rb->aux_watermark; } -- cgit From 69b73e95982649a1f2dc63b8f08f2113d28f7fed Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 29 Sep 2017 10:07:44 +0200 Subject: um/time: Fixup namespace collision The new timer_setup() function for struct timer_list collides with a private um function. Rename it. Fixes: 686fef928bba ("timer: Prepare to change timer callback argument type") Signed-off-by: Thomas Gleixner Cc: Richard Weinberger Cc: Jeff Dike Cc: user-mode-linux-devel@lists.sourceforge.net Cc: Kees Cook --- arch/um/kernel/time.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c index 0b034ebbda2a..7f69d17de354 100644 --- a/arch/um/kernel/time.c +++ b/arch/um/kernel/time.c @@ -98,7 +98,7 @@ static struct clocksource timer_clocksource = { .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; -static void __init timer_setup(void) +static void __init um_timer_setup(void) { int err; @@ -132,5 +132,5 @@ void read_persistent_clock(struct timespec *ts) void __init time_init(void) { timer_set_signal_handler(); - late_time_init = timer_setup; + late_time_init = um_timer_setup; } -- cgit From 1593baab910da72480d651ea7bf2ce6e3a25a484 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Sep 2017 18:09:26 +0200 Subject: sched/debug: Implement consistent task-state printing Currently get_task_state() and task_state_to_char() report different states, create a number of common helpers and unify the reported state space. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- fs/proc/array.c | 15 ++------------- include/linux/sched.h | 26 +++++++++++++++++++------- 2 files changed, 21 insertions(+), 20 deletions(-) diff --git a/fs/proc/array.c b/fs/proc/array.c index 525157ca25cb..01196d3ad452 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -130,19 +130,8 @@ static const char * const task_state_array[] = { static inline const char *get_task_state(struct task_struct *tsk) { - unsigned int state = (tsk->state | tsk->exit_state) & TASK_REPORT; - - /* - * Parked tasks do not run; they sit in __kthread_parkme(). - * Without this check, we would report them as running, which is - * clearly wrong, so we report them as sleeping instead. - */ - if (tsk->state == TASK_PARKED) - state = TASK_INTERRUPTIBLE; - - BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != ARRAY_SIZE(task_state_array)-1); - - return task_state_array[fls(state)]; + BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != ARRAY_SIZE(task_state_array) - 1); + return task_state_array[__get_task_state(tsk)]; } static inline int get_task_umask(struct task_struct *tsk) diff --git a/include/linux/sched.h b/include/linux/sched.h index 92fb8dd5a9e4..163a0b738908 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1243,17 +1243,29 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk) return task_pgrp_nr_ns(tsk, &init_pid_ns); } -static inline char task_state_to_char(struct task_struct *task) +static inline unsigned int __get_task_state(struct task_struct *tsk) { - const char stat_nam[] = TASK_STATE_TO_CHAR_STR; - unsigned long state = task->state; + unsigned int tsk_state = READ_ONCE(tsk->state); + unsigned int state = (tsk_state | tsk->exit_state) & TASK_REPORT; - state = state ? __ffs(state) + 1 : 0; + if (tsk_state == TASK_PARKED) + state = TASK_INTERRUPTIBLE; - /* Make sure the string lines up properly with the number of task states: */ - BUILD_BUG_ON(sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1); + return fls(state); +} + +static inline char __task_state_to_char(unsigned int state) +{ + static const char state_char[] = "RSDTtXZ"; + + BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != sizeof(state_char) - 2); - return state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'; + return state_char[state]; +} + +static inline char task_state_to_char(struct task_struct *tsk) +{ + return __task_state_to_char(__get_task_state(tsk)); } /** -- cgit From 92c4bc9f9cd92a8581e36bc5105f03b569f37e36 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Sep 2017 18:13:36 +0200 Subject: sched/debug: Convert TASK_state to hex Bit patterns are easier in hex. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 163a0b738908..69bed5339ffa 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -65,23 +65,23 @@ struct task_group; */ /* Used in tsk->state: */ -#define TASK_RUNNING 0 -#define TASK_INTERRUPTIBLE 1 -#define TASK_UNINTERRUPTIBLE 2 -#define __TASK_STOPPED 4 -#define __TASK_TRACED 8 +#define TASK_RUNNING 0x0000 +#define TASK_INTERRUPTIBLE 0x0001 +#define TASK_UNINTERRUPTIBLE 0x0002 +#define __TASK_STOPPED 0x0004 +#define __TASK_TRACED 0x0008 /* Used in tsk->exit_state: */ -#define EXIT_DEAD 16 -#define EXIT_ZOMBIE 32 +#define EXIT_DEAD 0x0010 +#define EXIT_ZOMBIE 0x0020 #define EXIT_TRACE (EXIT_ZOMBIE | EXIT_DEAD) /* Used in tsk->state again: */ -#define TASK_DEAD 64 -#define TASK_WAKEKILL 128 -#define TASK_WAKING 256 -#define TASK_PARKED 512 -#define TASK_NOLOAD 1024 -#define TASK_NEW 2048 -#define TASK_STATE_MAX 4096 +#define TASK_DEAD 0x0040 +#define TASK_WAKEKILL 0x0080 +#define TASK_WAKING 0x0100 +#define TASK_PARKED 0x0200 +#define TASK_NOLOAD 0x0400 +#define TASK_NEW 0x0800 +#define TASK_STATE_MAX 0x1000 #define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPNn" -- cgit From 65d5dc47fe8530e17e318722fb4df676536c2bfc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Sep 2017 18:14:08 +0200 Subject: sched/debug: Remove unused variable Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 01217fb5a5de..2f93e4a2d9f6 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -466,8 +466,6 @@ static char *task_group_path(struct task_group *tg) } #endif -static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; - static void print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) { -- cgit From efb40f588b4370ffaeffafbd50f6ff213d954254 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Sep 2017 18:19:53 +0200 Subject: sched/tracing: Fix trace_sched_switch task-state printing Convert trace_sched_switch to use the common task-state helpers and fix the "X" and "Z" order, possibly they ended up in the wrong order because TASK_REPORT has them in the wrong order too. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 +- include/trace/events/sched.h | 18 +++++++++++------- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 69bed5339ffa..a2fe636b6825 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -99,7 +99,7 @@ struct task_group; /* get_task_state(): */ #define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \ TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \ - __TASK_TRACED | EXIT_ZOMBIE | EXIT_DEAD) + __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE) #define task_is_traced(task) ((task->state & __TASK_TRACED) != 0) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index ae1409ffe99a..c63e20c9ef24 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -114,7 +114,10 @@ static inline long __trace_sched_switch_state(bool preempt, struct task_struct * * Preemption ignores task state, therefore preempted tasks are always * RUNNING (we will not have dequeued if state != RUNNING). */ - return preempt ? TASK_RUNNING | TASK_STATE_MAX : p->state; + if (preempt) + return TASK_STATE_MAX; + + return __get_task_state(p); } #endif /* CREATE_TRACE_POINTS */ @@ -152,12 +155,13 @@ TRACE_EVENT(sched_switch, TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d", __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, - __entry->prev_state & (TASK_STATE_MAX-1) ? - __print_flags(__entry->prev_state & (TASK_STATE_MAX-1), "|", - { 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" }, - { 16, "Z" }, { 32, "X" }, { 64, "x" }, - { 128, "K" }, { 256, "W" }, { 512, "P" }, - { 1024, "N" }) : "R", + + (__entry->prev_state & TASK_REPORT) ? + __print_flags(__entry->prev_state & TASK_REPORT, "|", + { 0x01, "S" }, { 0x02, "D" }, { 0x04, "T" }, + { 0x08, "t" }, { 0x10, "X" }, { 0x20, "Z" }) : + "R", + __entry->prev_state & TASK_STATE_MAX ? "+" : "", __entry->next_comm, __entry->next_pid, __entry->next_prio) ); -- cgit From 9c29c31830a4eca724e137a9339137204bbb31be Mon Sep 17 00:00:00 2001 From: Prateek Sood Date: Thu, 7 Sep 2017 20:00:58 +0530 Subject: locking/rwsem-xadd: Fix missed wakeup due to reordering of load If a spinner is present, there is a chance that the load of rwsem_has_spinner() in rwsem_wake() can be reordered with respect to decrement of rwsem count in __up_write() leading to wakeup being missed: spinning writer up_write caller --------------- ----------------------- [S] osq_unlock() [L] osq spin_lock(wait_lock) sem->count=0xFFFFFFFF00000001 +0xFFFFFFFF00000000 count=sem->count MB sem->count=0xFFFFFFFE00000001 -0xFFFFFFFF00000001 spin_trylock(wait_lock) return rwsem_try_write_lock(count) spin_unlock(wait_lock) schedule() Reordering of atomic_long_sub_return_release() in __up_write() and rwsem_has_spinner() in rwsem_wake() can cause missing of wakeup in up_write() context. In spinning writer, sem->count and local variable count is 0XFFFFFFFE00000001. It would result in rwsem_try_write_lock() failing to acquire rwsem and spinning writer going to sleep in rwsem_down_write_failed(). The smp_rmb() will make sure that the spinner state is consulted after sem->count is updated in up_write context. Signed-off-by: Prateek Sood Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave@stgolabs.net Cc: longman@redhat.com Cc: parri.andrea@gmail.com Cc: sramana@codeaurora.org Link: http://lkml.kernel.org/r/1504794658-15397-1-git-send-email-prsood@codeaurora.org Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 02f660666ab8..1fefe6dcafd7 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -612,6 +612,33 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) unsigned long flags; DEFINE_WAKE_Q(wake_q); + /* + * __rwsem_down_write_failed_common(sem) + * rwsem_optimistic_spin(sem) + * osq_unlock(sem->osq) + * ... + * atomic_long_add_return(&sem->count) + * + * - VS - + * + * __up_write() + * if (atomic_long_sub_return_release(&sem->count) < 0) + * rwsem_wake(sem) + * osq_is_locked(&sem->osq) + * + * And __up_write() must observe !osq_is_locked() when it observes the + * atomic_long_add_return() in order to not miss a wakeup. + * + * This boils down to: + * + * [S.rel] X = 1 [RmW] r0 = (Y += 0) + * MB RMB + * [RmW] Y += 1 [L] r1 = X + * + * exists (r0=1 /\ r1=0) + */ + smp_rmb(); + /* * If a spinner is present, it is not necessary to do the wakeup. * Try to do wakeup only if the trylock succeeds to minimize -- cgit From 5f6ad26ea353fdf3dad2328052cbee49e0b9c5b4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Sep 2017 18:23:31 +0200 Subject: sched/tracing: Use common task-state helpers Remove yet another task-state char instance. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 -- kernel/trace/trace_output.c | 21 ++++++--------------- kernel/trace/trace_sched_wakeup.c | 8 ++++---- 3 files changed, 10 insertions(+), 21 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index a2fe636b6825..bc7807933415 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -83,8 +83,6 @@ struct task_group; #define TASK_NEW 0x0800 #define TASK_STATE_MAX 0x1000 -#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPNn" - /* Convenience macros for the sake of set_current_state: */ #define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) #define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index bac629af2285..c738e764e2a5 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -656,15 +656,6 @@ int trace_print_lat_context(struct trace_iterator *iter) return !trace_seq_has_overflowed(s); } -static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; - -static int task_state_char(unsigned long state) -{ - int bit = state ? __ffs(state) + 1 : 0; - - return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?'; -} - /** * ftrace_find_event - find a registered event * @type: the type of event to look for @@ -930,8 +921,8 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter, trace_assign_type(field, iter->ent); - T = task_state_char(field->next_state); - S = task_state_char(field->prev_state); + T = __task_state_to_char(field->next_state); + S = __task_state_to_char(field->prev_state); trace_find_cmdline(field->next_pid, comm); trace_seq_printf(&iter->seq, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", @@ -966,8 +957,8 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S) trace_assign_type(field, iter->ent); if (!S) - S = task_state_char(field->prev_state); - T = task_state_char(field->next_state); + S = __task_state_to_char(field->prev_state); + T = __task_state_to_char(field->next_state); trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n", field->prev_pid, field->prev_prio, @@ -1002,8 +993,8 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S) trace_assign_type(field, iter->ent); if (!S) - S = task_state_char(field->prev_state); - T = task_state_char(field->next_state); + S = __task_state_to_char(field->prev_state); + T = __task_state_to_char(field->next_state); SEQ_PUT_HEX_FIELD(s, field->prev_pid); SEQ_PUT_HEX_FIELD(s, field->prev_prio); diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index ddec53b67646..0c331978b1a6 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -397,10 +397,10 @@ tracing_sched_switch_trace(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->prev_pid = prev->pid; entry->prev_prio = prev->prio; - entry->prev_state = prev->state; + entry->prev_state = __get_task_state(prev); entry->next_pid = next->pid; entry->next_prio = next->prio; - entry->next_state = next->state; + entry->next_state = __get_task_state(next); entry->next_cpu = task_cpu(next); if (!call_filter_check_discard(call, entry, buffer, event)) @@ -425,10 +425,10 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->prev_pid = curr->pid; entry->prev_prio = curr->prio; - entry->prev_state = curr->state; + entry->prev_state = __get_task_state(curr); entry->next_pid = wakee->pid; entry->next_prio = wakee->prio; - entry->next_state = wakee->state; + entry->next_state = __get_task_state(wakee); entry->next_cpu = task_cpu(wakee); if (!call_filter_check_discard(call, entry, buffer, event)) -- cgit From 06eb61844d841d0032a9950ce7f8e783ee49c0d0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Sep 2017 18:30:40 +0200 Subject: sched/debug: Add explicit TASK_IDLE printing Markus reported that kthreads that idle using TASK_IDLE instead of TASK_INTERRUPTIBLE are reported in as TASK_UNINTERRUPTIBLE and things like htop mark those red. This is undesirable, so add an explicit state for TASK_IDLE. Reported-by: Markus Trippelsdorf Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- fs/proc/array.c | 21 +++++++++++++-------- include/linux/sched.h | 12 ++++++++++-- include/trace/events/sched.h | 7 ++++--- 3 files changed, 27 insertions(+), 13 deletions(-) diff --git a/fs/proc/array.c b/fs/proc/array.c index 01196d3ad452..a120a4549d48 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -119,18 +119,23 @@ static inline void task_name(struct seq_file *m, struct task_struct *p) * simple bit tests. */ static const char * const task_state_array[] = { - "R (running)", /* 0 */ - "S (sleeping)", /* 1 */ - "D (disk sleep)", /* 2 */ - "T (stopped)", /* 4 */ - "t (tracing stop)", /* 8 */ - "X (dead)", /* 16 */ - "Z (zombie)", /* 32 */ + + /* states in TASK_REPORT: */ + "R (running)", /* 0x00 */ + "S (sleeping)", /* 0x01 */ + "D (disk sleep)", /* 0x02 */ + "T (stopped)", /* 0x04 */ + "t (tracing stop)", /* 0x08 */ + "X (dead)", /* 0x10 */ + "Z (zombie)", /* 0x20 */ + + /* states beyond TASK_REPORT: */ + "I (idle)", /* 0x40 */ }; static inline const char *get_task_state(struct task_struct *tsk) { - BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != ARRAY_SIZE(task_state_array) - 1); + BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != ARRAY_SIZE(task_state_array)); return task_state_array[__get_task_state(tsk)]; } diff --git a/include/linux/sched.h b/include/linux/sched.h index bc7807933415..286fc1117046 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1241,22 +1241,30 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk) return task_pgrp_nr_ns(tsk, &init_pid_ns); } +#define TASK_REPORT_IDLE (TASK_REPORT + 1) +#define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1) + static inline unsigned int __get_task_state(struct task_struct *tsk) { unsigned int tsk_state = READ_ONCE(tsk->state); unsigned int state = (tsk_state | tsk->exit_state) & TASK_REPORT; + BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX); + if (tsk_state == TASK_PARKED) state = TASK_INTERRUPTIBLE; + if (tsk_state == TASK_IDLE) + state = TASK_REPORT_IDLE; + return fls(state); } static inline char __task_state_to_char(unsigned int state) { - static const char state_char[] = "RSDTtXZ"; + static const char state_char[] = "RSDTtXZI"; - BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != sizeof(state_char) - 2); + BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != sizeof(state_char) - 1); return state_char[state]; } diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index c63e20c9ef24..b371ef8206e1 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -156,10 +156,11 @@ TRACE_EVENT(sched_switch, TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d", __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, - (__entry->prev_state & TASK_REPORT) ? - __print_flags(__entry->prev_state & TASK_REPORT, "|", + (__entry->prev_state & (TASK_REPORT_MAX - 1)) ? + __print_flags(__entry->prev_state & (TASK_REPORT_MAX - 1), "|", { 0x01, "S" }, { 0x02, "D" }, { 0x04, "T" }, - { 0x08, "t" }, { 0x10, "X" }, { 0x20, "Z" }) : + { 0x08, "t" }, { 0x10, "X" }, { 0x20, "Z" }, + { 0x40, "I" }) : "R", __entry->prev_state & TASK_STATE_MAX ? "+" : "", -- cgit From 5d68cc95fb24b2f58060cc5340dd7402a11f054e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Sep 2017 18:32:41 +0200 Subject: sched/debug: Ignore TASK_IDLE for SysRq-W Markus reported that tasks in TASK_IDLE state are reported by SysRq-W, which results in undesirable clutter. Reported-by: Markus Trippelsdorf Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 18a6966567da..d17c5da523a0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5166,6 +5166,28 @@ void sched_show_task(struct task_struct *p) put_task_stack(p); } +static inline bool +state_filter_match(unsigned long state_filter, struct task_struct *p) +{ + /* no filter, everything matches */ + if (!state_filter) + return true; + + /* filter, but doesn't match */ + if (!(p->state & state_filter)) + return false; + + /* + * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows + * TASK_KILLABLE). + */ + if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE) + return false; + + return true; +} + + void show_state_filter(unsigned long state_filter) { struct task_struct *g, *p; @@ -5188,7 +5210,7 @@ void show_state_filter(unsigned long state_filter) */ touch_nmi_watchdog(); touch_all_softlockup_watchdogs(); - if (!state_filter || (p->state & state_filter)) + if (state_filter_match(state_filter, p)) sched_show_task(p); } -- cgit From 8ef9925b02c23e3838d5e593c5cf37984141150f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Sep 2017 18:37:28 +0200 Subject: sched/debug: Add explicit TASK_PARKED printing Currently TASK_PARKED is masqueraded as TASK_INTERRUPTIBLE, give it its own print state because it will not in fact get woken by regular wakeups and is a long-term state. This requires moving TASK_PARKED into the TASK_REPORT mask, and since that latter needs to be a contiguous bitmask, we need to shuffle the bits around a bit. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- fs/proc/array.c | 3 ++- include/linux/sched.h | 16 +++++++--------- include/trace/events/sched.h | 2 +- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/fs/proc/array.c b/fs/proc/array.c index a120a4549d48..77a8eacbe032 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -128,9 +128,10 @@ static const char * const task_state_array[] = { "t (tracing stop)", /* 0x08 */ "X (dead)", /* 0x10 */ "Z (zombie)", /* 0x20 */ + "P (parked)", /* 0x40 */ /* states beyond TASK_REPORT: */ - "I (idle)", /* 0x40 */ + "I (idle)", /* 0x80 */ }; static inline const char *get_task_state(struct task_struct *tsk) diff --git a/include/linux/sched.h b/include/linux/sched.h index 286fc1117046..26a7df4e558c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -75,10 +75,10 @@ struct task_group; #define EXIT_ZOMBIE 0x0020 #define EXIT_TRACE (EXIT_ZOMBIE | EXIT_DEAD) /* Used in tsk->state again: */ -#define TASK_DEAD 0x0040 -#define TASK_WAKEKILL 0x0080 -#define TASK_WAKING 0x0100 -#define TASK_PARKED 0x0200 +#define TASK_PARKED 0x0040 +#define TASK_DEAD 0x0080 +#define TASK_WAKEKILL 0x0100 +#define TASK_WAKING 0x0200 #define TASK_NOLOAD 0x0400 #define TASK_NEW 0x0800 #define TASK_STATE_MAX 0x1000 @@ -97,7 +97,8 @@ struct task_group; /* get_task_state(): */ #define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \ TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \ - __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE) + __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \ + TASK_PARKED) #define task_is_traced(task) ((task->state & __TASK_TRACED) != 0) @@ -1251,9 +1252,6 @@ static inline unsigned int __get_task_state(struct task_struct *tsk) BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX); - if (tsk_state == TASK_PARKED) - state = TASK_INTERRUPTIBLE; - if (tsk_state == TASK_IDLE) state = TASK_REPORT_IDLE; @@ -1262,7 +1260,7 @@ static inline unsigned int __get_task_state(struct task_struct *tsk) static inline char __task_state_to_char(unsigned int state) { - static const char state_char[] = "RSDTtXZI"; + static const char state_char[] = "RSDTtXZPI"; BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != sizeof(state_char) - 1); diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index b371ef8206e1..3c8b7f625670 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -160,7 +160,7 @@ TRACE_EVENT(sched_switch, __print_flags(__entry->prev_state & (TASK_REPORT_MAX - 1), "|", { 0x01, "S" }, { 0x02, "D" }, { 0x04, "T" }, { 0x08, "t" }, { 0x10, "X" }, { 0x20, "Z" }, - { 0x40, "I" }) : + { 0x40, "P" }, { 0x80, "I" }) : "R", __entry->prev_state & TASK_STATE_MAX ? "+" : "", -- cgit From 520a13c530aeb5f63e011d668c42db1af19ed349 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 28 Sep 2017 16:58:26 -0500 Subject: x86/asm: Fix inline asm call constraints for GCC 4.4 The kernel test bot (run by Xiaolong Ye) reported that the following commit: f5caf621ee35 ("x86/asm: Fix inline asm call constraints for Clang") is causing double faults in a kernel compiled with GCC 4.4. Linus subsequently diagnosed the crash pattern and the buggy commit and found that the issue is with this code: register unsigned int __asm_call_sp asm("esp"); #define ASM_CALL_CONSTRAINT "+r" (__asm_call_sp) Even on a 64-bit kernel, it's using ESP instead of RSP. That causes GCC to produce the following bogus code: ffffffff8147461d: 89 e0 mov %esp,%eax ffffffff8147461f: 4c 89 f7 mov %r14,%rdi ffffffff81474622: 4c 89 fe mov %r15,%rsi ffffffff81474625: ba 20 00 00 00 mov $0x20,%edx ffffffff8147462a: 89 c4 mov %eax,%esp ffffffff8147462c: e8 bf 52 05 00 callq ffffffff814c98f0 Despite the absurdity of it backing up and restoring the stack pointer for no reason, the bug is actually the fact that it's only backing up and restoring the lower 32 bits of the stack pointer. The upper 32 bits are getting cleared out, corrupting the stack pointer. So change the '__asm_call_sp' register variable to be associated with the actual full-size stack pointer. This also requires changing the __ASM_SEL() macro to be based on the actual compiled arch size, rather than the CONFIG value, because CONFIG_X86_64 compiles some files with '-m32' (e.g., realmode and vdso). Otherwise Clang fails to build the kernel because it complains about the use of a 64-bit register (RSP) in a 32-bit file. Reported-and-Bisected-and-Tested-by: kernel test robot Diagnosed-by: Linus Torvalds Signed-off-by: Josh Poimboeuf Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Dmitriy Vyukov Cc: LKP Cc: Linus Torvalds Cc: Matthias Kaehlcke Cc: Miguel Bernal Marin Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: f5caf621ee35 ("x86/asm: Fix inline asm call constraints for Clang") Link: http://lkml.kernel.org/r/20170928215826.6sdpmwtkiydiytim@treble Signed-off-by: Ingo Molnar --- arch/x86/include/asm/asm.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index c1eadbaf1115..30c3c9ac784a 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -11,10 +11,12 @@ # define __ASM_FORM_COMMA(x) " " #x "," #endif -#ifdef CONFIG_X86_32 +#ifndef __x86_64__ +/* 32 bit */ # define __ASM_SEL(a,b) __ASM_FORM(a) # define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(a) #else +/* 64 bit */ # define __ASM_SEL(a,b) __ASM_FORM(b) # define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(b) #endif @@ -139,7 +141,7 @@ * gets set up by the containing function. If you forget to do this, objtool * may print a "call without frame pointer save/setup" warning. */ -register unsigned int __asm_call_sp asm("esp"); +register unsigned long __asm_call_sp asm(_ASM_SP); #define ASM_CALL_CONSTRAINT "+r" (__asm_call_sp) #endif -- cgit From 5ccba44ba118a5000cccc50076b0344632459779 Mon Sep 17 00:00:00 2001 From: Ethan Zhao Date: Mon, 4 Sep 2017 13:59:34 +0800 Subject: sched/sysctl: Check user input value of sysctl_sched_time_avg System will hang if user set sysctl_sched_time_avg to 0: [root@XXX ~]# sysctl kernel.sched_time_avg_ms=0 Stack traceback for pid 0 0xffff883f6406c600 0 0 1 3 R 0xffff883f6406cf50 *swapper/3 ffff883f7ccc3ae8 0000000000000018 ffffffff810c4dd0 0000000000000000 0000000000017800 ffff883f7ccc3d78 0000000000000003 ffff883f7ccc3bf8 ffffffff810c4fc9 ffff883f7ccc3c08 00000000810c5043 ffff883f7ccc3c08 Call Trace: [] ? update_group_capacity+0x110/0x200 [] ? update_sd_lb_stats+0x109/0x600 [] ? find_busiest_group+0x47/0x530 [] ? load_balance+0x194/0x900 [] ? update_rq_clock.part.83+0x1a/0xe0 [] ? rebalance_domains+0x152/0x290 [] ? run_rebalance_domains+0xdc/0x1d0 [] ? __do_softirq+0xfb/0x320 [] ? irq_exit+0x125/0x130 [] ? scheduler_ipi+0x97/0x160 [] ? smp_reschedule_interrupt+0x29/0x30 [] ? reschedule_interrupt+0x6e/0x80 [] ? cpuidle_enter_state+0xcc/0x230 [] ? cpuidle_enter_state+0x9c/0x230 [] ? cpuidle_enter+0x17/0x20 [] ? cpu_startup_entry+0x38c/0x420 [] ? start_secondary+0x173/0x1e0 Because divide-by-zero error happens in function: update_group_capacity() update_cpu_capacity() scale_rt_capacity() { ... total = sched_avg_period() + delta; used = div_u64(avg, total); ... } To fix this issue, check user input value of sysctl_sched_time_avg, keep it unchanged when hitting invalid input, and set the minimum limit of sysctl_sched_time_avg to 1 ms. Reported-by: James Puthukattukaran Signed-off-by: Ethan Zhao Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: efault@gmx.de Cc: ethan.kernel@gmail.com Cc: keescook@chromium.org Cc: mcgrof@kernel.org Cc: Link: http://lkml.kernel.org/r/1504504774-18253-1-git-send-email-ethan.zhao@oracle.com Signed-off-by: Ingo Molnar --- kernel/sysctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6648fbbb8157..423554ad3610 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -367,7 +367,8 @@ static struct ctl_table kern_table[] = { .data = &sysctl_sched_time_avg, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, }, #ifdef CONFIG_SCHEDSTATS { -- cgit From 305d0ab4764d36a02c8e7cddb67099aca65351ce Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 28 Sep 2017 18:16:44 -0700 Subject: KVM: nVMX: Fix nested #PF intends to break L1's vmlauch/vmresume MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ------------[ cut here ]------------ WARNING: CPU: 4 PID: 5280 at /home/kernel/linux/arch/x86/kvm//vmx.c:11394 nested_vmx_vmexit+0xc2b/0xd70 [kvm_intel] CPU: 4 PID: 5280 Comm: qemu-system-x86 Tainted: G W OE 4.13.0+ #17 RIP: 0010:nested_vmx_vmexit+0xc2b/0xd70 [kvm_intel] Call Trace: ? emulator_read_emulated+0x15/0x20 [kvm] ? segmented_read+0xae/0xf0 [kvm] vmx_inject_page_fault_nested+0x60/0x70 [kvm_intel] ? vmx_inject_page_fault_nested+0x60/0x70 [kvm_intel] x86_emulate_instruction+0x733/0x810 [kvm] vmx_handle_exit+0x2f4/0xda0 [kvm_intel] ? kvm_arch_vcpu_ioctl_run+0xd2f/0x1c60 [kvm] kvm_arch_vcpu_ioctl_run+0xdab/0x1c60 [kvm] ? kvm_arch_vcpu_load+0x62/0x230 [kvm] kvm_vcpu_ioctl+0x340/0x700 [kvm] ? kvm_vcpu_ioctl+0x340/0x700 [kvm] ? __fget+0xfc/0x210 do_vfs_ioctl+0xa4/0x6a0 ? __fget+0x11d/0x210 SyS_ioctl+0x79/0x90 entry_SYSCALL_64_fastpath+0x23/0xc2 A nested #PF is triggered during L0 emulating instruction for L2. However, it doesn't consider we should not break L1's vmlauch/vmresme. This patch fixes it by queuing the #PF exception instead ,requesting an immediate VM exit from L2 and keeping the exception for L1 pending for a subsequent nested VM exit. This should actually work all the time, making vmx_inject_page_fault_nested totally unnecessary. However, that's not working yet, so this patch can work around the issue in the meanwhile. Cc: Paolo Bonzini Cc: Radim Krčmář Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7f62c94196d1..5bfa353f6354 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -9845,7 +9845,8 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, WARN_ON(!is_guest_mode(vcpu)); - if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code)) { + if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) && + !to_vmx(vcpu)->nested.nested_run_pending) { vmcs12->vm_exit_intr_error_code = fault->error_code; nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, PF_VECTOR | INTR_TYPE_HARD_EXCEPTION | -- cgit From b862789aa5186d5ea3a024b7cfe0f80c3a38b980 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Fri, 29 Sep 2017 19:01:45 +0800 Subject: kvm/x86: Handle async PF in RCU read-side critical sections Sasha Levin reported a WARNING: | WARNING: CPU: 0 PID: 6974 at kernel/rcu/tree_plugin.h:329 | rcu_preempt_note_context_switch kernel/rcu/tree_plugin.h:329 [inline] | WARNING: CPU: 0 PID: 6974 at kernel/rcu/tree_plugin.h:329 | rcu_note_context_switch+0x16c/0x2210 kernel/rcu/tree.c:458 ... | CPU: 0 PID: 6974 Comm: syz-fuzzer Not tainted 4.13.0-next-20170908+ #246 | Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS | 1.10.1-1ubuntu1 04/01/2014 | Call Trace: ... | RIP: 0010:rcu_preempt_note_context_switch kernel/rcu/tree_plugin.h:329 [inline] | RIP: 0010:rcu_note_context_switch+0x16c/0x2210 kernel/rcu/tree.c:458 | RSP: 0018:ffff88003b2debc8 EFLAGS: 00010002 | RAX: 0000000000000001 RBX: 1ffff1000765bd85 RCX: 0000000000000000 | RDX: 1ffff100075d7882 RSI: ffffffffb5c7da20 RDI: ffff88003aebc410 | RBP: ffff88003b2def30 R08: dffffc0000000000 R09: 0000000000000001 | R10: 0000000000000000 R11: 0000000000000000 R12: ffff88003b2def08 | R13: 0000000000000000 R14: ffff88003aebc040 R15: ffff88003aebc040 | __schedule+0x201/0x2240 kernel/sched/core.c:3292 | schedule+0x113/0x460 kernel/sched/core.c:3421 | kvm_async_pf_task_wait+0x43f/0x940 arch/x86/kernel/kvm.c:158 | do_async_page_fault+0x72/0x90 arch/x86/kernel/kvm.c:271 | async_page_fault+0x22/0x30 arch/x86/entry/entry_64.S:1069 | RIP: 0010:format_decode+0x240/0x830 lib/vsprintf.c:1996 | RSP: 0018:ffff88003b2df520 EFLAGS: 00010283 | RAX: 000000000000003f RBX: ffffffffb5d1e141 RCX: ffff88003b2df670 | RDX: 0000000000000001 RSI: dffffc0000000000 RDI: ffffffffb5d1e140 | RBP: ffff88003b2df560 R08: dffffc0000000000 R09: 0000000000000000 | R10: ffff88003b2df718 R11: 0000000000000000 R12: ffff88003b2df5d8 | R13: 0000000000000064 R14: ffffffffb5d1e140 R15: 0000000000000000 | vsnprintf+0x173/0x1700 lib/vsprintf.c:2136 | sprintf+0xbe/0xf0 lib/vsprintf.c:2386 | proc_self_get_link+0xfb/0x1c0 fs/proc/self.c:23 | get_link fs/namei.c:1047 [inline] | link_path_walk+0x1041/0x1490 fs/namei.c:2127 ... This happened when the host hit a page fault, and delivered it as in an async page fault, while the guest was in an RCU read-side critical section. The guest then tries to reschedule in kvm_async_pf_task_wait(), but rcu_preempt_note_context_switch() would treat the reschedule as a sleep in RCU read-side critical section, which is not allowed (even in preemptible RCU). Thus the WARN. To cure this, make kvm_async_pf_task_wait() go to the halt path if the PF happens in a RCU read-side critical section. Reported-by: Sasha Levin Cc: "Paul E. McKenney" Cc: Peter Zijlstra Cc: stable@vger.kernel.org Signed-off-by: Boqun Feng Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index aa60a08b65b1..e675704fa6f7 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -140,7 +140,8 @@ void kvm_async_pf_task_wait(u32 token) n.token = token; n.cpu = smp_processor_id(); - n.halted = is_idle_task(current) || preempt_count() > 1; + n.halted = is_idle_task(current) || preempt_count() > 1 || + rcu_preempt_depth(); init_swait_queue_head(&n.wq); hlist_add_head(&n.link, &b->list); raw_spin_unlock(&b->lock); -- cgit From 04eae427406ef6af9b05bd631e235f4a509666b1 Mon Sep 17 00:00:00 2001 From: Shiraz Saleem Date: Fri, 29 Sep 2017 08:25:01 -0500 Subject: RDMA/iwpm: Properly mark end of NL messages Commit 1a1c116f3dcf removes nlmsg_len calculation in ibnl_put_attr causing netlink messages to be rejected due to incorrect length. Add nlmsg_end after all attributes are appended to calculate the nlmsg_len. Fixes: 1a1c116f3dcf ("RDMA/netlink: Simplify the put_msg and put_attr") Signed-off-by: Shiraz Saleem Signed-off-by: Tatyana Nikolova Reviewed-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/iwpm_msg.c | 8 ++++++++ drivers/infiniband/core/iwpm_util.c | 5 +++++ 2 files changed, 13 insertions(+) diff --git a/drivers/infiniband/core/iwpm_msg.c b/drivers/infiniband/core/iwpm_msg.c index 30825bb9b8e9..8861c052155a 100644 --- a/drivers/infiniband/core/iwpm_msg.c +++ b/drivers/infiniband/core/iwpm_msg.c @@ -100,6 +100,8 @@ int iwpm_register_pid(struct iwpm_dev_data *pm_msg, u8 nl_client) if (ret) goto pid_query_error; + nlmsg_end(skb, nlh); + pr_debug("%s: Multicasting a nlmsg (dev = %s ifname = %s iwpm = %s)\n", __func__, pm_msg->dev_name, pm_msg->if_name, iwpm_ulib_name); @@ -170,6 +172,8 @@ int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) &pm_msg->loc_addr, IWPM_NLA_MANAGE_ADDR); if (ret) goto add_mapping_error; + + nlmsg_end(skb, nlh); nlmsg_request->req_buffer = pm_msg; ret = rdma_nl_unicast_wait(skb, iwpm_user_pid); @@ -246,6 +250,8 @@ int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) &pm_msg->rem_addr, IWPM_NLA_QUERY_REMOTE_ADDR); if (ret) goto query_mapping_error; + + nlmsg_end(skb, nlh); nlmsg_request->req_buffer = pm_msg; ret = rdma_nl_unicast_wait(skb, iwpm_user_pid); @@ -308,6 +314,8 @@ int iwpm_remove_mapping(struct sockaddr_storage *local_addr, u8 nl_client) if (ret) goto remove_mapping_error; + nlmsg_end(skb, nlh); + ret = rdma_nl_unicast_wait(skb, iwpm_user_pid); if (ret) { skb = NULL; /* skb is freed in the netlink send-op handling */ diff --git a/drivers/infiniband/core/iwpm_util.c b/drivers/infiniband/core/iwpm_util.c index c81c55942626..3c4faadb8cdd 100644 --- a/drivers/infiniband/core/iwpm_util.c +++ b/drivers/infiniband/core/iwpm_util.c @@ -597,6 +597,9 @@ static int send_mapinfo_num(u32 mapping_num, u8 nl_client, int iwpm_pid) &mapping_num, IWPM_NLA_MAPINFO_SEND_NUM); if (ret) goto mapinfo_num_error; + + nlmsg_end(skb, nlh); + ret = rdma_nl_unicast(skb, iwpm_pid); if (ret) { skb = NULL; @@ -678,6 +681,8 @@ int iwpm_send_mapinfo(u8 nl_client, int iwpm_pid) if (ret) goto send_mapping_info_unlock; + nlmsg_end(skb, nlh); + iwpm_print_sockaddr(&map_info->local_sockaddr, "send_mapping_info: Local sockaddr:"); iwpm_print_sockaddr(&map_info->mapped_sockaddr, -- cgit From f069faba688701c4d56b6c3452a130f97bf02e95 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 29 Sep 2017 11:29:55 +0100 Subject: arm64: mm: Use READ_ONCE when dereferencing pointer to pte table On kernels built with support for transparent huge pages, different CPUs can access the PMD concurrently due to e.g. fast GUP or page_vma_mapped_walk and they must take care to use READ_ONCE to avoid value tearing or caching of stale values by the compiler. Unfortunately, these functions call into our pgtable macros, which don't use READ_ONCE, and compiler caching has been observed to cause the following crash during ext4 writeback: PC is at check_pte+0x20/0x170 LR is at page_vma_mapped_walk+0x2e0/0x540 [...] Process doio (pid: 2463, stack limit = 0xffff00000f2e8000) Call trace: [] check_pte+0x20/0x170 [] page_vma_mapped_walk+0x2e0/0x540 [] page_mkclean_one+0xac/0x278 [] rmap_walk_file+0xf0/0x238 [] rmap_walk+0x64/0xa0 [] page_mkclean+0x90/0xa8 [] clear_page_dirty_for_io+0x84/0x2a8 [] mpage_submit_page+0x34/0x98 [] mpage_process_page_bufs+0x164/0x170 [] mpage_prepare_extent_to_map+0x134/0x2b8 [] ext4_writepages+0x484/0xe30 [] do_writepages+0x44/0xe8 [] __filemap_fdatawrite_range+0xbc/0x110 [] file_write_and_wait_range+0x48/0xd8 [] ext4_sync_file+0x80/0x4b8 [] vfs_fsync_range+0x64/0xc0 [] SyS_msync+0x194/0x1e8 This is because page_vma_mapped_walk loads the PMD twice before calling pte_offset_map: the first time without READ_ONCE (where it gets all zeroes due to a concurrent pmdp_invalidate) and the second time with READ_ONCE (where it sees a valid table pointer due to a concurrent pmd_populate). However, the compiler inlines everything and caches the first value in a register, which is subsequently used in pte_offset_phys which returns a junk pointer that is later dereferenced when attempting to access the relevant pte. This patch fixes the issue by using READ_ONCE in pte_offset_phys to ensure that a stale value is not used. Whilst this is a point fix for a known failure (and simple to backport), a full fix moving all of our page table accessors over to {READ,WRITE}_ONCE and consistently using READ_ONCE in page_vma_mapped_walk is in the works for a future kernel release. Cc: Jon Masters Cc: Timur Tabi Cc: Fixes: f27176cfc363 ("mm: convert page_mkclean_one() to use page_vma_mapped_walk()") Tested-by: Richard Ruigrok Signed-off-by: Will Deacon Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/pgtable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index bc4e92337d16..b46e54c2399b 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -401,7 +401,7 @@ static inline phys_addr_t pmd_page_paddr(pmd_t pmd) /* Find an entry in the third-level page table. */ #define pte_index(addr) (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) -#define pte_offset_phys(dir,addr) (pmd_page_paddr(*(dir)) + pte_index(addr) * sizeof(pte_t)) +#define pte_offset_phys(dir,addr) (pmd_page_paddr(READ_ONCE(*(dir))) + pte_index(addr) * sizeof(pte_t)) #define pte_offset_kernel(dir,addr) ((pte_t *)__va(pte_offset_phys((dir), (addr)))) #define pte_offset_map(dir,addr) pte_offset_kernel((dir), (addr)) -- cgit From 760bfb47c36a07741a089bf6a28e854ffbee7dc9 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 29 Sep 2017 12:27:41 +0100 Subject: arm64: fault: Route pte translation faults via do_translation_fault We currently route pte translation faults via do_page_fault, which elides the address check against TASK_SIZE before invoking the mm fault handling code. However, this can cause issues with the path walking code in conjunction with our word-at-a-time implementation because load_unaligned_zeropad can end up faulting in kernel space if it reads across a page boundary and runs into a page fault (e.g. by attempting to read from a guard region). In the case of such a fault, load_unaligned_zeropad has registered a fixup to shift the valid data and pad with zeroes, however the abort is reported as a level 3 translation fault and we dispatch it straight to do_page_fault, despite it being a kernel address. This results in calling a sleeping function from atomic context: BUG: sleeping function called from invalid context at arch/arm64/mm/fault.c:313 in_atomic(): 0, irqs_disabled(): 0, pid: 10290 Internal error: Oops - BUG: 0 [#1] PREEMPT SMP [...] [] ___might_sleep+0x134/0x144 [] __might_sleep+0x7c/0x8c [] do_page_fault+0x140/0x330 [] do_mem_abort+0x54/0xb0 Exception stack(0xfffffffb20247a70 to 0xfffffffb20247ba0) [...] [] el1_da+0x18/0x78 [] path_parentat+0x44/0x88 [] filename_parentat+0x5c/0xd8 [] filename_create+0x4c/0x128 [] SyS_mkdirat+0x50/0xc8 [] el0_svc_naked+0x24/0x28 Code: 36380080 d5384100 f9400800 9402566d (d4210000) ---[ end trace 2d01889f2bca9b9f ]--- Fix this by dispatching all translation faults to do_translation_faults, which avoids invoking the page fault logic for faults on kernel addresses. Cc: Reported-by: Ankit Jain Signed-off-by: Will Deacon Signed-off-by: Catalin Marinas --- arch/arm64/mm/fault.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 89993c4be1be..2069e9bc0fca 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -651,7 +651,7 @@ static const struct fault_info fault_info[] = { { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 0 translation fault" }, { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 1 translation fault" }, { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 2 translation fault" }, - { do_page_fault, SIGSEGV, SEGV_MAPERR, "level 3 translation fault" }, + { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 3 translation fault" }, { do_bad, SIGBUS, 0, "unknown 8" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 1 access flag fault" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 2 access flag fault" }, -- cgit From bc829ee36e0ec92383c9d9b88fe08f00d4d592f8 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Fri, 29 Sep 2017 11:24:19 -0500 Subject: x86/mm: Disable branch profiling in mem_encrypt.c Some routines in mem_encrypt.c are called very early in the boot process, e.g. sme_encrypt_kernel(). When CONFIG_TRACE_BRANCH_PROFILING=y is defined the resulting branch profiling associated with the check to see if SME is active results in a kernel crash. Disable branch profiling for mem_encrypt.c by defining DISABLE_BRANCH_PROFILING before including any header files. Reported-by: kernel test robot Signed-off-by: Tom Lendacky Acked-by: Borislav Petkov Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170929162419.6016.53390.stgit@tlendack-t1.amdoffice.net Signed-off-by: Ingo Molnar --- arch/x86/mm/mem_encrypt.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 3fcc8e01683b..16c5f37933a2 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -10,6 +10,8 @@ * published by the Free Software Foundation. */ +#define DISABLE_BRANCH_PROFILING + #include #include #include -- cgit From 196bd485ee4f03ce4c690bfcf38138abfcd0a4bc Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Fri, 29 Sep 2017 17:15:36 +0300 Subject: x86/asm: Use register variable to get stack pointer value Currently we use current_stack_pointer() function to get the value of the stack pointer register. Since commit: f5caf621ee35 ("x86/asm: Fix inline asm call constraints for Clang") ... we have a stack register variable declared. It can be used instead of current_stack_pointer() function which allows to optimize away some excessive "mov %rsp, %" instructions: -mov %rsp,%rdx -sub %rdx,%rax -cmp $0x3fff,%rax -ja ffffffff810722fd +sub %rsp,%rax +cmp $0x3fff,%rax +ja ffffffff810722fa Remove current_stack_pointer(), rename __asm_call_sp to current_stack_pointer and use it instead of the removed function. Signed-off-by: Andrey Ryabinin Reviewed-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170929141537.29167-1-aryabinin@virtuozzo.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/asm.h | 4 ++-- arch/x86/include/asm/thread_info.h | 11 ----------- arch/x86/kernel/irq_32.c | 6 +++--- arch/x86/kernel/traps.c | 2 +- arch/x86/mm/tlb.c | 2 +- 5 files changed, 7 insertions(+), 18 deletions(-) diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 30c3c9ac784a..b0dc91f4bedc 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -141,8 +141,8 @@ * gets set up by the containing function. If you forget to do this, objtool * may print a "call without frame pointer save/setup" warning. */ -register unsigned long __asm_call_sp asm(_ASM_SP); -#define ASM_CALL_CONSTRAINT "+r" (__asm_call_sp) +register unsigned long current_stack_pointer asm(_ASM_SP); +#define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer) #endif #endif /* _ASM_X86_ASM_H */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 5161da1a0fa0..89e7eeb5cec1 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -158,17 +158,6 @@ struct thread_info { */ #ifndef __ASSEMBLY__ -static inline unsigned long current_stack_pointer(void) -{ - unsigned long sp; -#ifdef CONFIG_X86_64 - asm("mov %%rsp,%0" : "=g" (sp)); -#else - asm("mov %%esp,%0" : "=g" (sp)); -#endif - return sp; -} - /* * Walks up the stack frames to make sure that the specified object is * entirely contained by a single stack frame. diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 1f38d9a4d9de..d4eb450144fd 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -64,7 +64,7 @@ static void call_on_stack(void *func, void *stack) static inline void *current_stack(void) { - return (void *)(current_stack_pointer() & ~(THREAD_SIZE - 1)); + return (void *)(current_stack_pointer & ~(THREAD_SIZE - 1)); } static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc) @@ -88,7 +88,7 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc) /* Save the next esp at the bottom of the stack */ prev_esp = (u32 *)irqstk; - *prev_esp = current_stack_pointer(); + *prev_esp = current_stack_pointer; if (unlikely(overflow)) call_on_stack(print_stack_overflow, isp); @@ -139,7 +139,7 @@ void do_softirq_own_stack(void) /* Push the previous esp onto the stack */ prev_esp = (u32 *)irqstk; - *prev_esp = current_stack_pointer(); + *prev_esp = current_stack_pointer; call_on_stack(__do_softirq, isp); } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 34ea3651362e..67db4f43309e 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -142,7 +142,7 @@ void ist_begin_non_atomic(struct pt_regs *regs) * from double_fault. */ BUG_ON((unsigned long)(current_top_of_stack() - - current_stack_pointer()) >= THREAD_SIZE); + current_stack_pointer) >= THREAD_SIZE); preempt_enable_no_resched(); } diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 93fe97cce581..49d9778376d7 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -191,7 +191,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * mapped in the new pgd, we'll double-fault. Forcibly * map it. */ - unsigned int index = pgd_index(current_stack_pointer()); + unsigned int index = pgd_index(current_stack_pointer); pgd_t *pgd = next->pgd + index; if (unlikely(pgd_none(*pgd))) -- cgit From 6c85501f2fabcfc4fc6ed976543d252c4eaf4be9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 29 Sep 2017 13:43:15 -0400 Subject: fix infoleak in waitid(2) kernel_waitid() can return a PID, an error or 0. rusage is filled in the first case and waitid(2) rusage should've been copied out exactly in that case, *not* whenever kernel_waitid() has not returned an error. Compat variant shares that braino; none of kernel_wait4() callers do, so the below ought to fix it. Reported-and-tested-by: Alexander Potapenko Fixes: ce72a16fa705 ("wait4(2)/waitid(2): separate copying rusage to userland") Cc: stable@vger.kernel.org # v4.13 Signed-off-by: Al Viro --- kernel/exit.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/kernel/exit.c b/kernel/exit.c index 3481ababd06a..f2cd53e92147 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1600,12 +1600,10 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, struct waitid_info info = {.status = 0}; long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL); int signo = 0; + if (err > 0) { signo = SIGCHLD; err = 0; - } - - if (!err) { if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) return -EFAULT; } @@ -1723,16 +1721,15 @@ COMPAT_SYSCALL_DEFINE5(waitid, if (err > 0) { signo = SIGCHLD; err = 0; - } - - if (!err && uru) { - /* kernel_waitid() overwrites everything in ru */ - if (COMPAT_USE_64BIT_TIME) - err = copy_to_user(uru, &ru, sizeof(ru)); - else - err = put_compat_rusage(&ru, uru); - if (err) - return -EFAULT; + if (uru) { + /* kernel_waitid() overwrites everything in ru */ + if (COMPAT_USE_64BIT_TIME) + err = copy_to_user(uru, &ru, sizeof(ru)); + else + err = put_compat_rusage(&ru, uru); + if (err) + return -EFAULT; + } } if (!infop) -- cgit From 9792bf5ad5e30b207274ccbb459a89eab6033b46 Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Fri, 22 Sep 2017 22:00:29 -0700 Subject: clk: Export clk_bulk_prepare() Allow clk_bulk_prepare() to be referenced by kernel modules by adding the missing EXPORT_SYMBOL_GPL(). Fixes: 266e4e9d9150 ("clk: add clk_bulk_get accessories") Reported-by: Ulf Hansson Signed-off-by: Bjorn Andersson Reviewed-by: Ulf Hansson Signed-off-by: Stephen Boyd --- drivers/clk/clk-bulk.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/clk/clk-bulk.c b/drivers/clk/clk-bulk.c index c834f5abfc49..4c10456f8a32 100644 --- a/drivers/clk/clk-bulk.c +++ b/drivers/clk/clk-bulk.c @@ -105,6 +105,7 @@ err: return ret; } +EXPORT_SYMBOL_GPL(clk_bulk_prepare); #endif /* CONFIG_HAVE_CLK_PREPARE */ -- cgit From fef0035c0f31322d417d1954bba5ab959bf91183 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Thu, 28 Sep 2017 00:41:44 +0200 Subject: netlink: do not proceed if dump's start() errs Drivers that use the start method for netlink dumping rely on dumpit not being called if start fails. For example, ila_xlat.c allocates memory and assigns it to cb->args[0] in its start() function. It might fail to do that and return -ENOMEM instead. However, even when returning an error, dumpit will be called, which, in the example above, quickly dereferences the memory in cb->args[0], which will OOPS the kernel. This is but one example of how this goes wrong. Since start() has always been a function with an int return type, it therefore makes sense to use it properly, rather than ignoring it. This patch thus returns early and does not call dumpit() when start() fails. Signed-off-by: Jason A. Donenfeld Cc: Johannes Berg Reviewed-by: Johannes Berg Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 327807731b44..94c11cf0459d 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2270,10 +2270,13 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, mutex_unlock(nlk->cb_mutex); + ret = 0; if (cb->start) - cb->start(cb); + ret = cb->start(cb); + + if (!ret) + ret = netlink_dump(sk); - ret = netlink_dump(sk); sock_put(sk); if (ret) -- cgit From d51711c0557d6dbd26c63144aef32c7b3ec264b9 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 28 Sep 2017 13:23:31 +0800 Subject: ip_gre: ipgre_tap device should keep dst Without keeping dst, the tunnel will not update any mtu/pmtu info, since it does not have a dst on the skb. Reproducer: client(ipgre_tap1 - eth1) <-----> (eth1 - ipgre_tap1)server After reducing eth1's mtu on client, then perforamnce became 0. This patch is to netif_keep_dst in gre_tap_init, as ipgre does. Reported-by: Jianlin Shi Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 0162fb955b33..8b837f6f5532 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1223,6 +1223,7 @@ static int gre_tap_init(struct net_device *dev) { __gre_tunnel_init(dev); dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + netif_keep_dst(dev); return ip_tunnel_init(dev); } -- cgit From 2d40557cc702ed8e5edd9bd422233f86652d932e Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 28 Sep 2017 13:23:50 +0800 Subject: ip6_gre: ip6gre_tap device should keep dst The patch 'ip_gre: ipgre_tap device should keep dst' fixed a issue that ipgre_tap mtu couldn't be updated in tx path. The same fix is needed for ip6gre_tap as well. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 20f66f4c9460..1602b491b281 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1311,6 +1311,7 @@ static void ip6gre_tap_setup(struct net_device *dev) dev->features |= NETIF_F_NETNS_LOCAL; dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + netif_keep_dst(dev); } static bool ip6gre_netlink_encap_parms(struct nlattr *data[], -- cgit From d41bb33ba33b8f8debe54ed36be6925eb496e354 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 28 Sep 2017 13:24:07 +0800 Subject: ip6_tunnel: update mtu properly for ARPHRD_ETHER tunnel device in tx path Now when updating mtu in tx path, it doesn't consider ARPHRD_ETHER tunnel device, like ip6gre_tap tunnel, for which it should also subtract ether header to get the correct mtu. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index f2f21c24915f..a1c24443cd9e 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1043,6 +1043,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, struct dst_entry *dst = NULL, *ndst = NULL; struct net_device *tdev; int mtu; + unsigned int eth_hlen = t->dev->type == ARPHRD_ETHER ? ETH_HLEN : 0; unsigned int psh_hlen = sizeof(struct ipv6hdr) + t->encap_hlen; unsigned int max_headroom = psh_hlen; bool use_cache = false; @@ -1124,7 +1125,7 @@ route_lookup: t->parms.name); goto tx_err_dst_release; } - mtu = dst_mtu(dst) - psh_hlen - t->tun_hlen; + mtu = dst_mtu(dst) - eth_hlen - psh_hlen - t->tun_hlen; if (encap_limit >= 0) { max_headroom += 8; mtu -= 8; @@ -1133,7 +1134,7 @@ route_lookup: mtu = IPV6_MIN_MTU; if (skb_dst(skb) && !t->parms.collect_md) skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); - if (skb->len - t->tun_hlen > mtu && !skb_is_gso(skb)) { + if (skb->len - t->tun_hlen - eth_hlen > mtu && !skb_is_gso(skb)) { *pmtu = mtu; err = -EMSGSIZE; goto tx_err_dst_release; -- cgit From 7487449c86c65202b3b725c4524cb48dd65e4e6f Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 28 Sep 2017 15:51:36 +0200 Subject: IPv4: early demux can return an error code Currently no error is emitted, but this infrastructure will used by the next patch to allow source address validation for mcast sockets. Since early demux can do a route lookup and an ipv4 route lookup can return an error code this is consistent with the current ipv4 route infrastructure. Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/net/protocol.h | 4 ++-- include/net/tcp.h | 2 +- include/net/udp.h | 2 +- net/ipv4/ip_input.c | 25 +++++++++++++++---------- net/ipv4/tcp_ipv4.c | 9 +++++---- net/ipv4/udp.c | 11 ++++++----- 6 files changed, 30 insertions(+), 23 deletions(-) diff --git a/include/net/protocol.h b/include/net/protocol.h index 65ba335b0e7e..4fc75f7ae23b 100644 --- a/include/net/protocol.h +++ b/include/net/protocol.h @@ -39,8 +39,8 @@ /* This is used to register protocols. */ struct net_protocol { - void (*early_demux)(struct sk_buff *skb); - void (*early_demux_handler)(struct sk_buff *skb); + int (*early_demux)(struct sk_buff *skb); + int (*early_demux_handler)(struct sk_buff *skb); int (*handler)(struct sk_buff *skb); void (*err_handler)(struct sk_buff *skb, u32 info); unsigned int no_policy:1, diff --git a/include/net/tcp.h b/include/net/tcp.h index 3bc910a9bfc6..89974c5286d8 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -345,7 +345,7 @@ void tcp_v4_err(struct sk_buff *skb, u32); void tcp_shutdown(struct sock *sk, int how); -void tcp_v4_early_demux(struct sk_buff *skb); +int tcp_v4_early_demux(struct sk_buff *skb); int tcp_v4_rcv(struct sk_buff *skb); int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw); diff --git a/include/net/udp.h b/include/net/udp.h index 12dfbfe2e2d7..6c759c8594e2 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -259,7 +259,7 @@ static inline struct sk_buff *skb_recv_udp(struct sock *sk, unsigned int flags, return __skb_recv_udp(sk, flags, noblock, &peeked, &off, err); } -void udp_v4_early_demux(struct sk_buff *skb); +int udp_v4_early_demux(struct sk_buff *skb); bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst); int udp_get_port(struct sock *sk, unsigned short snum, int (*saddr_cmp)(const struct sock *, diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index fa2dc8f692c6..57fc13c6ab2b 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -311,9 +311,10 @@ drop: static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); - struct rtable *rt; + int (*edemux)(struct sk_buff *skb); struct net_device *dev = skb->dev; - void (*edemux)(struct sk_buff *skb); + struct rtable *rt; + int err; /* if ingress device is enslaved to an L3 master device pass the * skb to its handler for processing @@ -331,7 +332,9 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) ipprot = rcu_dereference(inet_protos[protocol]); if (ipprot && (edemux = READ_ONCE(ipprot->early_demux))) { - edemux(skb); + err = edemux(skb); + if (unlikely(err)) + goto drop_error; /* must reload iph, skb->head might have changed */ iph = ip_hdr(skb); } @@ -342,13 +345,10 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) * how the packet travels inside Linux networking. */ if (!skb_valid_dst(skb)) { - int err = ip_route_input_noref(skb, iph->daddr, iph->saddr, - iph->tos, dev); - if (unlikely(err)) { - if (err == -EXDEV) - __NET_INC_STATS(net, LINUX_MIB_IPRPFILTER); - goto drop; - } + err = ip_route_input_noref(skb, iph->daddr, iph->saddr, + iph->tos, dev); + if (unlikely(err)) + goto drop_error; } #ifdef CONFIG_IP_ROUTE_CLASSID @@ -399,6 +399,11 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) drop: kfree_skb(skb); return NET_RX_DROP; + +drop_error: + if (err == -EXDEV) + __NET_INC_STATS(net, LINUX_MIB_IPRPFILTER); + goto drop; } /* diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index d9416b5162bc..85164d4d3e53 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1503,23 +1503,23 @@ csum_err: } EXPORT_SYMBOL(tcp_v4_do_rcv); -void tcp_v4_early_demux(struct sk_buff *skb) +int tcp_v4_early_demux(struct sk_buff *skb) { const struct iphdr *iph; const struct tcphdr *th; struct sock *sk; if (skb->pkt_type != PACKET_HOST) - return; + return 0; if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) - return; + return 0; iph = ip_hdr(skb); th = tcp_hdr(skb); if (th->doff < sizeof(struct tcphdr) / 4) - return; + return 0; sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo, iph->saddr, th->source, @@ -1538,6 +1538,7 @@ void tcp_v4_early_demux(struct sk_buff *skb) skb_dst_set_noref(skb, dst); } } + return 0; } bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index ef29df8648e4..9b30f821fe96 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2221,7 +2221,7 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net, return NULL; } -void udp_v4_early_demux(struct sk_buff *skb) +int udp_v4_early_demux(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); const struct iphdr *iph; @@ -2234,7 +2234,7 @@ void udp_v4_early_demux(struct sk_buff *skb) /* validate the packet */ if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr))) - return; + return 0; iph = ip_hdr(skb); uh = udp_hdr(skb); @@ -2244,14 +2244,14 @@ void udp_v4_early_demux(struct sk_buff *skb) struct in_device *in_dev = __in_dev_get_rcu(skb->dev); if (!in_dev) - return; + return 0; /* we are supposed to accept bcast packets */ if (skb->pkt_type == PACKET_MULTICAST) { ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr, iph->protocol); if (!ours) - return; + return 0; } sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr, @@ -2263,7 +2263,7 @@ void udp_v4_early_demux(struct sk_buff *skb) } if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt)) - return; + return 0; skb->sk = sk; skb->destructor = sock_efree; @@ -2278,6 +2278,7 @@ void udp_v4_early_demux(struct sk_buff *skb) */ skb_dst_set_noref(skb, dst); } + return 0; } int udp_rcv(struct sk_buff *skb) -- cgit From bc044e8db7962e727a75b591b9851ff2ac5cf846 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 28 Sep 2017 15:51:37 +0200 Subject: udp: perform source validation for mcast early demux The UDP early demux can leverate the rx dst cache even for multicast unconnected sockets. In such scenario the ipv4 source address is validated only on the first packet in the given flow. After that, when we fetch the dst entry from the socket rx cache, we stop enforcing the rp_filter and we even start accepting any kind of martian addresses. Disabling the dst cache for unconnected multicast socket will cause large performace regression, nearly reducing by half the max ingress tput. Instead we factor out a route helper to completely validate an skb source address for multicast packets and we call it from the UDP early demux for mcast packets landing on unconnected sockets, after successful fetching the related cached dst entry. This still gives a measurable, but limited performance regression: rp_filter = 0 rp_filter = 1 edmux disabled: 1182 Kpps 1127 Kpps edmux before: 2238 Kpps 2238 Kpps edmux after: 2037 Kpps 2019 Kpps The above figures are on top of current net tree. Applying the net-next commit 6e617de84e87 ("net: avoid a full fib lookup when rp_filter is disabled.") the delta with rp_filter == 0 will decrease even more. Fixes: 421b3885bf6d ("udp: ipv4: Add udp early demux") Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/net/route.h | 4 +++- net/ipv4/route.c | 46 ++++++++++++++++++++++++++-------------------- net/ipv4/udp.c | 13 ++++++++++++- 3 files changed, 41 insertions(+), 22 deletions(-) diff --git a/include/net/route.h b/include/net/route.h index 57dfc6850d37..d538e6db1afe 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -175,7 +175,9 @@ static inline struct rtable *ip_route_output_gre(struct net *net, struct flowi4 fl4->fl4_gre_key = gre_key; return ip_route_output_key(net, fl4); } - +int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, + u8 tos, struct net_device *dev, + struct in_device *in_dev, u32 *itag); int ip_route_input_noref(struct sk_buff *skb, __be32 dst, __be32 src, u8 tos, struct net_device *devin); int ip_route_input_rcu(struct sk_buff *skb, __be32 dst, __be32 src, diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 94d4cd2d5ea4..ac6fde5d45f1 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1520,43 +1520,56 @@ struct rtable *rt_dst_alloc(struct net_device *dev, EXPORT_SYMBOL(rt_dst_alloc); /* called in rcu_read_lock() section */ -static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev, int our) +int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, + u8 tos, struct net_device *dev, + struct in_device *in_dev, u32 *itag) { - struct rtable *rth; - struct in_device *in_dev = __in_dev_get_rcu(dev); - unsigned int flags = RTCF_MULTICAST; - u32 itag = 0; int err; /* Primary sanity checks. */ - if (!in_dev) return -EINVAL; if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || skb->protocol != htons(ETH_P_IP)) - goto e_inval; + return -EINVAL; if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev)) - goto e_inval; + return -EINVAL; if (ipv4_is_zeronet(saddr)) { if (!ipv4_is_local_multicast(daddr)) - goto e_inval; + return -EINVAL; } else { err = fib_validate_source(skb, saddr, 0, tos, 0, dev, - in_dev, &itag); + in_dev, itag); if (err < 0) - goto e_err; + return err; } + return 0; +} + +/* called in rcu_read_lock() section */ +static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, + u8 tos, struct net_device *dev, int our) +{ + struct in_device *in_dev = __in_dev_get_rcu(dev); + unsigned int flags = RTCF_MULTICAST; + struct rtable *rth; + u32 itag = 0; + int err; + + err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag); + if (err) + return err; + if (our) flags |= RTCF_LOCAL; rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST, IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false); if (!rth) - goto e_nobufs; + return -ENOBUFS; #ifdef CONFIG_IP_ROUTE_CLASSID rth->dst.tclassid = itag; @@ -1572,13 +1585,6 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, skb_dst_set(skb, &rth->dst); return 0; - -e_nobufs: - return -ENOBUFS; -e_inval: - return -EINVAL; -e_err: - return err; } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 9b30f821fe96..5676237d2b0f 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2224,6 +2224,7 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net, int udp_v4_early_demux(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); + struct in_device *in_dev = NULL; const struct iphdr *iph; const struct udphdr *uh; struct sock *sk = NULL; @@ -2241,7 +2242,7 @@ int udp_v4_early_demux(struct sk_buff *skb) if (skb->pkt_type == PACKET_BROADCAST || skb->pkt_type == PACKET_MULTICAST) { - struct in_device *in_dev = __in_dev_get_rcu(skb->dev); + in_dev = __in_dev_get_rcu(skb->dev); if (!in_dev) return 0; @@ -2272,11 +2273,21 @@ int udp_v4_early_demux(struct sk_buff *skb) if (dst) dst = dst_check(dst, 0); if (dst) { + u32 itag = 0; + /* set noref for now. * any place which wants to hold dst has to call * dst_hold_safe() */ skb_dst_set_noref(skb, dst); + + /* for unconnected multicast sockets we need to validate + * the source on each packet + */ + if (!inet_sk(sk)->inet_daddr && in_dev) + return ip_mc_validate_source(skb, iph->daddr, + iph->saddr, iph->tos, + skb->dev, in_dev, &itag); } return 0; } -- cgit From 5a59a3a0ef0e546626a762d49dc06feaa204bab3 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Thu, 28 Sep 2017 17:57:58 +0200 Subject: ppp: fix __percpu annotation Move sparse annotation right after pointer type. Fixes sparse warning: drivers/net/ppp/ppp_generic.c:1422:13: warning: incorrect type in initializer (different address spaces) drivers/net/ppp/ppp_generic.c:1422:13: expected void const [noderef] *__vpp_verify drivers/net/ppp/ppp_generic.c:1422:13: got int * ... Fixes: e5dadc65f9e0 ("ppp: Fix false xmit recursion detect with two ppp devices") Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- drivers/net/ppp/ppp_generic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index a404552555d4..c3f77e3b7819 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -120,7 +120,7 @@ struct ppp { int n_channels; /* how many channels are attached 54 */ spinlock_t rlock; /* lock for receive side 58 */ spinlock_t wlock; /* lock for transmit side 5c */ - int *xmit_recursion __percpu; /* xmit recursion detect */ + int __percpu *xmit_recursion; /* xmit recursion detect */ int mru; /* max receive unit 60 */ unsigned int flags; /* control bits 64 */ unsigned int xstate; /* transmit state bits 68 */ -- cgit From aad06212d36cf34859428a0a279e5c14ee5c9e26 Mon Sep 17 00:00:00 2001 From: Parthasarathy Bhuvaragan Date: Fri, 29 Sep 2017 10:02:54 +0200 Subject: tipc: use only positive error codes in messages In commit e3a77561e7d32 ("tipc: split up function tipc_msg_eval()"), we have updated the function tipc_msg_lookup_dest() to set the error codes to negative values at destination lookup failures. Thus when the function sets the error code to -TIPC_ERR_NO_NAME, its inserted into the 4 bit error field of the message header as 0xf instead of TIPC_ERR_NO_NAME (1). The value 0xf is an unknown error code. In this commit, we set only positive error code. Fixes: e3a77561e7d32 ("tipc: split up function tipc_msg_eval()") Signed-off-by: Parthasarathy Bhuvaragan Signed-off-by: David S. Miller --- net/tipc/msg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 6ef379f004ac..121e59a1d0e7 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -551,7 +551,7 @@ bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err) return false; if (msg_errcode(msg)) return false; - *err = -TIPC_ERR_NO_NAME; + *err = TIPC_ERR_NO_NAME; if (skb_linearize(skb)) return false; msg = buf_msg(skb); -- cgit From 007a61ae2f35c7fcf767313285c4924e81f11983 Mon Sep 17 00:00:00 2001 From: Martin Wilck Date: Thu, 28 Sep 2017 21:33:23 +0200 Subject: nvme: fix visibility of "uuid" ns attribute "uuid" must be invisible if both ns->uuid and ns->nguid are unset, not if either one is. Fixes: d934f9848a77 "nvme: provide UUID value to userspace" Signed-off-by: Martin Wilck Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index bb2aad078637..5a14cc7f28ee 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2136,7 +2136,7 @@ static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj, struct nvme_ns *ns = nvme_get_ns_from_dev(dev); if (a == &dev_attr_uuid.attr) { - if (uuid_is_null(&ns->uuid) || + if (uuid_is_null(&ns->uuid) && !memchr_inv(ns->nguid, 0, sizeof(ns->nguid))) return 0; } -- cgit From 74007ae6316ebe40260e44f8ab558f9b1ccc04e5 Mon Sep 17 00:00:00 2001 From: Christophe Jaillet Date: Sat, 23 Sep 2017 08:44:15 +0200 Subject: hwmon: (xgene) Fix up error handling path mixup in 'xgene_hwmon_probe()' Commit 2ca492e22cb7 has moved the call to 'kfifo_alloc()' from after the main 'if' statement to before it. But it has not updated the error handling paths accordingly. Fix all that: - if 'kfifo_alloc()' fails we can return directly - direct returns after 'kfifo_alloc()' must now go to 'out_mbox_free' - 'goto out_mbox_free' must be replaced by 'goto out', otherwise the '[pcc_]mbox_free_channel()' call will be missed. Fixes: 2ca492e22cb7 ("hwmon: (xgene) Fix crash when alarm occurs before driver probe") Signed-off-by: Christophe JAILLET Signed-off-by: Guenter Roeck --- drivers/hwmon/xgene-hwmon.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/drivers/hwmon/xgene-hwmon.c b/drivers/hwmon/xgene-hwmon.c index 9c0dbb8191ad..e1be61095532 100644 --- a/drivers/hwmon/xgene-hwmon.c +++ b/drivers/hwmon/xgene-hwmon.c @@ -630,7 +630,7 @@ static int xgene_hwmon_probe(struct platform_device *pdev) sizeof(struct slimpro_resp_msg) * ASYNC_MSG_FIFO_SIZE, GFP_KERNEL); if (rc) - goto out_mbox_free; + return -ENOMEM; INIT_WORK(&ctx->workq, xgene_hwmon_evt_work); @@ -646,7 +646,8 @@ static int xgene_hwmon_probe(struct platform_device *pdev) if (IS_ERR(ctx->mbox_chan)) { dev_err(&pdev->dev, "SLIMpro mailbox channel request failed\n"); - return -ENODEV; + rc = -ENODEV; + goto out_mbox_free; } } else { struct acpi_pcct_hw_reduced *cppc_ss; @@ -654,7 +655,8 @@ static int xgene_hwmon_probe(struct platform_device *pdev) if (device_property_read_u32(&pdev->dev, "pcc-channel", &ctx->mbox_idx)) { dev_err(&pdev->dev, "no pcc-channel property\n"); - return -ENODEV; + rc = -ENODEV; + goto out_mbox_free; } cl->rx_callback = xgene_hwmon_pcc_rx_cb; @@ -662,7 +664,8 @@ static int xgene_hwmon_probe(struct platform_device *pdev) if (IS_ERR(ctx->mbox_chan)) { dev_err(&pdev->dev, "PPC channel request failed\n"); - return -ENODEV; + rc = -ENODEV; + goto out_mbox_free; } /* @@ -675,13 +678,13 @@ static int xgene_hwmon_probe(struct platform_device *pdev) if (!cppc_ss) { dev_err(&pdev->dev, "PPC subspace not found\n"); rc = -ENODEV; - goto out_mbox_free; + goto out; } if (!ctx->mbox_chan->mbox->txdone_irq) { dev_err(&pdev->dev, "PCC IRQ not supported\n"); rc = -ENODEV; - goto out_mbox_free; + goto out; } /* @@ -696,14 +699,14 @@ static int xgene_hwmon_probe(struct platform_device *pdev) } else { dev_err(&pdev->dev, "Failed to get PCC comm region\n"); rc = -ENODEV; - goto out_mbox_free; + goto out; } if (!ctx->pcc_comm_addr) { dev_err(&pdev->dev, "Failed to ioremap PCC comm region\n"); rc = -ENOMEM; - goto out_mbox_free; + goto out; } /* -- cgit From 9e66317d3c92ddaab330c125dfe9d06eee268aff Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 1 Oct 2017 14:54:54 -0700 Subject: Linux 4.14-rc3 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index f0c5b21fadb2..cf007a31d575 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 4 PATCHLEVEL = 14 SUBLEVEL = 0 -EXTRAVERSION = -rc2 +EXTRAVERSION = -rc3 NAME = Fearless Coyote # *DOCUMENTATION* -- cgit From 935a9749a36828af0e8be224a5cd4bc758112c34 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 1 Oct 2017 22:00:53 +0800 Subject: ip_gre: get key from session_id correctly in erspan_rcv erspan only uses the first 10 bits of session_id as the key to look up the tunnel. But in erspan_rcv, it missed 'session_id & ID_MASK' when getting the key from session_id. If any other flag is also set in session_id in a packet, it would fail to find the tunnel due to incorrect key in erspan_rcv. This patch is to add 'session_id & ID_MASK' there and also remove the unnecessary variable session_id. Fixes: 84e54fe0a5ea ("gre: introduce native tunnel support for ERSPAN") Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 8b837f6f5532..b25b1e5112d0 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -259,7 +259,6 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, struct ip_tunnel *tunnel; struct erspanhdr *ershdr; const struct iphdr *iph; - __be32 session_id; __be32 index; int len; @@ -275,8 +274,7 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, /* The original GRE header does not have key field, * Use ERSPAN 10-bit session ID as key. */ - session_id = cpu_to_be32(ntohs(ershdr->session_id)); - tpi->key = session_id; + tpi->key = cpu_to_be32(ntohs(ershdr->session_id) & ID_MASK); index = ershdr->md.index; tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags | TUNNEL_KEY, -- cgit From 5513d08d29511c263c00933c00dd7a82fffda3c9 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 1 Oct 2017 22:00:54 +0800 Subject: ip_gre: check packet length and mtu correctly in erspan_xmit As a ARPHRD_ETHER device, skb->len in erspan_xmit is the length of the whole ether packet. So before checking if a packet size exceeds the mtu, skb->len should subtract dev->hard_header_len. Otherwise, all packets with max size according to mtu would be trimmed to be truncated packet. Fixes: 84e54fe0a5ea ("gre: introduce native tunnel support for ERSPAN") Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index b25b1e5112d0..2a4ef9dc48ff 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -731,7 +731,7 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb, if (skb_cow_head(skb, dev->needed_headroom)) goto free_skb; - if (skb->len > dev->mtu) { + if (skb->len - dev->hard_header_len > dev->mtu) { pskb_trim(skb, dev->mtu); truncate = true; } -- cgit From c122fda271717f4fc618e0a31e833941fd5f1efd Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 1 Oct 2017 22:00:55 +0800 Subject: ip_gre: set tunnel hlen properly in erspan_tunnel_init According to __gre_tunnel_init, tunnel->hlen should be set as the headers' length between inner packet and outer iphdr. It would be used especially to calculate a proper mtu when updating mtu in tnl_update_pmtu. Now without setting it, a bigger mtu value than expected would be updated, which hurts performance a lot. This patch is to fix it by setting tunnel->hlen with: tunnel->tun_hlen + tunnel->encap_hlen + sizeof(struct erspanhdr) Fixes: 84e54fe0a5ea ("gre: introduce native tunnel support for ERSPAN") Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 2a4ef9dc48ff..fad0bb1e3e9a 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1245,7 +1245,9 @@ static int erspan_tunnel_init(struct net_device *dev) tunnel->tun_hlen = 8; tunnel->parms.iph.protocol = IPPROTO_GRE; - t_hlen = tunnel->hlen + sizeof(struct iphdr) + sizeof(struct erspanhdr); + tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen + + sizeof(struct erspanhdr); + t_hlen = tunnel->hlen + sizeof(struct iphdr); dev->needed_headroom = LL_MAX_HEADER + t_hlen + 4; dev->mtu = ETH_DATA_LEN - t_hlen - 4; -- cgit From c84bed440e4e11a973e8c0254d0dfaccfca41fb0 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 1 Oct 2017 22:00:56 +0800 Subject: ip_gre: erspan device should keep dst The patch 'ip_gre: ipgre_tap device should keep dst' fixed the issue ipgre_tap dev mtu couldn't be updated in tx path. The same fix is needed for erspan as well. Fixes: 84e54fe0a5ea ("gre: introduce native tunnel support for ERSPAN") Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index fad0bb1e3e9a..467e44d7587d 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1254,6 +1254,7 @@ static int erspan_tunnel_init(struct net_device *dev) dev->features |= GRE_FEATURES; dev->hw_features |= GRE_FEATURES; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + netif_keep_dst(dev); return ip_tunnel_init(dev); } -- cgit From 9f775ead5e570e7e19015b9e4e2f3dd6e71a5935 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Thu, 28 Sep 2017 15:44:38 +0200 Subject: l2tp: fix l2tp_eth module loading The l2tp_eth module crashes if its netlink callbacks are run when the pernet data aren't initialised. We should normally register_pernet_device() before the genl callbacks. However, the pernet data only maintain a list of l2tpeth interfaces, and this list is never used. So let's just drop pernet handling instead. Fixes: d9e31d17ceba ("l2tp: Add L2TP ethernet pseudowire support") Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_eth.c | 51 ++------------------------------------------------- 1 file changed, 2 insertions(+), 49 deletions(-) diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index 87da9ef61860..014a7bc2a872 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -44,7 +44,6 @@ struct l2tp_eth { struct net_device *dev; struct sock *tunnel_sock; struct l2tp_session *session; - struct list_head list; atomic_long_t tx_bytes; atomic_long_t tx_packets; atomic_long_t tx_dropped; @@ -58,17 +57,6 @@ struct l2tp_eth_sess { struct net_device *dev; }; -/* per-net private data for this module */ -static unsigned int l2tp_eth_net_id; -struct l2tp_eth_net { - struct list_head l2tp_eth_dev_list; - spinlock_t l2tp_eth_lock; -}; - -static inline struct l2tp_eth_net *l2tp_eth_pernet(struct net *net) -{ - return net_generic(net, l2tp_eth_net_id); -} static int l2tp_eth_dev_init(struct net_device *dev) { @@ -84,12 +72,6 @@ static int l2tp_eth_dev_init(struct net_device *dev) static void l2tp_eth_dev_uninit(struct net_device *dev) { - struct l2tp_eth *priv = netdev_priv(dev); - struct l2tp_eth_net *pn = l2tp_eth_pernet(dev_net(dev)); - - spin_lock(&pn->l2tp_eth_lock); - list_del_init(&priv->list); - spin_unlock(&pn->l2tp_eth_lock); dev_put(dev); } @@ -273,7 +255,6 @@ static int l2tp_eth_create(struct net *net, struct l2tp_tunnel *tunnel, struct l2tp_eth *priv; struct l2tp_eth_sess *spriv; int rc; - struct l2tp_eth_net *pn; if (cfg->ifname) { strlcpy(name, cfg->ifname, IFNAMSIZ); @@ -305,7 +286,6 @@ static int l2tp_eth_create(struct net *net, struct l2tp_tunnel *tunnel, priv = netdev_priv(dev); priv->dev = dev; priv->session = session; - INIT_LIST_HEAD(&priv->list); priv->tunnel_sock = tunnel->sock; session->recv_skb = l2tp_eth_dev_recv; @@ -326,10 +306,6 @@ static int l2tp_eth_create(struct net *net, struct l2tp_tunnel *tunnel, strlcpy(session->ifname, dev->name, IFNAMSIZ); dev_hold(dev); - pn = l2tp_eth_pernet(dev_net(dev)); - spin_lock(&pn->l2tp_eth_lock); - list_add(&priv->list, &pn->l2tp_eth_dev_list); - spin_unlock(&pn->l2tp_eth_lock); return 0; @@ -342,22 +318,6 @@ out: return rc; } -static __net_init int l2tp_eth_init_net(struct net *net) -{ - struct l2tp_eth_net *pn = net_generic(net, l2tp_eth_net_id); - - INIT_LIST_HEAD(&pn->l2tp_eth_dev_list); - spin_lock_init(&pn->l2tp_eth_lock); - - return 0; -} - -static struct pernet_operations l2tp_eth_net_ops = { - .init = l2tp_eth_init_net, - .id = &l2tp_eth_net_id, - .size = sizeof(struct l2tp_eth_net), -}; - static const struct l2tp_nl_cmd_ops l2tp_eth_nl_cmd_ops = { .session_create = l2tp_eth_create, @@ -371,25 +331,18 @@ static int __init l2tp_eth_init(void) err = l2tp_nl_register_ops(L2TP_PWTYPE_ETH, &l2tp_eth_nl_cmd_ops); if (err) - goto out; - - err = register_pernet_device(&l2tp_eth_net_ops); - if (err) - goto out_unreg; + goto err; pr_info("L2TP ethernet pseudowire support (L2TPv3)\n"); return 0; -out_unreg: - l2tp_nl_unregister_ops(L2TP_PWTYPE_ETH); -out: +err: return err; } static void __exit l2tp_eth_exit(void) { - unregister_pernet_device(&l2tp_eth_net_ops); l2tp_nl_unregister_ops(L2TP_PWTYPE_ETH); } -- cgit From 90841047a01b452cc8c3f9b990698b264143334a Mon Sep 17 00:00:00 2001 From: Grant Grundler Date: Thu, 28 Sep 2017 11:35:00 -0700 Subject: r8152: add Linksys USB3GIGV1 id This linksys dongle by default comes up in cdc_ether mode. This patch allows r8152 to claim the device: Bus 002 Device 002: ID 13b1:0041 Linksys Signed-off-by: Grant Grundler Reviewed-by: Douglas Anderson Signed-off-by: David S. Miller --- drivers/net/usb/cdc_ether.c | 10 ++++++++++ drivers/net/usb/r8152.c | 2 ++ 2 files changed, 12 insertions(+) diff --git a/drivers/net/usb/cdc_ether.c b/drivers/net/usb/cdc_ether.c index 8ab281b478f2..677a85360db1 100644 --- a/drivers/net/usb/cdc_ether.c +++ b/drivers/net/usb/cdc_ether.c @@ -547,6 +547,7 @@ static const struct driver_info wwan_info = { #define REALTEK_VENDOR_ID 0x0bda #define SAMSUNG_VENDOR_ID 0x04e8 #define LENOVO_VENDOR_ID 0x17ef +#define LINKSYS_VENDOR_ID 0x13b1 #define NVIDIA_VENDOR_ID 0x0955 #define HP_VENDOR_ID 0x03f0 #define MICROSOFT_VENDOR_ID 0x045e @@ -737,6 +738,15 @@ static const struct usb_device_id products[] = { .driver_info = 0, }, +#if IS_ENABLED(CONFIG_USB_RTL8152) +/* Linksys USB3GIGV1 Ethernet Adapter */ +{ + USB_DEVICE_AND_INTERFACE_INFO(LINKSYS_VENDOR_ID, 0x0041, USB_CLASS_COMM, + USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), + .driver_info = 0, +}, +#endif + /* ThinkPad USB-C Dock (based on Realtek RTL8153) */ { USB_DEVICE_AND_INTERFACE_INFO(LENOVO_VENDOR_ID, 0x3062, USB_CLASS_COMM, diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index ceb78e2ea4f0..941ece08ba78 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -613,6 +613,7 @@ enum rtl8152_flags { #define VENDOR_ID_MICROSOFT 0x045e #define VENDOR_ID_SAMSUNG 0x04e8 #define VENDOR_ID_LENOVO 0x17ef +#define VENDOR_ID_LINKSYS 0x13b1 #define VENDOR_ID_NVIDIA 0x0955 #define MCU_TYPE_PLA 0x0100 @@ -5316,6 +5317,7 @@ static const struct usb_device_id rtl8152_table[] = { {REALTEK_USB_DEVICE(VENDOR_ID_LENOVO, 0x7205)}, {REALTEK_USB_DEVICE(VENDOR_ID_LENOVO, 0x720c)}, {REALTEK_USB_DEVICE(VENDOR_ID_LENOVO, 0x7214)}, + {REALTEK_USB_DEVICE(VENDOR_ID_LINKSYS, 0x0041)}, {REALTEK_USB_DEVICE(VENDOR_ID_NVIDIA, 0x09ff)}, {} }; -- cgit From 4792ea04bcd03b8ccfd1ae336c5deba52dd9edc9 Mon Sep 17 00:00:00 2001 From: Gregory CLEMENT Date: Fri, 29 Sep 2017 14:27:39 +0200 Subject: net: mvpp2: Fix clock resource by adding an optional bus clock On Armada 7K/8K we need to explicitly enable the bus clock. The bus clock is optional because not all the SoCs need them but at least for Armada 7K/8K it is actually mandatory. The binding documentation is updating accordingly. Signed-off-by: Gregory CLEMENT Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/marvell-pp2.txt | 10 ++++++---- drivers/net/ethernet/marvell/mvpp2.c | 15 +++++++++++++++ 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/Documentation/devicetree/bindings/net/marvell-pp2.txt b/Documentation/devicetree/bindings/net/marvell-pp2.txt index 7e2dad08a12e..1814fa13f6ab 100644 --- a/Documentation/devicetree/bindings/net/marvell-pp2.txt +++ b/Documentation/devicetree/bindings/net/marvell-pp2.txt @@ -21,8 +21,9 @@ Required properties: - main controller clock (for both armada-375-pp2 and armada-7k-pp2) - GOP clock (for both armada-375-pp2 and armada-7k-pp2) - MG clock (only for armada-7k-pp2) -- clock-names: names of used clocks, must be "pp_clk", "gop_clk" and - "mg_clk" (the latter only for armada-7k-pp2). + - AXI clock (only for armada-7k-pp2) +- clock-names: names of used clocks, must be "pp_clk", "gop_clk", "mg_clk" + and "axi_clk" (the 2 latter only for armada-7k-pp2). The ethernet ports are represented by subnodes. At least one port is required. @@ -78,8 +79,9 @@ Example for marvell,armada-7k-pp2: cpm_ethernet: ethernet@0 { compatible = "marvell,armada-7k-pp22"; reg = <0x0 0x100000>, <0x129000 0xb000>; - clocks = <&cpm_syscon0 1 3>, <&cpm_syscon0 1 9>, <&cpm_syscon0 1 5>; - clock-names = "pp_clk", "gop_clk", "gp_clk"; + clocks = <&cpm_syscon0 1 3>, <&cpm_syscon0 1 9>, + <&cpm_syscon0 1 5>, <&cpm_syscon0 1 18>; + clock-names = "pp_clk", "gop_clk", "gp_clk", "axi_clk"; eth0: eth0 { interrupts = , diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c index 161055564720..9c86cb7cb988 100644 --- a/drivers/net/ethernet/marvell/mvpp2.c +++ b/drivers/net/ethernet/marvell/mvpp2.c @@ -793,6 +793,7 @@ struct mvpp2 { struct clk *pp_clk; struct clk *gop_clk; struct clk *mg_clk; + struct clk *axi_clk; /* List of pointers to port structures */ struct mvpp2_port **port_list; @@ -7970,6 +7971,18 @@ static int mvpp2_probe(struct platform_device *pdev) err = clk_prepare_enable(priv->mg_clk); if (err < 0) goto err_gop_clk; + + priv->axi_clk = devm_clk_get(&pdev->dev, "axi_clk"); + if (IS_ERR(priv->axi_clk)) { + err = PTR_ERR(priv->axi_clk); + if (err == -EPROBE_DEFER) + goto err_gop_clk; + priv->axi_clk = NULL; + } else { + err = clk_prepare_enable(priv->axi_clk); + if (err < 0) + goto err_gop_clk; + } } /* Get system's tclk rate */ @@ -8024,6 +8037,7 @@ static int mvpp2_probe(struct platform_device *pdev) return 0; err_mg_clk: + clk_disable_unprepare(priv->axi_clk); if (priv->hw_version == MVPP22) clk_disable_unprepare(priv->mg_clk); err_gop_clk: @@ -8061,6 +8075,7 @@ static int mvpp2_remove(struct platform_device *pdev) aggr_txq->descs_dma); } + clk_disable_unprepare(priv->axi_clk); clk_disable_unprepare(priv->mg_clk); clk_disable_unprepare(priv->pp_clk); clk_disable_unprepare(priv->gop_clk); -- cgit From 81359617f1b783a01e6e22b46cbb046e9513b9c6 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 30 Sep 2017 07:34:34 +0200 Subject: net: hns3: Fix an error handling path in 'hclge_rss_init_hw()' If this sanity check fails, we must free 'rss_indir'. Otherwise there is a memory leak. 'goto err' as done in the other error handling paths to fix it. Fixes: 46a3df9f9718 ("net: hns3: Fix for setting rss_size incorrectly") Signed-off-by: Christophe JAILLET Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index e0685e630afe..c1cdbfd83bdb 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -2652,7 +2652,8 @@ static int hclge_rss_init_hw(struct hclge_dev *hdev) dev_err(&hdev->pdev->dev, "Configure rss tc size failed, invalid TC_SIZE = %d\n", rss_size); - return -EINVAL; + ret = -EINVAL; + goto err; } roundup_size = roundup_pow_of_two(rss_size); -- cgit From fb458864d9a78cc433fec7979acbe4078c82d7a8 Mon Sep 17 00:00:00 2001 From: Chanho Min Date: Tue, 26 Sep 2017 09:03:40 +0900 Subject: mmc: core: add driver strength selection when selecting hs400es The driver strength selection is missed and required when selecting hs400es. So, It is added here. Fixes: 81ac2af65793ecf ("mmc: core: implement enhanced strobe support") Cc: stable@vger.kernel.org Signed-off-by: Hankyung Yu Signed-off-by: Chanho Min Reviewed-by: Adrian Hunter Reviewed-by: Shawn Lin Signed-off-by: Ulf Hansson --- drivers/mmc/core/mmc.c | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c index a7eb623f8daa..36217ad5e9b1 100644 --- a/drivers/mmc/core/mmc.c +++ b/drivers/mmc/core/mmc.c @@ -1286,6 +1286,23 @@ out_err: return err; } +static void mmc_select_driver_type(struct mmc_card *card) +{ + int card_drv_type, drive_strength, drv_type; + + card_drv_type = card->ext_csd.raw_driver_strength | + mmc_driver_type_mask(0); + + drive_strength = mmc_select_drive_strength(card, + card->ext_csd.hs200_max_dtr, + card_drv_type, &drv_type); + + card->drive_strength = drive_strength; + + if (drv_type) + mmc_set_driver_type(card->host, drv_type); +} + static int mmc_select_hs400es(struct mmc_card *card) { struct mmc_host *host = card->host; @@ -1341,6 +1358,8 @@ static int mmc_select_hs400es(struct mmc_card *card) goto out_err; } + mmc_select_driver_type(card); + /* Switch card to HS400 */ val = EXT_CSD_TIMING_HS400 | card->drive_strength << EXT_CSD_DRV_STR_SHIFT; @@ -1374,23 +1393,6 @@ out_err: return err; } -static void mmc_select_driver_type(struct mmc_card *card) -{ - int card_drv_type, drive_strength, drv_type; - - card_drv_type = card->ext_csd.raw_driver_strength | - mmc_driver_type_mask(0); - - drive_strength = mmc_select_drive_strength(card, - card->ext_csd.hs200_max_dtr, - card_drv_type, &drv_type); - - card->drive_strength = drive_strength; - - if (drv_type) - mmc_set_driver_type(card->host, drv_type); -} - /* * For device supporting HS200 mode, the following sequence * should be done before executing the tuning process. -- cgit From 2a5e597c6bb1b873e473e5f57147e9e5d2755430 Mon Sep 17 00:00:00 2001 From: Jason Gerecke Date: Mon, 18 Sep 2017 09:27:42 -0700 Subject: HID: wacom: Always increment hdev refcount within wacom_get_hdev_data The wacom_get_hdev_data function is used to find and return a reference to the "other half" of a Wacom device (i.e., the touch device associated with a pen, or vice-versa). To ensure these references are properly accounted for, the function is supposed to automatically increment the refcount before returning. This was not done, however, for devices which have pen & touch on different interfaces of the same USB device. This can lead to a WARNING ("refcount_t: underflow; use-after-free") when removing the module or device as we call kref_put() more times than kref_get(). Triggering an "actual" use- after-free would be difficult since both devices will disappear nearly- simultaneously. To silence this warning and prevent the potential error, we need to increment the refcount for all cases within wacom_get_hdev_data. Fixes: 41372d5d40 ("HID: wacom: Augment 'oVid' and 'oPid' with heuristics for HID_GENERIC") Cc: # v4.9+ Signed-off-by: Jason Gerecke Reviewed-by: Ping Cheng Signed-off-by: Jiri Kosina --- drivers/hid/wacom_sys.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/hid/wacom_sys.c b/drivers/hid/wacom_sys.c index 735bfbbcaa82..906e654fb0ba 100644 --- a/drivers/hid/wacom_sys.c +++ b/drivers/hid/wacom_sys.c @@ -668,8 +668,10 @@ static struct wacom_hdev_data *wacom_get_hdev_data(struct hid_device *hdev) /* Try to find an already-probed interface from the same device */ list_for_each_entry(data, &wacom_udev_list, list) { - if (compare_device_paths(hdev, data->dev, '/')) + if (compare_device_paths(hdev, data->dev, '/')) { + kref_get(&data->kref); return data; + } } /* Fallback to finding devices that appear to be "siblings" */ -- cgit From 814b6d17487fd970f293ee674c90ba267f82415d Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Tue, 19 Sep 2017 18:37:46 -0700 Subject: HID: hidraw: fix power sequence when closing device We should not try to bring HID device out of full power state before calling hid_hw_close(), so that transport driver operates on powered up device (making this inverse of the opening sequence). Signed-off-by: Dmitry Torokhov Reviewed-by: Guenter Roeck Reviewed-by: Benson Leung Signed-off-by: Jiri Kosina --- drivers/hid/hidraw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hid/hidraw.c b/drivers/hid/hidraw.c index ec530454e6f6..5fbe0f81ab2e 100644 --- a/drivers/hid/hidraw.c +++ b/drivers/hid/hidraw.c @@ -337,8 +337,8 @@ static void drop_ref(struct hidraw *hidraw, int exists_bit) kfree(hidraw); } else { /* close device for last reader */ - hid_hw_power(hidraw->hid, PM_HINT_NORMAL); hid_hw_close(hidraw->hid); + hid_hw_power(hidraw->hid, PM_HINT_NORMAL); } } } -- cgit From 66dcdafe8e251a3edc5d84cf725835567bd3dd35 Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Mon, 2 Oct 2017 15:50:34 +0800 Subject: Revert "HID: multitouch: Support ALPS PTP stick with pid 0x120A" This reverts commit fcaa4a07d2a4b541e91da7a55d8b3331f96d1865. As noted by Masaki [1], 0x120A + trackpoint will not be used in mass production machines, so remove the ID accordingly. [1] http://www.spinics.net/lists/linux-input/msg53222.html Signed-off-by: Kai-Heng Feng Acked-by: Benjamin Tissoires Signed-off-by: Jiri Kosina --- drivers/hid/hid-ids.h | 1 - drivers/hid/hid-multitouch.c | 4 ---- 2 files changed, 5 deletions(-) diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index 832897d4a849..a98919199858 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -75,7 +75,6 @@ #define USB_VENDOR_ID_ALPS_JP 0x044E #define HID_DEVICE_ID_ALPS_U1_DUAL 0x120B -#define HID_DEVICE_ID_ALPS_U1_PTP_2 0x120A #define HID_DEVICE_ID_ALPS_U1_DUAL_PTP 0x121F #define HID_DEVICE_ID_ALPS_U1_DUAL_3BTN_PTP 0x1220 diff --git a/drivers/hid/hid-multitouch.c b/drivers/hid/hid-multitouch.c index c78625dceced..9e8c4d2ba11d 100644 --- a/drivers/hid/hid-multitouch.c +++ b/drivers/hid/hid-multitouch.c @@ -1419,10 +1419,6 @@ static const struct hid_device_id mt_devices[] = { HID_DEVICE(BUS_I2C, HID_GROUP_MULTITOUCH_WIN_8, USB_VENDOR_ID_ALPS_JP, HID_DEVICE_ID_ALPS_U1_DUAL_3BTN_PTP) }, - { .driver_data = MT_CLS_WIN_8_DUAL, - HID_DEVICE(BUS_I2C, HID_GROUP_MULTITOUCH_WIN_8, - USB_VENDOR_ID_ALPS_JP, - HID_DEVICE_ID_ALPS_U1_PTP_2) }, /* Lenovo X1 TAB Gen 2 */ { .driver_data = MT_CLS_WIN_8_DUAL, -- cgit From 51db452df07bb4c5754b73789253ba21681d9dc2 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Tue, 26 Sep 2017 09:11:49 +0900 Subject: Revert "ALSA: echoaudio: purge contradictions between dimension matrix members and total number of members" This reverts commit 275353bb684e to fix a regression which can abort 'alsactl' program in alsa-utils due to assertion in alsa-lib. alsactl: control.c:2513: snd_ctl_elem_value_get_integer: Assertion `idx < sizeof(obj->value.integer.value) / sizeof(obj->value.integer.value[0])' failed. alsactl: control.c:2976: snd_ctl_elem_value_get_integer: Assertion `idx < ARRAY_SIZE(obj->value.integer.value)' failed. This commit is a band-aid. In a point of usage of ALSA control interface, the drivers still bring an issue that they prevent userspace applications to have a consistent way to parse each levels of the dimension information via ALSA control interface. Let me investigate this issue. Current implementation of the drivers have three control element sets with dimension information: * 'Monitor Mixer Volume' (type: integer) * 'VMixer Volume' (type: integer) * 'VU-meters' (type: boolean) Although the number of elements named as 'Monitor Mixer Volume' differs depending on drivers in this group, it can be calculated by macros defined by each driver (= (BX_NUM - BX_ANALOG_IN) * BX_ANALOG_IN). Each of the elements has one member for value and has dimension information with 2 levels (= BX_ANALOG_IN * (BX_NUM - BX_ANALOG_IN)). For these elements, userspace applications are expected to handle the dimension information so that all of the elements construct a matrix where the number of rows and columns are represented by the dimension information. The same way is applied to elements named as 'VMixer Volume'. The number of these elements can also be calculated by macros defined by each drivers (= PX_ANALOG_IN * BX_ANALOG_IN). Each of the element has one member for value and has dimension information with 2 levels (= BX_ANALOG_IN * PX_ANALOG_IN). All of the elements construct a matrix with the dimension information. An element named as 'VU-meters' gets a different way in a point of dimension information. The element includes 96 members for value. The element has dimension information with 3 levels (= 3 or 2 * 16 * 2). For this element, userspace applications are expected to handle the dimension information so that all of the members for value construct a matrix where the number of rows and columns are represented by the dimension information. This is different from the way for the former. As a summary, the drivers were not designed to produce a consistent way to parse the dimension information. This makes it hard for general userspace applications such as amixer to parse the information by a consistent way, and actually no userspace applications except for 'echomixer' utilize the dimension information. Additionally, no drivers excluding this group use the information. The reverted commit was written based on the latter way. A commit 860c1994a70a ('ALSA: control: add dimension validator for userspace elements') is written based on the latter way, too. The patch should be reconsider too in the same time to re-define a consistent way to parse the dimension information. Reported-by: Mark Hills Reported-by: S. Christian Collins Fixes: 275353bb684e ('ALSA: echoaudio: purge contradictions between dimension matrix members and total number of members') Cc: # v4.8+ Signed-off-by: Takashi Sakamoto Signed-off-by: Takashi Iwai --- sound/pci/echoaudio/echoaudio.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sound/pci/echoaudio/echoaudio.c b/sound/pci/echoaudio/echoaudio.c index 7326695bca33..d68f99e076a8 100644 --- a/sound/pci/echoaudio/echoaudio.c +++ b/sound/pci/echoaudio/echoaudio.c @@ -1272,11 +1272,11 @@ static int snd_echo_mixer_info(struct snd_kcontrol *kcontrol, chip = snd_kcontrol_chip(kcontrol); uinfo->type = SNDRV_CTL_ELEM_TYPE_INTEGER; + uinfo->count = 1; uinfo->value.integer.min = ECHOGAIN_MINOUT; uinfo->value.integer.max = ECHOGAIN_MAXOUT; uinfo->dimen.d[0] = num_busses_out(chip); uinfo->dimen.d[1] = num_busses_in(chip); - uinfo->count = uinfo->dimen.d[0] * uinfo->dimen.d[1]; return 0; } @@ -1344,11 +1344,11 @@ static int snd_echo_vmixer_info(struct snd_kcontrol *kcontrol, chip = snd_kcontrol_chip(kcontrol); uinfo->type = SNDRV_CTL_ELEM_TYPE_INTEGER; + uinfo->count = 1; uinfo->value.integer.min = ECHOGAIN_MINOUT; uinfo->value.integer.max = ECHOGAIN_MAXOUT; uinfo->dimen.d[0] = num_busses_out(chip); uinfo->dimen.d[1] = num_pipes_out(chip); - uinfo->count = uinfo->dimen.d[0] * uinfo->dimen.d[1]; return 0; } @@ -1728,6 +1728,7 @@ static int snd_echo_vumeters_info(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_info *uinfo) { uinfo->type = SNDRV_CTL_ELEM_TYPE_INTEGER; + uinfo->count = 96; uinfo->value.integer.min = ECHOGAIN_MINOUT; uinfo->value.integer.max = 0; #ifdef ECHOCARD_HAS_VMIXER @@ -1737,7 +1738,6 @@ static int snd_echo_vumeters_info(struct snd_kcontrol *kcontrol, #endif uinfo->dimen.d[1] = 16; /* 16 channels */ uinfo->dimen.d[2] = 2; /* 0=level, 1=peak */ - uinfo->count = uinfo->dimen.d[0] * uinfo->dimen.d[1] * uinfo->dimen.d[2]; return 0; } -- cgit From 0a6de8b8668a2cfc0912a1d7df21107e1a075a3a Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 2 Oct 2017 12:42:00 +0100 Subject: arm64: fix misleading data abort decoding Currently data_abort_decode() dumps the ISS field as a decimal value with a '0x' prefix, which is somewhat misleading. Fix it to print as hexadecimal, as was intended. Fixes: 1f9b8936f36f4a8e ("arm64: Decode information from ESR upon mem faults") Reviewed-by: Dave Martin Reviewed-by: Julien Thierry Acked-by: Will Deacon Signed-off-by: Mark Rutland Signed-off-by: Catalin Marinas --- arch/arm64/mm/fault.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 2069e9bc0fca..b64958b23a7f 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -97,7 +97,7 @@ static void data_abort_decode(unsigned int esr) (esr & ESR_ELx_SF) >> ESR_ELx_SF_SHIFT, (esr & ESR_ELx_AR) >> ESR_ELx_AR_SHIFT); } else { - pr_alert(" ISV = 0, ISS = 0x%08lu\n", esr & ESR_ELx_ISS_MASK); + pr_alert(" ISV = 0, ISS = 0x%08lx\n", esr & ESR_ELx_ISS_MASK); } pr_alert(" CM = %lu, WnR = %lu\n", -- cgit From 9f4057fc937f200f000dbc378c5c3e37d45e31dc Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 22 Sep 2017 09:26:57 +0800 Subject: ceph: properly queue cap snap for newly created snap realm commit 3ae0bebc "ceph: queue cap snap only when snap realm's context changes" introduced a regression: we may not call queue_realm_cap_snaps() for newly created snap realm. This regression allows unflushed snapshot data to be overwritten. Link: http://tracker.ceph.com/issues/21483 Signed-off-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/snap.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 1ffc8b426c1c..7fc0b850c352 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -374,12 +374,10 @@ static int build_snap_context(struct ceph_snap_realm *realm, realm->ino, realm, snapc, snapc->seq, (unsigned int) snapc->num_snaps); - if (realm->cached_context) { - ceph_put_snap_context(realm->cached_context); - /* queue realm for cap_snap creation */ - list_add_tail(&realm->dirty_item, dirty_realms); - } + ceph_put_snap_context(realm->cached_context); realm->cached_context = snapc; + /* queue realm for cap_snap creation */ + list_add_tail(&realm->dirty_item, dirty_realms); return 0; fail: -- cgit From 38f340ccdf9ed5f1350505b46c5689d015967057 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 22 Sep 2017 11:41:06 +0800 Subject: ceph: fix __choose_mds() for LSSNAP request previous commit 5d37ca14 "ceph: send LSSNAP request to auth mds of directory inode" is buggy. It makes __choose_mds() choose mds base on hash of '.snap' dentry. Signed-off-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 84edfc60d87a..f23c820daaed 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -734,12 +734,13 @@ static int __choose_mds(struct ceph_mds_client *mdsc, inode = req->r_inode; ihold(inode); } else { - /* req->r_dentry is non-null for LSSNAP request. - * fall-thru */ - WARN_ON_ONCE(!req->r_dentry); + /* req->r_dentry is non-null for LSSNAP request */ + rcu_read_lock(); + inode = get_nonsnap_parent(req->r_dentry); + rcu_read_unlock(); + dout("__choose_mds using snapdir's parent %p\n", inode); } - } - if (!inode && req->r_dentry) { + } else if (req->r_dentry) { /* ignore race with rename; old or new d_parent is okay */ struct dentry *parent; struct inode *dir; -- cgit From 7682e399485fe19622b6fd82510b1f4551e48a25 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 2 Oct 2017 14:06:43 +0200 Subject: ALSA: usx2y: Suppress kernel warning at page allocation failures The usx2y driver allocates the stream read/write buffers in continuous pages depending on the stream setup, and this may spew the kernel warning messages with a stack trace like: WARNING: CPU: 1 PID: 1846 at mm/page_alloc.c:3883 __alloc_pages_slowpath+0x1ef2/0x2d70 Modules linked in: CPU: 1 PID: 1846 Comm: kworker/1:2 Not tainted .... It may confuse user as if it were any serious error, although this is no fatal error and the driver handles the error case gracefully. Since the driver has already some sanity check of the given size (128 and 256 pages), it can't pass any crazy value. So it's merely page fragmentation. This patch adds __GFP_NOWARN to each caller for suppressing such kernel warnings. The original issue was spotted by syzkaller. Reported-by: Andrey Konovalov Tested-by: Andrey Konovalov Cc: Signed-off-by: Takashi Iwai --- sound/usb/usx2y/usb_stream.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sound/usb/usx2y/usb_stream.c b/sound/usb/usx2y/usb_stream.c index 4dab49080700..e229abd21652 100644 --- a/sound/usb/usx2y/usb_stream.c +++ b/sound/usb/usx2y/usb_stream.c @@ -191,7 +191,8 @@ struct usb_stream *usb_stream_new(struct usb_stream_kernel *sk, } pg = get_order(read_size); - sk->s = (void *) __get_free_pages(GFP_KERNEL|__GFP_COMP|__GFP_ZERO, pg); + sk->s = (void *) __get_free_pages(GFP_KERNEL|__GFP_COMP|__GFP_ZERO| + __GFP_NOWARN, pg); if (!sk->s) { snd_printk(KERN_WARNING "couldn't __get_free_pages()\n"); goto out; @@ -211,7 +212,8 @@ struct usb_stream *usb_stream_new(struct usb_stream_kernel *sk, pg = get_order(write_size); sk->write_page = - (void *)__get_free_pages(GFP_KERNEL|__GFP_COMP|__GFP_ZERO, pg); + (void *)__get_free_pages(GFP_KERNEL|__GFP_COMP|__GFP_ZERO| + __GFP_NOWARN, pg); if (!sk->write_page) { snd_printk(KERN_WARNING "couldn't __get_free_pages()\n"); usb_stream_free(sk); -- cgit From 28a04c7b7bbecaab642fcb6a2d7354eb70ea7fbe Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Mon, 2 Oct 2017 12:14:56 +0200 Subject: mlxsw: spectrum_router: Move VRF refcounting When creating a new RIF, bumping RIF count of the containing VR is the last thing to be done. Symmetrically, when destroying a RIF, RIF count is first dropped and only then the rest of the cleanup proceeds. That's a problem for loopback RIFs. Those hold two VR references: one for overlay and one for underlay. mlxsw_sp_rif_destroy() releases the overlay one, and the deconfigure() callback the underlay one. But if both overlay and underlay are the same, and if there are no other artifacts holding the VR alive, this put actually destroys the VR. Later on, when mlxsw_sp_rif_destroy() calls mlxsw_sp_vr_put() for the same VR, the VR will already have been released and the kernel crashes with NULL pointer dereference. The underlying problem is that the RIF under destruction ends up referencing the overlay VR much longer than it claims: all the way until the call to mlxsw_sp_vr_put(). So line up the reference counting properly to reflect this. Make corresponding changes in mlxsw_sp_rif_create() as well for symmetry. Fixes: 6ddb7426a7d4 ("mlxsw: spectrum_router: Introduce loopback RIFs") Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 2cfb3f5d092d..3917b4dd4202 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -5068,6 +5068,7 @@ mlxsw_sp_rif_create(struct mlxsw_sp *mlxsw_sp, vr = mlxsw_sp_vr_get(mlxsw_sp, tb_id ? : RT_TABLE_MAIN); if (IS_ERR(vr)) return ERR_CAST(vr); + vr->rif_count++; err = mlxsw_sp_rif_index_alloc(mlxsw_sp, &rif_index); if (err) @@ -5099,7 +5100,6 @@ mlxsw_sp_rif_create(struct mlxsw_sp *mlxsw_sp, mlxsw_sp_rif_counters_alloc(rif); mlxsw_sp->router->rifs[rif_index] = rif; - vr->rif_count++; return rif; @@ -5110,6 +5110,7 @@ err_fid_get: kfree(rif); err_rif_alloc: err_rif_index_alloc: + vr->rif_count--; mlxsw_sp_vr_put(vr); return ERR_PTR(err); } @@ -5124,7 +5125,6 @@ void mlxsw_sp_rif_destroy(struct mlxsw_sp_rif *rif) mlxsw_sp_router_rif_gone_sync(mlxsw_sp, rif); vr = &mlxsw_sp->router->vrs[rif->vr_id]; - vr->rif_count--; mlxsw_sp->router->rifs[rif->rif_index] = NULL; mlxsw_sp_rif_counters_free(rif); ops->deconfigure(rif); @@ -5132,6 +5132,7 @@ void mlxsw_sp_rif_destroy(struct mlxsw_sp_rif *rif) /* Loopback RIFs are not associated with a FID. */ mlxsw_sp_fid_put(fid); kfree(rif); + vr->rif_count--; mlxsw_sp_vr_put(vr); } -- cgit From de0f43c01a4b5d408a5c087c8a92ac1739938f8b Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Mon, 2 Oct 2017 12:14:57 +0200 Subject: mlxsw: spectrum_router: Track RIF of IPIP next hops When considering whether to set RTNH_F_OFFLOAD flag on an IPv6 route, mlxsw_sp_fib6_entry_offload_set() looks up the mlxsw_sp_nexthop corresponding to a given route, and decides based on whether the next hop's offloaded flag was set. When looking for the matching next hop, it also takes into account the device of the route, which must match next hop's RIF. IPIP next hops however hitherto didn't set the RIF. As a result, IPv6 routes forwarding traffic to IP-in-IP netdevices are never marked as offloaded, even when they actually are. Thus track RIF of IPIP next hops the same way as that of ETHERNET next hops. Fixes: 8f28a3097645 ("mlxsw: spectrum_router: Support IPv6 overlay encap") Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 3917b4dd4202..032089efc1a0 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -2723,6 +2723,7 @@ static void mlxsw_sp_nexthop_type_fini(struct mlxsw_sp *mlxsw_sp, mlxsw_sp_nexthop_rif_fini(nh); break; case MLXSW_SP_NEXTHOP_TYPE_IPIP: + mlxsw_sp_nexthop_rif_fini(nh); mlxsw_sp_nexthop_ipip_fini(mlxsw_sp, nh); break; } @@ -2742,7 +2743,11 @@ static int mlxsw_sp_nexthop4_type_init(struct mlxsw_sp *mlxsw_sp, router->ipip_ops_arr[ipipt]->can_offload(mlxsw_sp, dev, MLXSW_SP_L3_PROTO_IPV4)) { nh->type = MLXSW_SP_NEXTHOP_TYPE_IPIP; - return mlxsw_sp_nexthop_ipip_init(mlxsw_sp, ipipt, nh, dev); + err = mlxsw_sp_nexthop_ipip_init(mlxsw_sp, ipipt, nh, dev); + if (err) + return err; + mlxsw_sp_nexthop_rif_init(nh, &nh->ipip_entry->ol_lb->common); + return 0; } nh->type = MLXSW_SP_NEXTHOP_TYPE_ETH; @@ -4009,7 +4014,11 @@ static int mlxsw_sp_nexthop6_type_init(struct mlxsw_sp *mlxsw_sp, router->ipip_ops_arr[ipipt]->can_offload(mlxsw_sp, dev, MLXSW_SP_L3_PROTO_IPV6)) { nh->type = MLXSW_SP_NEXTHOP_TYPE_IPIP; - return mlxsw_sp_nexthop_ipip_init(mlxsw_sp, ipipt, nh, dev); + err = mlxsw_sp_nexthop_ipip_init(mlxsw_sp, ipipt, nh, dev); + if (err) + return err; + mlxsw_sp_nexthop_rif_init(nh, &nh->ipip_entry->ol_lb->common); + return 0; } nh->type = MLXSW_SP_NEXTHOP_TYPE_ETH; -- cgit From d312fefea8387503375f728855c9a62de20c9665 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 2 Oct 2017 19:31:24 +0100 Subject: ahci: don't ignore result code of ahci_reset_controller() ahci_pci_reset_controller() calls ahci_reset_controller(), which may fail, but ignores the result code and always returns success. This may result in failures like below ahci 0000:02:00.0: version 3.0 ahci 0000:02:00.0: enabling device (0000 -> 0003) ahci 0000:02:00.0: SSS flag set, parallel bus scan disabled ahci 0000:02:00.0: controller reset failed (0xffffffff) ahci 0000:02:00.0: failed to stop engine (-5) ... repeated many times ... ahci 0000:02:00.0: failed to stop engine (-5) Unable to handle kernel paging request at virtual address ffff0000093f9018 ... PC is at ahci_stop_engine+0x5c/0xd8 [libahci] LR is at ahci_deinit_port.constprop.12+0x1c/0xc0 [libahci] ... [] ahci_stop_engine+0x5c/0xd8 [libahci] [] ahci_deinit_port.constprop.12+0x1c/0xc0 [libahci] [] ahci_init_controller+0x80/0x168 [libahci] [] ahci_pci_init_controller+0x60/0x68 [ahci] [] ahci_init_one+0x75c/0xd88 [ahci] [] local_pci_probe+0x3c/0xb8 [] pci_device_probe+0x138/0x170 [] driver_probe_device+0x2dc/0x458 [] __driver_attach+0x114/0x118 [] bus_for_each_dev+0x60/0xa0 [] driver_attach+0x20/0x28 [] bus_add_driver+0x1f0/0x2a8 [] driver_register+0x60/0xf8 [] __pci_register_driver+0x3c/0x48 [] ahci_pci_driver_init+0x1c/0x1000 [ahci] [] do_one_initcall+0x38/0x120 where an obvious hardware level failure results in an unnecessary 15 second delay and a subsequent crash. So record the result code of ahci_reset_controller() and relay it, rather than ignoring it. Signed-off-by: Ard Biesheuvel Signed-off-by: Tejun Heo --- drivers/ata/ahci.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c index cb9b0e9090e3..9f78bb03bb76 100644 --- a/drivers/ata/ahci.c +++ b/drivers/ata/ahci.c @@ -621,8 +621,11 @@ static void ahci_pci_save_initial_config(struct pci_dev *pdev, static int ahci_pci_reset_controller(struct ata_host *host) { struct pci_dev *pdev = to_pci_dev(host->dev); + int rc; - ahci_reset_controller(host); + rc = ahci_reset_controller(host); + if (rc) + return rc; if (pdev->vendor == PCI_VENDOR_ID_INTEL) { struct ahci_host_priv *hpriv = host->private_data; -- cgit From cb1dab0e01969d63717c7464cb5d75c77a39bf02 Mon Sep 17 00:00:00 2001 From: Chen-Yu Tsai Date: Fri, 29 Sep 2017 16:22:57 +0800 Subject: drm/sun4i: hdmi: Disable clks in bind function error path and unbind function The HDMI driver enables the bus and mod clocks in the bind function, but does not disable them if it then bails our due to any errors. Neither does it disable the clocks in the unbind function. Fix this by adding a proper error path to the bind function, and clk_disable_unprepare calls to the unbind function. Also rename the err_cleanup_connector label to err_cleanup_encoder, since it is the encoder that gets cleaned up. Fixes: 9c5681011a0c ("drm/sun4i: Add HDMI support") Signed-off-by: Chen-Yu Tsai Acked-by: Maxime Ripard Signed-off-by: Maxime Ripard Link: https://patchwork.freedesktop.org/patch/msgid/20170929082306.16193-6-wens@csie.org --- drivers/gpu/drm/sun4i/sun4i_hdmi_enc.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/sun4i/sun4i_hdmi_enc.c b/drivers/gpu/drm/sun4i/sun4i_hdmi_enc.c index 9ea6cd5a1370..3cf1a6932fac 100644 --- a/drivers/gpu/drm/sun4i/sun4i_hdmi_enc.c +++ b/drivers/gpu/drm/sun4i/sun4i_hdmi_enc.c @@ -302,26 +302,29 @@ static int sun4i_hdmi_bind(struct device *dev, struct device *master, hdmi->mod_clk = devm_clk_get(dev, "mod"); if (IS_ERR(hdmi->mod_clk)) { dev_err(dev, "Couldn't get the HDMI mod clock\n"); - return PTR_ERR(hdmi->mod_clk); + ret = PTR_ERR(hdmi->mod_clk); + goto err_disable_bus_clk; } clk_prepare_enable(hdmi->mod_clk); hdmi->pll0_clk = devm_clk_get(dev, "pll-0"); if (IS_ERR(hdmi->pll0_clk)) { dev_err(dev, "Couldn't get the HDMI PLL 0 clock\n"); - return PTR_ERR(hdmi->pll0_clk); + ret = PTR_ERR(hdmi->pll0_clk); + goto err_disable_mod_clk; } hdmi->pll1_clk = devm_clk_get(dev, "pll-1"); if (IS_ERR(hdmi->pll1_clk)) { dev_err(dev, "Couldn't get the HDMI PLL 1 clock\n"); - return PTR_ERR(hdmi->pll1_clk); + ret = PTR_ERR(hdmi->pll1_clk); + goto err_disable_mod_clk; } ret = sun4i_tmds_create(hdmi); if (ret) { dev_err(dev, "Couldn't create the TMDS clock\n"); - return ret; + goto err_disable_mod_clk; } writel(SUN4I_HDMI_CTRL_ENABLE, hdmi->base + SUN4I_HDMI_CTRL_REG); @@ -362,7 +365,7 @@ static int sun4i_hdmi_bind(struct device *dev, struct device *master, ret = sun4i_hdmi_i2c_create(dev, hdmi); if (ret) { dev_err(dev, "Couldn't create the HDMI I2C adapter\n"); - return ret; + goto err_disable_mod_clk; } drm_encoder_helper_add(&hdmi->encoder, @@ -422,6 +425,10 @@ err_cleanup_connector: drm_encoder_cleanup(&hdmi->encoder); err_del_i2c_adapter: i2c_del_adapter(hdmi->i2c); +err_disable_mod_clk: + clk_disable_unprepare(hdmi->mod_clk); +err_disable_bus_clk: + clk_disable_unprepare(hdmi->bus_clk); return ret; } @@ -434,6 +441,8 @@ static void sun4i_hdmi_unbind(struct device *dev, struct device *master, drm_connector_cleanup(&hdmi->connector); drm_encoder_cleanup(&hdmi->encoder); i2c_del_adapter(hdmi->i2c); + clk_disable_unprepare(hdmi->mod_clk); + clk_disable_unprepare(hdmi->bus_clk); } static const struct component_ops sun4i_hdmi_ops = { -- cgit From 6e60a3bbb45bd8b307269d6a821ee2c72d815846 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 2 Oct 2017 16:22:08 -0400 Subject: nbd: fix -ERESTARTSYS handling Christoph made it so that if we return'ed BLK_STS_RESOURCE whenever we got ERESTARTSYS from sending our packets we'd return BLK_STS_OK, which means we'd never requeue and just hang. We really need to return the right value from the upper layer. Fixes: fc17b6534eb8 ("blk-mq: switch ->queue_rq return value to blk_status_t") Signed-off-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 3684e21d543f..883dfebd3014 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -820,9 +820,13 @@ static blk_status_t nbd_queue_rq(struct blk_mq_hw_ctx *hctx, * appropriate. */ ret = nbd_handle_cmd(cmd, hctx->queue_num); + if (ret < 0) + ret = BLK_STS_IOERR; + else if (!ret) + ret = BLK_STS_OK; complete(&cmd->send_complete); - return ret < 0 ? BLK_STS_IOERR : BLK_STS_OK; + return ret; } static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg, -- cgit From eefca20eb20c66b06cf5ed09b49b1a7caaa27b7b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 2 Oct 2017 12:20:51 -0700 Subject: socket, bpf: fix possible use after free Starting from linux-4.4, 3WHS no longer takes the listener lock. Since this time, we might hit a use-after-free in sk_filter_charge(), if the filter we got in the memcpy() of the listener content just happened to be replaced by a thread changing listener BPF filter. To fix this, we need to make sure the filter refcount is not already zero before incrementing it again. Fixes: e994b2f0fb92 ("tcp: do not lock listener to process SYN packets") Signed-off-by: Eric Dumazet Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/core/filter.c | 12 ++++++++---- net/core/sock.c | 5 ++++- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 82edad58d066..74b8c91fb5f4 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -989,10 +989,14 @@ static bool __sk_filter_charge(struct sock *sk, struct sk_filter *fp) bool sk_filter_charge(struct sock *sk, struct sk_filter *fp) { - bool ret = __sk_filter_charge(sk, fp); - if (ret) - refcount_inc(&fp->refcnt); - return ret; + if (!refcount_inc_not_zero(&fp->refcnt)) + return false; + + if (!__sk_filter_charge(sk, fp)) { + sk_filter_release(fp); + return false; + } + return true; } static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) diff --git a/net/core/sock.c b/net/core/sock.c index 7d55c05f449d..23953b741a41 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1684,13 +1684,16 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) sock_reset_flag(newsk, SOCK_DONE); - filter = rcu_dereference_protected(newsk->sk_filter, 1); + rcu_read_lock(); + filter = rcu_dereference(sk->sk_filter); if (filter != NULL) /* though it's an empty new sock, the charging may fail * if sysctl_optmem_max was changed between creation of * original socket and cloning */ is_charged = sk_filter_charge(newsk, filter); + RCU_INIT_POINTER(newsk->sk_filter, filter); + rcu_read_unlock(); if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { /* We need to make sure that we don't uncharge the new -- cgit From 28a0bc4120d38a394499382ba21d6965a67a3703 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Wed, 27 Sep 2017 21:35:12 -0400 Subject: scsi: sd: Implement blacklist option for WRITE SAME w/ UNMAP SBC-4 states: "A MAXIMUM UNMAP LBA COUNT field set to a non-zero value indicates the maximum number of LBAs that may be unmapped by an UNMAP command" "A MAXIMUM WRITE SAME LENGTH field set to a non-zero value indicates the maximum number of contiguous logical blocks that the device server allows to be unmapped or written in a single WRITE SAME command." Despite the spec being clear on the topic, some devices incorrectly expect WRITE SAME commands with the UNMAP bit set to be limited to the value reported in MAXIMUM UNMAP LBA COUNT in the Block Limits VPD. Implement a blacklist option that can be used to accommodate devices with this behavior. Cc: Reported-by: Bill Kuzeja Reported-by: Ewan D. Milne Reviewed-by: Ewan D. Milne Tested-by: Laurence Oberman Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_scan.c | 3 +++ drivers/scsi/sd.c | 16 ++++++++++++---- include/scsi/scsi_device.h | 1 + include/scsi/scsi_devinfo.h | 1 + 4 files changed, 17 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c index e7818afeda2b..15590a063ad9 100644 --- a/drivers/scsi/scsi_scan.c +++ b/drivers/scsi/scsi_scan.c @@ -956,6 +956,9 @@ static int scsi_add_lun(struct scsi_device *sdev, unsigned char *inq_result, if (*bflags & BLIST_NO_DIF) sdev->no_dif = 1; + if (*bflags & BLIST_UNMAP_LIMIT_WS) + sdev->unmap_limit_for_ws = 1; + sdev->eh_timeout = SCSI_DEFAULT_EH_TIMEOUT; if (*bflags & BLIST_TRY_VPD_PAGES) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index fb9f8b5f4673..3d26a729825c 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -715,13 +715,21 @@ static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode) break; case SD_LBP_WS16: - max_blocks = min_not_zero(sdkp->max_ws_blocks, - (u32)SD_MAX_WS16_BLOCKS); + if (sdkp->device->unmap_limit_for_ws) + max_blocks = sdkp->max_unmap_blocks; + else + max_blocks = sdkp->max_ws_blocks; + + max_blocks = min_not_zero(max_blocks, (u32)SD_MAX_WS16_BLOCKS); break; case SD_LBP_WS10: - max_blocks = min_not_zero(sdkp->max_ws_blocks, - (u32)SD_MAX_WS10_BLOCKS); + if (sdkp->device->unmap_limit_for_ws) + max_blocks = sdkp->max_unmap_blocks; + else + max_blocks = sdkp->max_ws_blocks; + + max_blocks = min_not_zero(max_blocks, (u32)SD_MAX_WS10_BLOCKS); break; case SD_LBP_ZERO: diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h index 82e93ee94708..67c5a9f223f7 100644 --- a/include/scsi/scsi_device.h +++ b/include/scsi/scsi_device.h @@ -192,6 +192,7 @@ struct scsi_device { unsigned no_dif:1; /* T10 PI (DIF) should be disabled */ unsigned broken_fua:1; /* Don't set FUA bit */ unsigned lun_in_cdb:1; /* Store LUN bits in CDB[1] */ + unsigned unmap_limit_for_ws:1; /* Use the UNMAP limit for WRITE SAME */ atomic_t disk_events_disable_depth; /* disable depth for disk events */ diff --git a/include/scsi/scsi_devinfo.h b/include/scsi/scsi_devinfo.h index 9592570e092a..36b03013d629 100644 --- a/include/scsi/scsi_devinfo.h +++ b/include/scsi/scsi_devinfo.h @@ -29,5 +29,6 @@ #define BLIST_TRY_VPD_PAGES 0x10000000 /* Attempt to read VPD pages */ #define BLIST_NO_RSOC 0x20000000 /* don't try to issue RSOC */ #define BLIST_MAX_1024 0x40000000 /* maximum 1024 sector cdb length */ +#define BLIST_UNMAP_LIMIT_WS 0x80000000 /* Use UNMAP limit for WRITE SAME */ #endif -- cgit From 77082ca503bed061f7fbda7cfd7c93beda967a41 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Wed, 27 Sep 2017 21:38:59 -0400 Subject: scsi: sd: Do not override max_sectors_kb sysfs setting A user may lower the max_sectors_kb setting in sysfs to accommodate certain workloads. Previously we would always set the max I/O size to either the block layer default or the optional preferred I/O size reported by the device. Keep the current heuristics for the initial setting of max_sectors_kb. For subsequent invocations, only update the current queue limit if it exceeds the capabilities of the hardware. Cc: Reported-by: Don Brace Reviewed-by: Martin Wilck Tested-by: Don Brace Signed-off-by: Martin K. Petersen --- drivers/scsi/sd.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 3d26a729825c..d175c5c5ccf8 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -3107,8 +3107,6 @@ static int sd_revalidate_disk(struct gendisk *disk) sd_read_security(sdkp, buffer); } - sdkp->first_scan = 0; - /* * We now have all cache related info, determine how we deal * with flush requests. @@ -3123,7 +3121,7 @@ static int sd_revalidate_disk(struct gendisk *disk) q->limits.max_dev_sectors = logical_to_sectors(sdp, dev_max); /* - * Use the device's preferred I/O size for reads and writes + * Determine the device's preferred I/O size for reads and writes * unless the reported value is unreasonably small, large, or * garbage. */ @@ -3137,8 +3135,19 @@ static int sd_revalidate_disk(struct gendisk *disk) rw_max = min_not_zero(logical_to_sectors(sdp, dev_max), (sector_t)BLK_DEF_MAX_SECTORS); - /* Combine with controller limits */ - q->limits.max_sectors = min(rw_max, queue_max_hw_sectors(q)); + /* Do not exceed controller limit */ + rw_max = min(rw_max, queue_max_hw_sectors(q)); + + /* + * Only update max_sectors if previously unset or if the current value + * exceeds the capabilities of the hardware. + */ + if (sdkp->first_scan || + q->limits.max_sectors > q->limits.max_dev_sectors || + q->limits.max_sectors > q->limits.max_hw_sectors) + q->limits.max_sectors = rw_max; + + sdkp->first_scan = 0; set_capacity(disk, logical_to_sectors(sdp, sdkp->capacity)); sd_config_write_same(sdkp); -- cgit From 9e10b5121ad991ea6e84ca40b15a04cdc551bfe9 Mon Sep 17 00:00:00 2001 From: Khazhismel Kumykov Date: Thu, 13 Jul 2017 09:11:21 -0700 Subject: scsi: libiscsi: Fix use-after-free race during iscsi_session_teardown Session attributes exposed through sysfs were freed before the device was destroyed, resulting in a potential use-after-free. Free these attributes after removing the device. Signed-off-by: Khazhismel Kumykov Acked-by: Chris Leech Signed-off-by: Martin K. Petersen --- drivers/scsi/libiscsi.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c index bd4605a34f54..c62e8d111fd9 100644 --- a/drivers/scsi/libiscsi.c +++ b/drivers/scsi/libiscsi.c @@ -2851,9 +2851,6 @@ EXPORT_SYMBOL_GPL(iscsi_session_setup); /** * iscsi_session_teardown - destroy session, host, and cls_session * @cls_session: iscsi session - * - * The driver must have called iscsi_remove_session before - * calling this. */ void iscsi_session_teardown(struct iscsi_cls_session *cls_session) { @@ -2863,6 +2860,8 @@ void iscsi_session_teardown(struct iscsi_cls_session *cls_session) iscsi_pool_free(&session->cmdpool); + iscsi_remove_session(cls_session); + kfree(session->password); kfree(session->password_in); kfree(session->username); @@ -2877,7 +2876,8 @@ void iscsi_session_teardown(struct iscsi_cls_session *cls_session) kfree(session->portal_type); kfree(session->discovery_parent_type); - iscsi_destroy_session(cls_session); + iscsi_free_session(cls_session); + iscsi_host_dec_session_cnt(shost); module_put(owner); } -- cgit From 1c048a250aae1aaab0ba9dbec908f0c6cdb8614f Mon Sep 17 00:00:00 2001 From: Khazhismel Kumykov Date: Thu, 13 Jul 2017 09:11:22 -0700 Subject: scsi: libiscsi: Remove iscsi_destroy_session iscsi_session_teardown was the only user of this function. Function currently is just short for iscsi_remove_session + iscsi_free_session. Signed-off-by: Khazhismel Kumykov Acked-by: Chris Leech Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_transport_iscsi.c | 16 ---------------- include/scsi/scsi_transport_iscsi.h | 1 - 2 files changed, 17 deletions(-) diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index 0190aeff5f7f..7404d26895f5 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -2210,22 +2210,6 @@ void iscsi_free_session(struct iscsi_cls_session *session) } EXPORT_SYMBOL_GPL(iscsi_free_session); -/** - * iscsi_destroy_session - destroy iscsi session - * @session: iscsi_session - * - * Can be called by a LLD or iscsi_transport. There must not be - * any running connections. - */ -int iscsi_destroy_session(struct iscsi_cls_session *session) -{ - iscsi_remove_session(session); - ISCSI_DBG_TRANS_SESSION(session, "Completing session destruction\n"); - iscsi_free_session(session); - return 0; -} -EXPORT_SYMBOL_GPL(iscsi_destroy_session); - /** * iscsi_create_conn - create iscsi class connection * @session: iscsi cls session diff --git a/include/scsi/scsi_transport_iscsi.h b/include/scsi/scsi_transport_iscsi.h index 6183d20a01fb..b266d2a3bcb1 100644 --- a/include/scsi/scsi_transport_iscsi.h +++ b/include/scsi/scsi_transport_iscsi.h @@ -434,7 +434,6 @@ extern struct iscsi_cls_session *iscsi_create_session(struct Scsi_Host *shost, unsigned int target_id); extern void iscsi_remove_session(struct iscsi_cls_session *session); extern void iscsi_free_session(struct iscsi_cls_session *session); -extern int iscsi_destroy_session(struct iscsi_cls_session *session); extern struct iscsi_cls_conn *iscsi_create_conn(struct iscsi_cls_session *sess, int dd_size, uint32_t cid); extern int iscsi_destroy_conn(struct iscsi_cls_conn *conn); -- cgit From 88e65389fce1f68ba6d13ae2fc0f8d7e5c338c52 Mon Sep 17 00:00:00 2001 From: "Bryant G. Ly" Date: Mon, 2 Oct 2017 12:59:38 -0500 Subject: scsi: ibmvscsis: Fix write_pending failure path For write_pending if the queue is down or client failed then return -EIO so that LIO can properly process the completed command. Prior we returned 0 since LIO could not handle it properly. Now with commit fa7e25cf13a6 ("target: Fix unknown fabric callback queue-full errors") that patch addresses LIO's ability to handle things right. Signed-off-by: Bryant G. Ly Signed-off-by: Martin K. Petersen --- drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c b/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c index 785fb42f6650..2799a6b08f73 100644 --- a/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c +++ b/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c @@ -3767,7 +3767,7 @@ static int ibmvscsis_write_pending(struct se_cmd *se_cmd) */ if ((vscsi->flags & (CLIENT_FAILED | RESPONSE_Q_DOWN))) { pr_err("write_pending failed since: %d\n", vscsi->flags); - return 0; + return -EIO; } rc = srp_transfer_data(cmd, &vio_iu(iue)->srp.cmd, ibmvscsis_rdma, -- cgit From 3b7af5c0fd9631762d1c4d7b4cee76f571dd3c2c Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Wed, 27 Sep 2017 12:55:51 +0800 Subject: powerpc: Fix action argument for cpufeatures-based TLB flush Commit 41d0c2ecde19 ("powerpc/powernv: Fix local TLB flush for boot and MCE on POWER9") introduced calls to __flush_tlb_power[89] from the cpufeatures code, specifying the number of sets to flush. However, these functions take an action argument, not a number of sets. This means we hit the BUG() in __flush_tlb_{206,300} when using cpufeatures-style configuration. This change passes TLB_INVAL_SCOPE_GLOBAL instead. Fixes: 41d0c2ecde19 ("powerpc/powernv: Fix local TLB flush for boot and MCE on POWER9") Cc: stable@vger.kernel.org # v4.13+ Signed-off-by: Jeremy Kerr Reviewed-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/dt_cpu_ftrs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index 1df770e8cbe0..7275fed271af 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -102,10 +102,10 @@ static void cpufeatures_flush_tlb(void) case PVR_POWER8: case PVR_POWER8E: case PVR_POWER8NVL: - __flush_tlb_power8(POWER8_TLB_SETS); + __flush_tlb_power8(TLB_INVAL_SCOPE_GLOBAL); break; case PVR_POWER9: - __flush_tlb_power9(POWER9_TLB_SETS_HASH); + __flush_tlb_power9(TLB_INVAL_SCOPE_GLOBAL); break; default: pr_err("unknown CPU version for boot TLB flush\n"); -- cgit From 070e004912fed099263408bf2ff1bbc6926abe2e Mon Sep 17 00:00:00 2001 From: Christian Lamparter Date: Sun, 1 Oct 2017 16:33:03 +0200 Subject: powerpc/4xx: Fix compile error with 64K pages on 40x, 44x MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The mmu context on the 40x, 44x does not define pte_frag entry. This causes gcc abort the compilation due to: setup-common.c: In function ‘setup_arch’: setup-common.c:908: error: ‘mm_context_t’ has no ‘pte_frag’ This patch fixes the issue by removing the pte_frag initialization in setup-common.c. This is possible, because the compiler will do the initialization, since the mm_context is a sub struct of init_mm. init_mm is declared in mm_types.h as external linkage. According to C99 6.2.4.3: An object whose identifier is declared with external linkage [...] has static storage duration. C99 defines in 6.7.8.10 that: If an object that has static storage duration is not initialized explicitly, then: - if it has pointer type, it is initialized to a null pointer Fixes: b1923caa6e64 ("powerpc: Merge 32-bit and 64-bit setup_arch()") Signed-off-by: Christian Lamparter Reviewed-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/setup-common.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 0ac741fae90e..2e3bc16d02b2 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -904,9 +904,6 @@ void __init setup_arch(char **cmdline_p) #endif #endif -#ifdef CONFIG_PPC_64K_PAGES - init_mm.context.pte_frag = NULL; -#endif #ifdef CONFIG_SPAPR_TCE_IOMMU mm_iommu_init(&init_mm); #endif -- cgit From 2b0b8499ae75df91455bbeb7491d45affc384fb0 Mon Sep 17 00:00:00 2001 From: Shu Wang Date: Tue, 12 Sep 2017 10:14:54 +0800 Subject: ftrace: Fix kmemleak in unregister_ftrace_graph The trampoline allocated by function tracer was overwriten by function_graph tracer, and caused a memory leak. The save_global_trampoline should have saved the previous trampoline in register_ftrace_graph() and restored it in unregister_ftrace_graph(). But as it is implemented, save_global_trampoline was only used in unregister_ftrace_graph as default value 0, and it overwrote the previous trampoline's value. Causing the previous allocated trampoline to be lost. kmmeleak backtrace: kmemleak_vmalloc+0x77/0xc0 __vmalloc_node_range+0x1b5/0x2c0 module_alloc+0x7c/0xd0 arch_ftrace_update_trampoline+0xb5/0x290 ftrace_startup+0x78/0x210 register_ftrace_function+0x8b/0xd0 function_trace_init+0x4f/0x80 tracing_set_tracer+0xe6/0x170 tracing_set_trace_write+0x90/0xd0 __vfs_write+0x37/0x170 vfs_write+0xb2/0x1b0 SyS_write+0x55/0xc0 do_syscall_64+0x67/0x180 return_from_SYSCALL_64+0x0/0x6a [ Looking further into this, I found that this was left over from when the function and function graph tracers shared the same ftrace_ops. But in commit 5f151b2401 ("ftrace: Fix function_profiler and function tracer together"), the two were separated, and the save_global_trampoline no longer was necessary (and it may have been broken back then too). -- Steven Rostedt ] Link: http://lkml.kernel.org/r/20170912021454.5976-1-shuwang@redhat.com Cc: stable@vger.kernel.org Fixes: 5f151b2401 ("ftrace: Fix function_profiler and function tracer together") Signed-off-by: Shu Wang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6abfafd7f173..8319e09e15b9 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4954,9 +4954,6 @@ static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata; static int ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer); -static unsigned long save_global_trampoline; -static unsigned long save_global_flags; - static int __init set_graph_function(char *str) { strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); @@ -6808,17 +6805,6 @@ void unregister_ftrace_graph(void) unregister_pm_notifier(&ftrace_suspend_notifier); unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); -#ifdef CONFIG_DYNAMIC_FTRACE - /* - * Function graph does not allocate the trampoline, but - * other global_ops do. We need to reset the ALLOC_TRAMP flag - * if one was used. - */ - global_ops.trampoline = save_global_trampoline; - if (save_global_flags & FTRACE_OPS_FL_ALLOC_TRAMP) - global_ops.flags |= FTRACE_OPS_FL_ALLOC_TRAMP; -#endif - out: mutex_unlock(&ftrace_lock); } -- cgit From f39b536ce9248e9799ff900358d6f073ab2e6c55 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 25 Sep 2017 20:19:02 -0700 Subject: rcu: Remove extraneous READ_ONCE()s from rcu_irq_{enter,exit}() The read of ->dynticks_nmi_nesting in rcu_irq_enter() and rcu_irq_exit() is currently protected with READ_ONCE(). However, this protection is unnecessary because (1) ->dynticks_nmi_nesting is updated only by the current CPU, (2) Although NMI handlers can update this field, they reset it back to its old value before return, and (3) Interrupts are disabled, so nothing else can modify it. The value of ->dynticks_nmi_nesting is thus effectively constant, and so no protection is required. This commit therefore removes the READ_ONCE() protection from these two accesses. Link: http://lkml.kernel.org/r/20170926031902.GA2074@linux.vnet.ibm.com Reported-by: Linus Torvalds Signed-off-by: Paul E. McKenney Signed-off-by: Steven Rostedt (VMware) --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 63bee8e1b193..c03152f7e458 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -890,7 +890,7 @@ void rcu_irq_exit(void) rdtp = this_cpu_ptr(&rcu_dynticks); /* Page faults can happen in NMI handlers, so check... */ - if (READ_ONCE(rdtp->dynticks_nmi_nesting)) + if (rdtp->dynticks_nmi_nesting) return; WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && @@ -1027,7 +1027,7 @@ void rcu_irq_enter(void) rdtp = this_cpu_ptr(&rcu_dynticks); /* Page faults can happen in NMI handlers, so check... */ - if (READ_ONCE(rdtp->dynticks_nmi_nesting)) + if (rdtp->dynticks_nmi_nesting) return; oldval = rdtp->dynticks_nesting; -- cgit From 2fb1e946450a4fef74bb72f360555f7760d816f0 Mon Sep 17 00:00:00 2001 From: Sam Bobroff Date: Tue, 26 Sep 2017 16:47:04 +1000 Subject: KVM: PPC: Book3S: Fix server always zero from kvmppc_xive_get_xive() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In KVM's XICS-on-XIVE emulation, kvmppc_xive_get_xive() returns the value of state->guest_server as "server". However, this value is not set by it's counterpart kvmppc_xive_set_xive(). When the guest uses this interface to migrate interrupts away from a CPU that is going offline, it sees all interrupts as belonging to CPU 0, so they are left assigned to (now) offline CPUs. This patch removes the guest_server field from the state, and returns act_server in it's place (that is, the CPU actually handling the interrupt, which may differ from the one requested). Fixes: 5af50993850a ("KVM: PPC: Book3S HV: Native usage of the XIVE interrupt controller") Cc: stable@vger.kernel.org Signed-off-by: Sam Bobroff Acked-by: Benjamin Herrenschmidt Signed-off-by: Radim Krčmář --- arch/powerpc/kvm/book3s_xive.c | 5 ++--- arch/powerpc/kvm/book3s_xive.h | 1 - 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index 13304622ab1c..bf457843e032 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -622,7 +622,7 @@ int kvmppc_xive_get_xive(struct kvm *kvm, u32 irq, u32 *server, return -EINVAL; state = &sb->irq_state[idx]; arch_spin_lock(&sb->lock); - *server = state->guest_server; + *server = state->act_server; *priority = state->guest_priority; arch_spin_unlock(&sb->lock); @@ -1331,7 +1331,7 @@ static int xive_get_source(struct kvmppc_xive *xive, long irq, u64 addr) xive->saved_src_count++; /* Convert saved state into something compatible with xics */ - val = state->guest_server; + val = state->act_server; prio = state->saved_scan_prio; if (prio == MASKED) { @@ -1507,7 +1507,6 @@ static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr) /* First convert prio and mark interrupt as untargetted */ act_prio = xive_prio_from_guest(guest_prio); state->act_priority = MASKED; - state->guest_server = server; /* * We need to drop the lock due to the mutex below. Hopefully diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h index 5938f7644dc1..6ba63f8e8a61 100644 --- a/arch/powerpc/kvm/book3s_xive.h +++ b/arch/powerpc/kvm/book3s_xive.h @@ -35,7 +35,6 @@ struct kvmppc_xive_irq_state { struct xive_irq_data *pt_data; /* XIVE Pass-through associated data */ /* Targetting as set by guest */ - u32 guest_server; /* Current guest selected target */ u8 guest_priority; /* Guest set priority */ u8 saved_priority; /* Saved priority when masking */ -- cgit From ce024f42c2e28b6bce4ecc1e891b42f57f753892 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 3 Oct 2017 13:20:48 +0300 Subject: net: rtnetlink: fix info leak in RTM_GETSTATS call When RTM_GETSTATS was added the fields of its header struct were not all initialized when returning the result thus leaking 4 bytes of information to user-space per rtnl_fill_statsinfo call, so initialize them now. Thanks to Alexander Potapenko for the detailed report and bisection. Reported-by: Alexander Potapenko Fixes: 10c9ead9f3c6 ("rtnetlink: add new RTM_GETSTATS message to dump link stats") Signed-off-by: Nikolay Aleksandrov Acked-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index a78fd61da0ec..d4bcdcc68e92 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3854,6 +3854,9 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, return -EMSGSIZE; ifsm = nlmsg_data(nlh); + ifsm->family = PF_UNSPEC; + ifsm->pad1 = 0; + ifsm->pad2 = 0; ifsm->ifindex = dev->ifindex; ifsm->filter_mask = filter_mask; -- cgit From 38b249bc0ca26d57dac65f5f659b39d88899d23d Mon Sep 17 00:00:00 2001 From: Wouter Verhelst Date: Fri, 22 Sep 2017 12:09:54 +0200 Subject: MAINTAINERS: update list for NBD nbd-general@sourceforge.net becomes nbd@other.debian.org, because sourceforge is just a spamtrap these days. Signed-off-by: Wouter Verhelst Reviewed-by: Josef Bacik Signed-off-by: Jens Axboe --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 6671f375f7fc..17a643f670a4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9348,7 +9348,7 @@ NETWORK BLOCK DEVICE (NBD) M: Josef Bacik S: Maintained L: linux-block@vger.kernel.org -L: nbd-general@lists.sourceforge.net +L: nbd@other.debian.org F: Documentation/blockdev/nbd.txt F: drivers/block/nbd.c F: include/uapi/linux/nbd.h -- cgit From d410a6417e5e4deb4a9c34164addcebebc9099c9 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 3 Oct 2017 13:18:47 +0200 Subject: ide: free hwif->portdev on hwif_init() failure Recent pci_assign_irq() changes uncovered a problem with missing freeing of ide_port class instance on hwif_init() failure in ide_host_register(): ide0: disabled, no IRQ ide0: failed to initialize IDE interface ide0: disabling port cmd64x 0000:00:02.0: IDE controller (0x1095:0x0646 rev 0x07) CMD64x_IDE 0000:00:02.0: BAR 0: can't reserve [io 0x8050-0x8057] cmd64x 0000:00:02.0: can't reserve resources CMD64x_IDE: probe of 0000:00:02.0 failed with error -16 ide_generic: please use "probe_mask=0x3f" module parameter for probing all legacy ISA IDE ports ------------[ cut here ]------------ WARNING: CPU: 0 PID: 1 at fs/sysfs/dir.c:31 sysfs_warn_dup+0x94/0xd0 sysfs: cannot create duplicate filename '/class/ide_port/ide0' ... Trace: [] __warn+0x160/0x190 [] sysfs_warn_dup+0x94/0xd0 [] warn_slowpath_fmt+0x58/0x70 [] sysfs_warn_dup+0x94/0xd0 [] kernfs_path_from_node+0x30/0x60 [] kernfs_put+0x16c/0x2c0 [] kernfs_put+0x16c/0x2c0 [] sysfs_do_create_link_sd.isra.2+0x100/0x120 [] device_add+0x2a4/0x7c0 [] device_create_groups_vargs+0x14c/0x170 [] device_create_groups_vargs+0x98/0x170 [] device_create+0x50/0x70 [] ide_host_register+0x48c/0xa00 [] ide_host_register+0x450/0xa00 [] device_register+0x20/0x50 [] ide_host_register+0x450/0xa00 [] ide_host_add+0x64/0xe0 [] kobject_uevent_env+0x16c/0x710 [] do_one_initcall+0x68/0x260 [] kernel_init+0x1c/0x1a0 [] kernel_init+0x0/0x1a0 [] ret_from_kernel_thread+0x18/0x20 [] kernel_init+0x0/0x1a0 ---[ end trace 24a70433c3e4d374 ]--- ide0: disabling port Fix the problem by adding missing code to ide_host_register(). Fixes: 30fdfb929e82 ("PCI: Add a call to pci_assign_irq() in pci_device_probe()") Fixes: 0e4c2eeb758a ("alpha/PCI: Replace pci_fixup_irqs() call with host bridge IRQ mapping hooks") Link: http://lkml.kernel.org/r/32ec730f-c1b0-5584-cd35-f8a809122b96@roeck-us.net Reported-by: Guenter Roeck Tested-by: Guenter Roeck Signed-off-by: Bartlomiej Zolnierkiewicz [bhelgaas: add Fixes:] Signed-off-by: Bjorn Helgaas Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner --- drivers/ide/ide-probe.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c index 01b2adfd8226..eaf39e5db08b 100644 --- a/drivers/ide/ide-probe.c +++ b/drivers/ide/ide-probe.c @@ -1451,6 +1451,7 @@ int ide_host_register(struct ide_host *host, const struct ide_port_info *d, if (hwif_init(hwif) == 0) { printk(KERN_INFO "%s: failed to initialize IDE " "interface\n", hwif->name); + device_unregister(hwif->portdev); device_unregister(&hwif->gendev); ide_disable_port(hwif); continue; -- cgit From a06876a766c27dcedf17a0f60be89609ee23d861 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 3 Oct 2017 14:17:13 +0200 Subject: ide: pci: free PCI BARs on initialization failure Recent pci_assign_irq() changes uncovered a problem with missing freeing of PCI BARs on PCI IDE host initialization failure: ide0: disabled, no IRQ ide0: failed to initialize IDE interface ide0: disabling port cmd64x 0000:00:02.0: IDE controller (0x1095:0x0646 rev 0x07) CMD64x_IDE 0000:00:02.0: BAR 0: can't reserve [io 0x8050-0x8057] cmd64x 0000:00:02.0: can't reserve resources CMD64x_IDE: probe of 0000:00:02.0 failed with error -16 Fix the problem by adding missing freeing of PCI BARs to ide_setup_pci_controller() and ide_pci_init_two(). Fixes: 30fdfb929e82 ("PCI: Add a call to pci_assign_irq() in pci_device_probe()") Fixes: 0e4c2eeb758a ("alpha/PCI: Replace pci_fixup_irqs() call with host bridge IRQ mapping hooks") Link: http://lkml.kernel.org/r/32ec730f-c1b0-5584-cd35-f8a809122b96@roeck-us.net Reported-by: Guenter Roeck Tested-by: Guenter Roeck Signed-off-by: Bartlomiej Zolnierkiewicz [bhelgaas: add Fixes:] Signed-off-by: Bjorn Helgaas Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner --- drivers/ide/setup-pci.c | 63 +++++++++++++++++++++++++++++++------------------ 1 file changed, 40 insertions(+), 23 deletions(-) diff --git a/drivers/ide/setup-pci.c b/drivers/ide/setup-pci.c index 112d2fe1bcdb..fdc8e813170c 100644 --- a/drivers/ide/setup-pci.c +++ b/drivers/ide/setup-pci.c @@ -179,6 +179,7 @@ EXPORT_SYMBOL_GPL(ide_setup_pci_noise); /** * ide_pci_enable - do PCI enables * @dev: PCI device + * @bars: PCI BARs mask * @d: IDE port info * * Enable the IDE PCI device. We attempt to enable the device in full @@ -189,9 +190,10 @@ EXPORT_SYMBOL_GPL(ide_setup_pci_noise); * Returns zero on success or an error code */ -static int ide_pci_enable(struct pci_dev *dev, const struct ide_port_info *d) +static int ide_pci_enable(struct pci_dev *dev, int bars, + const struct ide_port_info *d) { - int ret, bars; + int ret; if (pci_enable_device(dev)) { ret = pci_enable_device_io(dev); @@ -216,18 +218,6 @@ static int ide_pci_enable(struct pci_dev *dev, const struct ide_port_info *d) goto out; } - if (d->host_flags & IDE_HFLAG_SINGLE) - bars = (1 << 2) - 1; - else - bars = (1 << 4) - 1; - - if ((d->host_flags & IDE_HFLAG_NO_DMA) == 0) { - if (d->host_flags & IDE_HFLAG_CS5520) - bars |= (1 << 2); - else - bars |= (1 << 4); - } - ret = pci_request_selected_regions(dev, bars, d->name); if (ret < 0) printk(KERN_ERR "%s %s: can't reserve resources\n", @@ -403,6 +393,7 @@ int ide_hwif_setup_dma(ide_hwif_t *hwif, const struct ide_port_info *d) /** * ide_setup_pci_controller - set up IDE PCI * @dev: PCI device + * @bars: PCI BARs mask * @d: IDE port info * @noisy: verbose flag * @@ -411,7 +402,7 @@ int ide_hwif_setup_dma(ide_hwif_t *hwif, const struct ide_port_info *d) * and enables it if need be */ -static int ide_setup_pci_controller(struct pci_dev *dev, +static int ide_setup_pci_controller(struct pci_dev *dev, int bars, const struct ide_port_info *d, int noisy) { int ret; @@ -420,7 +411,7 @@ static int ide_setup_pci_controller(struct pci_dev *dev, if (noisy) ide_setup_pci_noise(dev, d); - ret = ide_pci_enable(dev, d); + ret = ide_pci_enable(dev, bars, d); if (ret < 0) goto out; @@ -428,16 +419,20 @@ static int ide_setup_pci_controller(struct pci_dev *dev, if (ret < 0) { printk(KERN_ERR "%s %s: error accessing PCI regs\n", d->name, pci_name(dev)); - goto out; + goto out_free_bars; } if (!(pcicmd & PCI_COMMAND_IO)) { /* is device disabled? */ ret = ide_pci_configure(dev, d); if (ret < 0) - goto out; + goto out_free_bars; printk(KERN_INFO "%s %s: device enabled (Linux)\n", d->name, pci_name(dev)); } + goto out; + +out_free_bars: + pci_release_selected_regions(dev, bars); out: return ret; } @@ -540,13 +535,28 @@ int ide_pci_init_two(struct pci_dev *dev1, struct pci_dev *dev2, { struct pci_dev *pdev[] = { dev1, dev2 }; struct ide_host *host; - int ret, i, n_ports = dev2 ? 4 : 2; + int ret, i, n_ports = dev2 ? 4 : 2, bars; struct ide_hw hw[4], *hws[] = { NULL, NULL, NULL, NULL }; + if (d->host_flags & IDE_HFLAG_SINGLE) + bars = (1 << 2) - 1; + else + bars = (1 << 4) - 1; + + if ((d->host_flags & IDE_HFLAG_NO_DMA) == 0) { + if (d->host_flags & IDE_HFLAG_CS5520) + bars |= (1 << 2); + else + bars |= (1 << 4); + } + for (i = 0; i < n_ports / 2; i++) { - ret = ide_setup_pci_controller(pdev[i], d, !i); - if (ret < 0) + ret = ide_setup_pci_controller(pdev[i], bars, d, !i); + if (ret < 0) { + if (i == 1) + pci_release_selected_regions(pdev[0], bars); goto out; + } ide_pci_setup_ports(pdev[i], d, &hw[i*2], &hws[i*2]); } @@ -554,7 +564,7 @@ int ide_pci_init_two(struct pci_dev *dev1, struct pci_dev *dev2, host = ide_host_alloc(d, hws, n_ports); if (host == NULL) { ret = -ENOMEM; - goto out; + goto out_free_bars; } host->dev[0] = &dev1->dev; @@ -576,7 +586,7 @@ int ide_pci_init_two(struct pci_dev *dev1, struct pci_dev *dev2, * do_ide_setup_pci_device() on the first device! */ if (ret < 0) - goto out; + goto out_free_bars; /* fixup IRQ */ if (ide_pci_is_in_compatibility_mode(pdev[i])) { @@ -589,6 +599,13 @@ int ide_pci_init_two(struct pci_dev *dev1, struct pci_dev *dev2, ret = ide_host_register(host, d, hws); if (ret) ide_host_free(host); + else + goto out; + +out_free_bars: + i = n_ports / 2; + while (i--) + pci_release_selected_regions(pdev[i], bars); out: return ret; } -- cgit From b1f9e5e355e909000fcccfd2bc31f7c1ded358ab Mon Sep 17 00:00:00 2001 From: Lorenzo Pieralisi Date: Mon, 2 Oct 2017 11:52:47 +0100 Subject: ide: fix IRQ assignment for PCI bus order probing We used to assign IRQs for all devices at boot-time, before any drivers claimed devices. The following commits: 30fdfb929e82 ("PCI: Add a call to pci_assign_irq() in pci_device_probe()") 0e4c2eeb758a ("alpha/PCI: Replace pci_fixup_irqs() call with host bridge IRQ mapping hooks") changed this so we now call pci_assign_irq() from pci_device_probe() when we call a driver's probe method. The ide_scan_pcibus() path (enabled by CONFIG_IDEPCI_PCIBUS_ORDER) bypasses pci_device_probe() so it can guarantee devices are claimed in order of PCI bus address. It calls the driver's probe method directly, so it misses the pci_assign_irq() call (and other PCI initialization functions), which causes failures like this: ide0: disabled, no IRQ ide0: failed to initialize IDE interface ide0: disabling port cmd64x 0000:00:02.0: IDE controller (0x1095:0x0646 rev 0x07) CMD64x_IDE 0000:00:02.0: BAR 0: can't reserve [io 0x8050-0x8057] cmd64x 0000:00:02.0: can't reserve resources CMD64x_IDE: probe of 0000:00:02.0 failed with error -16 ide_generic: please use "probe_mask=0x3f" module parameter for probing all legacy ISA IDE ports ------------[ cut here ]------------ WARNING: CPU: 0 PID: 1 at fs/sysfs/dir.c:31 sysfs_warn_dup+0x94/0xd0 sysfs: cannot create duplicate filename '/class/ide_port/ide0' ... Trace: [] sysfs_warn_dup+0x94/0xd0 [] warn_slowpath_fmt+0x58/0x70 [] sysfs_warn_dup+0x94/0xd0 [] kernfs_path_from_node+0x30/0x60 [] kernfs_put+0x16c/0x2c0 [] kernfs_put+0x16c/0x2c0 [] sysfs_do_create_link_sd.isra.2+0x100/0x120 [] device_add+0x2a4/0x7c0 [] device_create_groups_vargs+0x14c/0x170 [] device_create_groups_vargs+0x98/0x170 [] device_create+0x50/0x70 [] ide_host_register+0x48c/0xa00 [] ide_host_register+0x450/0xa00 [] device_register+0x20/0x50 [] ide_host_register+0x450/0xa00 [] ide_host_add+0x64/0xe0 [] kobject_uevent_env+0x16c/0x710 [] do_one_initcall+0x68/0x260 [] kernel_init+0x1c/0x1a0 ... ---[ end trace 24a70433c3e4d374 ]--- ide0: disabling port Fix the IRQ allocation issue by calling pci_assign_irq() from ide_scan_pcidev() before probing the IDE PCI drivers, so that IRQs for a given PCI device are allocated for the IDE PCI drivers to use them for device configuration. Fixes: 30fdfb929e82 ("PCI: Add a call to pci_assign_irq() in pci_device_probe()") Fixes: 0e4c2eeb758a ("alpha/PCI: Replace pci_fixup_irqs() call with host bridge IRQ mapping hooks") Link: http://lkml.kernel.org/r/32ec730f-c1b0-5584-cd35-f8a809122b96@roeck-us.net Reported-by: Guenter Roeck Tested-by: Guenter Roeck Signed-off-by: Lorenzo Pieralisi [bhelgaas: changelog] Signed-off-by: Bjorn Helgaas Reviewed-by: Bartlomiej Zolnierkiewicz Acked-by: David S. Miller Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner --- drivers/ide/ide-scan-pci.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/ide/ide-scan-pci.c b/drivers/ide/ide-scan-pci.c index 86aa88aeb3a6..acf874800ca4 100644 --- a/drivers/ide/ide-scan-pci.c +++ b/drivers/ide/ide-scan-pci.c @@ -56,6 +56,7 @@ static int __init ide_scan_pcidev(struct pci_dev *dev) { struct list_head *l; struct pci_driver *d; + int ret; list_for_each(l, &ide_pci_drivers) { d = list_entry(l, struct pci_driver, node); @@ -63,10 +64,14 @@ static int __init ide_scan_pcidev(struct pci_dev *dev) const struct pci_device_id *id = pci_match_id(d->id_table, dev); - if (id != NULL && d->probe(dev, id) >= 0) { - dev->driver = d; - pci_dev_get(dev); - return 1; + if (id != NULL) { + pci_assign_irq(dev); + ret = d->probe(dev, id); + if (ret >= 0) { + dev->driver = d; + pci_dev_get(dev); + return 1; + } } } } -- cgit From 71300132975f364a0d3ebf68671a2ce4923191db Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Mon, 2 Oct 2017 16:53:07 +0300 Subject: drm/i915: Fix DDI PHY init if it was already on The common lane power down flag of a DPIO PHY has a funky semantic: after the initial enabling of the PHY (so from a disabled state) this flag will be clear. It will be set only after the PHY will be used for the first time (for instance due to enabling the corresponding pipe) and then become unused (due to disabling the pipe). During the initial PHY enablement we don't know which of the above phases we are in, so move the check for the flag where this is known, the HW readout code. This is where the rest of lane power down status checks are done anyway. This fixes at least a problem on GLK where after module reloading, the common lane power down flag of PHY1 is set, but the PHY is actually powered-on and properly set up. The GRC readout code for other PHYs will hence think that PHY1 is not powered initially and disable it after the GRC readout. This will cause the AUX power well related to PHY1 to get disabled in a stuck state, timing out when we try to enable it later. Cc: Ville Syrjala Fixes: e93da0a0137b ("drm/i915/bxt: Sanitiy check the PHY lane power down status") Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=102777 Signed-off-by: Imre Deak Reviewed-by: Rodrigo Vivi Link: https://patchwork.freedesktop.org/patch/msgid/20171002135307.26117-1-imre.deak@intel.com (cherry picked from commit e19c1eb885ac4186e64c7e484424124f3145318e) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/intel_ddi.c | 3 ++- drivers/gpu/drm/i915/intel_dpio_phy.c | 20 -------------------- 2 files changed, 2 insertions(+), 21 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_ddi.c b/drivers/gpu/drm/i915/intel_ddi.c index 4b4fd1f8110b..476681d5940c 100644 --- a/drivers/gpu/drm/i915/intel_ddi.c +++ b/drivers/gpu/drm/i915/intel_ddi.c @@ -1655,7 +1655,8 @@ bool intel_ddi_get_hw_state(struct intel_encoder *encoder, out: if (ret && IS_GEN9_LP(dev_priv)) { tmp = I915_READ(BXT_PHY_CTL(port)); - if ((tmp & (BXT_PHY_LANE_POWERDOWN_ACK | + if ((tmp & (BXT_PHY_CMNLANE_POWERDOWN_ACK | + BXT_PHY_LANE_POWERDOWN_ACK | BXT_PHY_LANE_ENABLED)) != BXT_PHY_LANE_ENABLED) DRM_ERROR("Port %c enabled but PHY powered down? " "(PHY_CTL %08x)\n", port_name(port), tmp); diff --git a/drivers/gpu/drm/i915/intel_dpio_phy.c b/drivers/gpu/drm/i915/intel_dpio_phy.c index 09b670929786..de38d014ed39 100644 --- a/drivers/gpu/drm/i915/intel_dpio_phy.c +++ b/drivers/gpu/drm/i915/intel_dpio_phy.c @@ -208,12 +208,6 @@ static const struct bxt_ddi_phy_info glk_ddi_phy_info[] = { }, }; -static u32 bxt_phy_port_mask(const struct bxt_ddi_phy_info *phy_info) -{ - return (phy_info->dual_channel * BIT(phy_info->channel[DPIO_CH1].port)) | - BIT(phy_info->channel[DPIO_CH0].port); -} - static const struct bxt_ddi_phy_info * bxt_get_phy_list(struct drm_i915_private *dev_priv, int *count) { @@ -313,7 +307,6 @@ bool bxt_ddi_phy_is_enabled(struct drm_i915_private *dev_priv, enum dpio_phy phy) { const struct bxt_ddi_phy_info *phy_info; - enum port port; phy_info = bxt_get_phy_info(dev_priv, phy); @@ -335,19 +328,6 @@ bool bxt_ddi_phy_is_enabled(struct drm_i915_private *dev_priv, return false; } - for_each_port_masked(port, bxt_phy_port_mask(phy_info)) { - u32 tmp = I915_READ(BXT_PHY_CTL(port)); - - if (tmp & BXT_PHY_CMNLANE_POWERDOWN_ACK) { - DRM_DEBUG_DRIVER("DDI PHY %d powered, but common lane " - "for port %c powered down " - "(PHY_CTL %08x)\n", - phy, port_name(port), tmp); - - return false; - } - } - return true; } -- cgit From 63ba395cd7a52431cbb61658dad3beb5b24e9300 Mon Sep 17 00:00:00 2001 From: Aleksander Morgado Date: Wed, 27 Sep 2017 23:31:03 +0200 Subject: rndis_host: support Novatel Verizon USB730L MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Treat the ef/04/01 interface class/subclass/protocol combination used by the Novatel Verizon USB730L (1410:9030) as a possible RNDIS interface. T: Bus=01 Lev=02 Prnt=02 Port=01 Cnt=02 Dev#= 17 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 3 P: Vendor=1410 ProdID=9030 Rev=03.10 S: Manufacturer=Novatel Wireless S: Product=MiFi USB730L S: SerialNumber=0123456789ABCDEF C: #Ifs= 3 Cfg#= 1 Atr=80 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 1 Cls=ef(misc ) Sub=04 Prot=01 Driver=rndis_host I: If#= 1 Alt= 0 #EPs= 2 Cls=0a(data ) Sub=00 Prot=00 Driver=rndis_host I: If#= 2 Alt= 0 #EPs= 1 Cls=03(HID ) Sub=00 Prot=00 Driver=usbhid Once the network interface is brought up, the user just needs to run a DHCP client to get IP address and routing setup. As a side note, other Novatel Verizon USB730L models with the same vid:pid end up exposing a standard ECM interface which doesn't require any other kernel update to make it work. Signed-off-by: Aleksander Morgado Reviewed-by: Bjørn Mork Signed-off-by: David S. Miller --- drivers/net/usb/cdc_ether.c | 11 ++++++++++- drivers/net/usb/rndis_host.c | 4 ++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/net/usb/cdc_ether.c b/drivers/net/usb/cdc_ether.c index 677a85360db1..29c7e2ec0dcb 100644 --- a/drivers/net/usb/cdc_ether.c +++ b/drivers/net/usb/cdc_ether.c @@ -54,11 +54,19 @@ static int is_wireless_rndis(struct usb_interface_descriptor *desc) desc->bInterfaceProtocol == 3); } +static int is_novatel_rndis(struct usb_interface_descriptor *desc) +{ + return (desc->bInterfaceClass == USB_CLASS_MISC && + desc->bInterfaceSubClass == 4 && + desc->bInterfaceProtocol == 1); +} + #else #define is_rndis(desc) 0 #define is_activesync(desc) 0 #define is_wireless_rndis(desc) 0 +#define is_novatel_rndis(desc) 0 #endif @@ -150,7 +158,8 @@ int usbnet_generic_cdc_bind(struct usbnet *dev, struct usb_interface *intf) */ rndis = (is_rndis(&intf->cur_altsetting->desc) || is_activesync(&intf->cur_altsetting->desc) || - is_wireless_rndis(&intf->cur_altsetting->desc)); + is_wireless_rndis(&intf->cur_altsetting->desc) || + is_novatel_rndis(&intf->cur_altsetting->desc)); memset(info, 0, sizeof(*info)); info->control = intf; diff --git a/drivers/net/usb/rndis_host.c b/drivers/net/usb/rndis_host.c index a151f267aebb..b807c91abe1d 100644 --- a/drivers/net/usb/rndis_host.c +++ b/drivers/net/usb/rndis_host.c @@ -632,6 +632,10 @@ static const struct usb_device_id products [] = { /* RNDIS for tethering */ USB_INTERFACE_INFO(USB_CLASS_WIRELESS_CONTROLLER, 1, 3), .driver_info = (unsigned long) &rndis_info, +}, { + /* Novatel Verizon USB730L */ + USB_INTERFACE_INFO(USB_CLASS_MISC, 4, 1), + .driver_info = (unsigned long) &rndis_info, }, { }, // END }; -- cgit From 4f02fb7617ba12ac15d261c654b9759ea8f1f1ef Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Sat, 30 Sep 2017 14:38:49 +0800 Subject: blk-throttle: fix possible io stall when upgrade to max There is a case which will lead to io stall. The case is described as follows. /test1 |-subtest1 /test2 |-subtest2 And subtest1 and subtest2 each has 32 queued bios already. Now upgrade to max. In throtl_upgrade_state, it will try to dispatch bios as follows: 1) tg=subtest1, do nothing; 2) tg=test1, transfer 32 queued bios from subtest1 to test1; no pending left, no need to schedule next dispatch; 3) tg=subtest2, do nothing; 4) tg=test2, transfer 32 queued bios from subtest2 to test2; no pending left, no need to schedule next dispatch; 5) tg=/, transfer 8 queued bios from test1 to /, 8 queued bios from test2 to /, 8 queued bios from test1 to /, and 8 queued bios from test2 to /; note that test1 and test2 each still has 16 queued bios left; 6) tg=/, try to schedule next dispatch, but since disptime is now (update in tg_update_disptime, wait=0), pending timer is not scheduled in fact; 7) In throtl_upgrade_state it totally dispatches 32 queued bios and with 32 left. test1 and test2 each has 16 queued bios; 8) throtl_pending_timer_fn sees the left over bios, but could do nothing, because throtl_select_dispatch returns 0, and test1/test2 has no pending tg. The blktrace shows the following: 8,32 0 0 2.539007641 0 m N throtl upgrade to max 8,32 0 0 2.539072267 0 m N throtl /test2 dispatch nr_queued=16 read=0 write=16 8,32 7 0 2.539077142 0 m N throtl /test1 dispatch nr_queued=16 read=0 write=16 So force schedule dispatch if there are pending children. Reviewed-by: Shaohua Li Signed-off-by: Joseph Qi Signed-off-by: Jens Axboe --- block/blk-throttle.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 0fea76aa0f3f..17816a028dcb 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1911,11 +1911,11 @@ static void throtl_upgrade_state(struct throtl_data *td) tg->disptime = jiffies - 1; throtl_select_dispatch(sq); - throtl_schedule_next_dispatch(sq, false); + throtl_schedule_next_dispatch(sq, true); } rcu_read_unlock(); throtl_select_dispatch(&td->service_queue); - throtl_schedule_next_dispatch(&td->service_queue, false); + throtl_schedule_next_dispatch(&td->service_queue, true); queue_work(kthrotld_workqueue, &td->dispatch_work); } -- cgit From 6cd1a6fef7058de15405b13d6587538853279c7b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 3 Oct 2017 15:58:15 -0600 Subject: null_blk: change configfs dependency to select A recent commit made null_blk depend on configfs, which is kind of annoying since you now have to find this dependency and enable that as well. Discovered this since I no longer had null_blk available on a box I needed to debug, since it got killed when the config updated after the configfs change was merged. Fixes: 3bf2bd20734e ("nullb: add configfs interface") Reviewed-by: Shaohua Li Signed-off-by: Jens Axboe --- drivers/block/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 4a438b8abe27..2dfe99b328f8 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -17,7 +17,7 @@ if BLK_DEV config BLK_DEV_NULL_BLK tristate "Null test block driver" - depends on CONFIGFS_FS + select CONFIGFS_FS config BLK_DEV_FD tristate "Normal floppy disk support" -- cgit From 70e62f4bacdf31ea8a59f241c9229120cd06d9d1 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Tue, 3 Oct 2017 14:57:16 -0700 Subject: blk-mq-debugfs: fix device sched directory for default scheduler In blk_mq_debugfs_register(), I remembered to set up the per-hctx sched directories if a default scheduler was already configured by blk_mq_sched_init() from blk_mq_init_allocated_queue(), but I didn't do the same for the device-wide sched directory. Fix it. Fixes: d332ce091813 ("blk-mq-debugfs: allow schedulers to register debugfs attributes") Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 980e73095643..de294d775acf 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -815,10 +815,14 @@ int blk_mq_debugfs_register(struct request_queue *q) goto err; /* - * blk_mq_init_hctx() attempted to do this already, but q->debugfs_dir + * blk_mq_init_sched() attempted to do this already, but q->debugfs_dir * didn't exist yet (because we don't know what to name the directory * until the queue is registered to a gendisk). */ + if (q->elevator && !q->sched_debugfs_dir) + blk_mq_debugfs_register_sched(q); + + /* Similarly, blk_mq_init_hctx() couldn't do this previously. */ queue_for_each_hw_ctx(q, hctx, i) { if (!hctx->debugfs_dir && blk_mq_debugfs_register_hctx(q, hctx)) goto err; -- cgit From 05946876f0c16f6fe1db692d575aba42b25f0811 Mon Sep 17 00:00:00 2001 From: David Wu Date: Sat, 30 Sep 2017 17:47:23 +0800 Subject: net: stmmac: dwmac-rk: Add RK3128 GMAC support Add constants and callback functions for the dwmac on rk3128 soc. As can be seen, the base structure is the same, only registers and the bits in them moved slightly. Signed-off-by: David Wu Signed-off-by: David S. Miller --- .../devicetree/bindings/net/rockchip-dwmac.txt | 1 + drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c | 112 +++++++++++++++++++++ 2 files changed, 113 insertions(+) diff --git a/Documentation/devicetree/bindings/net/rockchip-dwmac.txt b/Documentation/devicetree/bindings/net/rockchip-dwmac.txt index 6af8eed1adeb..9c16ee2965a2 100644 --- a/Documentation/devicetree/bindings/net/rockchip-dwmac.txt +++ b/Documentation/devicetree/bindings/net/rockchip-dwmac.txt @@ -4,6 +4,7 @@ The device node has following properties. Required properties: - compatible: should be "rockchip,-gamc" + "rockchip,rk3128-gmac": found on RK312x SoCs "rockchip,rk3228-gmac": found on RK322x SoCs "rockchip,rk3288-gmac": found on RK3288 SoCs "rockchip,rk3328-gmac": found on RK3328 SoCs diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c index 99823f54696a..13133b30b575 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c @@ -83,6 +83,117 @@ struct rk_priv_data { (((tx) ? soc##_GMAC_TXCLK_DLY_ENABLE : soc##_GMAC_TXCLK_DLY_DISABLE) | \ ((rx) ? soc##_GMAC_RXCLK_DLY_ENABLE : soc##_GMAC_RXCLK_DLY_DISABLE)) +#define RK3128_GRF_MAC_CON0 0x0168 +#define RK3128_GRF_MAC_CON1 0x016c + +/* RK3128_GRF_MAC_CON0 */ +#define RK3128_GMAC_TXCLK_DLY_ENABLE GRF_BIT(14) +#define RK3128_GMAC_TXCLK_DLY_DISABLE GRF_CLR_BIT(14) +#define RK3128_GMAC_RXCLK_DLY_ENABLE GRF_BIT(15) +#define RK3128_GMAC_RXCLK_DLY_DISABLE GRF_CLR_BIT(15) +#define RK3128_GMAC_CLK_RX_DL_CFG(val) HIWORD_UPDATE(val, 0x7F, 7) +#define RK3128_GMAC_CLK_TX_DL_CFG(val) HIWORD_UPDATE(val, 0x7F, 0) + +/* RK3128_GRF_MAC_CON1 */ +#define RK3128_GMAC_PHY_INTF_SEL_RGMII \ + (GRF_BIT(6) | GRF_CLR_BIT(7) | GRF_CLR_BIT(8)) +#define RK3128_GMAC_PHY_INTF_SEL_RMII \ + (GRF_CLR_BIT(6) | GRF_CLR_BIT(7) | GRF_BIT(8)) +#define RK3128_GMAC_FLOW_CTRL GRF_BIT(9) +#define RK3128_GMAC_FLOW_CTRL_CLR GRF_CLR_BIT(9) +#define RK3128_GMAC_SPEED_10M GRF_CLR_BIT(10) +#define RK3128_GMAC_SPEED_100M GRF_BIT(10) +#define RK3128_GMAC_RMII_CLK_25M GRF_BIT(11) +#define RK3128_GMAC_RMII_CLK_2_5M GRF_CLR_BIT(11) +#define RK3128_GMAC_CLK_125M (GRF_CLR_BIT(12) | GRF_CLR_BIT(13)) +#define RK3128_GMAC_CLK_25M (GRF_BIT(12) | GRF_BIT(13)) +#define RK3128_GMAC_CLK_2_5M (GRF_CLR_BIT(12) | GRF_BIT(13)) +#define RK3128_GMAC_RMII_MODE GRF_BIT(14) +#define RK3128_GMAC_RMII_MODE_CLR GRF_CLR_BIT(14) + +static void rk3128_set_to_rgmii(struct rk_priv_data *bsp_priv, + int tx_delay, int rx_delay) +{ + struct device *dev = &bsp_priv->pdev->dev; + + if (IS_ERR(bsp_priv->grf)) { + dev_err(dev, "Missing rockchip,grf property\n"); + return; + } + + regmap_write(bsp_priv->grf, RK3128_GRF_MAC_CON1, + RK3128_GMAC_PHY_INTF_SEL_RGMII | + RK3128_GMAC_RMII_MODE_CLR); + regmap_write(bsp_priv->grf, RK3128_GRF_MAC_CON0, + DELAY_ENABLE(RK3128, tx_delay, rx_delay) | + RK3128_GMAC_CLK_RX_DL_CFG(rx_delay) | + RK3128_GMAC_CLK_TX_DL_CFG(tx_delay)); +} + +static void rk3128_set_to_rmii(struct rk_priv_data *bsp_priv) +{ + struct device *dev = &bsp_priv->pdev->dev; + + if (IS_ERR(bsp_priv->grf)) { + dev_err(dev, "Missing rockchip,grf property\n"); + return; + } + + regmap_write(bsp_priv->grf, RK3128_GRF_MAC_CON1, + RK3128_GMAC_PHY_INTF_SEL_RMII | RK3128_GMAC_RMII_MODE); +} + +static void rk3128_set_rgmii_speed(struct rk_priv_data *bsp_priv, int speed) +{ + struct device *dev = &bsp_priv->pdev->dev; + + if (IS_ERR(bsp_priv->grf)) { + dev_err(dev, "Missing rockchip,grf property\n"); + return; + } + + if (speed == 10) + regmap_write(bsp_priv->grf, RK3128_GRF_MAC_CON1, + RK3128_GMAC_CLK_2_5M); + else if (speed == 100) + regmap_write(bsp_priv->grf, RK3128_GRF_MAC_CON1, + RK3128_GMAC_CLK_25M); + else if (speed == 1000) + regmap_write(bsp_priv->grf, RK3128_GRF_MAC_CON1, + RK3128_GMAC_CLK_125M); + else + dev_err(dev, "unknown speed value for RGMII! speed=%d", speed); +} + +static void rk3128_set_rmii_speed(struct rk_priv_data *bsp_priv, int speed) +{ + struct device *dev = &bsp_priv->pdev->dev; + + if (IS_ERR(bsp_priv->grf)) { + dev_err(dev, "Missing rockchip,grf property\n"); + return; + } + + if (speed == 10) { + regmap_write(bsp_priv->grf, RK3128_GRF_MAC_CON1, + RK3128_GMAC_RMII_CLK_2_5M | + RK3128_GMAC_SPEED_10M); + } else if (speed == 100) { + regmap_write(bsp_priv->grf, RK3128_GRF_MAC_CON1, + RK3128_GMAC_RMII_CLK_25M | + RK3128_GMAC_SPEED_100M); + } else { + dev_err(dev, "unknown speed value for RMII! speed=%d", speed); + } +} + +static const struct rk_gmac_ops rk3128_ops = { + .set_to_rgmii = rk3128_set_to_rgmii, + .set_to_rmii = rk3128_set_to_rmii, + .set_rgmii_speed = rk3128_set_rgmii_speed, + .set_rmii_speed = rk3128_set_rmii_speed, +}; + #define RK3228_GRF_MAC_CON0 0x0900 #define RK3228_GRF_MAC_CON1 0x0904 @@ -1313,6 +1424,7 @@ static int rk_gmac_resume(struct device *dev) static SIMPLE_DEV_PM_OPS(rk_gmac_pm_ops, rk_gmac_suspend, rk_gmac_resume); static const struct of_device_id rk_gmac_dwmac_match[] = { + { .compatible = "rockchip,rk3128-gmac", .data = &rk3128_ops }, { .compatible = "rockchip,rk3228-gmac", .data = &rk3228_ops }, { .compatible = "rockchip,rk3288-gmac", .data = &rk3288_ops }, { .compatible = "rockchip,rk3328-gmac", .data = &rk3328_ops }, -- cgit From 90caccdd8cc0215705f18b92771b449b01e2474a Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 3 Oct 2017 15:37:20 -0700 Subject: bpf: fix bpf_tail_call() x64 JIT - bpf prog_array just like all other types of bpf array accepts 32-bit index. Clarify that in the comment. - fix x64 JIT of bpf_tail_call which was incorrectly loading 8 instead of 4 bytes - tighten corresponding check in the interpreter to stay consistent The JIT bug can be triggered after introduction of BPF_F_NUMA_NODE flag in commit 96eabe7a40aa in 4.14. Before that the map_flags would stay zero and though JIT code is wrong it will check bounds correctly. Hence two fixes tags. All other JITs don't have this problem. Signed-off-by: Alexei Starovoitov Fixes: 96eabe7a40aa ("bpf: Allow selecting numa node during map creation") Fixes: b52f00e6a715 ("x86: bpf_jit: implement bpf_tail_call() helper") Acked-by: Daniel Borkmann Acked-by: Martin KaFai Lau Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- arch/x86/net/bpf_jit_comp.c | 4 ++-- include/uapi/linux/bpf.h | 2 +- kernel/bpf/core.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 8c9573660d51..0554e8aef4d5 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -284,9 +284,9 @@ static void emit_bpf_tail_call(u8 **pprog) /* if (index >= array->map.max_entries) * goto out; */ - EMIT4(0x48, 0x8B, 0x46, /* mov rax, qword ptr [rsi + 16] */ + EMIT2(0x89, 0xD2); /* mov edx, edx */ + EMIT3(0x39, 0x56, /* cmp dword ptr [rsi + 16], edx */ offsetof(struct bpf_array, map.max_entries)); - EMIT3(0x48, 0x39, 0xD0); /* cmp rax, rdx */ #define OFFSET1 43 /* number of bytes to jump */ EMIT2(X86_JBE, OFFSET1); /* jbe out */ label1 = cnt; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 43ab5c402f98..f90860d1f897 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -312,7 +312,7 @@ union bpf_attr { * jump into another BPF program * @ctx: context pointer passed to next program * @prog_array_map: pointer to map which type is BPF_MAP_TYPE_PROG_ARRAY - * @index: index inside array that selects specific program to run + * @index: 32-bit index inside array that selects specific program to run * Return: 0 on success or negative error * * int bpf_clone_redirect(skb, ifindex, flags) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 917cc04a0a94..7b62df86be1d 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1022,7 +1022,7 @@ select_insn: struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2; struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_prog *prog; - u64 index = BPF_R3; + u32 index = BPF_R3; if (unlikely(index >= array->map.max_entries)) goto out; -- cgit From 8ee912dab95f1483156b6e994004bfcc3158d798 Mon Sep 17 00:00:00 2001 From: Sudip Mukherjee Date: Tue, 3 Oct 2017 16:14:15 -0700 Subject: alpha: fix build failures The build of alpha allmodconfig is giving error: arch/alpha/include/asm/mmu_context.h: In function 'ev5_switch_mm': arch/alpha/include/asm/mmu_context.h:160:2: error: implicit declaration of function 'task_thread_info'; did you mean 'init_thread_info'? [-Werror=implicit-function-declaration] The file 'mmu_context.h' needed an extra header file. Link: http://lkml.kernel.org/r/1505668810-7497-1-git-send-email-sudipm.mukherjee@gmail.com Signed-off-by: Sudip Mukherjee Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/include/asm/mmu_context.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/alpha/include/asm/mmu_context.h b/arch/alpha/include/asm/mmu_context.h index 384bd47b5187..45c020a0fe76 100644 --- a/arch/alpha/include/asm/mmu_context.h +++ b/arch/alpha/include/asm/mmu_context.h @@ -8,6 +8,7 @@ */ #include +#include #include #include -- cgit From 630cc2b30a42c70628368a412beb4a5e5dd71abe Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Tue, 3 Oct 2017 16:14:18 -0700 Subject: kernel/params.c: align add_sysfs_param documentation with code This parameter is named kp, so the documentation should use that. Fixes: 9b473de87209 ("param: Fix duplicate module prefixes") Link: http://lkml.kernel.org/r/20170919142656.64aea59e@endymion Signed-off-by: Jean Delvare Acked-by: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/params.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/params.c b/kernel/params.c index 60b2d8101355..1cd8f1a895a8 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -600,7 +600,7 @@ EXPORT_SYMBOL(kernel_param_unlock); /* * add_sysfs_param - add a parameter to sysfs * @mk: struct module_kobject - * @kparam: the actual parameter definition to add to sysfs + * @kp: the actual parameter definition to add to sysfs * @name: name of parameter * * Create a kobject if for a (per-module) parameter if mp NULL, and -- cgit From e00e5a26e3d37b47ce8ca59611db1e6135790ef8 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 3 Oct 2017 16:14:21 -0700 Subject: scripts/spelling.txt: add more spelling mistakes to spelling.txt Here are some of the more spelling mistakes and typos that I've found while fixing up spelling mistakes in kernel error message text over the past eight weeks. [akpm@linux-foundation.org: s/|/||/, per Joe] Link: http://lkml.kernel.org/r/20170919090818.5989-1-colin.king@canonical.com Signed-off-by: Colin Ian King Acked-by: Kees Cook Cc: Masahiro Yamada Cc: Stephen Boyd Cc: Joe Perches Cc: Ross Zwisler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/spelling.txt | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/scripts/spelling.txt b/scripts/spelling.txt index 400ef35169c5..aa0cc49ad1ad 100644 --- a/scripts/spelling.txt +++ b/scripts/spelling.txt @@ -53,6 +53,7 @@ acumulator||accumulator adapater||adapter addional||additional additionaly||additionally +additonal||additional addres||address adddress||address addreses||addresses @@ -67,6 +68,8 @@ adviced||advised afecting||affecting againt||against agaist||against +aggreataon||aggregation +aggreation||aggregation albumns||albums alegorical||allegorical algined||aligned @@ -80,6 +83,8 @@ aligment||alignment alignement||alignment allign||align alligned||aligned +alllocate||allocate +alloated||allocated allocatote||allocate allocatrd||allocated allocte||allocate @@ -171,6 +176,7 @@ availale||available availavility||availability availble||available availiable||available +availible||available avalable||available avaliable||available aysnc||async @@ -203,6 +209,7 @@ broadcat||broadcast cacluated||calculated caculation||calculation calender||calendar +calescing||coalescing calle||called callibration||calibration calucate||calculate @@ -210,6 +217,7 @@ calulate||calculate cancelation||cancellation cancle||cancel capabilites||capabilities +capabilty||capability capabitilies||capabilities capatibilities||capabilities capapbilities||capabilities @@ -302,6 +310,7 @@ containts||contains contaisn||contains contant||contact contence||contents +continious||continuous continous||continuous continously||continuously continueing||continuing @@ -393,6 +402,7 @@ differrence||difference diffrent||different diffrentiate||differentiate difinition||definition +dimesions||dimensions diplay||display direectly||directly disassocation||disassociation @@ -449,6 +459,7 @@ equiped||equipped equivelant||equivalent equivilant||equivalent eror||error +errorr||error estbalishment||establishment etsablishment||establishment etsbalishment||establishment @@ -481,6 +492,7 @@ failied||failed faillure||failure failue||failure failuer||failure +failng||failing faireness||fairness falied||failed faliure||failure @@ -493,6 +505,7 @@ fetaure||feature fetaures||features fileystem||filesystem fimware||firmware +firware||firmware finanize||finalize findn||find finilizes||finalizes @@ -502,6 +515,7 @@ folloing||following followign||following followings||following follwing||following +fonud||found forseeable||foreseeable forse||force fortan||fortran @@ -532,6 +546,7 @@ grabing||grabbing grahical||graphical grahpical||graphical grapic||graphic +grranted||granted guage||gauge guarenteed||guaranteed guarentee||guarantee @@ -543,6 +558,7 @@ happend||happened harware||hardware heirarchically||hierarchically helpfull||helpful +hybernate||hibernate hierachy||hierarchy hierarchie||hierarchy howver||however @@ -565,16 +581,19 @@ implemenation||implementation implementaiton||implementation implementated||implemented implemention||implementation +implementd||implemented implemetation||implementation implemntation||implementation implentation||implementation implmentation||implementation implmenting||implementing +incative||inactive incomming||incoming incompatabilities||incompatibilities incompatable||incompatible inconsistant||inconsistent increas||increase +incremeted||incremented incrment||increment indendation||indentation indended||intended @@ -619,6 +638,7 @@ interger||integer intermittant||intermittent internel||internal interoprability||interoperability +interuupt||interrupt interrface||interface interrrupt||interrupt interrup||interrupt @@ -638,8 +658,10 @@ intrrupt||interrupt intterrupt||interrupt intuative||intuitive invaid||invalid +invald||invalid invalde||invalid invalide||invalid +invalidiate||invalidate invalud||invalid invididual||individual invokation||invocation @@ -713,6 +735,7 @@ misformed||malformed mispelled||misspelled mispelt||misspelt mising||missing +mismactch||mismatch missmanaged||mismanaged missmatch||mismatch miximum||maximum @@ -731,6 +754,7 @@ multidimensionnal||multidimensional multple||multiple mumber||number muticast||multicast +mutilcast||multicast mutiple||multiple mutli||multi nams||names @@ -834,6 +858,7 @@ posible||possible positon||position possibilites||possibilities powerfull||powerful +preample||preamble preapre||prepare preceeded||preceded preceeding||preceding @@ -1059,6 +1084,7 @@ sturcture||structure subdirectoires||subdirectories suble||subtle substract||subtract +submition||submission succesfully||successfully succesful||successful successed||succeeded @@ -1078,6 +1104,7 @@ suppoted||supported suppported||supported suppport||support supress||suppress +surpressed||suppressed surpresses||suppresses susbsystem||subsystem suspeneded||suspended @@ -1091,6 +1118,7 @@ swithced||switched swithcing||switching swithed||switched swithing||switching +swtich||switch symetric||symmetric synax||syntax synchonized||synchronized @@ -1111,7 +1139,9 @@ therfore||therefore thier||their threds||threads threshhold||threshold +thresold||threshold throught||through +troughput||throughput thses||these tiggered||triggered tipically||typically @@ -1120,6 +1150,7 @@ tmis||this torerable||tolerable tramsmitted||transmitted tramsmit||transmit +tranasction||transaction tranfer||transfer transciever||transceiver transferd||transferred @@ -1133,6 +1164,7 @@ trasmission||transmission treshold||threshold trigerring||triggering trun||turn +tunning||tuning ture||true tyep||type udpate||update @@ -1199,6 +1231,7 @@ visiters||visitors vitual||virtual wakeus||wakeups wating||waiting +wiat||wait wether||whether whataver||whatever whcih||which -- cgit From fa87b91c94a8fd2c8502b6761be2d08a8e9bcf55 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 3 Oct 2017 16:14:24 -0700 Subject: include/linux/mm.h: fix typo in VM_MPX definition There's a typo in recent change of VM_MPX definition. We want it to be VM_HIGH_ARCH_4, not VM_HIGH_ARCH_BIT_4. This bug does cause visible regressions. In arch_vma_name the vmflags are tested against VM_MPX. With the incorrect value of VM_MPX, a number of vmas (such as the stack) test positive and end up being marked as "[mpx]" in /proc/N/maps instead of their correct names. This confuses tools like rr which expect to be able to find familiar vmas. Fixes: df3735c5b40f ("x86,mpx: make mpx depend on x86-64 to free up VMA flag") Link: http://lkml.kernel.org/r/20170918140253.36856-1-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Reviewed-by: Rik van Riel Cc: Dave Hansen Cc: Kyle Huey Cc: [4.14+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index f8c10d336e42..065d99deb847 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -240,7 +240,7 @@ extern unsigned int kobjsize(const void *objp); #if defined(CONFIG_X86_INTEL_MPX) /* MPX specific bounds table or bounds directory */ -# define VM_MPX VM_HIGH_ARCH_BIT_4 +# define VM_MPX VM_HIGH_ARCH_4 #else # define VM_MPX VM_NONE #endif -- cgit From 4b22927f0cbd58303aac689e378d20bf56267a39 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 3 Oct 2017 16:14:27 -0700 Subject: ksm: fix unlocked iteration over vmas in cmp_and_merge_page() In this place mm is unlocked, so vmas or list may change. Down read mmap_sem to protect them from modifications. Link: http://lkml.kernel.org/r/150512788393.10691.8868381099691121308.stgit@localhost.localdomain Fixes: e86c59b1b12d ("mm/ksm: improve deduplication of zero pages with colouring") Signed-off-by: Kirill Tkhai Acked-by: Michal Hocko Reviewed-by: Andrea Arcangeli Cc: Minchan Kim Cc: zhong jiang Cc: Ingo Molnar Cc: Claudio Imbrenda Cc: "Kirill A. Shutemov" Cc: Hugh Dickins Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/ksm.c b/mm/ksm.c index 15dd7415f7b3..6cb60f46cce5 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1990,6 +1990,7 @@ static void stable_tree_append(struct rmap_item *rmap_item, */ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) { + struct mm_struct *mm = rmap_item->mm; struct rmap_item *tree_rmap_item; struct page *tree_page = NULL; struct stable_node *stable_node; @@ -2062,9 +2063,11 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) if (ksm_use_zero_pages && (checksum == zero_checksum)) { struct vm_area_struct *vma; - vma = find_mergeable_vma(rmap_item->mm, rmap_item->address); + down_read(&mm->mmap_sem); + vma = find_mergeable_vma(mm, rmap_item->address); err = try_to_merge_one_page(vma, page, ZERO_PAGE(rmap_item->address)); + up_read(&mm->mmap_sem); /* * In case of failure, the page was not really empty, so we * need to continue. Otherwise we're done. -- cgit From 19bfbe22f59a207417b2679e7e83c180419c9ec5 Mon Sep 17 00:00:00 2001 From: Alexandru Moise <00moses.alexander00@gmail.com> Date: Tue, 3 Oct 2017 16:14:31 -0700 Subject: mm, hugetlb, soft_offline: save compound page order before page migration This fixes a bug in madvise() where if you'd try to soft offline a hugepage via madvise(), while walking the address range you'd end up, using the wrong page offset due to attempting to get the compound order of a former but presently not compound page, due to dissolving the huge page (since commit c3114a84f7f9: "mm: hugetlb: soft-offline: dissolve source hugepage after successful migration"). As a result I ended up with all my free pages except one being offlined. Link: http://lkml.kernel.org/r/20170912204306.GA12053@gmail.com Fixes: c3114a84f7f9 ("mm: hugetlb: soft-offline: dissolve source hugepage after successful migration") Signed-off-by: Alexandru Moise <00moses.alexander00@gmail.com> Cc: Anshuman Khandual Cc: Michal Hocko Cc: Andrea Arcangeli Cc: Minchan Kim Cc: Hillf Danton Cc: Shaohua Li Cc: Mike Rapoport Cc: "Kirill A. Shutemov" Cc: Mel Gorman Cc: David Rientjes Cc: Rik van Riel Cc: Naoya Horiguchi Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/madvise.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index 21261ff0466f..25bade36e9ca 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -625,18 +625,26 @@ static int madvise_inject_error(int behavior, { struct page *page; struct zone *zone; + unsigned int order; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - for (; start < end; start += PAGE_SIZE << - compound_order(compound_head(page))) { + + for (; start < end; start += PAGE_SIZE << order) { int ret; ret = get_user_pages_fast(start, 1, 0, &page); if (ret != 1) return ret; + /* + * When soft offlining hugepages, after migrating the page + * we dissolve it, therefore in the second loop "page" will + * no longer be a compound page, and order will be 0. + */ + order = compound_order(compound_head(page)); + if (PageHWPoison(page)) { put_page(page); continue; -- cgit From b78412b8300a8453b78d2c1b0b925b66493bb011 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 3 Oct 2017 16:14:34 -0700 Subject: sh: sh7722: remove nonexistent GPIO_PTQ7 to fix pinctrl registration Patch series "sh: sh7722/sh7757i/sh7264/sh7269: Fix pinctrl registration", v2. Magnus Damm reported that on sh7722/Migo-R, pinctrl registration fails with: sh-pfc pfc-sh7722: pin 0 already registered sh-pfc pfc-sh7722: error during pin registration sh-pfc pfc-sh7722: could not register: -22 sh-pfc: probe of pfc-sh7722 failed with error -22 pinmux_pins[] is initialized through PINMUX_GPIO(), using designated array initializers, where the GPIO_* enums serve as indices. Apparently GPIO_PTQ7 was defined in the enum, but never used. If enum values are defined, but never used, pinmux_pins[] contains (zero-filled) holes. Hence such entries are treated as pin zero, which was registered before, and pinctrl registration fails. I can't see how this ever worked, as at the time of commit f5e25ae52fef ("sh-pfc: Add sh7722 pinmux support"), pinmux_gpios[] in drivers/pinctrl/sh-pfc/pfc-sh7722.c already had the hole, and drivers/pinctrl/core.c already had the check. Some scripting revealed a few more broken drivers: - sh7757 has four holes, due to nonexistent GPIO_PT[JLNQ]7_RESV. - sh7264 and sh7269 define GPIO_PH[0-7], but don't use it with PINMUX_GPIO(). Patch 1 fixes the issue on sh7722, and was tested. Patches 3-4 should fix the issue on the other 3 SoCs, but was untested due to lack of hardware. This patch (of 4): On sh7722/Migo-R, pinctrl registration fails with: sh-pfc pfc-sh7722: pin 0 already registered sh-pfc pfc-sh7722: error during pin registration sh-pfc pfc-sh7722: could not register: -22 sh-pfc: probe of pfc-sh7722 failed with error -22 pinmux_pins[] is initialized through PINMUX_GPIO(), using designated array initializers, where the GPIO_* enums serve as indices. As GPIO_PTQ7 is defined in the enum, but never used, pinmux_pins[] contains a (zero-filled) hole. Hence this entry is treated as pin zero, which was registered before, and pinctrl registration fails. According to the datasheet, port PTQ7 does not exist. Hence remove GPIO_PTQ7 from the enum to fix this. Link: http://lkml.kernel.org/r/1505205657-18012-2-git-send-email-geert+renesas@glider.be Fixes: 8d7b5b0af7e070b9 ("sh: Add sh7722 pinmux code") Signed-off-by: Geert Uytterhoeven Reported-by: Magnus Damm Reviewed-by: Laurent Pinchart Tested-by: Jacopo Mondi Cc: Rich Felker Cc: Yoshihiro Shimoda Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/include/cpu-sh4/cpu/sh7722.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/sh/include/cpu-sh4/cpu/sh7722.h b/arch/sh/include/cpu-sh4/cpu/sh7722.h index 3bb74e534d0f..78961ab78a5a 100644 --- a/arch/sh/include/cpu-sh4/cpu/sh7722.h +++ b/arch/sh/include/cpu-sh4/cpu/sh7722.h @@ -67,7 +67,7 @@ enum { GPIO_PTN3, GPIO_PTN2, GPIO_PTN1, GPIO_PTN0, /* PTQ */ - GPIO_PTQ7, GPIO_PTQ6, GPIO_PTQ5, GPIO_PTQ4, + GPIO_PTQ6, GPIO_PTQ5, GPIO_PTQ4, GPIO_PTQ3, GPIO_PTQ2, GPIO_PTQ1, GPIO_PTQ0, /* PTR */ -- cgit From d8ce38f69843a56da044e56b6c16aecfbc3c6e39 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 3 Oct 2017 16:14:37 -0700 Subject: sh: sh7757: remove nonexistent GPIO_PT[JLNQ]7_RESV to fix pinctrl registration Commit 3810e96056ff ("sh: modify pinmux for SH7757 2nd cut") renamed GPIO_PT[JLNQ]7 to GPIO_PT[JLNQ]7_RESV, and removed the existing users from the pinmux_pins[] array. However, pinmux_pins[] is initialized through PINMUX_GPIO(), using designated array initializers, where the GPIO_* enums serve as indices. Hence entries were not really removed, but replaced by (zero-filled) holes. Such entries are treated as pin zero, which was registered before, thus leading to pinctrl registration failures, as seen on sh7722: sh-pfc pfc-sh7722: pin 0 already registered sh-pfc pfc-sh7722: error during pin registration sh-pfc pfc-sh7722: could not register: -22 sh-pfc: probe of pfc-sh7722 failed with error -22 Remove GPIO_PT[JLNQ]7_RESV from the enum to fix this. Link: http://lkml.kernel.org/r/1505205657-18012-3-git-send-email-geert+renesas@glider.be Fixes: 3810e96056ffddf6 ("sh: modify pinmux for SH7757 2nd cut") Signed-off-by: Geert Uytterhoeven Reviewed-by: Laurent Pinchart Cc: Jacopo Mondi Cc: Magnus Damm Cc: Rich Felker Cc: Yoshihiro Shimoda Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/include/cpu-sh4/cpu/sh7757.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/sh/include/cpu-sh4/cpu/sh7757.h b/arch/sh/include/cpu-sh4/cpu/sh7757.h index 5340f3bc1863..b40fb541e72a 100644 --- a/arch/sh/include/cpu-sh4/cpu/sh7757.h +++ b/arch/sh/include/cpu-sh4/cpu/sh7757.h @@ -40,7 +40,7 @@ enum { /* PTJ */ GPIO_PTJ0, GPIO_PTJ1, GPIO_PTJ2, GPIO_PTJ3, - GPIO_PTJ4, GPIO_PTJ5, GPIO_PTJ6, GPIO_PTJ7_RESV, + GPIO_PTJ4, GPIO_PTJ5, GPIO_PTJ6, /* PTK */ GPIO_PTK0, GPIO_PTK1, GPIO_PTK2, GPIO_PTK3, @@ -48,7 +48,7 @@ enum { /* PTL */ GPIO_PTL0, GPIO_PTL1, GPIO_PTL2, GPIO_PTL3, - GPIO_PTL4, GPIO_PTL5, GPIO_PTL6, GPIO_PTL7_RESV, + GPIO_PTL4, GPIO_PTL5, GPIO_PTL6, /* PTM */ GPIO_PTM0, GPIO_PTM1, GPIO_PTM2, GPIO_PTM3, @@ -56,7 +56,7 @@ enum { /* PTN */ GPIO_PTN0, GPIO_PTN1, GPIO_PTN2, GPIO_PTN3, - GPIO_PTN4, GPIO_PTN5, GPIO_PTN6, GPIO_PTN7_RESV, + GPIO_PTN4, GPIO_PTN5, GPIO_PTN6, /* PTO */ GPIO_PTO0, GPIO_PTO1, GPIO_PTO2, GPIO_PTO3, @@ -68,7 +68,7 @@ enum { /* PTQ */ GPIO_PTQ0, GPIO_PTQ1, GPIO_PTQ2, GPIO_PTQ3, - GPIO_PTQ4, GPIO_PTQ5, GPIO_PTQ6, GPIO_PTQ7_RESV, + GPIO_PTQ4, GPIO_PTQ5, GPIO_PTQ6, /* PTR */ GPIO_PTR0, GPIO_PTR1, GPIO_PTR2, GPIO_PTR3, -- cgit From eae3df7e82318d798f45dedf111e241805ec7a4a Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 3 Oct 2017 16:14:41 -0700 Subject: sh: sh7264: remove nonexistent GPIO_PH[0-7] to fix pinctrl registration Pinmux_pins[] is initialized through PINMUX_GPIO(), using designated array initializers, where the GPIO_* enums serve as indices. If enum values are defined, but never used, pinmux_pins[] contains (zero-filled) holes. Such entries are treated as pin zero, which was registered before, thus leading to pinctrl registration failures, as seen on sh7722: sh-pfc pfc-sh7722: pin 0 already registered sh-pfc pfc-sh7722: error during pin registration sh-pfc pfc-sh7722: could not register: -22 sh-pfc: probe of pfc-sh7722 failed with error -22 Remove GPIO_PH[0-7] from the enum to fix this. Link: http://lkml.kernel.org/r/1505205657-18012-4-git-send-email-geert+renesas@glider.be Fixes: 41797f75486d8ca3 ("sh: Add pinmux for sh7264") Signed-off-by: Geert Uytterhoeven Reviewed-by: Laurent Pinchart Cc: Jacopo Mondi Cc: Magnus Damm Cc: Rich Felker Cc: Yoshihiro Shimoda Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/include/cpu-sh2a/cpu/sh7264.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/sh/include/cpu-sh2a/cpu/sh7264.h b/arch/sh/include/cpu-sh2a/cpu/sh7264.h index 4d1ef6d74bd6..2ae0e938b657 100644 --- a/arch/sh/include/cpu-sh2a/cpu/sh7264.h +++ b/arch/sh/include/cpu-sh2a/cpu/sh7264.h @@ -43,9 +43,7 @@ enum { GPIO_PG7, GPIO_PG6, GPIO_PG5, GPIO_PG4, GPIO_PG3, GPIO_PG2, GPIO_PG1, GPIO_PG0, - /* Port H */ - GPIO_PH7, GPIO_PH6, GPIO_PH5, GPIO_PH4, - GPIO_PH3, GPIO_PH2, GPIO_PH1, GPIO_PH0, + /* Port H - Port H does not have a Data Register */ /* Port I - not on device */ -- cgit From d9d73e81fe82fdf4ee65a48c26531edc04108349 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 3 Oct 2017 16:14:44 -0700 Subject: sh: sh7269: remove nonexistent GPIO_PH[0-7] to fix pinctrl registration Pinmux_pins[] is initialized through PINMUX_GPIO(), using designated array initializers, where the GPIO_* enums serve as indices. If enum values are defined, but never used, pinmux_pins[] contains (zero-filled) holes. Such entries are treated as pin zero, which was registered before, thus leading to pinctrl registration failures, as seen on sh7722: sh-pfc pfc-sh7722: pin 0 already registered sh-pfc pfc-sh7722: error during pin registration sh-pfc pfc-sh7722: could not register: -22 sh-pfc: probe of pfc-sh7722 failed with error -22 Remove GPIO_PH[0-7] from the enum to fix this. Link: http://lkml.kernel.org/r/1505205657-18012-5-git-send-email-geert+renesas@glider.be Fixes: ef0fa5331a73e479 ("sh: Add pinmux for sh7269") Signed-off-by: Geert Uytterhoeven Reviewed-by: Laurent Pinchart Cc: Yoshinori Sato Cc: Rich Felker Cc: Magnus Damm Cc: Yoshihiro Shimoda Cc: Jacopo Mondi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/include/cpu-sh2a/cpu/sh7269.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/sh/include/cpu-sh2a/cpu/sh7269.h b/arch/sh/include/cpu-sh2a/cpu/sh7269.h index 2a0ca8780f0d..13c495a9fc00 100644 --- a/arch/sh/include/cpu-sh2a/cpu/sh7269.h +++ b/arch/sh/include/cpu-sh2a/cpu/sh7269.h @@ -45,9 +45,7 @@ enum { GPIO_PG7, GPIO_PG6, GPIO_PG5, GPIO_PG4, GPIO_PG3, GPIO_PG2, GPIO_PG1, GPIO_PG0, - /* Port H */ - GPIO_PH7, GPIO_PH6, GPIO_PH5, GPIO_PH4, - GPIO_PH3, GPIO_PH2, GPIO_PH1, GPIO_PH0, + /* Port H - Port H does not have a Data Register */ /* Port I - not on device */ -- cgit From d5567c9df1ef001b2a7e6684b3b3498371ee4cae Mon Sep 17 00:00:00 2001 From: Vitaly Wool Date: Tue, 3 Oct 2017 16:14:47 -0700 Subject: z3fold: fix potential race in z3fold_reclaim_page It is possible that on a (partially) unsuccessful page reclaim, kref_put() called in z3fold_reclaim_page() does not yield page release, but the page is released shortly afterwards by another thread. Then z3fold_reclaim_page() would try to list_add() that (released) page again which is obviously a bug. To avoid that, spin_lock() has to be taken earlier, before the kref_put() call mentioned earlier. Link: http://lkml.kernel.org/r/20170913162937.bfff21c7d12b12a5f47639fd@gmail.com Signed-off-by: Vitaly Wool Cc: Dan Streetman Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/z3fold.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/z3fold.c b/mm/z3fold.c index 486550df32be..b04fa3ba1bf2 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -875,16 +875,18 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) goto next; } next: + spin_lock(&pool->lock); if (test_bit(PAGE_HEADLESS, &page->private)) { if (ret == 0) { + spin_unlock(&pool->lock); free_z3fold_page(page); return 0; } } else if (kref_put(&zhdr->refcount, release_z3fold_page)) { atomic64_dec(&pool->pages_nr); + spin_unlock(&pool->lock); return 0; } - spin_lock(&pool->lock); /* * Add to the beginning of LRU. -- cgit From 4d4bbd8526a8fbeb2c090ea360211fceff952383 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 3 Oct 2017 16:14:50 -0700 Subject: mm, oom_reaper: skip mm structs with mmu notifiers Andrea has noticed that the oom_reaper doesn't invalidate the range via mmu notifiers (mmu_notifier_invalidate_range_start/end) and that can corrupt the memory of the kvm guest for example. tlb_flush_mmu_tlbonly already invokes mmu notifiers but that is not sufficient as per Andrea: "mmu_notifier_invalidate_range cannot be used in replacement of mmu_notifier_invalidate_range_start/end. For KVM mmu_notifier_invalidate_range is a noop and rightfully so. A MMU notifier implementation has to implement either ->invalidate_range method or the invalidate_range_start/end methods, not both. And if you implement invalidate_range_start/end like KVM is forced to do, calling mmu_notifier_invalidate_range in common code is a noop for KVM. For those MMU notifiers that can get away only implementing ->invalidate_range, the ->invalidate_range is implicitly called by mmu_notifier_invalidate_range_end(). And only those secondary MMUs that share the same pagetable with the primary MMU (like AMD iommuv2) can get away only implementing ->invalidate_range" As the callback is allowed to sleep and the implementation is out of hand of the MM it is safer to simply bail out if there is an mmu notifier registered. In order to not fail too early make the mm_has_notifiers check under the oom_lock and have a little nap before failing to give the current oom victim some more time to exit. [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/20170913113427.2291-1-mhocko@kernel.org Fixes: aac453635549 ("mm, oom: introduce oom reaper") Signed-off-by: Michal Hocko Reported-by: Andrea Arcangeli Reviewed-by: Andrea Arcangeli Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmu_notifier.h | 5 +++++ mm/oom_kill.c | 16 ++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 7b2e31b1745a..6866e8126982 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -400,6 +400,11 @@ extern void mmu_notifier_synchronize(void); #else /* CONFIG_MMU_NOTIFIER */ +static inline int mm_has_notifiers(struct mm_struct *mm) +{ + return 0; +} + static inline void mmu_notifier_release(struct mm_struct *mm) { } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 99736e026712..dee0f75c3013 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include "internal.h" @@ -494,6 +495,21 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) goto unlock_oom; } + /* + * If the mm has notifiers then we would need to invalidate them around + * unmap_page_range and that is risky because notifiers can sleep and + * what they do is basically undeterministic. So let's have a short + * sleep to give the oom victim some more time. + * TODO: we really want to get rid of this ugly hack and make sure that + * notifiers cannot block for unbounded amount of time and add + * mmu_notifier_invalidate_range_{start,end} around unmap_page_range + */ + if (mm_has_notifiers(mm)) { + up_read(&mm->mmap_sem); + schedule_timeout_idle(HZ); + goto unlock_oom; + } + /* * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't * work on the mm anymore. The check for MMF_OOM_SKIP must run -- cgit From 72f0184c8a00c70179cfed6266e2e06b4d400065 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 3 Oct 2017 16:14:53 -0700 Subject: mm, memcg: remove hotplug locking from try_charge The following lockdep splat has been noticed during LTP testing ====================================================== WARNING: possible circular locking dependency detected 4.13.0-rc3-next-20170807 #12 Not tainted ------------------------------------------------------ a.out/4771 is trying to acquire lock: (cpu_hotplug_lock.rw_sem){++++++}, at: [] drain_all_stock.part.35+0x18/0x140 but task is already holding lock: (&mm->mmap_sem){++++++}, at: [] __do_page_fault+0x175/0x530 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #3 (&mm->mmap_sem){++++++}: lock_acquire+0xc9/0x230 __might_fault+0x70/0xa0 _copy_to_user+0x23/0x70 filldir+0xa7/0x110 xfs_dir2_sf_getdents.isra.10+0x20c/0x2c0 [xfs] xfs_readdir+0x1fa/0x2c0 [xfs] xfs_file_readdir+0x30/0x40 [xfs] iterate_dir+0x17a/0x1a0 SyS_getdents+0xb0/0x160 entry_SYSCALL_64_fastpath+0x1f/0xbe -> #2 (&type->i_mutex_dir_key#3){++++++}: lock_acquire+0xc9/0x230 down_read+0x51/0xb0 lookup_slow+0xde/0x210 walk_component+0x160/0x250 link_path_walk+0x1a6/0x610 path_openat+0xe4/0xd50 do_filp_open+0x91/0x100 file_open_name+0xf5/0x130 filp_open+0x33/0x50 kernel_read_file_from_path+0x39/0x80 _request_firmware+0x39f/0x880 request_firmware_direct+0x37/0x50 request_microcode_fw+0x64/0xe0 reload_store+0xf7/0x180 dev_attr_store+0x18/0x30 sysfs_kf_write+0x44/0x60 kernfs_fop_write+0x113/0x1a0 __vfs_write+0x37/0x170 vfs_write+0xc7/0x1c0 SyS_write+0x58/0xc0 do_syscall_64+0x6c/0x1f0 return_from_SYSCALL_64+0x0/0x7a -> #1 (microcode_mutex){+.+.+.}: lock_acquire+0xc9/0x230 __mutex_lock+0x88/0x960 mutex_lock_nested+0x1b/0x20 microcode_init+0xbb/0x208 do_one_initcall+0x51/0x1a9 kernel_init_freeable+0x208/0x2a7 kernel_init+0xe/0x104 ret_from_fork+0x2a/0x40 -> #0 (cpu_hotplug_lock.rw_sem){++++++}: __lock_acquire+0x153c/0x1550 lock_acquire+0xc9/0x230 cpus_read_lock+0x4b/0x90 drain_all_stock.part.35+0x18/0x140 try_charge+0x3ab/0x6e0 mem_cgroup_try_charge+0x7f/0x2c0 shmem_getpage_gfp+0x25f/0x1050 shmem_fault+0x96/0x200 __do_fault+0x1e/0xa0 __handle_mm_fault+0x9c3/0xe00 handle_mm_fault+0x16e/0x380 __do_page_fault+0x24a/0x530 do_page_fault+0x30/0x80 page_fault+0x28/0x30 other info that might help us debug this: Chain exists of: cpu_hotplug_lock.rw_sem --> &type->i_mutex_dir_key#3 --> &mm->mmap_sem Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&mm->mmap_sem); lock(&type->i_mutex_dir_key#3); lock(&mm->mmap_sem); lock(cpu_hotplug_lock.rw_sem); *** DEADLOCK *** 2 locks held by a.out/4771: #0: (&mm->mmap_sem){++++++}, at: [] __do_page_fault+0x175/0x530 #1: (percpu_charge_mutex){+.+...}, at: [] try_charge+0x397/0x6e0 The problem is very similar to the one fixed by commit a459eeb7b852 ("mm, page_alloc: do not depend on cpu hotplug locks inside the allocator"). We are taking hotplug locks while we can be sitting on top of basically arbitrary locks. This just calls for problems. We can get rid of {get,put}_online_cpus, fortunately. We do not have to be worried about races with memory hotplug because drain_local_stock, which is called from both the WQ draining and the memory hotplug contexts, is always operating on the local cpu stock with IRQs disabled. The only thing to be careful about is that the target memcg doesn't vanish while we are still in drain_all_stock so take a reference on it. Link: http://lkml.kernel.org/r/20170913090023.28322-1-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Artem Savkov Tested-by: Artem Savkov Cc: Johannes Weiner Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 15af3da5af02..696c6529e900 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1777,6 +1777,10 @@ static void drain_local_stock(struct work_struct *dummy) struct memcg_stock_pcp *stock; unsigned long flags; + /* + * The only protection from memory hotplug vs. drain_stock races is + * that we always operate on local CPU stock here with IRQ disabled + */ local_irq_save(flags); stock = this_cpu_ptr(&memcg_stock); @@ -1821,27 +1825,33 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) /* If someone's already draining, avoid adding running more workers. */ if (!mutex_trylock(&percpu_charge_mutex)) return; - /* Notify other cpus that system-wide "drain" is running */ - get_online_cpus(); + /* + * Notify other cpus that system-wide "drain" is running + * We do not care about races with the cpu hotplug because cpu down + * as well as workers from this path always operate on the local + * per-cpu data. CPU up doesn't touch memcg_stock at all. + */ curcpu = get_cpu(); for_each_online_cpu(cpu) { struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); struct mem_cgroup *memcg; memcg = stock->cached; - if (!memcg || !stock->nr_pages) + if (!memcg || !stock->nr_pages || !css_tryget(&memcg->css)) continue; - if (!mem_cgroup_is_descendant(memcg, root_memcg)) + if (!mem_cgroup_is_descendant(memcg, root_memcg)) { + css_put(&memcg->css); continue; + } if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { if (cpu == curcpu) drain_local_stock(&stock->work); else schedule_work_on(cpu, &stock->work); } + css_put(&memcg->css); } put_cpu(); - put_online_cpus(); mutex_unlock(&percpu_charge_mutex); } -- cgit From 3f2eb0287ebd62ec8d6d544f830285302279e6bf Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Tue, 3 Oct 2017 16:14:57 -0700 Subject: mm/memcg: avoid page count check for zone device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix for 4.14, zone device page always have an elevated refcount of one and thus page count sanity check in uncharge_page() is inappropriate for them. [mhocko@suse.com: nano-optimize VM_BUG_ON in uncharge_page] Link: http://lkml.kernel.org/r/20170914190011.5217-1-jglisse@redhat.com Signed-off-by: Jérôme Glisse Signed-off-by: Michal Hocko Reported-by: Evgeny Baskakov Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 696c6529e900..d5f3a62887cf 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5658,7 +5658,8 @@ static void uncharge_batch(const struct uncharge_gather *ug) static void uncharge_page(struct page *page, struct uncharge_gather *ug) { VM_BUG_ON_PAGE(PageLRU(page), page); - VM_BUG_ON_PAGE(!PageHWPoison(page) && page_count(page), page); + VM_BUG_ON_PAGE(page_count(page) && !is_zone_device_page(page) && + !PageHWPoison(page) , page); if (!page->mem_cgroup) return; -- cgit From a1b2289cef92ef0e9a92afcd2e1ea71d5bcaaf64 Mon Sep 17 00:00:00 2001 From: Sherry Yang Date: Tue, 3 Oct 2017 16:15:00 -0700 Subject: android: binder: drop lru lock in isolate callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop the global lru lock in isolate callback before calling zap_page_range which calls cond_resched, and re-acquire the global lru lock before returning. Also change return code to LRU_REMOVED_RETRY. Use mmput_async when fail to acquire mmap sem in an atomic context. Fix "BUG: sleeping function called from invalid context" errors when CONFIG_DEBUG_ATOMIC_SLEEP is enabled. Also restore mmput_async, which was initially introduced in commit ec8d7c14ea14 ("mm, oom_reaper: do not mmput synchronously from the oom reaper context"), and was removed in commit 212925802454 ("mm: oom: let oom_reap_task and exit_mmap run concurrently"). Link: http://lkml.kernel.org/r/20170914182231.90908-1-sherryy@android.com Fixes: f2517eb76f1f2 ("android: binder: Add global lru shrinker to binder") Signed-off-by: Sherry Yang Signed-off-by: Greg Kroah-Hartman Reported-by: Kyle Yan Acked-by: Arve Hjønnevåg Acked-by: Michal Hocko Cc: Martijn Coenen Cc: Todd Kjos Cc: Riley Andrews Cc: Ingo Molnar Cc: Vlastimil Babka Cc: Hillf Danton Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: Thomas Gleixner Cc: Andy Lutomirski Cc: Oleg Nesterov Cc: Hoeun Ryu Cc: Christopher Lameter Cc: Vegard Nossum Cc: Frederic Weisbecker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/android/binder_alloc.c | 18 ++++++++++++------ include/linux/sched/mm.h | 6 ++++++ kernel/fork.c | 18 ++++++++++++++++++ 3 files changed, 36 insertions(+), 6 deletions(-) diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index 8fe165844e47..064f5e31ec55 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -913,6 +913,7 @@ enum lru_status binder_alloc_free_page(struct list_head *item, struct binder_alloc *alloc; uintptr_t page_addr; size_t index; + struct vm_area_struct *vma; alloc = page->alloc; if (!mutex_trylock(&alloc->mutex)) @@ -923,16 +924,22 @@ enum lru_status binder_alloc_free_page(struct list_head *item, index = page - alloc->pages; page_addr = (uintptr_t)alloc->buffer + index * PAGE_SIZE; - if (alloc->vma) { + vma = alloc->vma; + if (vma) { mm = get_task_mm(alloc->tsk); if (!mm) goto err_get_task_mm_failed; if (!down_write_trylock(&mm->mmap_sem)) goto err_down_write_mmap_sem_failed; + } + + list_lru_isolate(lru, item); + spin_unlock(lock); + if (vma) { trace_binder_unmap_user_start(alloc, index); - zap_page_range(alloc->vma, + zap_page_range(vma, page_addr + alloc->user_buffer_offset, PAGE_SIZE); @@ -950,13 +957,12 @@ enum lru_status binder_alloc_free_page(struct list_head *item, trace_binder_unmap_kernel_end(alloc, index); - list_lru_isolate(lru, item); - + spin_lock(lock); mutex_unlock(&alloc->mutex); - return LRU_REMOVED; + return LRU_REMOVED_RETRY; err_down_write_mmap_sem_failed: - mmput(mm); + mmput_async(mm); err_get_task_mm_failed: err_page_already_freed: mutex_unlock(&alloc->mutex); diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 3a19c253bdb1..ae53e413fb13 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -84,6 +84,12 @@ static inline bool mmget_not_zero(struct mm_struct *mm) /* mmput gets rid of the mappings and all user-space */ extern void mmput(struct mm_struct *); +#ifdef CONFIG_MMU +/* same as above but performs the slow path from the async context. Can + * be called from the atomic context as well + */ +void mmput_async(struct mm_struct *); +#endif /* Grab a reference to a task's mm, if it is not already going away */ extern struct mm_struct *get_task_mm(struct task_struct *task); diff --git a/kernel/fork.c b/kernel/fork.c index 10646182440f..e702cb9ffbd8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -946,6 +946,24 @@ void mmput(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(mmput); +#ifdef CONFIG_MMU +static void mmput_async_fn(struct work_struct *work) +{ + struct mm_struct *mm = container_of(work, struct mm_struct, + async_put_work); + + __mmput(mm); +} + +void mmput_async(struct mm_struct *mm) +{ + if (atomic_dec_and_test(&mm->mm_users)) { + INIT_WORK(&mm->async_put_work, mmput_async_fn); + schedule_work(&mm->async_put_work); + } +} +#endif + /** * set_mm_exe_file - change a reference to the mm's executable file * -- cgit From 6818600ff094ca255a7fe31838ad50c29656c3c5 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 3 Oct 2017 16:15:03 -0700 Subject: mm,compaction: serialize waitqueue_active() checks (for real) Andrea brought to my attention that the L->{L,S} guarantees are completely bogus for this case. I was looking at the diagram, from the offending commit, when that _is_ the race, we had the load reordered already. What we need is at least S->L semantics, thus simply use wq_has_sleeper() to serialize the call for good. Link: http://lkml.kernel.org/r/20170914175313.GB811@linux-80c1.suse Fixes: 46acef048a6 (mm,compaction: serialize waitqueue_active() checks) Signed-off-by: Davidlohr Bueso Reported-by: Andrea Parri Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index fb548e4c7bd4..03d31a875341 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1999,17 +1999,14 @@ void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx) if (pgdat->kcompactd_max_order < order) pgdat->kcompactd_max_order = order; - /* - * Pairs with implicit barrier in wait_event_freezable() - * such that wakeups are not missed in the lockless - * waitqueue_active() call. - */ - smp_acquire__after_ctrl_dep(); - if (pgdat->kcompactd_classzone_idx > classzone_idx) pgdat->kcompactd_classzone_idx = classzone_idx; - if (!waitqueue_active(&pgdat->kcompactd_wait)) + /* + * Pairs with implicit barrier in wait_event_freezable() + * such that wakeups are not missed. + */ + if (!wq_has_sleeper(&pgdat->kcompactd_wait)) return; if (!kcompactd_node_suitable(pgdat)) -- cgit From 3552935742e0d5f0dafd823736f45bdaa7ba672c Mon Sep 17 00:00:00 2001 From: Vitaly Wool Date: Tue, 3 Oct 2017 16:15:06 -0700 Subject: z3fold: fix stale list handling Fix the situation when clear_bit() is called for page->private before the page pointer is actually assigned. While at it, remove work_busy() check because it is costly and does not give 100% guarantee anyway. Signed-off-by: Vitaly Wool Cc: Dan Streetman Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/z3fold.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mm/z3fold.c b/mm/z3fold.c index b04fa3ba1bf2..b2ba2ba585f3 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -250,6 +250,7 @@ static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked) WARN_ON(!list_empty(&zhdr->buddy)); set_bit(PAGE_STALE, &page->private); + clear_bit(NEEDS_COMPACTING, &page->private); spin_lock(&pool->lock); if (!list_empty(&page->lru)) list_del(&page->lru); @@ -303,7 +304,6 @@ static void free_pages_work(struct work_struct *w) list_del(&zhdr->buddy); if (WARN_ON(!test_bit(PAGE_STALE, &page->private))) continue; - clear_bit(NEEDS_COMPACTING, &page->private); spin_unlock(&pool->stale_lock); cancel_work_sync(&zhdr->work); free_z3fold_page(page); @@ -624,10 +624,8 @@ lookup: * stale pages list. cancel_work_sync() can sleep so we must make * sure it won't be called in case we're in atomic context. */ - if (zhdr && (can_sleep || !work_pending(&zhdr->work) || - !unlikely(work_busy(&zhdr->work)))) { + if (zhdr && (can_sleep || !work_pending(&zhdr->work))) { list_del(&zhdr->buddy); - clear_bit(NEEDS_COMPACTING, &page->private); spin_unlock(&pool->stale_lock); if (can_sleep) cancel_work_sync(&zhdr->work); -- cgit From 57148a64e823bb1f49112fa52a92a7f372cda892 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 3 Oct 2017 16:15:10 -0700 Subject: mm: meminit: mark init_reserved_page as __meminit The function is called from __meminit context and calls other __meminit functions but isn't it self mark as such today: WARNING: vmlinux.o(.text.unlikely+0x4516): Section mismatch in reference from the function init_reserved_page() to the function .meminit.text:early_pfn_to_nid() The function init_reserved_page() references the function __meminit early_pfn_to_nid(). This is often because init_reserved_page lacks a __meminit annotation or the annotation of early_pfn_to_nid is wrong. On most compilers, we don't notice this because the function gets inlined all the time. Adding __meminit here fixes the harmless warning for the old versions and is generally the correct annotation. Link: http://lkml.kernel.org/r/20170915193149.901180-1-arnd@arndb.de Fixes: 7e18adb4f80b ("mm: meminit: initialise remaining struct pages in parallel with kswapd") Signed-off-by: Arnd Bergmann Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c841af88836a..38d165a87860 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1190,7 +1190,7 @@ static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone, } #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT -static void init_reserved_page(unsigned long pfn) +static void __meminit init_reserved_page(unsigned long pfn) { pg_data_t *pgdat; int nid, zid; -- cgit From 31d1e130f4a0f8f629a460167569577cac9b17c1 Mon Sep 17 00:00:00 2001 From: Ioan Nicu Date: Tue, 3 Oct 2017 16:15:13 -0700 Subject: rapidio: remove global irq spinlocks from the subsystem Locking of config and doorbell operations should be done only if the underlying hardware requires it. This patch removes the global spinlocks from the rapidio subsystem and moves them to the mport drivers (fsl_rio and tsi721), only to the necessary places. For example, local config space read and write operations (lcread/lcwrite) are atomic in all existing drivers, so there should be no need for locking, while the cread/cwrite operations which generate maintenance transactions need to be synchronized with a lock. Later, each driver could chose to use a per-port lock instead of a global one, or even more granular locking. Link: http://lkml.kernel.org/r/20170824113023.GD50104@nokia.com Signed-off-by: Ioan Nicu Signed-off-by: Frank Kunz Acked-by: Alexandre Bounine Cc: Matt Porter Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/sysdev/fsl_rio.c | 17 +++++++++++++++-- arch/powerpc/sysdev/fsl_rmu.c | 8 ++++++++ drivers/rapidio/devices/tsi721.c | 7 +++++++ drivers/rapidio/rio-access.c | 40 +++++----------------------------------- 4 files changed, 35 insertions(+), 37 deletions(-) diff --git a/arch/powerpc/sysdev/fsl_rio.c b/arch/powerpc/sysdev/fsl_rio.c index 9234be1e66f5..5011ffea4e4b 100644 --- a/arch/powerpc/sysdev/fsl_rio.c +++ b/arch/powerpc/sysdev/fsl_rio.c @@ -71,6 +71,8 @@ #define RIWAR_WRTYP_ALLOC 0x00006000 #define RIWAR_SIZE_MASK 0x0000003F +static DEFINE_SPINLOCK(fsl_rio_config_lock); + #define __fsl_read_rio_config(x, addr, err, op) \ __asm__ __volatile__( \ "1: "op" %1,0(%2)\n" \ @@ -184,6 +186,7 @@ fsl_rio_config_read(struct rio_mport *mport, int index, u16 destid, u8 hopcount, u32 offset, int len, u32 *val) { struct rio_priv *priv = mport->priv; + unsigned long flags; u8 *data; u32 rval, err = 0; @@ -197,6 +200,8 @@ fsl_rio_config_read(struct rio_mport *mport, int index, u16 destid, if (offset > (0x1000000 - len) || !IS_ALIGNED(offset, len)) return -EINVAL; + spin_lock_irqsave(&fsl_rio_config_lock, flags); + out_be32(&priv->maint_atmu_regs->rowtar, (destid << 22) | (hopcount << 12) | (offset >> 12)); out_be32(&priv->maint_atmu_regs->rowtear, (destid >> 10)); @@ -213,6 +218,7 @@ fsl_rio_config_read(struct rio_mport *mport, int index, u16 destid, __fsl_read_rio_config(rval, data, err, "lwz"); break; default: + spin_unlock_irqrestore(&fsl_rio_config_lock, flags); return -EINVAL; } @@ -221,6 +227,7 @@ fsl_rio_config_read(struct rio_mport *mport, int index, u16 destid, err, destid, hopcount, offset); } + spin_unlock_irqrestore(&fsl_rio_config_lock, flags); *val = rval; return err; @@ -244,7 +251,10 @@ fsl_rio_config_write(struct rio_mport *mport, int index, u16 destid, u8 hopcount, u32 offset, int len, u32 val) { struct rio_priv *priv = mport->priv; + unsigned long flags; u8 *data; + int ret = 0; + pr_debug ("fsl_rio_config_write:" " index %d destid %d hopcount %d offset %8.8x len %d val %8.8x\n", @@ -255,6 +265,8 @@ fsl_rio_config_write(struct rio_mport *mport, int index, u16 destid, if (offset > (0x1000000 - len) || !IS_ALIGNED(offset, len)) return -EINVAL; + spin_lock_irqsave(&fsl_rio_config_lock, flags); + out_be32(&priv->maint_atmu_regs->rowtar, (destid << 22) | (hopcount << 12) | (offset >> 12)); out_be32(&priv->maint_atmu_regs->rowtear, (destid >> 10)); @@ -271,10 +283,11 @@ fsl_rio_config_write(struct rio_mport *mport, int index, u16 destid, out_be32((u32 *) data, val); break; default: - return -EINVAL; + ret = -EINVAL; } + spin_unlock_irqrestore(&fsl_rio_config_lock, flags); - return 0; + return ret; } static void fsl_rio_inbound_mem_init(struct rio_priv *priv) diff --git a/arch/powerpc/sysdev/fsl_rmu.c b/arch/powerpc/sysdev/fsl_rmu.c index ab7a74c75be8..88b35a3dcdc5 100644 --- a/arch/powerpc/sysdev/fsl_rmu.c +++ b/arch/powerpc/sysdev/fsl_rmu.c @@ -104,6 +104,8 @@ #define DOORBELL_MESSAGE_SIZE 0x08 +static DEFINE_SPINLOCK(fsl_rio_doorbell_lock); + struct rio_msg_regs { u32 omr; u32 osr; @@ -626,9 +628,13 @@ err_out: int fsl_rio_doorbell_send(struct rio_mport *mport, int index, u16 destid, u16 data) { + unsigned long flags; + pr_debug("fsl_doorbell_send: index %d destid %4.4x data %4.4x\n", index, destid, data); + spin_lock_irqsave(&fsl_rio_doorbell_lock, flags); + /* In the serial version silicons, such as MPC8548, MPC8641, * below operations is must be. */ @@ -638,6 +644,8 @@ int fsl_rio_doorbell_send(struct rio_mport *mport, out_be32(&dbell->dbell_regs->oddatr, (index << 20) | data); out_be32(&dbell->dbell_regs->odmr, 0x00000001); + spin_unlock_irqrestore(&fsl_rio_doorbell_lock, flags); + return 0; } diff --git a/drivers/rapidio/devices/tsi721.c b/drivers/rapidio/devices/tsi721.c index 315a4be8dc1e..9a68914100ad 100644 --- a/drivers/rapidio/devices/tsi721.c +++ b/drivers/rapidio/devices/tsi721.c @@ -51,6 +51,8 @@ module_param(mbox_sel, byte, S_IRUGO); MODULE_PARM_DESC(mbox_sel, "RIO Messaging MBOX Selection Mask (default: 0x0f = all)"); +static DEFINE_SPINLOCK(tsi721_maint_lock); + static void tsi721_omsg_handler(struct tsi721_device *priv, int ch); static void tsi721_imsg_handler(struct tsi721_device *priv, int ch); @@ -124,12 +126,15 @@ static int tsi721_maint_dma(struct tsi721_device *priv, u32 sys_size, void __iomem *regs = priv->regs + TSI721_DMAC_BASE(priv->mdma.ch_id); struct tsi721_dma_desc *bd_ptr; u32 rd_count, swr_ptr, ch_stat; + unsigned long flags; int i, err = 0; u32 op = do_wr ? MAINT_WR : MAINT_RD; if (offset > (RIO_MAINT_SPACE_SZ - len) || (len != sizeof(u32))) return -EINVAL; + spin_lock_irqsave(&tsi721_maint_lock, flags); + bd_ptr = priv->mdma.bd_base; rd_count = ioread32(regs + TSI721_DMAC_DRDCNT); @@ -197,7 +202,9 @@ static int tsi721_maint_dma(struct tsi721_device *priv, u32 sys_size, */ swr_ptr = ioread32(regs + TSI721_DMAC_DSWP); iowrite32(swr_ptr, regs + TSI721_DMAC_DSRP); + err_out: + spin_unlock_irqrestore(&tsi721_maint_lock, flags); return err; } diff --git a/drivers/rapidio/rio-access.c b/drivers/rapidio/rio-access.c index a3824baca2e5..3ee9af83b638 100644 --- a/drivers/rapidio/rio-access.c +++ b/drivers/rapidio/rio-access.c @@ -13,17 +13,9 @@ #include #include -/* - * These interrupt-safe spinlocks protect all accesses to RIO - * configuration space and doorbell access. - */ -static DEFINE_SPINLOCK(rio_config_lock); -static DEFINE_SPINLOCK(rio_doorbell_lock); - /* * Wrappers for all RIO configuration access functions. They just check - * alignment, do locking and call the low-level functions pointed to - * by rio_mport->ops. + * alignment and call the low-level functions pointed to by rio_mport->ops. */ #define RIO_8_BAD 0 @@ -44,13 +36,10 @@ int __rio_local_read_config_##size \ (struct rio_mport *mport, u32 offset, type *value) \ { \ int res; \ - unsigned long flags; \ u32 data = 0; \ if (RIO_##size##_BAD) return RIO_BAD_SIZE; \ - spin_lock_irqsave(&rio_config_lock, flags); \ res = mport->ops->lcread(mport, mport->id, offset, len, &data); \ *value = (type)data; \ - spin_unlock_irqrestore(&rio_config_lock, flags); \ return res; \ } @@ -67,13 +56,8 @@ int __rio_local_read_config_##size \ int __rio_local_write_config_##size \ (struct rio_mport *mport, u32 offset, type value) \ { \ - int res; \ - unsigned long flags; \ if (RIO_##size##_BAD) return RIO_BAD_SIZE; \ - spin_lock_irqsave(&rio_config_lock, flags); \ - res = mport->ops->lcwrite(mport, mport->id, offset, len, value);\ - spin_unlock_irqrestore(&rio_config_lock, flags); \ - return res; \ + return mport->ops->lcwrite(mport, mport->id, offset, len, value);\ } RIO_LOP_READ(8, u8, 1) @@ -104,13 +88,10 @@ int rio_mport_read_config_##size \ (struct rio_mport *mport, u16 destid, u8 hopcount, u32 offset, type *value) \ { \ int res; \ - unsigned long flags; \ u32 data = 0; \ if (RIO_##size##_BAD) return RIO_BAD_SIZE; \ - spin_lock_irqsave(&rio_config_lock, flags); \ res = mport->ops->cread(mport, mport->id, destid, hopcount, offset, len, &data); \ *value = (type)data; \ - spin_unlock_irqrestore(&rio_config_lock, flags); \ return res; \ } @@ -127,13 +108,9 @@ int rio_mport_read_config_##size \ int rio_mport_write_config_##size \ (struct rio_mport *mport, u16 destid, u8 hopcount, u32 offset, type value) \ { \ - int res; \ - unsigned long flags; \ if (RIO_##size##_BAD) return RIO_BAD_SIZE; \ - spin_lock_irqsave(&rio_config_lock, flags); \ - res = mport->ops->cwrite(mport, mport->id, destid, hopcount, offset, len, value); \ - spin_unlock_irqrestore(&rio_config_lock, flags); \ - return res; \ + return mport->ops->cwrite(mport, mport->id, destid, hopcount, \ + offset, len, value); \ } RIO_OP_READ(8, u8, 1) @@ -162,14 +139,7 @@ EXPORT_SYMBOL_GPL(rio_mport_write_config_32); */ int rio_mport_send_doorbell(struct rio_mport *mport, u16 destid, u16 data) { - int res; - unsigned long flags; - - spin_lock_irqsave(&rio_doorbell_lock, flags); - res = mport->ops->dsend(mport, mport->id, destid, data); - spin_unlock_irqrestore(&rio_doorbell_lock, flags); - - return res; + return mport->ops->dsend(mport, mport->id, destid, data); } EXPORT_SYMBOL_GPL(rio_mport_send_doorbell); -- cgit From a872eb2131e91ce7c89a8888974a5e22a272b12f Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 3 Oct 2017 16:15:16 -0700 Subject: mm: fix RODATA_TEST failure "rodata_test: test data was not read only" On powerpc, RODATA_TEST fails with message the following messages: Freeing unused kernel memory: 528K rodata_test: test data was not read only This is because GCC allocates it to .data section: c0695034 g O .data 00000004 rodata_test_data Since commit 056b9d8a7692 ("mm: remove rodata_test_data export, add pr_fmt"), rodata_test_data is used only inside rodata_test.c By declaring it static, it gets properly allocated into .rodata section instead of .data: c04df710 l O .rodata 00000004 rodata_test_data Fixes: 056b9d8a7692 ("mm: remove rodata_test_data export, add pr_fmt") Link: http://lkml.kernel.org/r/20170921093729.1080368AC1@po15668-vm-win7.idsi0.si.c-s.fr Signed-off-by: Christophe Leroy Cc: Kees Cook Cc: Jinbum Park Cc: Segher Boessenkool Cc: David Laight Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rodata_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/rodata_test.c b/mm/rodata_test.c index 6bb4deb12e78..d908c8769b48 100644 --- a/mm/rodata_test.c +++ b/mm/rodata_test.c @@ -14,7 +14,7 @@ #include #include -const int rodata_test_data = 0xC3; +static const int rodata_test_data = 0xC3; void rodata_test(void) { -- cgit From ae94264ed4b0cf7cd887947650db4c69acb62072 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 3 Oct 2017 16:15:19 -0700 Subject: zram: fix null dereference of handle In testing I found handle passed to zs_map_object in __zram_bvec_read is NULL so eh kernel goes oops in pin_object(). The reason is there is no routine to check the slot's freeing after getting the slot's lock. This patch fixes it. [minchan@kernel.org: v2] Link: http://lkml.kernel.org/r/1505887347-10881-1-git-send-email-minchan@kernel.org Link: http://lkml.kernel.org/r/1505788488-26723-1-git-send-email-minchan@kernel.org Fixes: 1f7319c74275 ("zram: partial IO refactoring") Signed-off-by: Minchan Kim Reviewed-by: Sergey Senozhatsky Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 36 ++++++++++++------------------------ 1 file changed, 12 insertions(+), 24 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 2981c27d3aae..f149d3e61234 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -766,27 +766,6 @@ static void zram_slot_unlock(struct zram *zram, u32 index) bit_spin_unlock(ZRAM_ACCESS, &zram->table[index].value); } -static bool zram_same_page_read(struct zram *zram, u32 index, - struct page *page, - unsigned int offset, unsigned int len) -{ - zram_slot_lock(zram, index); - if (unlikely(!zram_get_handle(zram, index) || - zram_test_flag(zram, index, ZRAM_SAME))) { - void *mem; - - zram_slot_unlock(zram, index); - mem = kmap_atomic(page); - zram_fill_page(mem + offset, len, - zram_get_element(zram, index)); - kunmap_atomic(mem); - return true; - } - zram_slot_unlock(zram, index); - - return false; -} - static void zram_meta_free(struct zram *zram, u64 disksize) { size_t num_pages = disksize >> PAGE_SHIFT; @@ -884,11 +863,20 @@ static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index, zram_slot_unlock(zram, index); } - if (zram_same_page_read(zram, index, page, 0, PAGE_SIZE)) - return 0; - zram_slot_lock(zram, index); handle = zram_get_handle(zram, index); + if (!handle || zram_test_flag(zram, index, ZRAM_SAME)) { + unsigned long value; + void *mem; + + value = handle ? zram_get_element(zram, index) : 0; + mem = kmap_atomic(page); + zram_fill_page(mem, PAGE_SIZE, value); + kunmap_atomic(mem); + zram_slot_unlock(zram, index); + return 0; + } + size = zram_get_obj_size(zram, index); src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO); -- cgit From 5bdfca6435b8294490ffb5b7c8b7d8eac3814b06 Mon Sep 17 00:00:00 2001 From: Sudip Mukherjee Date: Tue, 3 Oct 2017 16:15:23 -0700 Subject: m32r: define CPU_BIG_ENDIAN The build of m32r allmodconfig is giving lots of build warnings about: include/linux/byteorder/big_endian.h:7:2: warning: #warning inconsistent configuration, needs CONFIG_CPU_BIG_ENDIAN [-Wcpp] #warning inconsistent configuration, needs CONFIG_CPU_BIG_ENDIAN Define CPU_BIG_ENDIAN like the way CPU_LITTLE_ENDIAN is defined. Link: http://lkml.kernel.org/r/1505678083-10320-1-git-send-email-sudipm.mukherjee@gmail.com Signed-off-by: Sudip Mukherjee Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/m32r/Kconfig | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig index 87cde1e4b38c..0777f3a8a1f3 100644 --- a/arch/m32r/Kconfig +++ b/arch/m32r/Kconfig @@ -194,6 +194,10 @@ config TIMER_DIVIDE int "Timer divider (integer)" default "128" +config CPU_BIG_ENDIAN + bool "Generate big endian code" + default n + config CPU_LITTLE_ENDIAN bool "Generate little endian code" default n -- cgit From f4e222c56c83b2aed7cc2b329fca7435508eefa1 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 3 Oct 2017 16:15:25 -0700 Subject: mm: have filemap_check_and_advance_wb_err clear AS_EIO/AS_ENOSPC Eryu noticed that he could sometimes get a leftover error reported when it shouldn't be on fsync with ext2 and non-journalled ext4. The problem is that writeback_single_inode still uses filemap_fdatawait. That picks up a previously set AS_EIO flag, which would ordinarily have been cleared before. Since we're mostly using this function as a replacement for filemap_check_errors, have filemap_check_and_advance_wb_err clear AS_EIO and AS_ENOSPC when reporting an error. That should allow the new function to better emulate the behavior of the old with respect to these flags. Link: http://lkml.kernel.org/r/20170922133331.28812-1-jlayton@kernel.org Signed-off-by: Jeff Layton Reported-by: Eryu Guan Reviewed-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mm/filemap.c b/mm/filemap.c index db250d0e0565..594d73fef8b4 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -620,6 +620,14 @@ int file_check_and_advance_wb_err(struct file *file) trace_file_check_and_advance_wb_err(file, old); spin_unlock(&file->f_lock); } + + /* + * We're mostly using this function as a drop in replacement for + * filemap_check_errors. Clear AS_EIO/AS_ENOSPC to emulate the effect + * that the legacy code would have had on these flags. + */ + clear_bit(AS_EIO, &mapping->flags); + clear_bit(AS_ENOSPC, &mapping->flags); return err; } EXPORT_SYMBOL(file_check_and_advance_wb_err); -- cgit From 24c92eb7dce0a299b8e1a8c5fa585844a53bf7f0 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 3 Oct 2017 16:15:29 -0700 Subject: mm: avoid marking swap cached page as lazyfree MADV_FREE clears pte dirty bit and then marks the page lazyfree (clear SwapBacked). There is no lock to prevent the page is added to swap cache between these two steps by page reclaim. Page reclaim could add the page to swap cache and unmap the page. After page reclaim, the page is added back to lru. At that time, we probably start draining per-cpu pagevec and mark the page lazyfree. So the page could be in a state with SwapBacked cleared and PG_swapcache set. Next time there is a refault in the virtual address, do_swap_page can find the page from swap cache but the page has PageSwapCache false because SwapBacked isn't set, so do_swap_page will bail out and do nothing. The task will keep running into fault handler. Fixes: 802a3a92ad7a ("mm: reclaim MADV_FREE pages") Link: http://lkml.kernel.org/r/6537ef3814398c0073630b03f176263bc81f0902.1506446061.git.shli@fb.com Signed-off-by: Shaohua Li Reported-by: Artem Savkov Tested-by: Artem Savkov Reviewed-by: Rik van Riel Acked-by: Johannes Weiner Acked-by: Michal Hocko Acked-by: Minchan Kim Cc: Hillf Danton Cc: Hugh Dickins Cc: Mel Gorman Cc: [4.12+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/swap.c b/mm/swap.c index 9295ae960d66..a77d68f2c1b6 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -575,7 +575,7 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec, void *arg) { if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) && - !PageUnevictable(page)) { + !PageSwapCache(page) && !PageUnevictable(page)) { bool active = PageActive(page); del_page_from_lru_list(page, lruvec, @@ -665,7 +665,7 @@ void deactivate_file_page(struct page *page) void mark_page_lazyfree(struct page *page) { if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) && - !PageUnevictable(page)) { + !PageSwapCache(page) && !PageUnevictable(page)) { struct pagevec *pvec = &get_cpu_var(lru_lazyfree_pvecs); get_page(page); -- cgit From 9625456cc76391b7f3f2809579126542a8ed4d39 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 3 Oct 2017 16:15:32 -0700 Subject: mm: fix data corruption caused by lazyfree page MADV_FREE clears pte dirty bit and then marks the page lazyfree (clear SwapBacked). There is no lock to prevent the page is added to swap cache between these two steps by page reclaim. If page reclaim finds such page, it will simply add the page to swap cache without pageout the page to swap because the page is marked as clean. Next time, page fault will read data from the swap slot which doesn't have the original data, so we have a data corruption. To fix issue, we mark the page dirty and pageout the page. However, we shouldn't dirty all pages which is clean and in swap cache. swapin page is swap cache and clean too. So we only dirty page which is added into swap cache in page reclaim, which shouldn't be swapin page. As Minchan suggested, simply dirty the page in add_to_swap can do the job. Fixes: 802a3a92ad7a ("mm: reclaim MADV_FREE pages") Link: http://lkml.kernel.org/r/08c84256b007bf3f63c91d94383bd9eb6fee2daa.1506446061.git.shli@fb.com Signed-off-by: Shaohua Li Reported-by: Artem Savkov Acked-by: Michal Hocko Acked-by: Minchan Kim Cc: Johannes Weiner Cc: Hillf Danton Cc: Hugh Dickins Cc: Rik van Riel Cc: Mel Gorman Cc: [4.12+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap_state.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/mm/swap_state.c b/mm/swap_state.c index 71ce2d1ccbf7..ed91091d1e68 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -242,6 +242,17 @@ int add_to_swap(struct page *page) * clear SWAP_HAS_CACHE flag. */ goto fail; + /* + * Normally the page will be dirtied in unmap because its pte should be + * dirty. A special case is MADV_FREE page. The page'e pte could have + * dirty bit cleared but the page's SwapBacked bit is still set because + * clearing the dirty bit and SwapBacked bit has no lock protected. For + * such page, unmap will not set dirty bit for it, so page reclaim will + * not write the page out. This can cause data corruption when the page + * is swap in later. Always setting the dirty bit for the page solves + * the problem. + */ + set_page_dirty(page); return 1; -- cgit From 7d790d2da386a52cfebcf0c898ba927bece9d4ab Mon Sep 17 00:00:00 2001 From: Reza Arbab Date: Tue, 3 Oct 2017 16:15:35 -0700 Subject: mm/device-public-memory: fix edge case in _vm_normal_page() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With device public pages at the end of my memory space, I'm getting output from _vm_normal_page(): BUG: Bad page map in process migrate_pages pte:c0800001ffff0d06 pmd:f95d3000 addr:00007fff89330000 vm_flags:00100073 anon_vma:c0000000fa899320 mapping: (null) index:7fff8933 file: (null) fault: (null) mmap: (null) readpage: (null) CPU: 0 PID: 13963 Comm: migrate_pages Tainted: P B OE 4.14.0-rc1-wip #155 Call Trace: dump_stack+0xb0/0xf4 (unreliable) print_bad_pte+0x28c/0x340 _vm_normal_page+0xc0/0x140 zap_pte_range+0x664/0xc10 unmap_page_range+0x318/0x670 unmap_vmas+0x74/0xe0 exit_mmap+0xe8/0x1f0 mmput+0xac/0x1f0 do_exit+0x348/0xcd0 do_group_exit+0x5c/0xf0 SyS_exit_group+0x1c/0x20 system_call+0x58/0x6c The pfn causing this is the very last one. Correct the bounds check accordingly. Fixes: df6ad69838fc ("mm/device-public-memory: device memory cache coherent with CPU") Link: http://lkml.kernel.org/r/1506092178-20351-1-git-send-email-arbab@linux.vnet.ibm.com Signed-off-by: Reza Arbab Reviewed-by: Jérôme Glisse Reviewed-by: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index ec4e15494901..a728bed16c20 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -845,7 +845,7 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr, * vm_normal_page() so that we do not have to special case all * call site of vm_normal_page(). */ - if (likely(pfn < highest_memmap_pfn)) { + if (likely(pfn <= highest_memmap_pfn)) { struct page *page = pfn_to_page(pfn); if (is_device_public_page(page)) { -- cgit From 384632e67e0829deb8015ee6ad916b180049d252 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Tue, 3 Oct 2017 16:15:38 -0700 Subject: userfaultfd: non-cooperative: fix fork use after free When reading the event from the uffd, we put it on a temporary fork_event list to detect if we can still access it after releasing and retaking the event_wqh.lock. If fork aborts and removes the event from the fork_event all is fine as long as we're still in the userfault read context and fork_event head is still alive. We've to put the event allocated in the fork kernel stack, back from fork_event list-head to the event_wqh head, before returning from userfaultfd_ctx_read, because the fork_event head lifetime is limited to the userfaultfd_ctx_read stack lifetime. Forgetting to move the event back to its event_wqh place then results in __remove_wait_queue(&ctx->event_wqh, &ewq->wq); in userfaultfd_event_wait_completion to remove it from a head that has been already freed from the reader stack. This could only happen if resolve_userfault_fork failed (for example if there are no file descriptors available to allocate the fork uffd). If it succeeded it was put back correctly. Furthermore, after find_userfault_evt receives a fork event, the forked userfault context in fork_nctx and uwq->msg.arg.reserved.reserved1 can be released by the fork thread as soon as the event_wqh.lock is released. Taking a reference on the fork_nctx before dropping the lock prevents an use after free in resolve_userfault_fork(). If the fork side aborted and it already released everything, we still try to succeed resolve_userfault_fork(), if possible. Fixes: 893e26e61d04eac9 ("userfaultfd: non-cooperative: Add fork() event") Link: http://lkml.kernel.org/r/20170920180413.26713-1-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Reported-by: Mark Rutland Tested-by: Mark Rutland Cc: Pavel Emelyanov Cc: Mike Rapoport Cc: "Dr. David Alan Gilbert" Cc: Mike Kravetz Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 56 insertions(+), 10 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index ef4b48d1ea42..1c713fd5b3e6 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -588,6 +588,12 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, break; if (ACCESS_ONCE(ctx->released) || fatal_signal_pending(current)) { + /* + * &ewq->wq may be queued in fork_event, but + * __remove_wait_queue ignores the head + * parameter. It would be a problem if it + * didn't. + */ __remove_wait_queue(&ctx->event_wqh, &ewq->wq); if (ewq->msg.event == UFFD_EVENT_FORK) { struct userfaultfd_ctx *new; @@ -1061,6 +1067,12 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, (unsigned long) uwq->msg.arg.reserved.reserved1; list_move(&uwq->wq.entry, &fork_event); + /* + * fork_nctx can be freed as soon as + * we drop the lock, unless we take a + * reference on it. + */ + userfaultfd_ctx_get(fork_nctx); spin_unlock(&ctx->event_wqh.lock); ret = 0; break; @@ -1091,19 +1103,53 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, if (!ret && msg->event == UFFD_EVENT_FORK) { ret = resolve_userfault_fork(ctx, fork_nctx, msg); + spin_lock(&ctx->event_wqh.lock); + if (!list_empty(&fork_event)) { + /* + * The fork thread didn't abort, so we can + * drop the temporary refcount. + */ + userfaultfd_ctx_put(fork_nctx); + + uwq = list_first_entry(&fork_event, + typeof(*uwq), + wq.entry); + /* + * If fork_event list wasn't empty and in turn + * the event wasn't already released by fork + * (the event is allocated on fork kernel + * stack), put the event back to its place in + * the event_wq. fork_event head will be freed + * as soon as we return so the event cannot + * stay queued there no matter the current + * "ret" value. + */ + list_del(&uwq->wq.entry); + __add_wait_queue(&ctx->event_wqh, &uwq->wq); - if (!ret) { - spin_lock(&ctx->event_wqh.lock); - if (!list_empty(&fork_event)) { - uwq = list_first_entry(&fork_event, - typeof(*uwq), - wq.entry); - list_del(&uwq->wq.entry); - __add_wait_queue(&ctx->event_wqh, &uwq->wq); + /* + * Leave the event in the waitqueue and report + * error to userland if we failed to resolve + * the userfault fork. + */ + if (likely(!ret)) userfaultfd_event_complete(ctx, uwq); - } - spin_unlock(&ctx->event_wqh.lock); + } else { + /* + * Here the fork thread aborted and the + * refcount from the fork thread on fork_nctx + * has already been released. We still hold + * the reference we took before releasing the + * lock above. If resolve_userfault_fork + * failed we've to drop it because the + * fork_nctx has to be freed in such case. If + * it succeeded we'll hold it because the new + * uffd references it. + */ + if (ret) + userfaultfd_ctx_put(fork_nctx); } + spin_unlock(&ctx->event_wqh.lock); } return ret; -- cgit From c2315c187fa0d3ab363fdebe22718170b40473e3 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 3 Oct 2017 16:15:42 -0700 Subject: exec: load_script: kill the onstack interp[BINPRM_BUF_SIZE] array Patch series "exec: binfmt_misc: fix use-after-free, kill iname[BINPRM_BUF_SIZE]". It looks like this code was always wrong, then commit 948b701a607f ("binfmt_misc: add persistent opened binary handler for containers") added more problems. This patch (of 6): load_script() can simply use i_name instead, it points into bprm->buf[] and nobody can change this memory until we call prepare_binprm(). The only complication is that we need to also change the signature of bprm_change_interp() but this change looks good too. While at it, do whitespace/style cleanups. NOTE: the real motivation for this change is that people want to increase BINPRM_BUF_SIZE, we need to change load_misc_binary() too but this looks more complicated because afaics it is very buggy. Link: http://lkml.kernel.org/r/20170918163446.GA26793@redhat.com Signed-off-by: Oleg Nesterov Acked-by: Kees Cook Cc: Travis Gummels Cc: Ben Woodard Cc: Jim Foraker Cc: Cc: Al Viro Cc: James Bottomley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_script.c | 17 +++++++++-------- fs/exec.c | 2 +- include/linux/binfmts.h | 2 +- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c index afdf4e3cafc2..7cde3f46ad26 100644 --- a/fs/binfmt_script.c +++ b/fs/binfmt_script.c @@ -19,7 +19,6 @@ static int load_script(struct linux_binprm *bprm) const char *i_arg, *i_name; char *cp; struct file *file; - char interp[BINPRM_BUF_SIZE]; int retval; if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!')) @@ -55,7 +54,7 @@ static int load_script(struct linux_binprm *bprm) break; } for (cp = bprm->buf+2; (*cp == ' ') || (*cp == '\t'); cp++); - if (*cp == '\0') + if (*cp == '\0') return -ENOEXEC; /* No interpreter name found */ i_name = cp; i_arg = NULL; @@ -65,7 +64,6 @@ static int load_script(struct linux_binprm *bprm) *cp++ = '\0'; if (*cp) i_arg = cp; - strcpy (interp, i_name); /* * OK, we've parsed out the interpreter name and * (optional) argument. @@ -80,24 +78,27 @@ static int load_script(struct linux_binprm *bprm) if (retval) return retval; retval = copy_strings_kernel(1, &bprm->interp, bprm); - if (retval < 0) return retval; + if (retval < 0) + return retval; bprm->argc++; if (i_arg) { retval = copy_strings_kernel(1, &i_arg, bprm); - if (retval < 0) return retval; + if (retval < 0) + return retval; bprm->argc++; } retval = copy_strings_kernel(1, &i_name, bprm); - if (retval) return retval; + if (retval) + return retval; bprm->argc++; - retval = bprm_change_interp(interp, bprm); + retval = bprm_change_interp(i_name, bprm); if (retval < 0) return retval; /* * OK, now restart the process with the interpreter's dentry. */ - file = open_exec(interp); + file = open_exec(i_name); if (IS_ERR(file)) return PTR_ERR(file); diff --git a/fs/exec.c b/fs/exec.c index ac34d9724684..5470d3c1892a 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1410,7 +1410,7 @@ static void free_bprm(struct linux_binprm *bprm) kfree(bprm); } -int bprm_change_interp(char *interp, struct linux_binprm *bprm) +int bprm_change_interp(const char *interp, struct linux_binprm *bprm) { /* If a binfmt changed the interp, free it first. */ if (bprm->interp != bprm->filename) diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index fb44d6180ca0..18d05b5491f3 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -131,7 +131,7 @@ extern int setup_arg_pages(struct linux_binprm * bprm, int executable_stack); extern int transfer_args_to_stack(struct linux_binprm *bprm, unsigned long *sp_location); -extern int bprm_change_interp(char *interp, struct linux_binprm *bprm); +extern int bprm_change_interp(const char *interp, struct linux_binprm *bprm); extern int copy_strings_kernel(int argc, const char *const *argv, struct linux_binprm *bprm); extern int prepare_bprm_creds(struct linux_binprm *bprm); -- cgit From baba1b29731c79d605100087b8f02f9e1cf5a344 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 3 Oct 2017 16:15:45 -0700 Subject: exec: binfmt_misc: don't nullify Node->dentry in kill_node() kill_node() nullifies/checks Node->dentry to avoid double free. This complicates the next changes and this is very confusing: - we do not need to check dentry != NULL under entries_lock, kill_node() is always called under inode_lock(d_inode(root)) and we rely on this inode_lock() anyway, without this lock the MISC_FMT_OPEN_FILE cleanup could race with itself. - if kill_inode() was already called and ->dentry == NULL we should not even try to close e->interp_file. We can change bm_entry_write() to simply check !list_empty(list) before kill_node. Again, we rely on inode_lock(), in particular it saves us from the race with bm_status_write(), another caller of kill_node(). Link: http://lkml.kernel.org/r/20170922143641.GA17210@redhat.com Signed-off-by: Oleg Nesterov Acked-by: Kees Cook Cc: Al Viro Cc: Ben Woodard Cc: James Bottomley Cc: Jim Foraker Cc: Cc: Travis Gummels Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_misc.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index ce7181ea60fa..6451e7520e05 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -603,11 +603,7 @@ static void kill_node(Node *e) struct dentry *dentry; write_lock(&entries_lock); - dentry = e->dentry; - if (dentry) { - list_del_init(&e->list); - e->dentry = NULL; - } + list_del_init(&e->list); write_unlock(&entries_lock); if ((e->flags & MISC_FMT_OPEN_FILE) && e->interp_file) { @@ -615,12 +611,11 @@ static void kill_node(Node *e) e->interp_file = NULL; } - if (dentry) { - drop_nlink(d_inode(dentry)); - d_drop(dentry); - dput(dentry); - simple_release_fs(&bm_mnt, &entry_count); - } + dentry = e->dentry; + drop_nlink(d_inode(dentry)); + d_drop(dentry); + dput(dentry); + simple_release_fs(&bm_mnt, &entry_count); } /* / */ @@ -665,7 +660,8 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer, root = file_inode(file)->i_sb->s_root; inode_lock(d_inode(root)); - kill_node(e); + if (!list_empty(&e->list)) + kill_node(e); inode_unlock(d_inode(root)); break; @@ -794,7 +790,7 @@ static ssize_t bm_status_write(struct file *file, const char __user *buffer, inode_lock(d_inode(root)); while (!list_empty(&entries)) - kill_node(list_entry(entries.next, Node, list)); + kill_node(list_first_entry(&entries, Node, list)); inode_unlock(d_inode(root)); break; -- cgit From 83f918274e4b841d6fb817861ea0c896fba0c179 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 3 Oct 2017 16:15:48 -0700 Subject: exec: binfmt_misc: shift filp_close(interp_file) from kill_node() to bm_evict_inode() To ensure that load_misc_binary() can't use the partially destroyed Node, see also the next patch. The current logic looks wrong in any case, once we close interp_file it doesn't make any sense to delay kfree(inode->i_private), this Node is no longer valid. Even if the MISC_FMT_OPEN_FILE/interp_file checks were not racy (they are), load_misc_binary() should not try to reopen ->interpreter if MISC_FMT_OPEN_FILE is set but ->interp_file is NULL. And I can't understand why do we use filp_close(), not fput(). Link: http://lkml.kernel.org/r/20170922143644.GA17216@redhat.com Signed-off-by: Oleg Nesterov Acked-by: Kees Cook Cc: Al Viro Cc: Ben Woodard Cc: James Bottomley Cc: Jim Foraker Cc: Cc: Travis Gummels Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_misc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 6451e7520e05..203598ccb40a 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -594,8 +594,13 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode) static void bm_evict_inode(struct inode *inode) { + Node *e = inode->i_private; + + if ((e->flags & MISC_FMT_OPEN_FILE) && e->interp_file) + filp_close(e->interp_file, NULL); + clear_inode(inode); - kfree(inode->i_private); + kfree(e); } static void kill_node(Node *e) @@ -606,11 +611,6 @@ static void kill_node(Node *e) list_del_init(&e->list); write_unlock(&entries_lock); - if ((e->flags & MISC_FMT_OPEN_FILE) && e->interp_file) { - filp_close(e->interp_file, NULL); - e->interp_file = NULL; - } - dentry = e->dentry; drop_nlink(d_inode(dentry)); d_drop(dentry); -- cgit From eb23aa0317eb1f08e8d9d36b8753d42f03b32764 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 3 Oct 2017 16:15:51 -0700 Subject: exec: binfmt_misc: remove the confusing e->interp_file != NULL checks If MISC_FMT_OPEN_FILE flag is set e->interp_file must be valid or we have a bug which should not be silently ignored. Link: http://lkml.kernel.org/r/20170922143647.GA17222@redhat.com Signed-off-by: Oleg Nesterov Acked-by: Kees Cook Cc: Al Viro Cc: Ben Woodard Cc: James Bottomley Cc: Jim Foraker Cc: Cc: Travis Gummels Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_misc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 203598ccb40a..babfe7bd56f0 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -205,7 +205,7 @@ static int load_misc_binary(struct linux_binprm *bprm) if (retval < 0) goto error; - if (fmt->flags & MISC_FMT_OPEN_FILE && fmt->interp_file) { + if (fmt->flags & MISC_FMT_OPEN_FILE) { interp_file = filp_clone_open(fmt->interp_file); if (!IS_ERR(interp_file)) deny_write_access(interp_file); @@ -596,7 +596,7 @@ static void bm_evict_inode(struct inode *inode) { Node *e = inode->i_private; - if ((e->flags & MISC_FMT_OPEN_FILE) && e->interp_file) + if (e->flags & MISC_FMT_OPEN_FILE) filp_close(e->interp_file, NULL); clear_inode(inode); -- cgit From 43a4f2619038002f48c78698c42c05692d4b4eb2 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 3 Oct 2017 16:15:55 -0700 Subject: exec: binfmt_misc: fix race between load_misc_binary() and kill_node() load_misc_binary() makes a local copy of fmt->interpreter under entries_lock to avoid the race with kill_node() but this is not enough; the whole Node can be freed after we drop entries_lock, not only the ->interpreter string. Add dget/dput(fmt->dentry) to ensure bm_evict_inode() can't destroy/free this Node. Link: http://lkml.kernel.org/r/20170922143650.GA17227@redhat.com Signed-off-by: Oleg Nesterov Acked-by: Kees Cook Cc: Al Viro Cc: Ben Woodard Cc: James Bottomley Cc: Jim Foraker Cc: Travis Gummels Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_misc.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index babfe7bd56f0..f5f8c2541790 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -138,20 +138,23 @@ static int load_misc_binary(struct linux_binprm *bprm) retval = -ENOEXEC; if (!enabled) - goto ret; + return retval; /* to keep locking time low, we copy the interpreter string */ read_lock(&entries_lock); fmt = check_file(bprm); - if (fmt) + if (fmt) { + dget(fmt->dentry); strlcpy(iname, fmt->interpreter, BINPRM_BUF_SIZE); + } read_unlock(&entries_lock); if (!fmt) - goto ret; + return retval; /* Need to be able to load the file after exec */ + retval = -ENOENT; if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE) - return -ENOENT; + goto ret; if (!(fmt->flags & MISC_FMT_PRESERVE_ARGV0)) { retval = remove_arg_zero(bprm); @@ -238,6 +241,7 @@ static int load_misc_binary(struct linux_binprm *bprm) goto error; ret: + dput(fmt->dentry); return retval; error: if (fd_binary > 0) -- cgit From 50097f74934e3ec8fb1e6f3087568b958972817d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 3 Oct 2017 16:15:58 -0700 Subject: exec: binfmt_misc: kill the onstack iname[BINPRM_BUF_SIZE] array After the previous change "fmt" can't go away, we can kill iname/iname_addr and use fmt->interpreter. Link: http://lkml.kernel.org/r/20170922143653.GA17232@redhat.com Signed-off-by: Oleg Nesterov Acked-by: Kees Cook Cc: Al Viro Cc: Ben Woodard Cc: James Bottomley Cc: Jim Foraker Cc: Cc: Travis Gummels Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_misc.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index f5f8c2541790..2a46762def31 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -54,7 +54,7 @@ typedef struct { int size; /* size of magic/mask */ char *magic; /* magic or filename extension */ char *mask; /* mask, NULL for exact match */ - char *interpreter; /* filename of interpreter */ + const char *interpreter; /* filename of interpreter */ char *name; struct dentry *dentry; struct file *interp_file; @@ -131,8 +131,6 @@ static int load_misc_binary(struct linux_binprm *bprm) { Node *fmt; struct file *interp_file = NULL; - char iname[BINPRM_BUF_SIZE]; - const char *iname_addr = iname; int retval; int fd_binary = -1; @@ -143,10 +141,8 @@ static int load_misc_binary(struct linux_binprm *bprm) /* to keep locking time low, we copy the interpreter string */ read_lock(&entries_lock); fmt = check_file(bprm); - if (fmt) { + if (fmt) dget(fmt->dentry); - strlcpy(iname, fmt->interpreter, BINPRM_BUF_SIZE); - } read_unlock(&entries_lock); if (!fmt) return retval; @@ -198,13 +194,13 @@ static int load_misc_binary(struct linux_binprm *bprm) bprm->argc++; /* add the interp as argv[0] */ - retval = copy_strings_kernel(1, &iname_addr, bprm); + retval = copy_strings_kernel(1, &fmt->interpreter, bprm); if (retval < 0) goto error; bprm->argc++; /* Update interp in case binfmt_script needs it. */ - retval = bprm_change_interp(iname, bprm); + retval = bprm_change_interp(fmt->interpreter, bprm); if (retval < 0) goto error; @@ -213,7 +209,7 @@ static int load_misc_binary(struct linux_binprm *bprm) if (!IS_ERR(interp_file)) deny_write_access(interp_file); } else { - interp_file = open_exec(iname); + interp_file = open_exec(fmt->interpreter); } retval = PTR_ERR(interp_file); if (IS_ERR(interp_file)) -- cgit From 8cb5d7482810b7eb1bb05bf4f71bc93ce35e5896 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 3 Oct 2017 16:16:01 -0700 Subject: lib/lz4: make arrays static const, reduces object code size Don't populate the read-only arrays dec32table and dec64table on the stack, instead make them both static const. Makes the object code smaller by over 10K bytes: Before: text data bss dec hex filename 31500 0 0 31500 7b0c lib/lz4/lz4_decompress.o After: text data bss dec hex filename 20237 176 0 20413 4fbd lib/lz4/lz4_decompress.o (gcc version 7.2.0 x86_64) Link: http://lkml.kernel.org/r/20170921221939.20820-1-colin.king@canonical.com Signed-off-by: Colin Ian King Cc: Christophe JAILLET Cc: Sven Schmidt <4sschmid@informatik.uni-hamburg.de> Cc: Arnd Bergmann Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/lz4/lz4_decompress.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c index bd3574312b82..141734d255e4 100644 --- a/lib/lz4/lz4_decompress.c +++ b/lib/lz4/lz4_decompress.c @@ -85,8 +85,8 @@ static FORCE_INLINE int LZ4_decompress_generic( const BYTE * const lowLimit = lowPrefix - dictSize; const BYTE * const dictEnd = (const BYTE *)dictStart + dictSize; - const unsigned int dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 }; - const int dec64table[] = { 0, 0, 0, -1, 0, 1, 2, 3 }; + static const unsigned int dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 }; + static const int dec64table[] = { 0, 0, 0, -1, 0, 1, 2, 3 }; const int safeDecode = (endOnInput == endOnInputSize); const int checkOffset = ((safeDecode) && (dictSize < (int)(64 * KB))); -- cgit From 7240767450d6d8380fb153e2998a1bb4ede7b029 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 3 Oct 2017 16:16:04 -0700 Subject: include/linux/bitfield.h: remove 32bit from FIELD_GET comment block I do not see anything that restricts this macro to 32 bit width. Link: http://lkml.kernel.org/r/1505921975-23379-1-git-send-email-yamada.masahiro@socionext.com Signed-off-by: Masahiro Yamada Acked-by: Jakub Kicinski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitfield.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/bitfield.h b/include/linux/bitfield.h index 8b9d6fff002d..f2deb71958b2 100644 --- a/include/linux/bitfield.h +++ b/include/linux/bitfield.h @@ -92,7 +92,7 @@ /** * FIELD_GET() - extract a bitfield element * @_mask: shifted mask defining the field's length and position - * @_reg: 32bit value of entire bitfield + * @_reg: value of entire bitfield * * FIELD_GET() extracts the field specified by @_mask from the * bitfield passed in as @_reg by masking and shifting it down. -- cgit From 3181c38e4df257852a0c0a53552fd5c869402886 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Tue, 3 Oct 2017 16:16:07 -0700 Subject: kernel/sysctl.c: remove duplicate UINT_MAX check on do_proc_douintvec_conv() do_proc_douintvec_conv() has two UINT_MAX checks, we can remove one. This has no functional changes other than fixing a compiler warning: kernel/sysctl.c:2190]: (warning) Identical condition '*lvalp>UINT_MAX', second condition is always false Fixes: 4f2fec00afa60 ("sysctl: simplify unsigned int support") Link: http://lkml.kernel.org/r/20170919072918.12066-1-mcgrof@kernel.org Signed-off-by: Luis R. Rodriguez Reported-by: David Binderman Acked-by: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 423554ad3610..4da9e622471f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2186,8 +2186,6 @@ static int do_proc_douintvec_conv(unsigned long *lvalp, int write, void *data) { if (write) { - if (*lvalp > UINT_MAX) - return -EINVAL; if (*lvalp > UINT_MAX) return -EINVAL; *valp = *lvalp; -- cgit From f80c7dab95a1f0f968acbafe4426ee9525b6f6ab Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 3 Oct 2017 16:16:10 -0700 Subject: mm: memcontrol: use vmalloc fallback for large kmem memcg arrays For quick per-memcg indexing, slab caches and list_lru structures maintain linear arrays of descriptors. As the number of concurrent memory cgroups in the system goes up, this requires large contiguous allocations (8k cgroups = order-5, 16k cgroups = order-6 etc.) for every existing slab cache and list_lru, which can easily fail on loaded systems. E.g.: mkdir: page allocation failure: order:5, mode:0x14040c0(GFP_KERNEL|__GFP_COMP), nodemask=(null) CPU: 1 PID: 6399 Comm: mkdir Not tainted 4.13.0-mm1-00065-g720bbe532b7c-dirty #481 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-20170228_101828-anatol 04/01/2014 Call Trace: ? __alloc_pages_direct_compact+0x4c/0x110 __alloc_pages_nodemask+0xf50/0x1430 alloc_pages_current+0x60/0xc0 kmalloc_order_trace+0x29/0x1b0 __kmalloc+0x1f4/0x320 memcg_update_all_list_lrus+0xca/0x2e0 mem_cgroup_css_alloc+0x612/0x670 cgroup_apply_control_enable+0x19e/0x360 cgroup_mkdir+0x322/0x490 kernfs_iop_mkdir+0x55/0x80 vfs_mkdir+0xd0/0x120 SyS_mkdirat+0x6c/0xe0 SyS_mkdir+0x14/0x20 entry_SYSCALL_64_fastpath+0x18/0xad Mem-Info: active_anon:2965 inactive_anon:19 isolated_anon:0 active_file:100270 inactive_file:98846 isolated_file:0 unevictable:0 dirty:0 writeback:0 unstable:0 slab_reclaimable:7328 slab_unreclaimable:16402 mapped:771 shmem:52 pagetables:278 bounce:0 free:13718 free_pcp:0 free_cma:0 This output is from an artificial reproducer, but we have repeatedly observed order-7 failures in production in the Facebook fleet. These systems become useless as they cannot run more jobs, even though there is plenty of memory to allocate 128 individual pages. Use kvmalloc and kvzalloc to fall back to vmalloc space if these arrays prove too large for allocating them physically contiguous. Link: http://lkml.kernel.org/r/20170918184919.20644-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Josef Bacik Acked-by: Michal Hocko Acked-by: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/list_lru.c | 12 ++++++------ mm/slab_common.c | 22 +++++++++++++++------- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/mm/list_lru.c b/mm/list_lru.c index 7a40fa2be858..f141f0c80ff3 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -325,12 +325,12 @@ static int memcg_init_list_lru_node(struct list_lru_node *nlru) { int size = memcg_nr_cache_ids; - nlru->memcg_lrus = kmalloc(size * sizeof(void *), GFP_KERNEL); + nlru->memcg_lrus = kvmalloc(size * sizeof(void *), GFP_KERNEL); if (!nlru->memcg_lrus) return -ENOMEM; if (__memcg_init_list_lru_node(nlru->memcg_lrus, 0, size)) { - kfree(nlru->memcg_lrus); + kvfree(nlru->memcg_lrus); return -ENOMEM; } @@ -340,7 +340,7 @@ static int memcg_init_list_lru_node(struct list_lru_node *nlru) static void memcg_destroy_list_lru_node(struct list_lru_node *nlru) { __memcg_destroy_list_lru_node(nlru->memcg_lrus, 0, memcg_nr_cache_ids); - kfree(nlru->memcg_lrus); + kvfree(nlru->memcg_lrus); } static int memcg_update_list_lru_node(struct list_lru_node *nlru, @@ -351,12 +351,12 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru, BUG_ON(old_size > new_size); old = nlru->memcg_lrus; - new = kmalloc(new_size * sizeof(void *), GFP_KERNEL); + new = kvmalloc(new_size * sizeof(void *), GFP_KERNEL); if (!new) return -ENOMEM; if (__memcg_init_list_lru_node(new, old_size, new_size)) { - kfree(new); + kvfree(new); return -ENOMEM; } @@ -373,7 +373,7 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru, nlru->memcg_lrus = new; spin_unlock_irq(&nlru->lock); - kfree(old); + kvfree(old); return 0; } diff --git a/mm/slab_common.c b/mm/slab_common.c index 904a83be82de..80164599ca5d 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -165,9 +165,9 @@ static int init_memcg_params(struct kmem_cache *s, if (!memcg_nr_cache_ids) return 0; - arr = kzalloc(sizeof(struct memcg_cache_array) + - memcg_nr_cache_ids * sizeof(void *), - GFP_KERNEL); + arr = kvzalloc(sizeof(struct memcg_cache_array) + + memcg_nr_cache_ids * sizeof(void *), + GFP_KERNEL); if (!arr) return -ENOMEM; @@ -178,15 +178,23 @@ static int init_memcg_params(struct kmem_cache *s, static void destroy_memcg_params(struct kmem_cache *s) { if (is_root_cache(s)) - kfree(rcu_access_pointer(s->memcg_params.memcg_caches)); + kvfree(rcu_access_pointer(s->memcg_params.memcg_caches)); +} + +static void free_memcg_params(struct rcu_head *rcu) +{ + struct memcg_cache_array *old; + + old = container_of(rcu, struct memcg_cache_array, rcu); + kvfree(old); } static int update_memcg_params(struct kmem_cache *s, int new_array_size) { struct memcg_cache_array *old, *new; - new = kzalloc(sizeof(struct memcg_cache_array) + - new_array_size * sizeof(void *), GFP_KERNEL); + new = kvzalloc(sizeof(struct memcg_cache_array) + + new_array_size * sizeof(void *), GFP_KERNEL); if (!new) return -ENOMEM; @@ -198,7 +206,7 @@ static int update_memcg_params(struct kmem_cache *s, int new_array_size) rcu_assign_pointer(s->memcg_params.memcg_caches, new); if (old) - kfree_rcu(old, rcu); + call_rcu(&old->rcu, free_memcg_params); return 0; } -- cgit From a70e43a59de9316e6fbad3b65557d0a24c099aca Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 3 Oct 2017 16:16:13 -0700 Subject: lib/idr.c: fix comment for idr_replace() idr_replace() returns the old value on success, not 0. Link: http://lkml.kernel.org/r/20170918162642.37511-1-ebiggers3@gmail.com Signed-off-by: Eric Biggers Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/idr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/idr.c b/lib/idr.c index f9adf4805fd7..edd9b2be1651 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -146,8 +146,8 @@ EXPORT_SYMBOL(idr_get_next_ext); * idr_alloc() and idr_remove() (as long as the ID being removed is not * the one being replaced!). * - * Returns: 0 on success. %-ENOENT indicates that @id was not found. - * %-EINVAL indicates that @id or @ptr were not valid. + * Returns: the old value on success. %-ENOENT indicates that @id was not + * found. %-EINVAL indicates that @id or @ptr were not valid. */ void *idr_replace(struct idr *idr, void *ptr, int id) { -- cgit From f64ac5e6e30668216cf489d73ba8a96e372d78c6 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 3 Oct 2017 16:16:16 -0700 Subject: mm, memory_hotplug: add scheduling point to __add_pages Patch series "mm, memory_hotplug: fix few soft lockups in memory hotadd". Johannes has noticed few soft lockups when adding a large nvdimm device. All of them were caused by a long loop without any explicit cond_resched which is a problem for !PREEMPT kernels. The fix is quite straightforward. Just make sure that cond_resched gets called from time to time. This patch (of 3): __add_pages gets a pfn range to add and there is no upper bound for a single call. This is usually a memory block aligned size for the regular memory hotplug - smaller sizes are usual for memory balloning drivers, or the whole NUMA node for physical memory online. There is no explicit scheduling point in that code path though. This can lead to long latencies while __add_pages is executed and we have even seen a soft lockup report during nvdimm initialization with !PREEMPT kernel NMI watchdog: BUG: soft lockup - CPU#11 stuck for 23s! [kworker/u641:3:832] [...] Workqueue: events_unbound async_run_entry_fn task: ffff881809270f40 ti: ffff881809274000 task.ti: ffff881809274000 RIP: _raw_spin_unlock_irqrestore+0x11/0x20 RSP: 0018:ffff881809277b10 EFLAGS: 00000286 [...] Call Trace: sparse_add_one_section+0x13d/0x18e __add_pages+0x10a/0x1d0 arch_add_memory+0x4a/0xc0 devm_memremap_pages+0x29d/0x430 pmem_attach_disk+0x2fd/0x3f0 [nd_pmem] nvdimm_bus_probe+0x64/0x110 [libnvdimm] driver_probe_device+0x1f7/0x420 bus_for_each_drv+0x52/0x80 __device_attach+0xb0/0x130 bus_probe_device+0x87/0xa0 device_add+0x3fc/0x5f0 nd_async_device_register+0xe/0x40 [libnvdimm] async_run_entry_fn+0x43/0x150 process_one_work+0x14e/0x410 worker_thread+0x116/0x490 kthread+0xc7/0xe0 ret_from_fork+0x3f/0x70 DWARF2 unwinder stuck at ret_from_fork+0x3f/0x70 Fix this by adding cond_resched once per each memory section in the given pfn range. Each section is constant amount of work which itself is not too expensive but many of them will just add up. Link: http://lkml.kernel.org/r/20170918121410.24466-2-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Johannes Thumshirn Tested-by: Johannes Thumshirn Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index e882cb6da994..23d5bd968950 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -328,6 +328,7 @@ int __ref __add_pages(int nid, unsigned long phys_start_pfn, if (err && (err != -EEXIST)) break; err = 0; + cond_resched(); } vmemmap_populate_print_last(); out: -- cgit From 9b6e63cbf85b89b2dbffa4955dbf2df8250e5375 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 3 Oct 2017 16:16:19 -0700 Subject: mm, page_alloc: add scheduling point to memmap_init_zone memmap_init_zone gets a pfn range to initialize and it can be really large resulting in a soft lockup on non-preemptible kernels NMI watchdog: BUG: soft lockup - CPU#31 stuck for 23s! [kworker/u642:5:1720] [...] task: ffff88ecd7e902c0 ti: ffff88eca4e50000 task.ti: ffff88eca4e50000 RIP: move_pfn_range_to_zone+0x185/0x1d0 [...] Call Trace: devm_memremap_pages+0x2c7/0x430 pmem_attach_disk+0x2fd/0x3f0 [nd_pmem] nvdimm_bus_probe+0x64/0x110 [libnvdimm] driver_probe_device+0x1f7/0x420 bus_for_each_drv+0x52/0x80 __device_attach+0xb0/0x130 bus_probe_device+0x87/0xa0 device_add+0x3fc/0x5f0 nd_async_device_register+0xe/0x40 [libnvdimm] async_run_entry_fn+0x43/0x150 process_one_work+0x14e/0x410 worker_thread+0x116/0x490 kthread+0xc7/0xe0 ret_from_fork+0x3f/0x70 Fix this by adding a scheduling point once per page block. Link: http://lkml.kernel.org/r/20170918121410.24466-3-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Johannes Thumshirn Tested-by: Johannes Thumshirn Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 38d165a87860..77e4d3c5c57b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5367,6 +5367,7 @@ not_early: __init_single_page(page, pfn, zone, nid); set_pageblock_migratetype(page, MIGRATE_MOVABLE); + cond_resched(); } else { __init_single_pfn(pfn, zone, nid); } -- cgit From 1fdcce6e16c54facc4f0688630d3b9ecfcaa411f Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 3 Oct 2017 16:16:23 -0700 Subject: memremap: add scheduling point to devm_memremap_pages devm_memremap_pages is initializing struct pages in for_each_device_pfn and that can take quite some time. We have even seen a soft lockup triggering on a non preemptive kernel NMI watchdog: BUG: soft lockup - CPU#61 stuck for 22s! [kworker/u641:11:1808] [...] RIP: 0010:[] [] devm_memremap_pages+0x327/0x430 [...] Call Trace: pmem_attach_disk+0x2fd/0x3f0 [nd_pmem] nvdimm_bus_probe+0x64/0x110 [libnvdimm] driver_probe_device+0x1f7/0x420 bus_for_each_drv+0x52/0x80 __device_attach+0xb0/0x130 bus_probe_device+0x87/0xa0 device_add+0x3fc/0x5f0 nd_async_device_register+0xe/0x40 [libnvdimm] async_run_entry_fn+0x43/0x150 process_one_work+0x14e/0x410 worker_thread+0x116/0x490 kthread+0xc7/0xe0 ret_from_fork+0x3f/0x70 fix this by adding cond_resched every 1024 pages. Link: http://lkml.kernel.org/r/20170918121410.24466-4-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Johannes Thumshirn Tested-by: Johannes Thumshirn Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/memremap.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/memremap.c b/kernel/memremap.c index 6bcbfbf1a8fd..403ab9cdb949 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -350,7 +350,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, pgprot_t pgprot = PAGE_KERNEL; struct dev_pagemap *pgmap; struct page_map *page_map; - int error, nid, is_ram; + int error, nid, is_ram, i = 0; align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) @@ -448,6 +448,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, list_del(&page->lru); page->pgmap = pgmap; percpu_ref_get(ref); + if (!(++i % 1024)) + cond_resched(); } devres_add(dev, page_map); return __va(res->start); -- cgit From c9653850c90d6050197bc76ba672e00a7771aea5 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Tue, 3 Oct 2017 16:16:26 -0700 Subject: kernel/kcmp.c: drop branch leftover typo The else branch been left over and escaped the source code refresh. Not a problem but better clean it up. Fixes: 0791e3644e5e ("kcmp: add KCMP_EPOLL_TFD mode to compare epoll target files") Link: http://lkml.kernel.org/r/20170917165838.GA1887@uranus.lan Reported-by: Eugene Syromiatnikov Signed-off-by: Cyrill Gorcunov Acked-by: Andrei Vagin Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kcmp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/kcmp.c b/kernel/kcmp.c index ea34ed8bb952..055bb2962a0b 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c @@ -131,7 +131,7 @@ static int kcmp_epoll_target(struct task_struct *task1, if (filp_epoll) { filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff); fput(filp_epoll); - } else + } if (IS_ERR(filp_tgt)) return PTR_ERR(filp_tgt); -- cgit From 1dd2bfc86818ddbc95f98e312e7704350223fd7d Mon Sep 17 00:00:00 2001 From: YASUAKI ISHIMATSU Date: Tue, 3 Oct 2017 16:16:29 -0700 Subject: mm/memory_hotplug: change pfn_to_section_nr/section_nr_to_pfn macro to inline function pfn_to_section_nr() and section_nr_to_pfn() are defined as macro. pfn_to_section_nr() has no issue even if it is defined as macro. But section_nr_to_pfn() has overflow issue if sec is defined as int. section_nr_to_pfn() just shifts sec by PFN_SECTION_SHIFT. If sec is defined as unsigned long, section_nr_to_pfn() returns pfn as 64 bit value. But if sec is defined as int, section_nr_to_pfn() returns pfn as 32 bit value. __remove_section() calculates start_pfn using section_nr_to_pfn() and scn_nr defined as int. So if hot-removed memory address is over 16TB, overflow issue occurs and section_nr_to_pfn() does not calculate correct pfn. To make callers use proper arg, the patch changes the macros to inline functions. Fixes: 815121d2b5cd ("memory_hotplug: clear zone when removing the memory") Link: http://lkml.kernel.org/r/e643a387-e573-6bbf-d418-c60c8ee3d15e@gmail.com Signed-off-by: Yasuaki Ishimatsu Acked-by: Michal Hocko Cc: Xishi Qiu Cc: Reza Arbab Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 10 ++++++++-- mm/memory_hotplug.c | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 356a814e7c8e..c8f89417740b 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1094,8 +1094,14 @@ static inline unsigned long early_pfn_to_nid(unsigned long pfn) #error Allocator MAX_ORDER exceeds SECTION_SIZE #endif -#define pfn_to_section_nr(pfn) ((pfn) >> PFN_SECTION_SHIFT) -#define section_nr_to_pfn(sec) ((sec) << PFN_SECTION_SHIFT) +static inline unsigned long pfn_to_section_nr(unsigned long pfn) +{ + return pfn >> PFN_SECTION_SHIFT; +} +static inline unsigned long section_nr_to_pfn(unsigned long sec) +{ + return sec << PFN_SECTION_SHIFT; +} #define SECTION_ALIGN_UP(pfn) (((pfn) + PAGES_PER_SECTION - 1) & PAGE_SECTION_MASK) #define SECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SECTION_MASK) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 23d5bd968950..efd1ad37bb57 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -551,7 +551,7 @@ static int __remove_section(struct zone *zone, struct mem_section *ms, return ret; scn_nr = __section_nr(ms); - start_pfn = section_nr_to_pfn(scn_nr); + start_pfn = section_nr_to_pfn((unsigned long)scn_nr); __remove_zone(zone, start_pfn); sparse_remove_one_section(zone, ms, map_offset); -- cgit From d09b0137d204bebeaafed672bc5a244e9ac92edb Mon Sep 17 00:00:00 2001 From: YASUAKI ISHIMATSU Date: Tue, 3 Oct 2017 16:16:32 -0700 Subject: mm/memory_hotplug: define find_{smallest|biggest}_section_pfn as unsigned long find_{smallest|biggest}_section_pfn()s find the smallest/biggest section and return the pfn of the section. But the functions are defined as int. So the functions always return 0x00000000 - 0xffffffff. It means if memory address is over 16TB, the functions does not work correctly. To handle 64 bit value, the patch defines find_{smallest|biggest}_section_pfn() as unsigned long. Fixes: 815121d2b5cd ("memory_hotplug: clear zone when removing the memory") Link: http://lkml.kernel.org/r/d9d5593a-d0a4-c4be-ab08-493df59a85c6@gmail.com Signed-off-by: Yasuaki Ishimatsu Acked-by: Michal Hocko Cc: Xishi Qiu Cc: Reza Arbab Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index efd1ad37bb57..d4b5f29906b9 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -338,7 +338,7 @@ EXPORT_SYMBOL_GPL(__add_pages); #ifdef CONFIG_MEMORY_HOTREMOVE /* find the smallest valid pfn in the range [start_pfn, end_pfn) */ -static int find_smallest_section_pfn(int nid, struct zone *zone, +static unsigned long find_smallest_section_pfn(int nid, struct zone *zone, unsigned long start_pfn, unsigned long end_pfn) { @@ -363,7 +363,7 @@ static int find_smallest_section_pfn(int nid, struct zone *zone, } /* find the biggest valid pfn in the range [start_pfn, end_pfn). */ -static int find_biggest_section_pfn(int nid, struct zone *zone, +static unsigned long find_biggest_section_pfn(int nid, struct zone *zone, unsigned long start_pfn, unsigned long end_pfn) { -- cgit From 90ceb2a3ad868f800eb1c9f4ede650daddd94b77 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Tue, 3 Oct 2017 16:16:35 -0700 Subject: kernel/params.c: fix the maximum length in param_get_string The length parameter of strlcpy() is supposed to reflect the size of the target buffer, not of the source string. Harmless in this case as the buffer is PAGE_SIZE long and the source string is always much shorter than this, but conceptually wrong, so let's fix it. Link: http://lkml.kernel.org/r/20170928162515.24846b4f@endymion Signed-off-by: Jean Delvare Acked-by: Ingo Molnar Cc: Baoquan He Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/params.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/params.c b/kernel/params.c index 1cd8f1a895a8..8283ba045f4f 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -507,7 +507,7 @@ EXPORT_SYMBOL(param_set_copystring); int param_get_string(char *buffer, const struct kernel_param *kp) { const struct kparam_string *kps = kp->str; - return strlcpy(buffer, kps->string, kps->maxlen); + return strlcpy(buffer, kps->string, PAGE_SIZE); } EXPORT_SYMBOL(param_get_string); -- cgit From 96802e6b1dbf29d3012b39503c5dd6d9d8e82955 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Tue, 3 Oct 2017 16:16:38 -0700 Subject: kernel/params.c: fix an overflow in param_attr_show Function param_attr_show could overflow the buffer it is operating on. The buffer size is PAGE_SIZE, and the string returned by attribute->param->ops->get is generated by scnprintf(buffer, PAGE_SIZE, ...) so it could be PAGE_SIZE - 1 long, with the terminating '\0' at the very end of the buffer. Calling strcat(..., "\n") on this isn't safe, as the '\0' will be replaced by '\n' (OK) and then another '\0' will be added past the end of the buffer (not OK.) Simply add the trailing '\n' when writing the attribute contents to the buffer originally. This is safe, and also faster. Credits to Teradata for discovering this issue. Link: http://lkml.kernel.org/r/20170928162602.60c379c7@endymion Signed-off-by: Jean Delvare Acked-by: Ingo Molnar Cc: Baoquan He Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/params.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/kernel/params.c b/kernel/params.c index 8283ba045f4f..0cca488263dd 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -224,7 +224,7 @@ char *parse_args(const char *doing, } \ int param_get_##name(char *buffer, const struct kernel_param *kp) \ { \ - return scnprintf(buffer, PAGE_SIZE, format, \ + return scnprintf(buffer, PAGE_SIZE, format "\n", \ *((type *)kp->arg)); \ } \ const struct kernel_param_ops param_ops_##name = { \ @@ -270,7 +270,7 @@ EXPORT_SYMBOL(param_set_charp); int param_get_charp(char *buffer, const struct kernel_param *kp) { - return scnprintf(buffer, PAGE_SIZE, "%s", *((char **)kp->arg)); + return scnprintf(buffer, PAGE_SIZE, "%s\n", *((char **)kp->arg)); } EXPORT_SYMBOL(param_get_charp); @@ -301,7 +301,7 @@ EXPORT_SYMBOL(param_set_bool); int param_get_bool(char *buffer, const struct kernel_param *kp) { /* Y and N chosen as being relatively non-coder friendly */ - return sprintf(buffer, "%c", *(bool *)kp->arg ? 'Y' : 'N'); + return sprintf(buffer, "%c\n", *(bool *)kp->arg ? 'Y' : 'N'); } EXPORT_SYMBOL(param_get_bool); @@ -360,7 +360,7 @@ EXPORT_SYMBOL(param_set_invbool); int param_get_invbool(char *buffer, const struct kernel_param *kp) { - return sprintf(buffer, "%c", (*(bool *)kp->arg) ? 'N' : 'Y'); + return sprintf(buffer, "%c\n", (*(bool *)kp->arg) ? 'N' : 'Y'); } EXPORT_SYMBOL(param_get_invbool); @@ -460,8 +460,9 @@ static int param_array_get(char *buffer, const struct kernel_param *kp) struct kernel_param p = *kp; for (i = off = 0; i < (arr->num ? *arr->num : arr->max); i++) { + /* Replace \n with comma */ if (i) - buffer[off++] = ','; + buffer[off - 1] = ','; p.arg = arr->elem + arr->elemsize * i; check_kparam_locked(p.mod); ret = arr->ops->get(buffer + off, &p); @@ -507,7 +508,7 @@ EXPORT_SYMBOL(param_set_copystring); int param_get_string(char *buffer, const struct kernel_param *kp) { const struct kparam_string *kps = kp->str; - return strlcpy(buffer, kps->string, PAGE_SIZE); + return scnprintf(buffer, PAGE_SIZE, "%s\n", kps->string); } EXPORT_SYMBOL(param_get_string); @@ -549,10 +550,6 @@ static ssize_t param_attr_show(struct module_attribute *mattr, kernel_param_lock(mk->mod); count = attribute->param->ops->get(buf, attribute->param); kernel_param_unlock(mk->mod); - if (count > 0) { - strcat(buf, "\n"); - ++count; - } return count; } -- cgit From e0596c80f442d1e1221c17dbb71b2aed43909221 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Tue, 3 Oct 2017 16:16:41 -0700 Subject: kernel/params.c: improve STANDARD_PARAM_DEF readability Align the parameters passed to STANDARD_PARAM_DEF for clarity. Link: http://lkml.kernel.org/r/20170928162728.756143cc@endymion Signed-off-by: Jean Delvare Suggested-by: Ingo Molnar Acked-by: Ingo Molnar Cc: Baoquan He Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/params.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/params.c b/kernel/params.c index 0cca488263dd..cc9108c2a1fd 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -236,14 +236,14 @@ char *parse_args(const char *doing, EXPORT_SYMBOL(param_ops_##name) -STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8); -STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16); -STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16); -STANDARD_PARAM_DEF(int, int, "%i", kstrtoint); -STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint); -STANDARD_PARAM_DEF(long, long, "%li", kstrtol); -STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul); -STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull); +STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8); +STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16); +STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16); +STANDARD_PARAM_DEF(int, int, "%i", kstrtoint); +STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint); +STANDARD_PARAM_DEF(long, long, "%li", kstrtol); +STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul); +STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull); int param_set_charp(const char *val, const struct kernel_param *kp) { -- cgit From 656d61ce9666209c4c4a13c71902d3ee70d1ff6f Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 3 Oct 2017 16:16:45 -0700 Subject: lib/ratelimit.c: use deferred printk() version printk_ratelimit() invokes ___ratelimit() which may invoke a normal printk() (pr_warn() in this particular case) to warn about suppressed output. Given that printk_ratelimit() may be called from anywhere, that pr_warn() is dangerous - it may end up deadlocking the system. Fix ___ratelimit() by using deferred printk(). Sasha reported the following lockdep error: : Unregister pv shared memory for cpu 8 : select_fallback_rq: 3 callbacks suppressed : process 8583 (trinity-c78) no longer affine to cpu8 : : ====================================================== : WARNING: possible circular locking dependency detected : 4.14.0-rc2-next-20170927+ #252 Not tainted : ------------------------------------------------------ : migration/8/62 is trying to acquire lock: : (&port_lock_key){-.-.}, at: serial8250_console_write() : : but task is already holding lock: : (&rq->lock){-.-.}, at: sched_cpu_dying() : : which lock already depends on the new lock. : : : the existing dependency chain (in reverse order) is: : : -> #3 (&rq->lock){-.-.}: : __lock_acquire() : lock_acquire() : _raw_spin_lock() : task_fork_fair() : sched_fork() : copy_process.part.31() : _do_fork() : kernel_thread() : rest_init() : start_kernel() : x86_64_start_reservations() : x86_64_start_kernel() : verify_cpu() : : -> #2 (&p->pi_lock){-.-.}: : __lock_acquire() : lock_acquire() : _raw_spin_lock_irqsave() : try_to_wake_up() : default_wake_function() : woken_wake_function() : __wake_up_common() : __wake_up_common_lock() : __wake_up() : tty_wakeup() : tty_port_default_wakeup() : tty_port_tty_wakeup() : uart_write_wakeup() : serial8250_tx_chars() : serial8250_handle_irq.part.25() : serial8250_default_handle_irq() : serial8250_interrupt() : __handle_irq_event_percpu() : handle_irq_event_percpu() : handle_irq_event() : handle_level_irq() : handle_irq() : do_IRQ() : ret_from_intr() : native_safe_halt() : default_idle() : arch_cpu_idle() : default_idle_call() : do_idle() : cpu_startup_entry() : rest_init() : start_kernel() : x86_64_start_reservations() : x86_64_start_kernel() : verify_cpu() : : -> #1 (&tty->write_wait){-.-.}: : __lock_acquire() : lock_acquire() : _raw_spin_lock_irqsave() : __wake_up_common_lock() : __wake_up() : tty_wakeup() : tty_port_default_wakeup() : tty_port_tty_wakeup() : uart_write_wakeup() : serial8250_tx_chars() : serial8250_handle_irq.part.25() : serial8250_default_handle_irq() : serial8250_interrupt() : __handle_irq_event_percpu() : handle_irq_event_percpu() : handle_irq_event() : handle_level_irq() : handle_irq() : do_IRQ() : ret_from_intr() : native_safe_halt() : default_idle() : arch_cpu_idle() : default_idle_call() : do_idle() : cpu_startup_entry() : rest_init() : start_kernel() : x86_64_start_reservations() : x86_64_start_kernel() : verify_cpu() : : -> #0 (&port_lock_key){-.-.}: : check_prev_add() : __lock_acquire() : lock_acquire() : _raw_spin_lock_irqsave() : serial8250_console_write() : univ8250_console_write() : console_unlock() : vprintk_emit() : vprintk_default() : vprintk_func() : printk() : ___ratelimit() : __printk_ratelimit() : select_fallback_rq() : sched_cpu_dying() : cpuhp_invoke_callback() : take_cpu_down() : multi_cpu_stop() : cpu_stopper_thread() : smpboot_thread_fn() : kthread() : ret_from_fork() : : other info that might help us debug this: : : Chain exists of: : &port_lock_key --> &p->pi_lock --> &rq->lock : : Possible unsafe locking scenario: : : CPU0 CPU1 : ---- ---- : lock(&rq->lock); : lock(&p->pi_lock); : lock(&rq->lock); : lock(&port_lock_key); : : *** DEADLOCK *** : : 4 locks held by migration/8/62: : #0: (&p->pi_lock){-.-.}, at: sched_cpu_dying() : #1: (&rq->lock){-.-.}, at: sched_cpu_dying() : #2: (printk_ratelimit_state.lock){....}, at: ___ratelimit() : #3: (console_lock){+.+.}, at: vprintk_emit() : : stack backtrace: : CPU: 8 PID: 62 Comm: migration/8 Not tainted 4.14.0-rc2-next-20170927+ #252 : Call Trace: : dump_stack() : print_circular_bug() : check_prev_add() : ? add_lock_to_list.isra.26() : ? check_usage() : ? kvm_clock_read() : ? kvm_sched_clock_read() : ? sched_clock() : ? check_preemption_disabled() : __lock_acquire() : ? __lock_acquire() : ? add_lock_to_list.isra.26() : ? debug_check_no_locks_freed() : ? memcpy() : lock_acquire() : ? serial8250_console_write() : _raw_spin_lock_irqsave() : ? serial8250_console_write() : serial8250_console_write() : ? serial8250_start_tx() : ? lock_acquire() : ? memcpy() : univ8250_console_write() : console_unlock() : ? __down_trylock_console_sem() : vprintk_emit() : vprintk_default() : vprintk_func() : printk() : ? show_regs_print_info() : ? lock_acquire() : ___ratelimit() : __printk_ratelimit() : select_fallback_rq() : sched_cpu_dying() : ? sched_cpu_starting() : ? rcutree_dying_cpu() : ? sched_cpu_starting() : cpuhp_invoke_callback() : ? cpu_disable_common() : take_cpu_down() : ? trace_hardirqs_off_caller() : ? cpuhp_invoke_callback() : multi_cpu_stop() : ? __this_cpu_preempt_check() : ? cpu_stop_queue_work() : cpu_stopper_thread() : ? cpu_stop_create() : smpboot_thread_fn() : ? sort_range() : ? schedule() : ? __kthread_parkme() : kthread() : ? sort_range() : ? kthread_create_on_node() : ret_from_fork() : process 9121 (trinity-c78) no longer affine to cpu8 : smpboot: CPU 8 is now offline Link: http://lkml.kernel.org/r/20170928120405.18273-1-sergey.senozhatsky@gmail.com Fixes: 6b1d174b0c27b ("ratelimit: extend to print suppressed messages on release") Signed-off-by: Sergey Senozhatsky Reported-by: Sasha Levin Reviewed-by: Petr Mladek Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Steven Rostedt Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/ratelimit.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/ratelimit.c b/lib/ratelimit.c index 08f8043cac61..d01f47135239 100644 --- a/lib/ratelimit.c +++ b/lib/ratelimit.c @@ -48,7 +48,9 @@ int ___ratelimit(struct ratelimit_state *rs, const char *func) if (time_is_before_jiffies(rs->begin + rs->interval)) { if (rs->missed) { if (!(rs->flags & RATELIMIT_MSG_ON_RELEASE)) { - pr_warn("%s: %d callbacks suppressed\n", func, rs->missed); + printk_deferred(KERN_WARNING + "%s: %d callbacks suppressed\n", + func, rs->missed); rs->missed = 0; } } -- cgit From d22e3d69ee1a3f83ff7cc943af63de48b6156dcf Mon Sep 17 00:00:00 2001 From: Sudip Mukherjee Date: Tue, 3 Oct 2017 16:16:49 -0700 Subject: m32r: fix build failure The allmodconfig build of m32r is failing with the error: lib/mpi/mpih-div.o: In function 'mpihelp_divrem': mpih-div.c:(.text+0x40): undefined reference to 'abort' mpih-div.c:(.text+0x40): relocation truncated to fit: R_M32R_26_PCREL_RELA against undefined symbol 'abort' The function 'abort' was never defined for the m32r architecture. Create 'abort' as is done in other arch like 'arm' and 'unicore32'. Link: http://lkml.kernel.org/r/1506727220-6108-1-git-send-email-sudip.mukherjee@codethink.co.uk Signed-off-by: Sudip Mukherjee Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/m32r/kernel/traps.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/m32r/kernel/traps.c b/arch/m32r/kernel/traps.c index 647dd94a0c39..72b96f282689 100644 --- a/arch/m32r/kernel/traps.c +++ b/arch/m32r/kernel/traps.c @@ -114,6 +114,15 @@ static void set_eit_vector_entries(void) _flush_cache_copyback_all(); } +void abort(void) +{ + BUG(); + + /* if that doesn't kill us, halt */ + panic("Oops failed to kill thread"); +} +EXPORT_SYMBOL(abort); + void __init trap_init(void) { set_eit_vector_entries(); -- cgit From a08ffbef4ab799351d0610bbdbeaa1ee746b9065 Mon Sep 17 00:00:00 2001 From: Stafford Horne Date: Tue, 3 Oct 2017 16:16:51 -0700 Subject: checkpatch: fix ignoring cover-letter logic Currently running checkpatch on a directory with a cover-letter.patch file reports the following error: ----------------------------------------- patches/smp-v2/v2-0000-cover-letter.patch ----------------------------------------- ERROR: Does not appear to be a unified-diff format patch The logic to suppress the unified-diff check for cover letters is there but is checking $file instead of $filename. Fix the variable to use the correct one. Link: http://lkml.kernel.org/r/20170909090406.31523-1-shorne@gmail.com Signed-off-by: Stafford Horne Acked-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index dd2c262aebbf..8b80bac055e4 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -6390,7 +6390,7 @@ sub process { exit(0); } - if (!$is_patch && $file !~ /cover-letter\.patch$/) { + if (!$is_patch && $filename !~ /cover-letter\.patch$/) { ERROR("NOT_UNIFIED_DIFF", "Does not appear to be a unified-diff format patch\n"); } -- cgit From 32e57c29e3c038ac802b7cc214a8795a4234055f Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 3 Oct 2017 16:16:54 -0700 Subject: include/linux/fs.h: fix comment about struct address_space Before commit 9c5d760b8d22 ("mm: split gfp_mask and mapping flags into separate fields") the private_* fields of struct adrress_space were grouped together and using "ditto" in comments describing the last fields was correct. With introduction of gpf_mask between private_lock and private_list "ditto" references the wrong description. Fix it by using the elaborate description. Link: http://lkml.kernel.org/r/1507009987-8746-1-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 339e73742e73..13dab191a23e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -403,7 +403,7 @@ struct address_space { unsigned long flags; /* error bits */ spinlock_t private_lock; /* for use by the address_space */ gfp_t gfp_mask; /* implicit gfp mask for allocations */ - struct list_head private_list; /* ditto */ + struct list_head private_list; /* for use by the address_space */ void *private_data; /* ditto */ errseq_t wb_err; } __attribute__((aligned(sizeof(long)))) __randomize_layout; -- cgit From e4c77f8b9b213c6315faba109c03b0db873db200 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 18 Sep 2017 17:47:50 +0200 Subject: ARM: defconfig: FRAMEBUFFER_CONSOLE can no longer be =m It is no longer possible to load this at runtime, so let's change the few remaining users to have it built-in all the time. arch/arm/configs/zeus_defconfig:115:warning: symbol value 'm' invalid for FRAMEBUFFER_CONSOLE arch/arm/configs/viper_defconfig:116:warning: symbol value 'm' invalid for FRAMEBUFFER_CONSOLE arch/arm/configs/pxa_defconfig:474:warning: symbol value 'm' invalid for FRAMEBUFFER_CONSOLE Reported-by: kernelci.org bot Fixes: 6104c37094e7 ("fbcon: Make fbcon a built-time depency for fbdev") Signed-off-by: Arnd Bergmann Signed-off-by: Olof Johansson --- arch/arm/configs/pxa_defconfig | 2 +- arch/arm/configs/viper_defconfig | 2 +- arch/arm/configs/zeus_defconfig | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm/configs/pxa_defconfig b/arch/arm/configs/pxa_defconfig index 64e3a2a8cede..d5e1370ec303 100644 --- a/arch/arm/configs/pxa_defconfig +++ b/arch/arm/configs/pxa_defconfig @@ -471,7 +471,7 @@ CONFIG_LCD_PLATFORM=m CONFIG_LCD_TOSA=m CONFIG_BACKLIGHT_PWM=m CONFIG_BACKLIGHT_TOSA=m -CONFIG_FRAMEBUFFER_CONSOLE=m +CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_FRAMEBUFFER_CONSOLE_ROTATION=y CONFIG_LOGO=y CONFIG_SOUND=m diff --git a/arch/arm/configs/viper_defconfig b/arch/arm/configs/viper_defconfig index 44d4fa57ba0a..070e5074f1ee 100644 --- a/arch/arm/configs/viper_defconfig +++ b/arch/arm/configs/viper_defconfig @@ -113,7 +113,7 @@ CONFIG_FB_PXA_PARAMETERS=y CONFIG_BACKLIGHT_LCD_SUPPORT=y CONFIG_BACKLIGHT_PWM=m # CONFIG_VGA_CONSOLE is not set -CONFIG_FRAMEBUFFER_CONSOLE=m +CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_LOGO=y CONFIG_SOUND=m CONFIG_SND=m diff --git a/arch/arm/configs/zeus_defconfig b/arch/arm/configs/zeus_defconfig index 8d4c0c926c34..09e7050d5653 100644 --- a/arch/arm/configs/zeus_defconfig +++ b/arch/arm/configs/zeus_defconfig @@ -112,7 +112,7 @@ CONFIG_FB_PXA=m CONFIG_FB_PXA_PARAMETERS=y CONFIG_BACKLIGHT_LCD_SUPPORT=y # CONFIG_VGA_CONSOLE is not set -CONFIG_FRAMEBUFFER_CONSOLE=m +CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_LOGO=y CONFIG_SOUND=m CONFIG_SND=m -- cgit From 0694b2ee87ee1a6d83acf1a66b92c8e64ceb38f2 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Sun, 17 Sep 2017 16:26:18 +0200 Subject: ARM: defconfig: update Gemini defconfig This updates the Gemini defconfig with drivers merged for v4.13 or v4.14: - ATA driver is merged - DMA driver is merged - RTC driver gets selected from default Kconfig Signed-off-by: Linus Walleij Signed-off-by: Olof Johansson --- arch/arm/configs/gemini_defconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm/configs/gemini_defconfig b/arch/arm/configs/gemini_defconfig index d2d75fa664a6..2a63fa10c813 100644 --- a/arch/arm/configs/gemini_defconfig +++ b/arch/arm/configs/gemini_defconfig @@ -32,6 +32,7 @@ CONFIG_BLK_DEV_RAM_SIZE=16384 CONFIG_BLK_DEV_SD=y # CONFIG_SCSI_LOWLEVEL is not set CONFIG_ATA=y +CONFIG_PATA_FTIDE010=y CONFIG_INPUT_EVDEV=y CONFIG_KEYBOARD_GPIO=y # CONFIG_INPUT_MOUSE is not set @@ -55,8 +56,8 @@ CONFIG_LEDS_GPIO=y CONFIG_LEDS_TRIGGERS=y CONFIG_LEDS_TRIGGER_HEARTBEAT=y CONFIG_RTC_CLASS=y -CONFIG_RTC_DRV_GEMINI=y CONFIG_DMADEVICES=y +CONFIG_AMBA_PL08X=y # CONFIG_DNOTIFY is not set CONFIG_TMPFS=y CONFIG_TMPFS_POSIX_ACL=y -- cgit From 043d1e729b0fbaf2b69386fe45290b8a9a18a6a9 Mon Sep 17 00:00:00 2001 From: Eugeniy Paltsev Date: Wed, 6 Sep 2017 21:21:08 +0300 Subject: ARC: [plat-axs103] Add temporary quirk to reset ethernet IP DW ethernet controller on AXS10x hangs sometimes after SW reset, so add temporary quirk to reset DW ethernet controller IP core. This quirk can be removed after axs10x reset driver (see http://patchwork.ozlabs.org/patch/800273/) or simple reset driver (see https://patchwork.kernel.org/patch/9903375/) will be available in upstream. Signed-off-by: Eugeniy Paltsev Signed-off-by: Vineet Gupta --- arch/arc/plat-axs10x/axs10x.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/arc/plat-axs10x/axs10x.c b/arch/arc/plat-axs10x/axs10x.c index f1ac6790da5f..cf14ebc36916 100644 --- a/arch/arc/plat-axs10x/axs10x.c +++ b/arch/arc/plat-axs10x/axs10x.c @@ -111,6 +111,13 @@ static void __init axs10x_early_init(void) axs10x_enable_gpio_intc_wire(); + /* + * Reset ethernet IP core. + * TODO: get rid of this quirk after axs10x reset driver (or simple + * reset driver) will be available in upstream. + */ + iowrite32((1 << 5), (void __iomem *) CREG_MB_SW_RESET); + scnprintf(mb, 32, "MainBoard v%d", mb_rev); axs10x_print_board_ver(CREG_MB_VER, mb); } -- cgit From 6afa3bcf1f919c374d4606a7ed8078d3f67dfa90 Mon Sep 17 00:00:00 2001 From: Eugeniy Paltsev Date: Fri, 8 Sep 2017 21:42:33 +0300 Subject: ARC: [plat-hsdk] sdio: Temporary fix of sdio ciu frequency DW sdio controller has external ciu clock divider controlled via register in SDIO IP. Due to its unexpected default value (it should divide by 1 but it divides by 8) SDIO IP uses wrong ciu clock and works unstable So add temporary fix and change clock frequency from 100000000 to 12500000 Hz until we fix dw sdio driver itself. Fixes SNPS STAR 9001204800 Signed-off-by: Eugeniy Paltsev Signed-off-by: Vineet Gupta --- arch/arc/boot/dts/hsdk.dts | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/arc/boot/dts/hsdk.dts b/arch/arc/boot/dts/hsdk.dts index 229d13adbce4..daeef4ab2df9 100644 --- a/arch/arc/boot/dts/hsdk.dts +++ b/arch/arc/boot/dts/hsdk.dts @@ -120,7 +120,17 @@ mmcclk_ciu: mmcclk-ciu { compatible = "fixed-clock"; - clock-frequency = <100000000>; + /* + * DW sdio controller has external ciu clock divider + * controlled via register in SDIO IP. Due to its + * unexpected default value (it should devide by 1 + * but it devides by 8) SDIO IP uses wrong clock and + * works unstable (see STAR 9001204800) + * So add temporary fix and change clock frequency + * from 100000000 to 12500000 Hz until we fix dw sdio + * driver itself. + */ + clock-frequency = <12500000>; #clock-cells = <0>; }; -- cgit From 976e78a5226598cb582fe9ef98a72861adbc0e9c Mon Sep 17 00:00:00 2001 From: Eugeniy Paltsev Date: Tue, 12 Sep 2017 21:20:45 +0300 Subject: ARC: [plat-axs10x] sdio: Temporary fix of sdio ciu frequency DW sdio controller has external ciu clock divider controlled via register in SDIO IP. It divides sdio_ref_clk (which comes from CGU) by 16 for default. So default mmcclk clock (which comes to sdk_in) is 25000000 Hz. So fix wrong current value (50000000 Hz) to actual 25000000 Hz. Note this is a preventive fix, in line with similar change for HSDK where this was actually needed. see: http://lists.infradead.org/pipermail/linux-snps-arc/2017-September/002924.html Signed-off-by: Eugeniy Paltsev Signed-off-by: Vineet Gupta --- arch/arc/boot/dts/axs10x_mb.dtsi | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/arc/boot/dts/axs10x_mb.dtsi b/arch/arc/boot/dts/axs10x_mb.dtsi index 2367a67c5f10..e114000a84f5 100644 --- a/arch/arc/boot/dts/axs10x_mb.dtsi +++ b/arch/arc/boot/dts/axs10x_mb.dtsi @@ -44,7 +44,14 @@ mmcclk: mmcclk { compatible = "fixed-clock"; - clock-frequency = <50000000>; + /* + * DW sdio controller has external ciu clock divider + * controlled via register in SDIO IP. It divides + * sdio_ref_clk (which comes from CGU) by 16 for + * default. So default mmcclk clock (which comes + * to sdk_in) is 25000000 Hz. + */ + clock-frequency = <25000000>; #clock-cells = <0>; }; -- cgit From 9583833e9e3628177661e815e5ce80dd3955d82f Mon Sep 17 00:00:00 2001 From: Alexey Brodkin Date: Fri, 8 Sep 2017 23:12:59 +0300 Subject: ARC: [*defconfig] Reenable soft lock-up detector Commit 92e5aae45778 "kernel/watchdog: split up config options" introduced SOFTLOCKUP_DETECTOR which selects LOCKUP_DETECTOR instead of the latter to be selected itself. We need to adjust our defconfigs accordingly. Signed-off-by: Alexey Brodkin Signed-off-by: Vineet Gupta --- arch/arc/configs/axs101_defconfig | 2 +- arch/arc/configs/axs103_defconfig | 2 +- arch/arc/configs/axs103_smp_defconfig | 2 +- arch/arc/configs/haps_hs_smp_defconfig | 2 +- arch/arc/configs/hsdk_defconfig | 2 +- arch/arc/configs/vdk_hs38_defconfig | 2 +- arch/arc/configs/vdk_hs38_smp_defconfig | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/arc/configs/axs101_defconfig b/arch/arc/configs/axs101_defconfig index 6980b966a364..ec7c849a5c8e 100644 --- a/arch/arc/configs/axs101_defconfig +++ b/arch/arc/configs/axs101_defconfig @@ -105,7 +105,7 @@ CONFIG_NLS_ISO8859_1=y # CONFIG_ENABLE_WARN_DEPRECATED is not set # CONFIG_ENABLE_MUST_CHECK is not set CONFIG_STRIP_ASM_SYMS=y -CONFIG_LOCKUP_DETECTOR=y +CONFIG_SOFTLOCKUP_DETECTOR=y CONFIG_DEFAULT_HUNG_TASK_TIMEOUT=10 # CONFIG_SCHED_DEBUG is not set # CONFIG_DEBUG_PREEMPT is not set diff --git a/arch/arc/configs/axs103_defconfig b/arch/arc/configs/axs103_defconfig index 2233f5777a71..63d3cf69e0b0 100644 --- a/arch/arc/configs/axs103_defconfig +++ b/arch/arc/configs/axs103_defconfig @@ -104,7 +104,7 @@ CONFIG_NLS_ISO8859_1=y # CONFIG_ENABLE_WARN_DEPRECATED is not set # CONFIG_ENABLE_MUST_CHECK is not set CONFIG_STRIP_ASM_SYMS=y -CONFIG_LOCKUP_DETECTOR=y +CONFIG_SOFTLOCKUP_DETECTOR=y CONFIG_DEFAULT_HUNG_TASK_TIMEOUT=10 # CONFIG_SCHED_DEBUG is not set # CONFIG_DEBUG_PREEMPT is not set diff --git a/arch/arc/configs/axs103_smp_defconfig b/arch/arc/configs/axs103_smp_defconfig index 30a3d4cf53d2..f613ecac14a7 100644 --- a/arch/arc/configs/axs103_smp_defconfig +++ b/arch/arc/configs/axs103_smp_defconfig @@ -107,7 +107,7 @@ CONFIG_NLS_ISO8859_1=y # CONFIG_ENABLE_WARN_DEPRECATED is not set # CONFIG_ENABLE_MUST_CHECK is not set CONFIG_STRIP_ASM_SYMS=y -CONFIG_LOCKUP_DETECTOR=y +CONFIG_SOFTLOCKUP_DETECTOR=y CONFIG_DEFAULT_HUNG_TASK_TIMEOUT=10 # CONFIG_SCHED_DEBUG is not set # CONFIG_DEBUG_PREEMPT is not set diff --git a/arch/arc/configs/haps_hs_smp_defconfig b/arch/arc/configs/haps_hs_smp_defconfig index 821a2e562f3f..3507be2af6fe 100644 --- a/arch/arc/configs/haps_hs_smp_defconfig +++ b/arch/arc/configs/haps_hs_smp_defconfig @@ -84,5 +84,5 @@ CONFIG_TMPFS=y CONFIG_NFS_FS=y # CONFIG_ENABLE_WARN_DEPRECATED is not set # CONFIG_ENABLE_MUST_CHECK is not set -CONFIG_LOCKUP_DETECTOR=y +CONFIG_SOFTLOCKUP_DETECTOR=y # CONFIG_DEBUG_PREEMPT is not set diff --git a/arch/arc/configs/hsdk_defconfig b/arch/arc/configs/hsdk_defconfig index 9a3fcf446388..7b8f8faf8a24 100644 --- a/arch/arc/configs/hsdk_defconfig +++ b/arch/arc/configs/hsdk_defconfig @@ -72,7 +72,7 @@ CONFIG_NLS_ISO8859_1=y # CONFIG_ENABLE_WARN_DEPRECATED is not set # CONFIG_ENABLE_MUST_CHECK is not set CONFIG_STRIP_ASM_SYMS=y -CONFIG_LOCKUP_DETECTOR=y +CONFIG_SOFTLOCKUP_DETECTOR=y CONFIG_DEFAULT_HUNG_TASK_TIMEOUT=10 # CONFIG_SCHED_DEBUG is not set # CONFIG_DEBUG_PREEMPT is not set diff --git a/arch/arc/configs/vdk_hs38_defconfig b/arch/arc/configs/vdk_hs38_defconfig index c0d6a010751a..4fcf4f2503f6 100644 --- a/arch/arc/configs/vdk_hs38_defconfig +++ b/arch/arc/configs/vdk_hs38_defconfig @@ -94,7 +94,7 @@ CONFIG_NLS_ISO8859_1=y # CONFIG_ENABLE_MUST_CHECK is not set CONFIG_STRIP_ASM_SYMS=y CONFIG_DEBUG_SHIRQ=y -CONFIG_LOCKUP_DETECTOR=y +CONFIG_SOFTLOCKUP_DETECTOR=y CONFIG_DEFAULT_HUNG_TASK_TIMEOUT=10 # CONFIG_SCHED_DEBUG is not set # CONFIG_DEBUG_PREEMPT is not set diff --git a/arch/arc/configs/vdk_hs38_smp_defconfig b/arch/arc/configs/vdk_hs38_smp_defconfig index 5c0971787acf..7b71464f6c2f 100644 --- a/arch/arc/configs/vdk_hs38_smp_defconfig +++ b/arch/arc/configs/vdk_hs38_smp_defconfig @@ -98,7 +98,7 @@ CONFIG_NLS_ISO8859_1=y # CONFIG_ENABLE_MUST_CHECK is not set CONFIG_STRIP_ASM_SYMS=y CONFIG_DEBUG_SHIRQ=y -CONFIG_LOCKUP_DETECTOR=y +CONFIG_SOFTLOCKUP_DETECTOR=y CONFIG_DEFAULT_HUNG_TASK_TIMEOUT=10 # CONFIG_SCHED_DEBUG is not set # CONFIG_DEBUG_PREEMPT is not set -- cgit From ef833eab1ddec06982ea620086b03d67ef4ddf9b Mon Sep 17 00:00:00 2001 From: Eugeniy Paltsev Date: Mon, 4 Sep 2017 12:48:43 +0300 Subject: ARC: [plat-hsdk] use actual clk driver to manage cpu clk With corresponding clk driver now merged upstream, switch to it. - core_clk now represent the PLL (vs. fixed clk before) - input_clk represent the clk signal src for PLL (basically xtal) Signed-off-by: Eugeniy Paltsev Signed-off-by: Vineet Gupta --- arch/arc/boot/dts/hsdk.dts | 11 +++++++++-- arch/arc/plat-hsdk/Kconfig | 3 ++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/arch/arc/boot/dts/hsdk.dts b/arch/arc/boot/dts/hsdk.dts index daeef4ab2df9..b922f3faf554 100644 --- a/arch/arc/boot/dts/hsdk.dts +++ b/arch/arc/boot/dts/hsdk.dts @@ -57,10 +57,10 @@ }; }; - core_clk: core-clk { + input_clk: input-clk { #clock-cells = <0>; compatible = "fixed-clock"; - clock-frequency = <500000000>; + clock-frequency = <33333333>; }; cpu_intc: cpu-interrupt-controller { @@ -102,6 +102,13 @@ ranges = <0x00000000 0xf0000000 0x10000000>; + core_clk: core-clk@0 { + compatible = "snps,hsdk-core-pll-clock"; + reg = <0x00 0x10>, <0x14B8 0x4>; + #clock-cells = <0>; + clocks = <&input_clk>; + }; + serial: serial@5000 { compatible = "snps,dw-apb-uart"; reg = <0x5000 0x100>; diff --git a/arch/arc/plat-hsdk/Kconfig b/arch/arc/plat-hsdk/Kconfig index 5a6ed5afb009..bd08de4be75e 100644 --- a/arch/arc/plat-hsdk/Kconfig +++ b/arch/arc/plat-hsdk/Kconfig @@ -6,4 +6,5 @@ # menuconfig ARC_SOC_HSDK - bool "ARC HS Development Kit SOC" + bool "ARC HS Development Kit SOC" + select CLK_HSDK -- cgit From bd6d3588c834e3087ad0229ff0da651bbccf5e24 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Mon, 11 Sep 2017 09:48:46 -0700 Subject: ARC: [plat-eznps] Update platform maintainer as Noam left Signed-off-by: Vineet Gupta --- MAINTAINERS | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 65b0c88d5ee0..42bfa57673ed 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5259,7 +5259,8 @@ S: Maintained F: drivers/iommu/exynos-iommu.c EZchip NPS platform support -M: Noam Camus +M: Elad Kanfi +M: Vineet Gupta S: Supported F: arch/arc/plat-eznps F: arch/arc/boot/dts/eznps.dts -- cgit From d9bc84a808572451f95fb1dde80cb8d12be05665 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 20 Sep 2017 20:25:30 +0900 Subject: arc: remove redundant UTS_MACHINE define in arch/arc/Makefile The top-level Makefile sets the default of UTS_MACHINE to $(ARCH). If ARCH and UTS_MACHINE match, arch/$(ARCH)/Makefile need not specify UTS_MACHINE explicitly. Signed-off-by: Masahiro Yamada Signed-off-by: Vineet Gupta --- arch/arc/Makefile | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/arc/Makefile b/arch/arc/Makefile index 3a4b52b7e09d..d37f49d6a27f 100644 --- a/arch/arc/Makefile +++ b/arch/arc/Makefile @@ -6,8 +6,6 @@ # published by the Free Software Foundation. # -UTS_MACHINE := arc - ifeq ($(CROSS_COMPILE),) ifndef CONFIG_CPU_BIG_ENDIAN CROSS_COMPILE := arc-linux- -- cgit From 010a8c98884f4ca42a167c9b51470c624daa2932 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Thu, 21 Sep 2017 17:46:38 -0700 Subject: ARC: boot log: decontaminate ARCv2 ISA_CONFIG register ARCv2 ISA_CONFIG and ARC700_BUILD build config registers are not compatible. cpuinfo_arc had isa info placeholder which was mashup of bits form both. Untangle this by defining it off of ARCv2 ISA info and it is fine even for ARC700 since former is a super set of latter (ARC700 buildonly has 2 bits for atomics and stack check). At runtime, we treat ARCv2 ISA info as a generic placeholder but populate it correctly depending on ARC700 or HS. This paves way for adding more HS specific bits in isa info which was colliding with the extra bits for arc700. Signed-off-by: Vineet Gupta --- arch/arc/include/asm/arcregs.h | 8 ++++---- arch/arc/kernel/setup.c | 15 +++++++++++---- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/arch/arc/include/asm/arcregs.h b/arch/arc/include/asm/arcregs.h index ba8e802dba80..b71d84873f7d 100644 --- a/arch/arc/include/asm/arcregs.h +++ b/arch/arc/include/asm/arcregs.h @@ -135,12 +135,12 @@ struct bcr_identity { #endif }; -struct bcr_isa { +struct bcr_isa_arcv2 { #ifdef CONFIG_CPU_BIG_ENDIAN unsigned int div_rem:4, pad2:4, ldd:1, unalign:1, atomic:1, be:1, - pad1:11, atomic1:1, ver:8; + pad1:12, ver:8; #else - unsigned int ver:8, atomic1:1, pad1:11, be:1, atomic:1, unalign:1, + unsigned int ver:8, pad1:12, be:1, atomic:1, unalign:1, ldd:1, pad2:4, div_rem:4; #endif }; @@ -263,7 +263,7 @@ struct cpuinfo_arc { struct cpuinfo_arc_mmu mmu; struct cpuinfo_arc_bpu bpu; struct bcr_identity core; - struct bcr_isa isa; + struct bcr_isa_arcv2 isa; const char *details, *name; unsigned int vec_base; struct cpuinfo_arc_ccm iccm, dccm; diff --git a/arch/arc/kernel/setup.c b/arch/arc/kernel/setup.c index 877cec8f5ea2..228593a964f8 100644 --- a/arch/arc/kernel/setup.c +++ b/arch/arc/kernel/setup.c @@ -119,11 +119,11 @@ static void read_arc_build_cfg_regs(void) struct bcr_generic bcr; struct cpuinfo_arc *cpu = &cpuinfo_arc700[smp_processor_id()]; const struct id_to_str *tbl; + struct bcr_isa_arcv2 isa; FIX_PTR(cpu); READ_BCR(AUX_IDENTITY, cpu->core); - READ_BCR(ARC_REG_ISA_CFG_BCR, cpu->isa); for (tbl = &arc_cpu_rel[0]; tbl->id != 0; tbl++) { if (cpu->core.family == tbl->id) { @@ -205,18 +205,25 @@ static void read_arc_build_cfg_regs(void) cpu->extn.debug = cpu->extn.ap | cpu->extn.smart | cpu->extn.rtt; + READ_BCR(ARC_REG_ISA_CFG_BCR, isa); + /* some hacks for lack of feature BCR info in old ARC700 cores */ if (is_isa_arcompact()) { - if (!cpu->isa.ver) /* ISA BCR absent, use Kconfig info */ + if (!isa.ver) /* ISA BCR absent, use Kconfig info */ cpu->isa.atomic = IS_ENABLED(CONFIG_ARC_HAS_LLSC); - else - cpu->isa.atomic = cpu->isa.atomic1; + else { + /* ARC700_BUILD only has 2 bits of isa info */ + struct bcr_generic bcr = *(struct bcr_generic *)&isa; + cpu->isa.atomic = bcr.info & 1; + } cpu->isa.be = IS_ENABLED(CONFIG_CPU_BIG_ENDIAN); /* there's no direct way to distinguish 750 vs. 770 */ if (unlikely(cpu->core.family < 0x34 || cpu->mmu.ver < 3)) cpu->name = "ARC750"; + } else { + cpu->isa = isa; } } -- cgit From dea8252059a3210340f255bf69d67225b9af552d Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Thu, 21 Sep 2017 18:02:44 -0700 Subject: ARCv2: boot log: identify HS48 cores (dual issue) Signed-off-by: Vineet Gupta --- arch/arc/include/asm/arcregs.h | 3 ++- arch/arc/kernel/setup.c | 17 ++++++++++++++--- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/arch/arc/include/asm/arcregs.h b/arch/arc/include/asm/arcregs.h index b71d84873f7d..b1c56d35f2a9 100644 --- a/arch/arc/include/asm/arcregs.h +++ b/arch/arc/include/asm/arcregs.h @@ -98,6 +98,7 @@ /* Auxiliary registers */ #define AUX_IDENTITY 4 +#define AUX_EXEC_CTRL 8 #define AUX_INTR_VEC_BASE 0x25 #define AUX_VOL 0x5e @@ -269,7 +270,7 @@ struct cpuinfo_arc { struct cpuinfo_arc_ccm iccm, dccm; struct { unsigned int swap:1, norm:1, minmax:1, barrel:1, crc:1, swape:1, pad1:2, - fpu_sp:1, fpu_dp:1, pad2:6, + fpu_sp:1, fpu_dp:1, dual_iss_enb:1, dual_iss_exist:1, pad2:4, debug:1, ap:1, smart:1, rtt:1, pad3:4, timer0:1, timer1:1, rtc:1, gfrc:1, pad4:4; } extn; diff --git a/arch/arc/kernel/setup.c b/arch/arc/kernel/setup.c index 228593a964f8..fb83844daeea 100644 --- a/arch/arc/kernel/setup.c +++ b/arch/arc/kernel/setup.c @@ -51,6 +51,7 @@ static const struct id_to_str arc_cpu_rel[] = { { 0x51, "R2.0" }, { 0x52, "R2.1" }, { 0x53, "R3.0" }, + { 0x54, "R4.0" }, #endif { 0x00, NULL } }; @@ -62,6 +63,7 @@ static const struct id_to_str arc_cpu_nm[] = { #else { 0x40, "ARC EM" }, { 0x50, "ARC HS38" }, + { 0x54, "ARC HS48" }, #endif { 0x00, "Unknown" } }; @@ -133,7 +135,7 @@ static void read_arc_build_cfg_regs(void) } for (tbl = &arc_cpu_nm[0]; tbl->id != 0; tbl++) { - if ((cpu->core.family & 0xF0) == tbl->id) + if ((cpu->core.family & 0xF4) == tbl->id) break; } cpu->name = tbl->str; @@ -192,6 +194,14 @@ static void read_arc_build_cfg_regs(void) cpu->bpu.full = bpu.ft; cpu->bpu.num_cache = 256 << bpu.bce; cpu->bpu.num_pred = 2048 << bpu.pte; + + if (cpu->core.family >= 0x54) { + unsigned int exec_ctrl; + + READ_BCR(AUX_EXEC_CTRL, exec_ctrl); + cpu->extn.dual_iss_exist = 1; + cpu->extn.dual_iss_enb = exec_ctrl & 1; + } } READ_BCR(ARC_REG_AP_BCR, bcr); @@ -239,10 +249,11 @@ static char *arc_cpu_mumbojumbo(int cpu_id, char *buf, int len) "\nIDENTITY\t: ARCVER [%#02x] ARCNUM [%#02x] CHIPID [%#4x]\n", core->family, core->cpu_id, core->chip_id); - n += scnprintf(buf + n, len - n, "processor [%d]\t: %s %s (%s ISA) %s\n", + n += scnprintf(buf + n, len - n, "processor [%d]\t: %s %s (%s ISA) %s%s%s\n", cpu_id, cpu->name, cpu->details, is_isa_arcompact() ? "ARCompact" : "ARCv2", - IS_AVAIL1(cpu->isa.be, "[Big-Endian]")); + IS_AVAIL1(cpu->isa.be, "[Big-Endian]"), + IS_AVAIL3(cpu->extn.dual_iss_exist, cpu->extn.dual_iss_enb, " Dual-Issue")); n += scnprintf(buf + n, len - n, "Timers\t\t: %s%s%s%s%s%s\nISA Extn\t: ", IS_AVAIL1(cpu->extn.timer0, "Timer0 "), -- cgit From 5464d03d92601ac2977ef605b0cbb33276567daf Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Fri, 29 Sep 2017 14:46:50 -0700 Subject: ARC: fix allnoconfig build warning Reported-by: Dmitrii Kolesnichenko Signed-off-by: Vineet Gupta --- arch/arc/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index a598641eed98..c84e67fdea09 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -24,7 +24,7 @@ config ARC select GENERIC_SMP_IDLE_THREAD select HAVE_ARCH_KGDB select HAVE_ARCH_TRACEHOOK - select HAVE_FUTEX_CMPXCHG + select HAVE_FUTEX_CMPXCHG if FUTEX select HAVE_IOREMAP_PROT select HAVE_KPROBES select HAVE_KRETPROBES -- cgit From edb40d74c08edfd049cbba15479dadd9aeb7d307 Mon Sep 17 00:00:00 2001 From: Eugeniy Paltsev Date: Thu, 28 Sep 2017 17:33:29 +0300 Subject: ARC: [plat-hsdk]: Temporary fix to set CPU frequency to 1GHz Add temporary fix to HSDK platform code to setup CPU frequency to 1GHz on early boot. We can remove this fix when smart hsdk pll driver will be introduced, see discussion: https://www.mail-archive.com/linux-snps-arc@lists.infradead.org/msg02689.html Signed-off-by: Eugeniy Paltsev Signed-off-by: Vineet Gupta --- arch/arc/plat-hsdk/platform.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/arch/arc/plat-hsdk/platform.c b/arch/arc/plat-hsdk/platform.c index a2e7fd17e36d..744e62e58788 100644 --- a/arch/arc/plat-hsdk/platform.c +++ b/arch/arc/plat-hsdk/platform.c @@ -38,6 +38,42 @@ static void __init hsdk_init_per_cpu(unsigned int cpu) #define CREG_PAE (CREG_BASE + 0x180) #define CREG_PAE_UPDATE (CREG_BASE + 0x194) +#define CREG_CORE_IF_CLK_DIV (CREG_BASE + 0x4B8) +#define CREG_CORE_IF_CLK_DIV_2 0x1 +#define CGU_BASE ARC_PERIPHERAL_BASE +#define CGU_PLL_STATUS (ARC_PERIPHERAL_BASE + 0x4) +#define CGU_PLL_CTRL (ARC_PERIPHERAL_BASE + 0x0) +#define CGU_PLL_STATUS_LOCK BIT(0) +#define CGU_PLL_STATUS_ERR BIT(1) +#define CGU_PLL_CTRL_1GHZ 0x3A10 +#define HSDK_PLL_LOCK_TIMEOUT 500 + +#define HSDK_PLL_LOCKED() \ + !!(ioread32((void __iomem *) CGU_PLL_STATUS) & CGU_PLL_STATUS_LOCK) + +#define HSDK_PLL_ERR() \ + !!(ioread32((void __iomem *) CGU_PLL_STATUS) & CGU_PLL_STATUS_ERR) + +static void __init hsdk_set_cpu_freq_1ghz(void) +{ + u32 timeout = HSDK_PLL_LOCK_TIMEOUT; + + /* + * As we set cpu clock which exceeds 500MHz, the divider for the interface + * clock must be programmed to div-by-2. + */ + iowrite32(CREG_CORE_IF_CLK_DIV_2, (void __iomem *) CREG_CORE_IF_CLK_DIV); + + /* Set cpu clock to 1GHz */ + iowrite32(CGU_PLL_CTRL_1GHZ, (void __iomem *) CGU_PLL_CTRL); + + while (!HSDK_PLL_LOCKED() && timeout--) + cpu_relax(); + + if (!HSDK_PLL_LOCKED() || HSDK_PLL_ERR()) + pr_err("Failed to setup CPU frequency to 1GHz!"); +} + static void __init hsdk_init_early(void) { /* @@ -52,6 +88,12 @@ static void __init hsdk_init_early(void) /* Really apply settings made above */ writel(1, (void __iomem *) CREG_PAE_UPDATE); + + /* + * Setup CPU frequency to 1GHz. + * TODO: remove it after smart hsdk pll driver will be introduced. + */ + hsdk_set_cpu_freq_1ghz(); } static const char *hsdk_compat[] __initconst = { -- cgit From 52bfcdd7adbc26639bc7b2356ab9a3f5dad68ad6 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 18 Sep 2017 09:41:18 -0700 Subject: xfs: always swap the cow forks when swapping extents Since the CoW fork exists as a secondary data structure to the data fork, we must always swap cow forks during swapext. We also need to swap the extent counts and reset the cowblocks tags. Reviewed-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_bmap_util.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index bc6c6e10a969..e9db7fc95b70 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -2122,11 +2122,31 @@ xfs_swap_extents( ip->i_d.di_flags2 |= tip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK; tip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; tip->i_d.di_flags2 |= f & XFS_DIFLAG2_REFLINK; + } + + /* Swap the cow forks. */ + if (xfs_sb_version_hasreflink(&mp->m_sb)) { + xfs_extnum_t extnum; + + ASSERT(ip->i_cformat == XFS_DINODE_FMT_EXTENTS); + ASSERT(tip->i_cformat == XFS_DINODE_FMT_EXTENTS); + + extnum = ip->i_cnextents; + ip->i_cnextents = tip->i_cnextents; + tip->i_cnextents = extnum; + cowfp = ip->i_cowfp; ip->i_cowfp = tip->i_cowfp; tip->i_cowfp = cowfp; - xfs_inode_set_cowblocks_tag(ip); - xfs_inode_set_cowblocks_tag(tip); + + if (ip->i_cowfp && ip->i_cnextents) + xfs_inode_set_cowblocks_tag(ip); + else + xfs_inode_clear_cowblocks_tag(ip); + if (tip->i_cowfp && tip->i_cnextents) + xfs_inode_set_cowblocks_tag(tip); + else + xfs_inode_clear_cowblocks_tag(tip); } xfs_trans_log_inode(tp, ip, src_log_flags); -- cgit From e12199f85d0ad1b04ce6c425ad93cd847fe930bb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 3 Oct 2017 08:58:33 -0700 Subject: xfs: handle racy AIO in xfs_reflink_end_cow If we got two AIO writes into a COW area the second one might not have any COW extents left to convert. Handle that case gracefully instead of triggering an assert or accessing beyond the bounds of the extent list. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_reflink.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 3246815c24d6..37e603bf1591 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -736,7 +736,13 @@ xfs_reflink_end_cow( /* If there is a hole at end_fsb - 1 go to the previous extent */ if (!xfs_iext_lookup_extent(ip, ifp, end_fsb - 1, &idx, &got) || got.br_startoff > end_fsb) { - ASSERT(idx > 0); + /* + * In case of racing, overlapping AIO writes no COW extents + * might be left by the time I/O completes for the loser of + * the race. In that case we are done. + */ + if (idx <= 0) + goto out_cancel; xfs_iext_get_extent(ifp, --idx, &got); } @@ -809,6 +815,7 @@ next_extent: out_defer: xfs_defer_cancel(&dfops); +out_cancel: xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); out: -- cgit From 57e7ba04d422c3d41c8426380303ec9b7533ded9 Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Tue, 19 Sep 2017 09:39:08 -0700 Subject: lsm: fix smack_inode_removexattr and xattr_getsecurity memleak security_inode_getsecurity() provides the text string value of a security attribute. It does not provide a "secctx". The code in xattr_getsecurity() that calls security_inode_getsecurity() and then calls security_release_secctx() happened to work because SElinux and Smack treat the attribute and the secctx the same way. It fails for cap_inode_getsecurity(), because that module has no secctx that ever needs releasing. It turns out that Smack is the one that's doing things wrong by not allocating memory when instructed to do so by the "alloc" parameter. The fix is simple enough. Change the security_release_secctx() to kfree() because it isn't a secctx being returned by security_inode_getsecurity(). Change Smack to allocate the string when told to do so. Note: this also fixes memory leaks for LSMs which implement inode_getsecurity but not release_secctx, such as capabilities. Signed-off-by: Casey Schaufler Reported-by: Konstantin Khlebnikov Cc: stable@vger.kernel.org Signed-off-by: James Morris --- fs/xattr.c | 2 +- security/smack/smack_lsm.c | 55 +++++++++++++++++++++------------------------- 2 files changed, 26 insertions(+), 31 deletions(-) diff --git a/fs/xattr.c b/fs/xattr.c index 4424f7fecf14..61cd28ba25f3 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -250,7 +250,7 @@ xattr_getsecurity(struct inode *inode, const char *name, void *value, } memcpy(value, buffer, len); out: - security_release_secctx(buffer, len); + kfree(buffer); out_noalloc: return len; } diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 319add31b4a4..286171a16ed2 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -1473,7 +1473,7 @@ static int smack_inode_removexattr(struct dentry *dentry, const char *name) * @inode: the object * @name: attribute name * @buffer: where to put the result - * @alloc: unused + * @alloc: duplicate memory * * Returns the size of the attribute or an error code */ @@ -1486,43 +1486,38 @@ static int smack_inode_getsecurity(struct inode *inode, struct super_block *sbp; struct inode *ip = (struct inode *)inode; struct smack_known *isp; - int ilen; - int rc = 0; - if (strcmp(name, XATTR_SMACK_SUFFIX) == 0) { + if (strcmp(name, XATTR_SMACK_SUFFIX) == 0) isp = smk_of_inode(inode); - ilen = strlen(isp->smk_known); - *buffer = isp->smk_known; - return ilen; - } + else { + /* + * The rest of the Smack xattrs are only on sockets. + */ + sbp = ip->i_sb; + if (sbp->s_magic != SOCKFS_MAGIC) + return -EOPNOTSUPP; - /* - * The rest of the Smack xattrs are only on sockets. - */ - sbp = ip->i_sb; - if (sbp->s_magic != SOCKFS_MAGIC) - return -EOPNOTSUPP; + sock = SOCKET_I(ip); + if (sock == NULL || sock->sk == NULL) + return -EOPNOTSUPP; - sock = SOCKET_I(ip); - if (sock == NULL || sock->sk == NULL) - return -EOPNOTSUPP; - - ssp = sock->sk->sk_security; + ssp = sock->sk->sk_security; - if (strcmp(name, XATTR_SMACK_IPIN) == 0) - isp = ssp->smk_in; - else if (strcmp(name, XATTR_SMACK_IPOUT) == 0) - isp = ssp->smk_out; - else - return -EOPNOTSUPP; + if (strcmp(name, XATTR_SMACK_IPIN) == 0) + isp = ssp->smk_in; + else if (strcmp(name, XATTR_SMACK_IPOUT) == 0) + isp = ssp->smk_out; + else + return -EOPNOTSUPP; + } - ilen = strlen(isp->smk_known); - if (rc == 0) { - *buffer = isp->smk_known; - rc = ilen; + if (alloc) { + *buffer = kstrdup(isp->smk_known, GFP_KERNEL); + if (*buffer == NULL) + return -ENOMEM; } - return rc; + return strlen(isp->smk_known); } -- cgit From de3ee99b097dd51938276e3af388cd4ad0f2750a Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Wed, 20 Sep 2017 10:56:14 +0200 Subject: mmc: Delete bounce buffer handling In may, Steven sent a patch deleting the bounce buffer handling and the CONFIG_MMC_BLOCK_BOUNCE option. I chose the less invasive path of making it a runtime config option, and we merged that successfully for kernel v4.12. The code is however just standing in the way and taking up space for seemingly no gain on any systems in wide use today. Pierre says the code was there to improve speed on TI SDHCI controllers on certain HP laptops and possibly some Ricoh controllers as well. Early SDHCI controllers lacked the scatter-gather feature, which made software bounce buffers a significant speed boost. We are clearly talking about the list of SDHCI PCI-based MMC/SD card readers found in the pci_ids[] list in drivers/mmc/host/sdhci-pci-core.c. The TI SDHCI derivative is not supported by the upstream kernel. This leaves the Ricoh. What we can however notice is that the x86 defconfigs in the kernel did not enable CONFIG_MMC_BLOCK_BOUNCE option, which means that any such laptop would have to have a custom configured kernel to actually take advantage of this bounce buffer speed-up. It simply seems like there was a speed optimization for the Ricoh controllers that noone was using. (I have not checked the distro defconfigs but I am pretty sure the situation is the same there.) Bounce buffers increased performance on the OMAP HSMMC at one point, and was part of the original submission in commit a45c6cb81647 ("[ARM] 5369/1: omap mmc: Add new omap hsmmc controller for 2430 and 34xx, v3") This optimization was removed in commit 0ccd76d4c236 ("omap_hsmmc: Implement scatter-gather emulation") which found that scatter-gather emulation provided even better performance. The same was introduced for SDHCI in commit 2134a922c6e7 ("sdhci: scatter-gather (ADMA) support") I am pretty positively convinced that software scatter-gather emulation will do for any host controller what the bounce buffers were doing. Essentially, the bounce buffer was a reimplementation of software scatter-gather-emulation in the MMC subsystem, and it should be done away with. Cc: Pierre Ossman Cc: Juha Yrjola Cc: Steven J. Hill Cc: Shawn Lin Cc: Adrian Hunter Suggested-by: Steven J. Hill Suggested-by: Shawn Lin Signed-off-by: Linus Walleij Signed-off-by: Ulf Hansson --- drivers/mmc/core/block.c | 3 -- drivers/mmc/core/queue.c | 125 ++++------------------------------------------ drivers/mmc/core/queue.h | 6 --- drivers/mmc/host/cavium.c | 2 +- drivers/mmc/host/pxamci.c | 6 +-- include/linux/mmc/host.h | 2 +- 6 files changed, 12 insertions(+), 132 deletions(-) diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index 29fc1e662891..2ad7b5c69156 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -1634,8 +1634,6 @@ static void mmc_blk_data_prep(struct mmc_queue *mq, struct mmc_queue_req *mqrq, } mqrq->areq.mrq = &brq->mrq; - - mmc_queue_bounce_pre(mqrq); } static void mmc_blk_rw_rq_prep(struct mmc_queue_req *mqrq, @@ -1829,7 +1827,6 @@ static void mmc_blk_issue_rw_rq(struct mmc_queue *mq, struct request *new_req) brq = &mq_rq->brq; old_req = mmc_queue_req_to_req(mq_rq); type = rq_data_dir(old_req) == READ ? MMC_BLK_READ : MMC_BLK_WRITE; - mmc_queue_bounce_post(mq_rq); switch (status) { case MMC_BLK_SUCCESS: diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c index 74c663b1c0a7..0a4e77a5ba33 100644 --- a/drivers/mmc/core/queue.c +++ b/drivers/mmc/core/queue.c @@ -23,8 +23,6 @@ #include "core.h" #include "card.h" -#define MMC_QUEUE_BOUNCESZ 65536 - /* * Prepare a MMC request. This just filters out odd stuff. */ @@ -150,26 +148,6 @@ static void mmc_queue_setup_discard(struct request_queue *q, queue_flag_set_unlocked(QUEUE_FLAG_SECERASE, q); } -static unsigned int mmc_queue_calc_bouncesz(struct mmc_host *host) -{ - unsigned int bouncesz = MMC_QUEUE_BOUNCESZ; - - if (host->max_segs != 1 || (host->caps & MMC_CAP_NO_BOUNCE_BUFF)) - return 0; - - if (bouncesz > host->max_req_size) - bouncesz = host->max_req_size; - if (bouncesz > host->max_seg_size) - bouncesz = host->max_seg_size; - if (bouncesz > host->max_blk_count * 512) - bouncesz = host->max_blk_count * 512; - - if (bouncesz <= 512) - return 0; - - return bouncesz; -} - /** * mmc_init_request() - initialize the MMC-specific per-request data * @q: the request queue @@ -184,26 +162,9 @@ static int mmc_init_request(struct request_queue *q, struct request *req, struct mmc_card *card = mq->card; struct mmc_host *host = card->host; - if (card->bouncesz) { - mq_rq->bounce_buf = kmalloc(card->bouncesz, gfp); - if (!mq_rq->bounce_buf) - return -ENOMEM; - if (card->bouncesz > 512) { - mq_rq->sg = mmc_alloc_sg(1, gfp); - if (!mq_rq->sg) - return -ENOMEM; - mq_rq->bounce_sg = mmc_alloc_sg(card->bouncesz / 512, - gfp); - if (!mq_rq->bounce_sg) - return -ENOMEM; - } - } else { - mq_rq->bounce_buf = NULL; - mq_rq->bounce_sg = NULL; - mq_rq->sg = mmc_alloc_sg(host->max_segs, gfp); - if (!mq_rq->sg) - return -ENOMEM; - } + mq_rq->sg = mmc_alloc_sg(host->max_segs, gfp); + if (!mq_rq->sg) + return -ENOMEM; return 0; } @@ -212,13 +173,6 @@ static void mmc_exit_request(struct request_queue *q, struct request *req) { struct mmc_queue_req *mq_rq = req_to_mmc_queue_req(req); - /* It is OK to kfree(NULL) so this will be smooth */ - kfree(mq_rq->bounce_sg); - mq_rq->bounce_sg = NULL; - - kfree(mq_rq->bounce_buf); - mq_rq->bounce_buf = NULL; - kfree(mq_rq->sg); mq_rq->sg = NULL; } @@ -242,12 +196,6 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card, if (mmc_dev(host)->dma_mask && *mmc_dev(host)->dma_mask) limit = (u64)dma_max_pfn(mmc_dev(host)) << PAGE_SHIFT; - /* - * mmc_init_request() depends on card->bouncesz so it must be calculated - * before blk_init_allocated_queue() starts allocating requests. - */ - card->bouncesz = mmc_queue_calc_bouncesz(host); - mq->card = card; mq->queue = blk_alloc_queue(GFP_KERNEL); if (!mq->queue) @@ -271,17 +219,11 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card, if (mmc_can_erase(card)) mmc_queue_setup_discard(mq->queue, card); - if (card->bouncesz) { - blk_queue_max_hw_sectors(mq->queue, card->bouncesz / 512); - blk_queue_max_segments(mq->queue, card->bouncesz / 512); - blk_queue_max_segment_size(mq->queue, card->bouncesz); - } else { - blk_queue_bounce_limit(mq->queue, limit); - blk_queue_max_hw_sectors(mq->queue, - min(host->max_blk_count, host->max_req_size / 512)); - blk_queue_max_segments(mq->queue, host->max_segs); - blk_queue_max_segment_size(mq->queue, host->max_seg_size); - } + blk_queue_bounce_limit(mq->queue, limit); + blk_queue_max_hw_sectors(mq->queue, + min(host->max_blk_count, host->max_req_size / 512)); + blk_queue_max_segments(mq->queue, host->max_segs); + blk_queue_max_segment_size(mq->queue, host->max_seg_size); sema_init(&mq->thread_sem, 1); @@ -370,56 +312,7 @@ void mmc_queue_resume(struct mmc_queue *mq) */ unsigned int mmc_queue_map_sg(struct mmc_queue *mq, struct mmc_queue_req *mqrq) { - unsigned int sg_len; - size_t buflen; - struct scatterlist *sg; struct request *req = mmc_queue_req_to_req(mqrq); - int i; - - if (!mqrq->bounce_buf) - return blk_rq_map_sg(mq->queue, req, mqrq->sg); - - sg_len = blk_rq_map_sg(mq->queue, req, mqrq->bounce_sg); - - mqrq->bounce_sg_len = sg_len; - - buflen = 0; - for_each_sg(mqrq->bounce_sg, sg, sg_len, i) - buflen += sg->length; - - sg_init_one(mqrq->sg, mqrq->bounce_buf, buflen); - - return 1; -} - -/* - * If writing, bounce the data to the buffer before the request - * is sent to the host driver - */ -void mmc_queue_bounce_pre(struct mmc_queue_req *mqrq) -{ - if (!mqrq->bounce_buf) - return; - - if (rq_data_dir(mmc_queue_req_to_req(mqrq)) != WRITE) - return; - - sg_copy_to_buffer(mqrq->bounce_sg, mqrq->bounce_sg_len, - mqrq->bounce_buf, mqrq->sg[0].length); -} - -/* - * If reading, bounce the data from the buffer after the request - * has been handled by the host driver - */ -void mmc_queue_bounce_post(struct mmc_queue_req *mqrq) -{ - if (!mqrq->bounce_buf) - return; - - if (rq_data_dir(mmc_queue_req_to_req(mqrq)) != READ) - return; - sg_copy_from_buffer(mqrq->bounce_sg, mqrq->bounce_sg_len, - mqrq->bounce_buf, mqrq->sg[0].length); + return blk_rq_map_sg(mq->queue, req, mqrq->sg); } diff --git a/drivers/mmc/core/queue.h b/drivers/mmc/core/queue.h index 04fc89360a7a..f18d3f656baa 100644 --- a/drivers/mmc/core/queue.h +++ b/drivers/mmc/core/queue.h @@ -49,9 +49,6 @@ enum mmc_drv_op { struct mmc_queue_req { struct mmc_blk_request brq; struct scatterlist *sg; - char *bounce_buf; - struct scatterlist *bounce_sg; - unsigned int bounce_sg_len; struct mmc_async_req areq; enum mmc_drv_op drv_op; int drv_op_result; @@ -81,11 +78,8 @@ extern int mmc_init_queue(struct mmc_queue *, struct mmc_card *, spinlock_t *, extern void mmc_cleanup_queue(struct mmc_queue *); extern void mmc_queue_suspend(struct mmc_queue *); extern void mmc_queue_resume(struct mmc_queue *); - extern unsigned int mmc_queue_map_sg(struct mmc_queue *, struct mmc_queue_req *); -extern void mmc_queue_bounce_pre(struct mmc_queue_req *); -extern void mmc_queue_bounce_post(struct mmc_queue_req *); extern int mmc_access_rpmb(struct mmc_queue *); diff --git a/drivers/mmc/host/cavium.c b/drivers/mmc/host/cavium.c index 27fb625cbcf3..fbd29f00fca0 100644 --- a/drivers/mmc/host/cavium.c +++ b/drivers/mmc/host/cavium.c @@ -1038,7 +1038,7 @@ int cvm_mmc_of_slot_probe(struct device *dev, struct cvm_mmc_host *host) */ mmc->caps |= MMC_CAP_MMC_HIGHSPEED | MMC_CAP_SD_HIGHSPEED | MMC_CAP_ERASE | MMC_CAP_CMD23 | MMC_CAP_POWER_OFF_CARD | - MMC_CAP_3_3V_DDR | MMC_CAP_NO_BOUNCE_BUFF; + MMC_CAP_3_3V_DDR; if (host->use_sg) mmc->max_segs = 16; diff --git a/drivers/mmc/host/pxamci.c b/drivers/mmc/host/pxamci.c index 59ab194cb009..c763b404510f 100644 --- a/drivers/mmc/host/pxamci.c +++ b/drivers/mmc/host/pxamci.c @@ -702,11 +702,7 @@ static int pxamci_probe(struct platform_device *pdev) pxamci_init_ocr(host); - /* - * This architecture used to disable bounce buffers through its - * defconfig, now it is done at runtime as a host property. - */ - mmc->caps = MMC_CAP_NO_BOUNCE_BUFF; + mmc->caps = 0; host->cmdat = 0; if (!cpu_is_pxa25x()) { mmc->caps |= MMC_CAP_4_BIT_DATA | MMC_CAP_SDIO_IRQ; diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index f3f2d07feb2a..9a43763a68ad 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -316,7 +316,7 @@ struct mmc_host { #define MMC_CAP_UHS_SDR50 (1 << 18) /* Host supports UHS SDR50 mode */ #define MMC_CAP_UHS_SDR104 (1 << 19) /* Host supports UHS SDR104 mode */ #define MMC_CAP_UHS_DDR50 (1 << 20) /* Host supports UHS DDR50 mode */ -#define MMC_CAP_NO_BOUNCE_BUFF (1 << 21) /* Disable bounce buffers on host */ +/* (1 << 21) is free for reuse */ #define MMC_CAP_DRIVER_TYPE_A (1 << 23) /* Host supports Driver Type A */ #define MMC_CAP_DRIVER_TYPE_C (1 << 24) /* Host supports Driver Type C */ #define MMC_CAP_DRIVER_TYPE_D (1 << 25) /* Host supports Driver Type D */ -- cgit From ca3dcd3ff5b13a31a09a0119dc484b97ec19c4c8 Mon Sep 17 00:00:00 2001 From: Jerome Brunet Date: Mon, 2 Oct 2017 14:27:41 +0200 Subject: mmc: meson-gx: make sure the clock is rounded down Using CLK_DIVIDER_ROUND_CLOSEST is unsafe as the mmc clock could be rounded to a rate higher the specified rate. Removing this flag ensure that, if the rate needs to be rounded, it will be rounded down. Fixes: 51c5d8447bd7 ("MMC: meson: initial support for GX platforms") Signed-off-by: Jerome Brunet Reviewed-by: Kevin Hilman Signed-off-by: Ulf Hansson --- drivers/mmc/host/meson-gx-mmc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/mmc/host/meson-gx-mmc.c b/drivers/mmc/host/meson-gx-mmc.c index c885c2d4b904..421c8719c202 100644 --- a/drivers/mmc/host/meson-gx-mmc.c +++ b/drivers/mmc/host/meson-gx-mmc.c @@ -531,8 +531,7 @@ static int meson_mmc_clk_init(struct meson_host *host) div->shift = __ffs(CLK_DIV_MASK); div->width = __builtin_popcountl(CLK_DIV_MASK); div->hw.init = &init; - div->flags = (CLK_DIVIDER_ONE_BASED | - CLK_DIVIDER_ROUND_CLOSEST); + div->flags = CLK_DIVIDER_ONE_BASED; clk = devm_clk_register(host->dev, &div->hw); if (WARN_ON(IS_ERR(clk))) -- cgit From 3e2b0af411d4bf85bc0fbc385756fd5968adb9fd Mon Sep 17 00:00:00 2001 From: Jerome Brunet Date: Mon, 2 Oct 2017 14:27:42 +0200 Subject: mmc: meson-gx: fix rx phase reset Resetting the phase when POWER_ON is set the set_ios() call means that the phase is reset almost every time the set_ios() is called, while the expected behavior was to reset the phase on a power cycle. This had gone unnoticed until now because in all mode (except hs400) the tuning is done after the last to set_ios(). In such case, the tuning result is used anyway. In HS400, there are a few calls to set_ios() after the tuning is done, overwriting the tuning result. Resetting the phase on POWER_UP instead of POWER_ON solve the problem. Fixes: d341ca88eead ("mmc: meson-gx: rework tuning function") Signed-off-by: Jerome Brunet Reviewed-by: Kevin Hilman Signed-off-by: Ulf Hansson --- drivers/mmc/host/meson-gx-mmc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/mmc/host/meson-gx-mmc.c b/drivers/mmc/host/meson-gx-mmc.c index 421c8719c202..08a55c2e96e1 100644 --- a/drivers/mmc/host/meson-gx-mmc.c +++ b/drivers/mmc/host/meson-gx-mmc.c @@ -745,6 +745,10 @@ static void meson_mmc_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) case MMC_POWER_UP: if (!IS_ERR(mmc->supply.vmmc)) mmc_regulator_set_ocr(mmc, mmc->supply.vmmc, ios->vdd); + + /* Reset rx phase */ + clk_set_phase(host->rx_clk, 0); + break; case MMC_POWER_ON: @@ -758,8 +762,6 @@ static void meson_mmc_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) host->vqmmc_enabled = true; } - /* Reset rx phase */ - clk_set_phase(host->rx_clk, 0); break; } -- cgit From 0a44697627d17a66d7dc98f17aeca07ca79c5c20 Mon Sep 17 00:00:00 2001 From: Jerome Brunet Date: Mon, 2 Oct 2017 14:27:43 +0200 Subject: mmc: meson-gx: include tx phase in the tuning process It has been reported that some platforms (odroid-c2) may require a different tx phase setting to operate at high speed (hs200 and hs400) To improve the situation, this patch includes tx phase in the tuning process. Fixes: d341ca88eead ("mmc: meson-gx: rework tuning function") Reported-by: Heiner Kallweit Signed-off-by: Jerome Brunet Reviewed-by: Kevin Hilman Signed-off-by: Ulf Hansson --- drivers/mmc/host/meson-gx-mmc.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/host/meson-gx-mmc.c b/drivers/mmc/host/meson-gx-mmc.c index 08a55c2e96e1..85745ef179e2 100644 --- a/drivers/mmc/host/meson-gx-mmc.c +++ b/drivers/mmc/host/meson-gx-mmc.c @@ -716,6 +716,22 @@ static int meson_mmc_clk_phase_tuning(struct mmc_host *mmc, u32 opcode, static int meson_mmc_execute_tuning(struct mmc_host *mmc, u32 opcode) { struct meson_host *host = mmc_priv(mmc); + int ret; + + /* + * If this is the initial tuning, try to get a sane Rx starting + * phase before doing the actual tuning. + */ + if (!mmc->doing_retune) { + ret = meson_mmc_clk_phase_tuning(mmc, opcode, host->rx_clk); + + if (ret) + return ret; + } + + ret = meson_mmc_clk_phase_tuning(mmc, opcode, host->tx_clk); + if (ret) + return ret; return meson_mmc_clk_phase_tuning(mmc, opcode, host->rx_clk); } @@ -746,8 +762,9 @@ static void meson_mmc_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) if (!IS_ERR(mmc->supply.vmmc)) mmc_regulator_set_ocr(mmc, mmc->supply.vmmc, ios->vdd); - /* Reset rx phase */ + /* Reset phases */ clk_set_phase(host->rx_clk, 0); + clk_set_phase(host->tx_clk, 270); break; -- cgit From bb16ea1742c8f35a9349b7508dc45d3a922db5f5 Mon Sep 17 00:00:00 2001 From: Gregory CLEMENT Date: Mon, 2 Oct 2017 16:58:52 +0200 Subject: mmc: sdhci-xenon: Fix clock resource by adding an optional bus clock On Armada 7K/8K we need to explicitly enable the bus clock. The bus clock is optional because not all the SoCs need them but at least for Armada 7K/8K it is actually mandatory. The binding documentation is updating accordingly. Without this patch the kernel hand during boot if the mvpp2.2 network driver was not present in the kernel. Indeed the clock needed by the xenon controller was set by the network driver. Fixes: 3a3748dba881 ("mmc: sdhci-xenon: Add Marvell Xenon SDHC core functionality)" CC: Stable Tested-by: Zhoujie Wu Signed-off-by: Gregory CLEMENT Signed-off-by: Ulf Hansson --- .../bindings/mmc/marvell,xenon-sdhci.txt | 12 ++++++----- drivers/mmc/host/sdhci-xenon.c | 24 ++++++++++++++++++---- drivers/mmc/host/sdhci-xenon.h | 1 + 3 files changed, 28 insertions(+), 9 deletions(-) diff --git a/Documentation/devicetree/bindings/mmc/marvell,xenon-sdhci.txt b/Documentation/devicetree/bindings/mmc/marvell,xenon-sdhci.txt index b878a1e305af..ed1456f5c94d 100644 --- a/Documentation/devicetree/bindings/mmc/marvell,xenon-sdhci.txt +++ b/Documentation/devicetree/bindings/mmc/marvell,xenon-sdhci.txt @@ -16,11 +16,13 @@ Required Properties: - clocks: Array of clocks required for SDHC. - Require at least input clock for Xenon IP core. + Require at least input clock for Xenon IP core. For Armada AP806 and + CP110, the AXI clock is also mandatory. - clock-names: Array of names corresponding to clocks property. The input clock for Xenon IP core should be named as "core". + The input clock for the AXI bus must be named as "axi". - reg: * For "marvell,armada-3700-sdhci", two register areas. @@ -106,8 +108,8 @@ Example: compatible = "marvell,armada-ap806-sdhci"; reg = <0xaa0000 0x1000>; interrupts = - clocks = <&emmc_clk>; - clock-names = "core"; + clocks = <&emmc_clk>,<&axi_clk>; + clock-names = "core", "axi"; bus-width = <4>; marvell,xenon-phy-slow-mode; marvell,xenon-tun-count = <11>; @@ -126,8 +128,8 @@ Example: interrupts = vqmmc-supply = <&sd_vqmmc_regulator>; vmmc-supply = <&sd_vmmc_regulator>; - clocks = <&sdclk>; - clock-names = "core"; + clocks = <&sdclk>, <&axi_clk>; + clock-names = "core", "axi"; bus-width = <4>; marvell,xenon-tun-count = <9>; }; diff --git a/drivers/mmc/host/sdhci-xenon.c b/drivers/mmc/host/sdhci-xenon.c index 2eec2e652c53..0842bbc2d7ad 100644 --- a/drivers/mmc/host/sdhci-xenon.c +++ b/drivers/mmc/host/sdhci-xenon.c @@ -466,6 +466,7 @@ static int xenon_probe(struct platform_device *pdev) { struct sdhci_pltfm_host *pltfm_host; struct sdhci_host *host; + struct xenon_priv *priv; int err; host = sdhci_pltfm_init(pdev, &sdhci_xenon_pdata, @@ -474,6 +475,7 @@ static int xenon_probe(struct platform_device *pdev) return PTR_ERR(host); pltfm_host = sdhci_priv(host); + priv = sdhci_pltfm_priv(pltfm_host); /* * Link Xenon specific mmc_host_ops function, @@ -491,9 +493,20 @@ static int xenon_probe(struct platform_device *pdev) if (err) goto free_pltfm; + priv->axi_clk = devm_clk_get(&pdev->dev, "axi"); + if (IS_ERR(priv->axi_clk)) { + err = PTR_ERR(priv->axi_clk); + if (err == -EPROBE_DEFER) + goto err_clk; + } else { + err = clk_prepare_enable(priv->axi_clk); + if (err) + goto err_clk; + } + err = mmc_of_parse(host->mmc); if (err) - goto err_clk; + goto err_clk_axi; sdhci_get_of_property(pdev); @@ -502,11 +515,11 @@ static int xenon_probe(struct platform_device *pdev) /* Xenon specific dt parse */ err = xenon_probe_dt(pdev); if (err) - goto err_clk; + goto err_clk_axi; err = xenon_sdhc_prepare(host); if (err) - goto err_clk; + goto err_clk_axi; pm_runtime_get_noresume(&pdev->dev); pm_runtime_set_active(&pdev->dev); @@ -527,6 +540,8 @@ remove_sdhc: pm_runtime_disable(&pdev->dev); pm_runtime_put_noidle(&pdev->dev); xenon_sdhc_unprepare(host); +err_clk_axi: + clk_disable_unprepare(priv->axi_clk); err_clk: clk_disable_unprepare(pltfm_host->clk); free_pltfm: @@ -538,6 +553,7 @@ static int xenon_remove(struct platform_device *pdev) { struct sdhci_host *host = platform_get_drvdata(pdev); struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); + struct xenon_priv *priv = sdhci_pltfm_priv(pltfm_host); pm_runtime_get_sync(&pdev->dev); pm_runtime_disable(&pdev->dev); @@ -546,7 +562,7 @@ static int xenon_remove(struct platform_device *pdev) sdhci_remove_host(host, 0); xenon_sdhc_unprepare(host); - + clk_disable_unprepare(priv->axi_clk); clk_disable_unprepare(pltfm_host->clk); sdhci_pltfm_free(pdev); diff --git a/drivers/mmc/host/sdhci-xenon.h b/drivers/mmc/host/sdhci-xenon.h index 2bc0510c0769..9994995c7c56 100644 --- a/drivers/mmc/host/sdhci-xenon.h +++ b/drivers/mmc/host/sdhci-xenon.h @@ -83,6 +83,7 @@ struct xenon_priv { unsigned char bus_width; unsigned char timing; unsigned int clock; + struct clk *axi_clk; int phy_type; /* -- cgit From 6b9dc4806b28214a4a260517e59439e0ac12a15e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 2 Oct 2017 12:34:50 +0200 Subject: watchdog/core, powerpc: Replace watchdog_nmi_reconfigure() The recent cleanup of the watchdog code split watchdog_nmi_reconfigure() into two stages. One to stop the NMI and one to restart it after reconfiguration. That was done by adding a boolean 'run' argument to the code, which is functionally correct but not necessarily a piece of art. Replace it by two explicit functions: watchdog_nmi_stop() and watchdog_nmi_start(). Fixes: 6592ad2fcc8f ("watchdog/core, powerpc: Make watchdog_nmi_reconfigure() two stage") Requested-by: Linus 'Nursing his pet-peeve' Torvalds Signed-off-by: Thomas 'Mopping up garbage' Gleixner Acked-by: Michael Ellerman Cc: Peter Zijlstra Cc: Don Zickus Cc: Benjamin Herrenschmidt Cc: Nicholas Piggin Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1710021957480.2114@nanos --- arch/powerpc/kernel/watchdog.c | 23 ++++++++++++++--------- include/linux/nmi.h | 3 ++- kernel/watchdog.c | 33 ++++++++++++++++++--------------- 3 files changed, 34 insertions(+), 25 deletions(-) diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c index dfb067764480..2673ec8bec00 100644 --- a/arch/powerpc/kernel/watchdog.c +++ b/arch/powerpc/kernel/watchdog.c @@ -355,19 +355,24 @@ static void watchdog_calc_timeouts(void) wd_timer_period_ms = watchdog_thresh * 1000 * 2 / 5; } -void watchdog_nmi_reconfigure(bool run) +void watchdog_nmi_stop(void) { int cpu; cpus_read_lock(); - if (!run) { - for_each_cpu(cpu, &wd_cpus_enabled) - stop_wd_on_cpu(cpu); - } else { - watchdog_calc_timeouts(); - for_each_cpu_and(cpu, cpu_online_mask, &watchdog_cpumask) - start_wd_on_cpu(cpu); - } + for_each_cpu(cpu, &wd_cpus_enabled) + stop_wd_on_cpu(cpu); + cpus_read_unlock(); +} + +void watchdog_nmi_start(void) +{ + int cpu; + + cpus_read_lock(); + watchdog_calc_timeouts(); + for_each_cpu_and(cpu, cpu_online_mask, &watchdog_cpumask) + start_wd_on_cpu(cpu); cpus_read_unlock(); } diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 89ba8b23c6fe..0c9ed49fb21a 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -109,7 +109,8 @@ static inline int hardlockup_detector_perf_init(void) { return 0; } # endif #endif -void watchdog_nmi_reconfigure(bool run); +void watchdog_nmi_stop(void); +void watchdog_nmi_start(void); /** * touch_nmi_watchdog - restart NMI watchdog timeout. diff --git a/kernel/watchdog.c b/kernel/watchdog.c index f6ef163b72cd..6ad6226535d0 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -123,24 +123,27 @@ int __weak __init watchdog_nmi_probe(void) } /** - * watchdog_nmi_reconfigure - Optional function to reconfigure NMI watchdogs - * @run: If false stop the watchdogs on all enabled CPUs - * If true start the watchdogs on all enabled CPUs + * watchdog_nmi_stop - Stop the watchdog for reconfiguration * - * The core call order is: - * watchdog_nmi_reconfigure(false); + * The reconfiguration steps are: + * watchdog_nmi_stop(); * update_variables(); - * watchdog_nmi_reconfigure(true); + * watchdog_nmi_start(); + */ +void __weak watchdog_nmi_stop(void) { } + +/** + * watchdog_nmi_start - Start the watchdog after reconfiguration * - * The second call which starts the watchdogs again guarantees that the - * following variables are stable across the call. + * Counterpart to watchdog_nmi_stop(). + * + * The following variables have been updated in update_variables() and + * contain the currently valid configuration: * - watchdog_enabled * - watchdog_thresh * - watchdog_cpumask - * - * After the call the variables can be changed again. */ -void __weak watchdog_nmi_reconfigure(bool run) { } +void __weak watchdog_nmi_start(void) { } /** * lockup_detector_update_enable - Update the sysctl enable bit @@ -551,13 +554,13 @@ static void softlockup_unpark_threads(void) static void softlockup_reconfigure_threads(void) { - watchdog_nmi_reconfigure(false); + watchdog_nmi_stop(); softlockup_park_all_threads(); set_sample_period(); lockup_detector_update_enable(); if (watchdog_enabled && watchdog_thresh) softlockup_unpark_threads(); - watchdog_nmi_reconfigure(true); + watchdog_nmi_start(); } /* @@ -602,9 +605,9 @@ static inline void watchdog_disable_all_cpus(void) { } static inline void softlockup_init_threads(void) { } static void softlockup_reconfigure_threads(void) { - watchdog_nmi_reconfigure(false); + watchdog_nmi_stop(); lockup_detector_update_enable(); - watchdog_nmi_reconfigure(true); + watchdog_nmi_start(); } #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */ -- cgit From e31d6883f21c1cdfe5bc64e28411f8a92b783fde Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 3 Oct 2017 16:37:53 +0200 Subject: watchdog/core, powerpc: Lock cpus across reconfiguration Instead of dropping the cpu hotplug lock after stopping NMI watchdog and threads and reaquiring for restart, the code and the protection rules become more obvious when holding cpu hotplug lock across the full reconfiguration. Suggested-by: Linus Torvalds Signed-off-by: Thomas Gleixner Acked-by: Michael Ellerman Cc: Peter Zijlstra Cc: Don Zickus Cc: Benjamin Herrenschmidt Cc: Nicholas Piggin Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1710022105570.2114@nanos --- arch/powerpc/kernel/watchdog.c | 4 ---- kernel/smpboot.c | 3 +-- kernel/watchdog.c | 10 +++++++++- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c index 2673ec8bec00..f9b4c6352d24 100644 --- a/arch/powerpc/kernel/watchdog.c +++ b/arch/powerpc/kernel/watchdog.c @@ -359,21 +359,17 @@ void watchdog_nmi_stop(void) { int cpu; - cpus_read_lock(); for_each_cpu(cpu, &wd_cpus_enabled) stop_wd_on_cpu(cpu); - cpus_read_unlock(); } void watchdog_nmi_start(void) { int cpu; - cpus_read_lock(); watchdog_calc_timeouts(); for_each_cpu_and(cpu, cpu_online_mask, &watchdog_cpumask) start_wd_on_cpu(cpu); - cpus_read_unlock(); } /* diff --git a/kernel/smpboot.c b/kernel/smpboot.c index ed7507b69b48..5043e7433f4b 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -351,7 +351,7 @@ void smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread static struct cpumask tmp; unsigned int cpu; - get_online_cpus(); + lockdep_assert_cpus_held(); mutex_lock(&smpboot_threads_lock); /* Park threads that were exclusively enabled on the old mask. */ @@ -367,7 +367,6 @@ void smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread cpumask_copy(old, new); mutex_unlock(&smpboot_threads_lock); - put_online_cpus(); } static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 6ad6226535d0..fff90fe10007 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -535,7 +535,6 @@ static void softlockup_update_smpboot_threads(void) smpboot_update_cpumask_percpu_thread(&watchdog_threads, &watchdog_allowed_mask); - __lockup_detector_cleanup(); } /* Temporarily park all watchdog threads */ @@ -554,6 +553,7 @@ static void softlockup_unpark_threads(void) static void softlockup_reconfigure_threads(void) { + cpus_read_lock(); watchdog_nmi_stop(); softlockup_park_all_threads(); set_sample_period(); @@ -561,6 +561,12 @@ static void softlockup_reconfigure_threads(void) if (watchdog_enabled && watchdog_thresh) softlockup_unpark_threads(); watchdog_nmi_start(); + cpus_read_unlock(); + /* + * Must be called outside the cpus locked section to prevent + * recursive locking in the perf code. + */ + __lockup_detector_cleanup(); } /* @@ -605,9 +611,11 @@ static inline void watchdog_disable_all_cpus(void) { } static inline void softlockup_init_threads(void) { } static void softlockup_reconfigure_threads(void) { + cpus_read_lock(); watchdog_nmi_stop(); lockup_detector_update_enable(); watchdog_nmi_start(); + cpus_read_unlock(); } #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */ -- cgit From 34ddaa3e5c0096fef52485186c7eb6cf56ddc686 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 3 Oct 2017 16:39:02 +0200 Subject: powerpc/watchdog: Make use of watchdog_nmi_probe() The rework of the core hotplug code triggers the WARN_ON in start_wd_cpu() on powerpc because it is called multiple times for the boot CPU. The first call is via: start_wd_on_cpu+0x80/0x2f0 watchdog_nmi_reconfigure+0x124/0x170 softlockup_reconfigure_threads+0x110/0x130 lockup_detector_init+0xbc/0xe0 kernel_init_freeable+0x18c/0x37c kernel_init+0x2c/0x160 ret_from_kernel_thread+0x5c/0xbc And then again via the CPU hotplug registration: start_wd_on_cpu+0x80/0x2f0 cpuhp_invoke_callback+0x194/0x620 cpuhp_thread_fun+0x7c/0x1b0 smpboot_thread_fn+0x290/0x2a0 kthread+0x168/0x1b0 ret_from_kernel_thread+0x5c/0xbc This can be avoided by setting up the cpu hotplug state with nocalls and move the initialization to the watchdog_nmi_probe() function. That initializes the hotplug callbacks without invoking the callback and the following core initialization function then configures the watchdog for the online CPUs (in this case CPU0) via softlockup_reconfigure_threads(). Reported-and-tested-by: Michael Ellerman Signed-off-by: Thomas Gleixner Acked-by: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Nicholas Piggin Cc: linuxppc-dev@lists.ozlabs.org --- arch/powerpc/kernel/watchdog.c | 17 ++++++++--------- include/linux/nmi.h | 1 + kernel/watchdog.c | 5 ++++- 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c index f9b4c6352d24..c702a8981452 100644 --- a/arch/powerpc/kernel/watchdog.c +++ b/arch/powerpc/kernel/watchdog.c @@ -373,22 +373,21 @@ void watchdog_nmi_start(void) } /* - * This runs after lockup_detector_init() which sets up watchdog_cpumask. + * Invoked from core watchdog init. */ -static int __init powerpc_watchdog_init(void) +int __init watchdog_nmi_probe(void) { int err; - watchdog_calc_timeouts(); - - err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "powerpc/watchdog:online", - start_wd_on_cpu, stop_wd_on_cpu); - if (err < 0) + err = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "powerpc/watchdog:online", + start_wd_on_cpu, stop_wd_on_cpu); + if (err < 0) { pr_warn("Watchdog could not be initialized"); - + return err; + } return 0; } -arch_initcall(powerpc_watchdog_init); static void handle_backtrace_ipi(struct pt_regs *regs) { diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 0c9ed49fb21a..27e249ed7c5c 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -111,6 +111,7 @@ static inline int hardlockup_detector_perf_init(void) { return 0; } void watchdog_nmi_stop(void); void watchdog_nmi_start(void); +int watchdog_nmi_probe(void); /** * touch_nmi_watchdog - restart NMI watchdog timeout. diff --git a/kernel/watchdog.c b/kernel/watchdog.c index fff90fe10007..5c6fb7cd9ae8 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -608,7 +608,6 @@ static inline int watchdog_park_threads(void) { return 0; } static inline void watchdog_unpark_threads(void) { } static inline int watchdog_enable_all_cpus(void) { return 0; } static inline void watchdog_disable_all_cpus(void) { } -static inline void softlockup_init_threads(void) { } static void softlockup_reconfigure_threads(void) { cpus_read_lock(); @@ -617,6 +616,10 @@ static void softlockup_reconfigure_threads(void) watchdog_nmi_start(); cpus_read_unlock(); } +static inline void softlockup_init_threads(void) +{ + softlockup_reconfigure_threads(); +} #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */ static void __lockup_detector_cleanup(void) -- cgit From 5587185ddb4b9f413299dfec0a022ad0212513e8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 Oct 2017 10:03:04 +0200 Subject: watchdog/core: Rename some softlockup_* functions The function names made sense up to the point where the watchdog (re)configuration was unified to use softlockup_reconfigure_threads() for all configuration purposes. But that includes scenarios which solely configure the nmi watchdog. Rename softlockup_reconfigure_threads() and softlockup_init_threads() so the function names match the functionality. Signed-off-by: Thomas Gleixner Cc: Linus Torvalds Cc: Michael Ellerman Cc: Peter Zijlstra Cc: Don Zickus --- kernel/watchdog.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 5c6fb7cd9ae8..d241bd99cee1 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -551,7 +551,7 @@ static void softlockup_unpark_threads(void) softlockup_update_smpboot_threads(); } -static void softlockup_reconfigure_threads(void) +static void lockup_detector_reconfigure(void) { cpus_read_lock(); watchdog_nmi_stop(); @@ -570,13 +570,13 @@ static void softlockup_reconfigure_threads(void) } /* - * Create the watchdog thread infrastructure. + * Create the watchdog thread infrastructure and configure the detector(s). * * The threads are not unparked as watchdog_allowed_mask is empty. When * the threads are sucessfully initialized, take the proper locks and * unpark the threads in the watchdog_cpumask if the watchdog is enabled. */ -static __init void softlockup_init_threads(void) +static __init void lockup_detector_setup(void) { int ret; @@ -599,7 +599,7 @@ static __init void softlockup_init_threads(void) mutex_lock(&watchdog_mutex); softlockup_threads_initialized = true; - softlockup_reconfigure_threads(); + lockup_detector_reconfigure(); mutex_unlock(&watchdog_mutex); } @@ -608,7 +608,7 @@ static inline int watchdog_park_threads(void) { return 0; } static inline void watchdog_unpark_threads(void) { } static inline int watchdog_enable_all_cpus(void) { return 0; } static inline void watchdog_disable_all_cpus(void) { } -static void softlockup_reconfigure_threads(void) +static void lockup_detector_reconfigure(void) { cpus_read_lock(); watchdog_nmi_stop(); @@ -616,9 +616,9 @@ static void softlockup_reconfigure_threads(void) watchdog_nmi_start(); cpus_read_unlock(); } -static inline void softlockup_init_threads(void) +static inline void lockup_detector_setup(void) { - softlockup_reconfigure_threads(); + lockup_detector_reconfigure(); } #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */ @@ -658,7 +658,7 @@ static void proc_watchdog_update(void) { /* Remove impossible cpus to keep sysctl output clean. */ cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask); - softlockup_reconfigure_threads(); + lockup_detector_reconfigure(); } /* @@ -785,5 +785,5 @@ void __init lockup_detector_init(void) if (!watchdog_nmi_probe()) nmi_watchdog_available = true; - softlockup_init_threads(); + lockup_detector_setup(); } -- cgit From 0b62bf862dc93a05fea97b6ca6ffca072e2f30c1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 2 Oct 2017 20:59:09 +0200 Subject: watchdog/core: Put softlockup_threads_initialized under ifdef guard The variable is unused when the softlockup detector is disabled in Kconfig. Signed-off-by: Thomas Gleixner --- kernel/watchdog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index d241bd99cee1..6bcb854909c0 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -47,7 +47,6 @@ int __read_mostly watchdog_thresh = 10; int __read_mostly nmi_watchdog_available; struct cpumask watchdog_allowed_mask __read_mostly; -static bool softlockup_threads_initialized __read_mostly; struct cpumask watchdog_cpumask __read_mostly; unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); @@ -168,6 +167,7 @@ static void lockup_detector_update_enable(void) unsigned int __read_mostly softlockup_panic = CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; +static bool softlockup_threads_initialized __read_mostly; static u64 __read_mostly sample_period; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); -- cgit From 4edd8121e555acbee63578abeaf73026d055bbb4 Mon Sep 17 00:00:00 2001 From: Alexandre Torgue Date: Wed, 4 Oct 2017 11:42:00 +0200 Subject: ARM: dts: stm32: Fix STMPE1600 binding on stm32429i-eval board To declare gpio interrupt line for STMPE1600, 2 possibilities are offered: -use gpio binding (and then the gpiolib interface inside driver) -use interrupt binding as each gpio-controller are also interrupt controller on stm32f429. In STMPE 1600 node both (gpio and interrupt) bindings are defined. This patch fixes this issue and use only interrupt binding. Fixes: c04b2e72af8d ("ARM: dts: stm32: Enable STMPE1600 gpio expander of STM32F429-EVAL board") Signed-off-by: Alexandre Torgue --- arch/arm/boot/dts/stm32429i-eval.dts | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/arm/boot/dts/stm32429i-eval.dts b/arch/arm/boot/dts/stm32429i-eval.dts index 97b1c2321ba9..5bdb90b2ae72 100644 --- a/arch/arm/boot/dts/stm32429i-eval.dts +++ b/arch/arm/boot/dts/stm32429i-eval.dts @@ -202,10 +202,8 @@ stmpe1600: stmpe1600@42 { compatible = "st,stmpe1600"; reg = <0x42>; - irq-gpio = <&gpioi 8 0>; - irq-trigger = <3>; interrupts = <8 3>; - interrupt-parent = <&exti>; + interrupt-parent = <&gpioi>; interrupt-controller; wakeup-source; -- cgit From 8969f1f8291762c13147c1ba89d46238af01675b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 1 Oct 2017 09:37:35 +0200 Subject: nvme-pci: Use PCI bus address for data/queues in CMB Currently, NVMe PCI host driver is programming CMB dma address as I/O SQs addresses. This results in failures on systems where 1:1 outbound mapping is not used (example Broadcom iProc SOCs) because CMB BAR will be progammed with PCI bus address but NVMe PCI EP will try to access CMB using dma address. To have CMB working on systems without 1:1 outbound mapping, we program PCI bus address for I/O SQs instead of dma address. This approach will work on systems with/without 1:1 outbound mapping. Based on a report and previous patch from Abhishek Shah. Fixes: 8ffaadf7 ("NVMe: Use CMB for the IO SQes if available") Cc: stable@vger.kernel.org Reported-by: Abhishek Shah Tested-by: Abhishek Shah Reviewed-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index cb73bc8cad3b..3f5a04c586ce 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -94,7 +94,7 @@ struct nvme_dev { struct mutex shutdown_lock; bool subsystem; void __iomem *cmb; - dma_addr_t cmb_dma_addr; + pci_bus_addr_t cmb_bus_addr; u64 cmb_size; u32 cmbsz; u32 cmbloc; @@ -1226,7 +1226,7 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, if (qid && dev->cmb && use_cmb_sqes && NVME_CMB_SQS(dev->cmbsz)) { unsigned offset = (qid - 1) * roundup(SQ_SIZE(depth), dev->ctrl.page_size); - nvmeq->sq_dma_addr = dev->cmb_dma_addr + offset; + nvmeq->sq_dma_addr = dev->cmb_bus_addr + offset; nvmeq->sq_cmds_io = dev->cmb + offset; } else { nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth), @@ -1527,7 +1527,7 @@ static void __iomem *nvme_map_cmb(struct nvme_dev *dev) resource_size_t bar_size; struct pci_dev *pdev = to_pci_dev(dev->dev); void __iomem *cmb; - dma_addr_t dma_addr; + int bar; dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ); if (!(NVME_CMB_SZ(dev->cmbsz))) @@ -1540,7 +1540,8 @@ static void __iomem *nvme_map_cmb(struct nvme_dev *dev) szu = (u64)1 << (12 + 4 * NVME_CMB_SZU(dev->cmbsz)); size = szu * NVME_CMB_SZ(dev->cmbsz); offset = szu * NVME_CMB_OFST(dev->cmbloc); - bar_size = pci_resource_len(pdev, NVME_CMB_BIR(dev->cmbloc)); + bar = NVME_CMB_BIR(dev->cmbloc); + bar_size = pci_resource_len(pdev, bar); if (offset > bar_size) return NULL; @@ -1553,12 +1554,11 @@ static void __iomem *nvme_map_cmb(struct nvme_dev *dev) if (size > bar_size - offset) size = bar_size - offset; - dma_addr = pci_resource_start(pdev, NVME_CMB_BIR(dev->cmbloc)) + offset; - cmb = ioremap_wc(dma_addr, size); + cmb = ioremap_wc(pci_resource_start(pdev, bar) + offset, size); if (!cmb) return NULL; - dev->cmb_dma_addr = dma_addr; + dev->cmb_bus_addr = pci_bus_address(pdev, bar) + offset; dev->cmb_size = size; return cmb; } -- cgit From 74f1282114acc7d67e25745efe200f020f823c8a Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Wed, 4 Oct 2017 11:15:04 +0200 Subject: powerpc/xive: Fix IPI reset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When resetting an IPI, hw_ipi should also be set to zero. Fixes: eac1e731b59e ("powerpc/xive: guest exploitation of the XIVE interrupt controller") Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman --- arch/powerpc/sysdev/xive/spapr.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/powerpc/sysdev/xive/spapr.c b/arch/powerpc/sysdev/xive/spapr.c index f24a70bc6855..d9c4c9366049 100644 --- a/arch/powerpc/sysdev/xive/spapr.c +++ b/arch/powerpc/sysdev/xive/spapr.c @@ -431,7 +431,11 @@ static int xive_spapr_get_ipi(unsigned int cpu, struct xive_cpu *xc) static void xive_spapr_put_ipi(unsigned int cpu, struct xive_cpu *xc) { + if (!xc->hw_ipi) + return; + xive_irq_bitmap_free(xc->hw_ipi); + xc->hw_ipi = 0; } #endif /* CONFIG_SMP */ -- cgit From cc56939802fb4c9548be53563387a0700baeec82 Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Wed, 4 Oct 2017 11:15:05 +0200 Subject: powerpc/xive: Clear XIVE internal structures when a CPU is removed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit eac1e731b59e ("powerpc/xive: guest exploitation of the XIVE interrupt controller") introduced support for the XIVE exploitation mode of the P9 interrupt controller on the pseries platform. At that time, support for CPU removal was not complete on PowerVM and CPU hot unplug remained untested. It appears that some cleanups of the XIVE internal structures are required before releasing the CPU, without which the kernel crashes in a RTAS call doing the CPU isolation. These changes fix the crash by deconfiguring the IPI interrupt source and clearing the event queues of the CPU when it is removed. Fixes: eac1e731b59e ("powerpc/xive: guest exploitation of the XIVE interrupt controller") Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman --- arch/powerpc/sysdev/xive/common.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index f387318678b9..a3b8d7d1316e 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -1402,6 +1402,14 @@ void xive_teardown_cpu(void) if (xive_ops->teardown_cpu) xive_ops->teardown_cpu(cpu, xc); + +#ifdef CONFIG_SMP + /* Get rid of IPI */ + xive_cleanup_cpu_ipi(cpu, xc); +#endif + + /* Disable and free the queues */ + xive_cleanup_cpu_queues(cpu, xc); } void xive_kexec_teardown_cpu(int secondary) -- cgit From 7c6a4f3b1641195119ddbb531200f4dc4cecbafa Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sun, 24 Sep 2017 10:30:43 -0700 Subject: powerpc/mm: Call flush_tlb_kernel_range with interrupts enabled flush_tlb_kernel_range() may call smp_call_function_many() which expects interrupts to be enabled. This results in a traceback. WARNING: CPU: 0 PID: 1 at kernel/smp.c:416 smp_call_function_many+0xcc/0x2fc CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.14.0-rc1-00009-g0666f56 #1 task: cf830000 task.stack: cf82e000 NIP: c00a93c8 LR: c00a9634 CTR: 00000001 REGS: cf82fde0 TRAP: 0700 Not tainted (4.14.0-rc1-00009-g0666f56) MSR: 00021000 CR: 24000082 XER: 00000000 GPR00: c00a9634 cf82fe90 cf830000 c050ad3c c0015a54 00000000 00000001 00000001 GPR08: 00000001 00000000 00000000 cf82e000 24000084 00000000 c0003150 00000000 GPR16: 00000000 00000000 00000000 00000000 00000000 00000001 00000000 c0510000 GPR24: 00000000 c0015a54 00000000 c050ad3c c051823c c050ad3c 00000025 00000000 NIP [c00a93c8] smp_call_function_many+0xcc/0x2fc LR [c00a9634] smp_call_function+0x3c/0x50 Call Trace: [cf82fe90] [00000010] 0x10 (unreliable) [cf82fed0] [c00a9634] smp_call_function+0x3c/0x50 [cf82fee0] [c0015d2c] flush_tlb_kernel_range+0x20/0x38 [cf82fef0] [c001524c] mark_initmem_nx+0x154/0x16c [cf82ff20] [c001484c] free_initmem+0x20/0x4c [cf82ff30] [c000316c] kernel_init+0x1c/0x108 [cf82ff40] [c000f3a8] ret_from_kernel_thread+0x5c/0x64 Instruction dump: 7c0803a6 7d808120 38210040 4e800020 3d20c052 812981a0 2f890000 40beffac 3d20c051 8929ac64 2f890000 40beff9c <0fe00000> 4bffff94 7fc3f378 7f64db78 Fixes: 3184cc4b6f6a ("powerpc/mm: Fix kernel RAM protection after freeing ...") Fixes: e611939fc8ec ("powerpc/mm: Ensure change_page_attr() doesn't ...") Cc: Christophe Leroy Signed-off-by: Guenter Roeck Reviewed-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/pgtable_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index 65eda1997c3f..f6c7f54c0515 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -361,9 +361,9 @@ static int change_page_attr(struct page *page, int numpages, pgprot_t prot) break; } wmb(); + local_irq_restore(flags); flush_tlb_kernel_range((unsigned long)page_address(start), (unsigned long)page_address(page)); - local_irq_restore(flags); return err; } -- cgit From 2aaae13a9db7897a007c5d7bb46cacfb37dffacd Mon Sep 17 00:00:00 2001 From: Alexandre Torgue Date: Wed, 4 Oct 2017 15:34:48 +0200 Subject: ARM: dts: stm32: use right pinctrl compatible for stm32f469 Currently, same stm32f429-pinctrl driver is used for stm32f429 and stm32f469. As pin map is different between those 2 MCUs, a stm32f469-pinctrl driver has been recently added. This patch -allows to use stm32f469-pinctrl driver for stm32f469 boards -reworks stm32 devicetree files to fit with stm32f429 / stm32f469 In the same time it fixes an issue when only MACH_STM32F469 flag is selected in menuconfig. Fixes: d28bcd53fa90 ("ARM: stm32: Introduce MACH_STM32F469 flag") Reported-by: Nicolas Pitre Signed-off-by: Alexandre Torgue --- arch/arm/boot/dts/stm32429i-eval.dts | 1 + arch/arm/boot/dts/stm32f4-pinctrl.dtsi | 343 +++++++++++++++++++++++++++++++ arch/arm/boot/dts/stm32f429-disco.dts | 1 + arch/arm/boot/dts/stm32f429-pinctrl.dtsi | 95 +++++++++ arch/arm/boot/dts/stm32f429.dtsi | 297 -------------------------- arch/arm/boot/dts/stm32f469-disco.dts | 1 + arch/arm/boot/dts/stm32f469-pinctrl.dtsi | 96 +++++++++ 7 files changed, 537 insertions(+), 297 deletions(-) create mode 100644 arch/arm/boot/dts/stm32f4-pinctrl.dtsi create mode 100644 arch/arm/boot/dts/stm32f429-pinctrl.dtsi create mode 100644 arch/arm/boot/dts/stm32f469-pinctrl.dtsi diff --git a/arch/arm/boot/dts/stm32429i-eval.dts b/arch/arm/boot/dts/stm32429i-eval.dts index 5bdb90b2ae72..293ecb957227 100644 --- a/arch/arm/boot/dts/stm32429i-eval.dts +++ b/arch/arm/boot/dts/stm32429i-eval.dts @@ -47,6 +47,7 @@ /dts-v1/; #include "stm32f429.dtsi" +#include "stm32f429-pinctrl.dtsi" #include #include diff --git a/arch/arm/boot/dts/stm32f4-pinctrl.dtsi b/arch/arm/boot/dts/stm32f4-pinctrl.dtsi new file mode 100644 index 000000000000..7f3560c0211d --- /dev/null +++ b/arch/arm/boot/dts/stm32f4-pinctrl.dtsi @@ -0,0 +1,343 @@ +/* + * Copyright 2017 - Alexandre Torgue + * + * This file is dual-licensed: you can use it either under the terms + * of the GPL or the X11 license, at your option. Note that this dual + * licensing only applies to this file, and not this project as a + * whole. + * + * a) This file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This file is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Or, alternatively, + * + * b) Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include +#include + +/ { + soc { + pinctrl: pin-controller { + #address-cells = <1>; + #size-cells = <1>; + ranges = <0 0x40020000 0x3000>; + interrupt-parent = <&exti>; + st,syscfg = <&syscfg 0x8>; + pins-are-numbered; + + gpioa: gpio@40020000 { + gpio-controller; + #gpio-cells = <2>; + interrupt-controller; + #interrupt-cells = <2>; + reg = <0x0 0x400>; + clocks = <&rcc 0 STM32F4_AHB1_CLOCK(GPIOA)>; + st,bank-name = "GPIOA"; + }; + + gpiob: gpio@40020400 { + gpio-controller; + #gpio-cells = <2>; + interrupt-controller; + #interrupt-cells = <2>; + reg = <0x400 0x400>; + clocks = <&rcc 0 STM32F4_AHB1_CLOCK(GPIOB)>; + st,bank-name = "GPIOB"; + }; + + gpioc: gpio@40020800 { + gpio-controller; + #gpio-cells = <2>; + interrupt-controller; + #interrupt-cells = <2>; + reg = <0x800 0x400>; + clocks = <&rcc 0 STM32F4_AHB1_CLOCK(GPIOC)>; + st,bank-name = "GPIOC"; + }; + + gpiod: gpio@40020c00 { + gpio-controller; + #gpio-cells = <2>; + interrupt-controller; + #interrupt-cells = <2>; + reg = <0xc00 0x400>; + clocks = <&rcc 0 STM32F4_AHB1_CLOCK(GPIOD)>; + st,bank-name = "GPIOD"; + }; + + gpioe: gpio@40021000 { + gpio-controller; + #gpio-cells = <2>; + interrupt-controller; + #interrupt-cells = <2>; + reg = <0x1000 0x400>; + clocks = <&rcc 0 STM32F4_AHB1_CLOCK(GPIOE)>; + st,bank-name = "GPIOE"; + }; + + gpiof: gpio@40021400 { + gpio-controller; + #gpio-cells = <2>; + interrupt-controller; + #interrupt-cells = <2>; + reg = <0x1400 0x400>; + clocks = <&rcc 0 STM32F4_AHB1_CLOCK(GPIOF)>; + st,bank-name = "GPIOF"; + }; + + gpiog: gpio@40021800 { + gpio-controller; + #gpio-cells = <2>; + interrupt-controller; + #interrupt-cells = <2>; + reg = <0x1800 0x400>; + clocks = <&rcc 0 STM32F4_AHB1_CLOCK(GPIOG)>; + st,bank-name = "GPIOG"; + }; + + gpioh: gpio@40021c00 { + gpio-controller; + #gpio-cells = <2>; + interrupt-controller; + #interrupt-cells = <2>; + reg = <0x1c00 0x400>; + clocks = <&rcc 0 STM32F4_AHB1_CLOCK(GPIOH)>; + st,bank-name = "GPIOH"; + }; + + gpioi: gpio@40022000 { + gpio-controller; + #gpio-cells = <2>; + interrupt-controller; + #interrupt-cells = <2>; + reg = <0x2000 0x400>; + clocks = <&rcc 0 STM32F4_AHB1_CLOCK(GPIOI)>; + st,bank-name = "GPIOI"; + }; + + gpioj: gpio@40022400 { + gpio-controller; + #gpio-cells = <2>; + interrupt-controller; + #interrupt-cells = <2>; + reg = <0x2400 0x400>; + clocks = <&rcc 0 STM32F4_AHB1_CLOCK(GPIOJ)>; + st,bank-name = "GPIOJ"; + }; + + gpiok: gpio@40022800 { + gpio-controller; + #gpio-cells = <2>; + interrupt-controller; + #interrupt-cells = <2>; + reg = <0x2800 0x400>; + clocks = <&rcc 0 STM32F4_AHB1_CLOCK(GPIOK)>; + st,bank-name = "GPIOK"; + }; + + usart1_pins_a: usart1@0 { + pins1 { + pinmux = ; + bias-disable; + drive-push-pull; + slew-rate = <0>; + }; + pins2 { + pinmux = ; + bias-disable; + }; + }; + + usart3_pins_a: usart3@0 { + pins1 { + pinmux = ; + bias-disable; + drive-push-pull; + slew-rate = <0>; + }; + pins2 { + pinmux = ; + bias-disable; + }; + }; + + usbotg_fs_pins_a: usbotg_fs@0 { + pins { + pinmux = , + , + ; + bias-disable; + drive-push-pull; + slew-rate = <2>; + }; + }; + + usbotg_fs_pins_b: usbotg_fs@1 { + pins { + pinmux = , + , + ; + bias-disable; + drive-push-pull; + slew-rate = <2>; + }; + }; + + usbotg_hs_pins_a: usbotg_hs@0 { + pins { + pinmux = , + , + , + , + , + , + , + , + , + , + , + ; + bias-disable; + drive-push-pull; + slew-rate = <2>; + }; + }; + + ethernet_mii: mii@0 { + pins { + pinmux = , + , + , + , + , + , + , + , + , + , + , + , + , + ; + slew-rate = <2>; + }; + }; + + adc3_in8_pin: adc@200 { + pins { + pinmux = ; + }; + }; + + pwm1_pins: pwm@1 { + pins { + pinmux = , + , + ; + }; + }; + + pwm3_pins: pwm@3 { + pins { + pinmux = , + ; + }; + }; + + i2c1_pins: i2c1@0 { + pins { + pinmux = , + ; + bias-disable; + drive-open-drain; + slew-rate = <3>; + }; + }; + + ltdc_pins: ltdc@0 { + pins { + pinmux = , + , + , + , + , + , + , + , + , + , + , + , + , + , + , + , + , + , + , + , + , + , + , + , + , + , + , + ; + slew-rate = <2>; + }; + }; + + dcmi_pins: dcmi@0 { + pins { + pinmux = , + , + , + , + , + , + , + , + , + , + , + , + , + , + ; + bias-disable; + drive-push-pull; + slew-rate = <3>; + }; + }; + }; + }; +}; diff --git a/arch/arm/boot/dts/stm32f429-disco.dts b/arch/arm/boot/dts/stm32f429-disco.dts index c66d617e4245..5ceb2cf3777f 100644 --- a/arch/arm/boot/dts/stm32f429-disco.dts +++ b/arch/arm/boot/dts/stm32f429-disco.dts @@ -47,6 +47,7 @@ /dts-v1/; #include "stm32f429.dtsi" +#include "stm32f429-pinctrl.dtsi" #include / { diff --git a/arch/arm/boot/dts/stm32f429-pinctrl.dtsi b/arch/arm/boot/dts/stm32f429-pinctrl.dtsi new file mode 100644 index 000000000000..3e7a17d9112e --- /dev/null +++ b/arch/arm/boot/dts/stm32f429-pinctrl.dtsi @@ -0,0 +1,95 @@ +/* + * Copyright 2017 - Alexandre Torgue + * + * This file is dual-licensed: you can use it either under the terms + * of the GPL or the X11 license, at your option. Note that this dual + * licensing only applies to this file, and not this project as a + * whole. + * + * a) This file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This file is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Or, alternatively, + * + * b) Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "stm32f4-pinctrl.dtsi" + +/ { + soc { + pinctrl: pin-controller { + compatible = "st,stm32f429-pinctrl"; + + gpioa: gpio@40020000 { + gpio-ranges = <&pinctrl 0 0 16>; + }; + + gpiob: gpio@40020400 { + gpio-ranges = <&pinctrl 0 16 16>; + }; + + gpioc: gpio@40020800 { + gpio-ranges = <&pinctrl 0 32 16>; + }; + + gpiod: gpio@40020c00 { + gpio-ranges = <&pinctrl 0 48 16>; + }; + + gpioe: gpio@40021000 { + gpio-ranges = <&pinctrl 0 64 16>; + }; + + gpiof: gpio@40021400 { + gpio-ranges = <&pinctrl 0 80 16>; + }; + + gpiog: gpio@40021800 { + gpio-ranges = <&pinctrl 0 96 16>; + }; + + gpioh: gpio@40021c00 { + gpio-ranges = <&pinctrl 0 112 16>; + }; + + gpioi: gpio@40022000 { + gpio-ranges = <&pinctrl 0 128 16>; + }; + + gpioj: gpio@40022400 { + gpio-ranges = <&pinctrl 0 144 16>; + }; + + gpiok: gpio@40022800 { + gpio-ranges = <&pinctrl 0 160 8>; + }; + }; + }; +}; diff --git a/arch/arm/boot/dts/stm32f429.dtsi b/arch/arm/boot/dts/stm32f429.dtsi index dd7e99b1f43b..5b36eb114ddc 100644 --- a/arch/arm/boot/dts/stm32f429.dtsi +++ b/arch/arm/boot/dts/stm32f429.dtsi @@ -47,7 +47,6 @@ #include "skeleton.dtsi" #include "armv7-m.dtsi" -#include #include #include @@ -591,302 +590,6 @@ status = "disabled"; }; - pinctrl: pin-controller { - #address-cells = <1>; - #size-cells = <1>; - compatible = "st,stm32f429-pinctrl"; - ranges = <0 0x40020000 0x3000>; - interrupt-parent = <&exti>; - st,syscfg = <&syscfg 0x8>; - pins-are-numbered; - - gpioa: gpio@40020000 { - gpio-controller; - #gpio-cells = <2>; - interrupt-controller; - #interrupt-cells = <2>; - reg = <0x0 0x400>; - clocks = <&rcc 0 STM32F4_AHB1_CLOCK(GPIOA)>; - st,bank-name = "GPIOA"; - }; - - gpiob: gpio@40020400 { - gpio-controller; - #gpio-cells = <2>; - interrupt-controller; - #interrupt-cells = <2>; - reg = <0x400 0x400>; - clocks = <&rcc 0 STM32F4_AHB1_CLOCK(GPIOB)>; - st,bank-name = "GPIOB"; - }; - - gpioc: gpio@40020800 { - gpio-controller; - #gpio-cells = <2>; - interrupt-controller; - #interrupt-cells = <2>; - reg = <0x800 0x400>; - clocks = <&rcc 0 STM32F4_AHB1_CLOCK(GPIOC)>; - st,bank-name = "GPIOC"; - }; - - gpiod: gpio@40020c00 { - gpio-controller; - #gpio-cells = <2>; - interrupt-controller; - #interrupt-cells = <2>; - reg = <0xc00 0x400>; - clocks = <&rcc 0 STM32F4_AHB1_CLOCK(GPIOD)>; - st,bank-name = "GPIOD"; - }; - - gpioe: gpio@40021000 { - gpio-controller; - #gpio-cells = <2>; - interrupt-controller; - #interrupt-cells = <2>; - reg = <0x1000 0x400>; - clocks = <&rcc 0 STM32F4_AHB1_CLOCK(GPIOE)>; - st,bank-name = "GPIOE"; - }; - - gpiof: gpio@40021400 { - gpio-controller; - #gpio-cells = <2>; - interrupt-controller; - #interrupt-cells = <2>; - reg = <0x1400 0x400>; - clocks = <&rcc 0 STM32F4_AHB1_CLOCK(GPIOF)>; - st,bank-name = "GPIOF"; - }; - - gpiog: gpio@40021800 { - gpio-controller; - #gpio-cells = <2>; - interrupt-controller; - #interrupt-cells = <2>; - reg = <0x1800 0x400>; - clocks = <&rcc 0 STM32F4_AHB1_CLOCK(GPIOG)>; - st,bank-name = "GPIOG"; - }; - - gpioh: gpio@40021c00 { - gpio-controller; - #gpio-cells = <2>; - interrupt-controller; - #interrupt-cells = <2>; - reg = <0x1c00 0x400>; - clocks = <&rcc 0 STM32F4_AHB1_CLOCK(GPIOH)>; - st,bank-name = "GPIOH"; - }; - - gpioi: gpio@40022000 { - gpio-controller; - #gpio-cells = <2>; - interrupt-controller; - #interrupt-cells = <2>; - reg = <0x2000 0x400>; - clocks = <&rcc 0 STM32F4_AHB1_CLOCK(GPIOI)>; - st,bank-name = "GPIOI"; - }; - - gpioj: gpio@40022400 { - gpio-controller; - #gpio-cells = <2>; - interrupt-controller; - #interrupt-cells = <2>; - reg = <0x2400 0x400>; - clocks = <&rcc 0 STM32F4_AHB1_CLOCK(GPIOJ)>; - st,bank-name = "GPIOJ"; - }; - - gpiok: gpio@40022800 { - gpio-controller; - #gpio-cells = <2>; - interrupt-controller; - #interrupt-cells = <2>; - reg = <0x2800 0x400>; - clocks = <&rcc 0 STM32F4_AHB1_CLOCK(GPIOK)>; - st,bank-name = "GPIOK"; - }; - - usart1_pins_a: usart1@0 { - pins1 { - pinmux = ; - bias-disable; - drive-push-pull; - slew-rate = <0>; - }; - pins2 { - pinmux = ; - bias-disable; - }; - }; - - usart3_pins_a: usart3@0 { - pins1 { - pinmux = ; - bias-disable; - drive-push-pull; - slew-rate = <0>; - }; - pins2 { - pinmux = ; - bias-disable; - }; - }; - - usbotg_fs_pins_a: usbotg_fs@0 { - pins { - pinmux = , - , - ; - bias-disable; - drive-push-pull; - slew-rate = <2>; - }; - }; - - usbotg_fs_pins_b: usbotg_fs@1 { - pins { - pinmux = , - , - ; - bias-disable; - drive-push-pull; - slew-rate = <2>; - }; - }; - - usbotg_hs_pins_a: usbotg_hs@0 { - pins { - pinmux = , - , - , - , - , - , - , - , - , - , - , - ; - bias-disable; - drive-push-pull; - slew-rate = <2>; - }; - }; - - ethernet_mii: mii@0 { - pins { - pinmux = , - , - , - , - , - , - , - , - , - , - , - , - , - ; - slew-rate = <2>; - }; - }; - - adc3_in8_pin: adc@200 { - pins { - pinmux = ; - }; - }; - - pwm1_pins: pwm@1 { - pins { - pinmux = , - , - ; - }; - }; - - pwm3_pins: pwm@3 { - pins { - pinmux = , - ; - }; - }; - - i2c1_pins: i2c1@0 { - pins { - pinmux = , - ; - bias-disable; - drive-open-drain; - slew-rate = <3>; - }; - }; - - ltdc_pins: ltdc@0 { - pins { - pinmux = , - , - , - , - , - , - , - , - , - , - , - , - , - , - , - , - , - , - , - , - , - , - , - , - , - , - , - ; - slew-rate = <2>; - }; - }; - - dcmi_pins: dcmi@0 { - pins { - pinmux = , - , - , - , - , - , - , - , - , - , - , - , - , - , - ; - bias-disable; - drive-push-pull; - slew-rate = <3>; - }; - }; - }; - crc: crc@40023000 { compatible = "st,stm32f4-crc"; reg = <0x40023000 0x400>; diff --git a/arch/arm/boot/dts/stm32f469-disco.dts b/arch/arm/boot/dts/stm32f469-disco.dts index 6ae1f037f3f0..c18acbe4cf4e 100644 --- a/arch/arm/boot/dts/stm32f469-disco.dts +++ b/arch/arm/boot/dts/stm32f469-disco.dts @@ -47,6 +47,7 @@ /dts-v1/; #include "stm32f429.dtsi" +#include "stm32f469-pinctrl.dtsi" / { model = "STMicroelectronics STM32F469i-DISCO board"; diff --git a/arch/arm/boot/dts/stm32f469-pinctrl.dtsi b/arch/arm/boot/dts/stm32f469-pinctrl.dtsi new file mode 100644 index 000000000000..fff542662eea --- /dev/null +++ b/arch/arm/boot/dts/stm32f469-pinctrl.dtsi @@ -0,0 +1,96 @@ +/* + * Copyright 2017 - Alexandre Torgue + * + * This file is dual-licensed: you can use it either under the terms + * of the GPL or the X11 license, at your option. Note that this dual + * licensing only applies to this file, and not this project as a + * whole. + * + * a) This file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This file is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Or, alternatively, + * + * b) Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "stm32f4-pinctrl.dtsi" + +/ { + soc { + pinctrl: pin-controller { + compatible = "st,stm32f469-pinctrl"; + + gpioa: gpio@40020000 { + gpio-ranges = <&pinctrl 0 0 16>; + }; + + gpiob: gpio@40020400 { + gpio-ranges = <&pinctrl 0 16 16>; + }; + + gpioc: gpio@40020800 { + gpio-ranges = <&pinctrl 0 32 16>; + }; + + gpiod: gpio@40020c00 { + gpio-ranges = <&pinctrl 0 48 16>; + }; + + gpioe: gpio@40021000 { + gpio-ranges = <&pinctrl 0 64 16>; + }; + + gpiof: gpio@40021400 { + gpio-ranges = <&pinctrl 0 80 16>; + }; + + gpiog: gpio@40021800 { + gpio-ranges = <&pinctrl 0 96 16>; + }; + + gpioh: gpio@40021c00 { + gpio-ranges = <&pinctrl 0 112 16>; + }; + + gpioi: gpio@40022000 { + gpio-ranges = <&pinctrl 0 128 16>; + }; + + gpioj: gpio@40022400 { + gpio-ranges = <&pinctrl 0 144 6>, + <&pinctrl 12 156 4>; + }; + + gpiok: gpio@40022800 { + gpio-ranges = <&pinctrl 3 163 5>; + }; + }; + }; +}; -- cgit From 2d8ce70a08fe033c904115d59276ad86adeaa337 Mon Sep 17 00:00:00 2001 From: Goffredo Baroncelli Date: Tue, 3 Oct 2017 19:31:10 +0200 Subject: btrfs: avoid overflow when sector_t is 32 bit Jean-Denis Girard noticed commit c821e7f3 "pass bytes to btrfs_bio_alloc" (https://patchwork.kernel.org/patch/9763081/) introduces a regression on 32 bit machines. When CONFIG_LBDAF is _not_ defined (CONFIG_LBDAF == Support for large (2TB+) block devices and files) sector_t is 32 bit on 32bit machines. In the function submit_extent_page, 'sector' (which is sector_t type) is multiplied by 512 to convert it from sectors to bytes, leading to an overflow when the disk is bigger than 4GB (!). I added a cast to u64 to avoid overflow. Fixes: c821e7f3 ("btrfs: pass bytes to btrfs_bio_alloc") CC: stable@vger.kernel.org # 4.13+ Signed-off-by: Goffredo Baroncelli Tested-by: Jean-Denis Girard Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 3738d245518c..8dfa181b1a92 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2801,7 +2801,7 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree, } } - bio = btrfs_bio_alloc(bdev, sector << 9); + bio = btrfs_bio_alloc(bdev, (u64)sector << 9); bio_add_page(bio, page, page_size, offset); bio->bi_end_io = end_io_func; bio->bi_private = tree; -- cgit From eab40cf336065e8d765e006b81ff48c5c114b365 Mon Sep 17 00:00:00 2001 From: Benjamin Block Date: Tue, 3 Oct 2017 12:48:37 +0200 Subject: bsg-lib: fix use-after-free under memory-pressure When under memory-pressure it is possible that the mempool which backs the 'struct request_queue' will make use of up to BLKDEV_MIN_RQ count emergency buffers - in case it can't get a regular allocation. These buffers are preallocated and once they are also used, they are re-supplied with old finished requests from the same request_queue (see mempool_free()). The bug is, when re-supplying the emergency pool, the old requests are not again ran through the callback mempool_t->alloc(), and thus also not through the callback bsg_init_rq(). Thus we skip initialization, and while the sense-buffer still should be good, scsi_request->cmd might have become to be an invalid pointer in the meantime. When the request is initialized in bsg.c, and the user's CDB is larger than BLK_MAX_CDB, bsg will replace it with a custom allocated buffer, which is freed when the user's command is finished, thus it dangles afterwards. When next a command is sent by the user that has a smaller/similar CDB as BLK_MAX_CDB, bsg will assume that scsi_request->cmd is backed by scsi_request->__cmd, will not make a custom allocation, and write into undefined memory. Fix this by splitting bsg_init_rq() into two functions: - bsg_init_rq() is changed to only do the allocation of the sense-buffer, which is used to back the bsg job's reply buffer. This pointer should never change during the lifetime of a scsi_request, so it doesn't need re-initialization. - bsg_initialize_rq() is a new function that makes use of 'struct request_queue's initialize_rq_fn callback (which was introduced in v4.12). This is always called before the request is given out via blk_get_request(). This function does the remaining initialization that was previously done in bsg_init_rq(), and will also do it when the request is taken from the emergency-pool of the backing mempool. Fixes: 50b4d485528d ("bsg-lib: fix kernel panic resulting from missing allocation of reply-buffer") Cc: # 4.11+ Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Signed-off-by: Benjamin Block Signed-off-by: Jens Axboe --- block/bsg-lib.c | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/block/bsg-lib.c b/block/bsg-lib.c index dbddff8174e5..15d25ccd51a5 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -207,20 +207,34 @@ static int bsg_init_rq(struct request_queue *q, struct request *req, gfp_t gfp) struct bsg_job *job = blk_mq_rq_to_pdu(req); struct scsi_request *sreq = &job->sreq; + /* called right after the request is allocated for the request_queue */ + + sreq->sense = kzalloc(SCSI_SENSE_BUFFERSIZE, gfp); + if (!sreq->sense) + return -ENOMEM; + + return 0; +} + +static void bsg_initialize_rq(struct request *req) +{ + struct bsg_job *job = blk_mq_rq_to_pdu(req); + struct scsi_request *sreq = &job->sreq; + void *sense = sreq->sense; + + /* called right before the request is given to the request_queue user */ + memset(job, 0, sizeof(*job)); scsi_req_init(sreq); + + sreq->sense = sense; sreq->sense_len = SCSI_SENSE_BUFFERSIZE; - sreq->sense = kzalloc(sreq->sense_len, gfp); - if (!sreq->sense) - return -ENOMEM; job->req = req; - job->reply = sreq->sense; + job->reply = sense; job->reply_len = sreq->sense_len; job->dd_data = job + 1; - - return 0; } static void bsg_exit_rq(struct request_queue *q, struct request *req) @@ -251,6 +265,7 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name, q->cmd_size = sizeof(struct bsg_job) + dd_job_size; q->init_rq_fn = bsg_init_rq; q->exit_rq_fn = bsg_exit_rq; + q->initialize_rq_fn = bsg_initialize_rq; q->request_fn = bsg_request_fn; ret = blk_init_allocated_queue(q); -- cgit From 69ad59767d094752c23c0fc180a79532fde073d0 Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Wed, 4 Oct 2017 11:05:17 +0900 Subject: Btrfs: fix overlap of fs_info::flags values Because the values of BTRFS_FS_EXCL_OP and BTRFS_FS_QUOTA_OVERRIDE overlap, we should change the value. First, BTRFS_FS_EXCL_OP was set to 14. commit 171938e52807 ("btrfs: track exclusive filesystem operation in flags") Next, the value of BTRFS_FS_QUOTA_OVERRIDE was set to 14. commit f29efe292198 ("btrfs: add quota override flag to enable quota override for CAP_SYS_RESOURCE") As a result, the value 14 overlapped, by accident. This problem is solved by defining the value of BTRFS_FS_EXCL_OP as 16, the flags are internal. Fixes: f29efe292198 ("btrfs: add quota override flag to enable quota override for CAP_SYS_RESOURCE") CC: stable@vger.kernel.org # 4.13+ Signed-off-by: Tsutomu Itoh Reviewed-by: David Sterba [ minimize the change, update only BTRFS_FS_EXCL_OP ] Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b7ccfcc01732..aff3248beb90 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -721,7 +721,7 @@ struct btrfs_delayed_root; * Indicate that a whole-filesystem exclusive operation is running * (device replace, resize, device add/delete, balance) */ -#define BTRFS_FS_EXCL_OP 14 +#define BTRFS_FS_EXCL_OP 16 struct btrfs_fs_info { u8 fsid[BTRFS_FSID_SIZE]; -- cgit From 783874b050768d361239e444ba0fa396bb6d463f Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Wed, 13 Sep 2017 15:45:56 +0200 Subject: dm crypt: reject sector_size feature if device length is not aligned to it If a crypt mapping uses optional sector_size feature, additional restrictions to mapped device segment size must be applied in constructor, otherwise the device activation will fail later. Fixes: 8f0009a225 ("dm crypt: optionally support larger encryption sector size") Cc: stable@vger.kernel.org # 4.12+ Signed-off-by: Milan Broz Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 75341fdca4b6..96ab46512e1f 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -2585,6 +2585,10 @@ static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **ar ti->error = "Invalid feature value for sector_size"; return -EINVAL; } + if (ti->len & ((cc->sector_size >> SECTOR_SHIFT) - 1)) { + ti->error = "Device size is not multiple of sector_size feature"; + return -EINVAL; + } cc->sector_shift = __ffs(cc->sector_size) - SECTOR_SHIFT; } else if (!strcasecmp(opt_string, "iv_large_sectors")) set_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags); -- cgit From 5dcbeca615ef12047a5f4097b91030cbf995b1d2 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Tue, 19 Sep 2017 12:01:08 +0200 Subject: clk: samsung: exynos4: Enable VPLL and EPLL clocks for suspend/resume cycle Commit 6edfa11cb396 ("clk: samsung: Add enable/disable operation for PLL36XX clocks") added enable/disable operations to PLL clocks. Prior that VPLL and EPPL clocks were always enabled because the enable bit was never touched. Those clocks have to be enabled during suspend/resume cycle, because otherwise board fails to enter sleep mode. This patch enables them unconditionally before entering system suspend state. System restore function will set them to the previous state saved in the register cache done before that unconditional enable. Fixes: 6edfa11cb396 ("clk: samsung: Add enable/disable operation for PLL36XX clocks") CC: stable@vger.kernel.org # v4.13 Signed-off-by: Marek Szyprowski Reviewed-by: Chanwoo Choi Reviewed-by: Krzysztof Kozlowski Acked-by: Sylwester Nawrocki Signed-off-by: Stephen Boyd --- drivers/clk/samsung/clk-exynos4.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/clk/samsung/clk-exynos4.c b/drivers/clk/samsung/clk-exynos4.c index e40b77583c47..d8d3cb67b402 100644 --- a/drivers/clk/samsung/clk-exynos4.c +++ b/drivers/clk/samsung/clk-exynos4.c @@ -294,6 +294,18 @@ static const struct samsung_clk_reg_dump src_mask_suspend_e4210[] = { #define PLL_ENABLED (1 << 31) #define PLL_LOCKED (1 << 29) +static void exynos4_clk_enable_pll(u32 reg) +{ + u32 pll_con = readl(reg_base + reg); + pll_con |= PLL_ENABLED; + writel(pll_con, reg_base + reg); + + while (!(pll_con & PLL_LOCKED)) { + cpu_relax(); + pll_con = readl(reg_base + reg); + } +} + static void exynos4_clk_wait_for_pll(u32 reg) { u32 pll_con; @@ -315,6 +327,9 @@ static int exynos4_clk_suspend(void) samsung_clk_save(reg_base, exynos4_save_pll, ARRAY_SIZE(exynos4_clk_pll_regs)); + exynos4_clk_enable_pll(EPLL_CON0); + exynos4_clk_enable_pll(VPLL_CON0); + if (exynos4_soc == EXYNOS4210) { samsung_clk_save(reg_base, exynos4_save_soc, ARRAY_SIZE(exynos4210_clk_save)); -- cgit From a2b7861bb33b2538420bb5d8554153484d3f961f Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Tue, 3 Oct 2017 21:36:51 +0800 Subject: kvm/x86: Avoid async PF preempting the kernel incorrectly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, in PREEMPT_COUNT=n kernel, kvm_async_pf_task_wait() could call schedule() to reschedule in some cases. This could result in accidentally ending the current RCU read-side critical section early, causing random memory corruption in the guest, or otherwise preempting the currently running task inside between preempt_disable and preempt_enable. The difficulty to handle this well is because we don't know whether an async PF delivered in a preemptible section or RCU read-side critical section for PREEMPT_COUNT=n, since preempt_disable()/enable() and rcu_read_lock/unlock() are both no-ops in that case. To cure this, we treat any async PF interrupting a kernel context as one that cannot be preempted, preventing kvm_async_pf_task_wait() from choosing the schedule() path in that case. To do so, a second parameter for kvm_async_pf_task_wait() is introduced, so that we know whether it's called from a context interrupting the kernel, and the parameter is set properly in all the callsites. Cc: "Paul E. McKenney" Cc: Peter Zijlstra Cc: Wanpeng Li Cc: stable@vger.kernel.org Signed-off-by: Boqun Feng Signed-off-by: Radim Krčmář --- arch/x86/include/asm/kvm_para.h | 4 ++-- arch/x86/kernel/kvm.c | 14 ++++++++++---- arch/x86/kvm/mmu.c | 2 +- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index bc62e7cbf1b1..59ad3d132353 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -88,7 +88,7 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1, bool kvm_para_available(void); unsigned int kvm_arch_para_features(void); void __init kvm_guest_init(void); -void kvm_async_pf_task_wait(u32 token); +void kvm_async_pf_task_wait(u32 token, int interrupt_kernel); void kvm_async_pf_task_wake(u32 token); u32 kvm_read_and_reset_pf_reason(void); extern void kvm_disable_steal_time(void); @@ -103,7 +103,7 @@ static inline void kvm_spinlock_init(void) #else /* CONFIG_KVM_GUEST */ #define kvm_guest_init() do {} while (0) -#define kvm_async_pf_task_wait(T) do {} while(0) +#define kvm_async_pf_task_wait(T, I) do {} while(0) #define kvm_async_pf_task_wake(T) do {} while(0) static inline bool kvm_para_available(void) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index e675704fa6f7..8bb9594d0761 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -117,7 +117,11 @@ static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b, return NULL; } -void kvm_async_pf_task_wait(u32 token) +/* + * @interrupt_kernel: Is this called from a routine which interrupts the kernel + * (other than user space)? + */ +void kvm_async_pf_task_wait(u32 token, int interrupt_kernel) { u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; @@ -140,8 +144,10 @@ void kvm_async_pf_task_wait(u32 token) n.token = token; n.cpu = smp_processor_id(); - n.halted = is_idle_task(current) || preempt_count() > 1 || - rcu_preempt_depth(); + n.halted = is_idle_task(current) || + (IS_ENABLED(CONFIG_PREEMPT_COUNT) + ? preempt_count() > 1 || rcu_preempt_depth() + : interrupt_kernel); init_swait_queue_head(&n.wq); hlist_add_head(&n.link, &b->list); raw_spin_unlock(&b->lock); @@ -269,7 +275,7 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code) case KVM_PV_REASON_PAGE_NOT_PRESENT: /* page is swapped out by the host. */ prev_state = exception_enter(); - kvm_async_pf_task_wait((u32)read_cr2()); + kvm_async_pf_task_wait((u32)read_cr2(), !user_mode(regs)); exception_exit(prev_state); break; case KVM_PV_REASON_PAGE_READY: diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index eca30c1eb1d9..106d4a029a8a 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3837,7 +3837,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, case KVM_PV_REASON_PAGE_NOT_PRESENT: vcpu->arch.apf.host_apf_reason = 0; local_irq_disable(); - kvm_async_pf_task_wait(fault_address); + kvm_async_pf_task_wait(fault_address, 0); local_irq_enable(); break; case KVM_PV_REASON_PAGE_READY: -- cgit From 37f6b42e9c2966b08c7df5cfddc0d73c39cead4a Mon Sep 17 00:00:00 2001 From: Lorenzo Pieralisi Date: Mon, 2 Oct 2017 18:28:44 +0100 Subject: ACPI/IORT: Fix PCI ACS enablement commit f6810c15cf97 ("iommu/arm-smmu: Clean up early-probing workarounds") removed kernel code that was allowing to initialize and probe the SMMU devices early (ie earlier than PCI devices, through linker script callback entries) in the boot process because it was not needed any longer in that the SMMU devices/drivers now support deferred probing. Since the SMMUs probe routines are also in charge of requesting global PCI ACS kernel enablement, commit f6810c15cf97 ("iommu/arm-smmu: Clean up early-probing workarounds") also postponed PCI ACS enablement to SMMUs devices probe time, which is too late given that PCI devices needs to detect if PCI ACS is enabled to init the respective capability through the following call path: pci_device_add() -> pci_init_capabilities() -> pci_enable_acs() Add code in the ACPI IORT SMMU platform devices initialization path (that is called before ACPI PCI enumeration) to detect if there exists firmware mappings to map root complexes ids to SMMU ids and if so enable ACS for the system. Fixes: f6810c15cf97 ("iommu/arm-smmu: Clean up early-probing workarounds") Reviewed-by: Robin Murphy Tested-by: Nate Watterson Signed-off-by: Lorenzo Pieralisi Cc: Will Deacon Cc: Hanjun Guo Cc: Sudeep Holla Cc: Zhou Wang Cc: Alex Williamson Signed-off-by: Catalin Marinas --- drivers/acpi/arm64/iort.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c index 9565d572f8dd..de56394dd161 100644 --- a/drivers/acpi/arm64/iort.c +++ b/drivers/acpi/arm64/iort.c @@ -1178,12 +1178,44 @@ dev_put: return ret; } +static bool __init iort_enable_acs(struct acpi_iort_node *iort_node) +{ + if (iort_node->type == ACPI_IORT_NODE_PCI_ROOT_COMPLEX) { + struct acpi_iort_node *parent; + struct acpi_iort_id_mapping *map; + int i; + + map = ACPI_ADD_PTR(struct acpi_iort_id_mapping, iort_node, + iort_node->mapping_offset); + + for (i = 0; i < iort_node->mapping_count; i++, map++) { + if (!map->output_reference) + continue; + + parent = ACPI_ADD_PTR(struct acpi_iort_node, + iort_table, map->output_reference); + /* + * If we detect a RC->SMMU mapping, make sure + * we enable ACS on the system. + */ + if ((parent->type == ACPI_IORT_NODE_SMMU) || + (parent->type == ACPI_IORT_NODE_SMMU_V3)) { + pci_request_acs(); + return true; + } + } + } + + return false; +} + static void __init iort_init_platform_devices(void) { struct acpi_iort_node *iort_node, *iort_end; struct acpi_table_iort *iort; struct fwnode_handle *fwnode; int i, ret; + bool acs_enabled = false; /* * iort_table and iort both point to the start of IORT table, but @@ -1203,6 +1235,9 @@ static void __init iort_init_platform_devices(void) return; } + if (!acs_enabled) + acs_enabled = iort_enable_acs(iort_node); + if ((iort_node->type == ACPI_IORT_NODE_SMMU) || (iort_node->type == ACPI_IORT_NODE_SMMU_V3)) { -- cgit From b02faed15d86f846b0f23f47b92e0782baa873ed Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 3 Oct 2017 18:25:46 +0100 Subject: arm64: Use larger stacks when KASAN is selected AddressSanitizer instrumentation can significantly bloat the stack, and with GCC 7 this can result in stack overflows at boot time in some configurations. We can avoid this by doubling our stack size when KASAN is in use, as is already done on x86 (and has been since KASAN was introduced). Regardless of other patches to decrease KASAN's stack utilization, kernels built with KASAN will always require more stack space than those built without, and we should take this into account. Signed-off-by: Mark Rutland Cc: Will Deacon Cc: Suzuki K Poulose Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/memory.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 3585a5e26151..f7c4d2146aed 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -95,16 +95,19 @@ #define KERNEL_END _end /* - * The size of the KASAN shadow region. This should be 1/8th of the - * size of the entire kernel virtual address space. + * KASAN requires 1/8th of the kernel virtual address space for the shadow + * region. KASAN can bloat the stack significantly, so double the (minimum) + * stack size when KASAN is in use. */ #ifdef CONFIG_KASAN #define KASAN_SHADOW_SIZE (UL(1) << (VA_BITS - 3)) +#define KASAN_THREAD_SHIFT 1 #else #define KASAN_SHADOW_SIZE (0) +#define KASAN_THREAD_SHIFT 0 #endif -#define MIN_THREAD_SHIFT 14 +#define MIN_THREAD_SHIFT (14 + KASAN_THREAD_SHIFT) /* * VMAP'd stacks are allocated at page granularity, so we must ensure that such -- cgit From e19cd282ebedb801e572efae5df2f88d573932ce Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Sun, 1 Oct 2017 09:54:35 +0300 Subject: IB/mlx5: Fix label order in error path handling When UAR get_page fails, it needs to continue to cleanup debugfs for congestion control parameters. Labels for error path were incorrectly ordered. This patch fixes to do correct cleanup on debugfs init failure and uar get page failure. Fixes: 4a2da0b8c078 ("IB/mlx5: Add debug control parameters for congestion control") Signed-off-by: Parav Pandit Reviewed-by: Mark Bloch Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index d6fbad8f34aa..552f7bd4ecc3 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -4174,9 +4174,9 @@ err_bfreg: err_uar_page: mlx5_put_uars_page(dev->mdev, dev->mdev->priv.uar); -err_cnt: - mlx5_ib_cleanup_cong_debugfs(dev); err_cong: + mlx5_ib_cleanup_cong_debugfs(dev); +err_cnt: if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) mlx5_ib_dealloc_counters(dev); -- cgit From efe63c220e57ce74e240670399912e609ac125d9 Mon Sep 17 00:00:00 2001 From: "Amrani, Ram" Date: Tue, 3 Oct 2017 14:47:26 +0300 Subject: RDMA/qedr: Parse VLAN ID correctly and ignore the value of zero Rename vlan_id field name to vlan as it contains more than the vlan_id. Mask out non vlan id fields from vlan tag of the QED LL2 RX GSI vlan output. As it is expected to be vlan id only. Ignore vlan_id with value of zero. Fixes: abd49676c707 ("qed: Add RoCE ll2 & GSI support") Signed-off-by: Ram Amrani Signed-off-by: Michal Kalderon Signed-off-by: Doug Ledford --- drivers/infiniband/hw/qedr/qedr.h | 2 +- drivers/infiniband/hw/qedr/qedr_cm.c | 10 +++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/qedr/qedr.h b/drivers/infiniband/hw/qedr/qedr.h index b2bb42e2805d..254083b524bd 100644 --- a/drivers/infiniband/hw/qedr/qedr.h +++ b/drivers/infiniband/hw/qedr/qedr.h @@ -387,7 +387,7 @@ struct qedr_qp { u8 wqe_size; u8 smac[ETH_ALEN]; - u16 vlan_id; + u16 vlan; int rc; } *rqe_wr_id; diff --git a/drivers/infiniband/hw/qedr/qedr_cm.c b/drivers/infiniband/hw/qedr/qedr_cm.c index 4689e802b332..5ebbe4952386 100644 --- a/drivers/infiniband/hw/qedr/qedr_cm.c +++ b/drivers/infiniband/hw/qedr/qedr_cm.c @@ -105,7 +105,7 @@ void qedr_ll2_complete_rx_packet(void *cxt, qp->rqe_wr_id[qp->rq.gsi_cons].rc = data->u.data_length_error ? -EINVAL : 0; - qp->rqe_wr_id[qp->rq.gsi_cons].vlan_id = data->vlan; + qp->rqe_wr_id[qp->rq.gsi_cons].vlan = data->vlan; /* note: length stands for data length i.e. GRH is excluded */ qp->rqe_wr_id[qp->rq.gsi_cons].sg_list[0].length = data->length.data_length; @@ -694,6 +694,7 @@ int qedr_gsi_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) struct qedr_cq *cq = get_qedr_cq(ibcq); struct qedr_qp *qp = dev->gsi_qp; unsigned long flags; + u16 vlan_id; int i = 0; spin_lock_irqsave(&cq->cq_lock, flags); @@ -712,9 +713,12 @@ int qedr_gsi_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) wc[i].wc_flags |= IB_WC_GRH | IB_WC_IP_CSUM_OK; ether_addr_copy(wc[i].smac, qp->rqe_wr_id[qp->rq.cons].smac); wc[i].wc_flags |= IB_WC_WITH_SMAC; - if (qp->rqe_wr_id[qp->rq.cons].vlan_id) { + + vlan_id = qp->rqe_wr_id[qp->rq.cons].vlan & + VLAN_VID_MASK; + if (vlan_id) { wc[i].wc_flags |= IB_WC_WITH_VLAN; - wc[i].vlan_id = qp->rqe_wr_id[qp->rq.cons].vlan_id; + wc[i].vlan_id = vlan_id; } qedr_inc_sw_cons(&qp->rq); -- cgit From 1736b4c99d1c53abec042d41b702aeabeb65d86a Mon Sep 17 00:00:00 2001 From: "Amrani, Ram" Date: Tue, 3 Oct 2017 14:47:27 +0300 Subject: RDMA/qedr: Parse vlan priority as sl Parse the vlan priority from the vlan tag and configure it to the WC's sl field. Fixes: abd49676c707 ("qed: Add RoCE ll2 & GSI support") Signed-off-by: Ram Amrani Signed-off-by: Michal Kalderon Signed-off-by: Doug Ledford --- drivers/infiniband/hw/qedr/qedr_cm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/hw/qedr/qedr_cm.c b/drivers/infiniband/hw/qedr/qedr_cm.c index 5ebbe4952386..ad8965397cf7 100644 --- a/drivers/infiniband/hw/qedr/qedr_cm.c +++ b/drivers/infiniband/hw/qedr/qedr_cm.c @@ -719,6 +719,8 @@ int qedr_gsi_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) if (vlan_id) { wc[i].wc_flags |= IB_WC_WITH_VLAN; wc[i].vlan_id = vlan_id; + wc[i].sl = (qp->rqe_wr_id[qp->rq.cons].vlan & + VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; } qedr_inc_sw_cons(&qp->rq); -- cgit From 43bfc24ec1d69853d706cb3ebfdb9088846b9b50 Mon Sep 17 00:00:00 2001 From: Mustafa Ismail Date: Tue, 3 Oct 2017 11:11:49 -0500 Subject: i40iw: Add missing memory barriers Remove duplicate set_64bit_val call to offset 24. Replace some instances of set_64bit_val with i40iw_insert_wqe_hdr as valid bit needs a write barrier and should be the last write operation for the WQE. Fixes: 786c6adb3a94 ("i40iw: add puda code") Signed-off-by: Mustafa Ismail Signed-off-by: Shiraz Saleem Signed-off-by: Doug Ledford --- drivers/infiniband/hw/i40iw/i40iw_ctrl.c | 2 +- drivers/infiniband/hw/i40iw/i40iw_p.h | 2 ++ drivers/infiniband/hw/i40iw/i40iw_puda.c | 11 ++++------- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/hw/i40iw/i40iw_ctrl.c b/drivers/infiniband/hw/i40iw/i40iw_ctrl.c index d1f5345f04f0..42ca5346777d 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_ctrl.c +++ b/drivers/infiniband/hw/i40iw/i40iw_ctrl.c @@ -48,7 +48,7 @@ * @wqe: cqp wqe for header * @header: header for the cqp wqe */ -static inline void i40iw_insert_wqe_hdr(u64 *wqe, u64 header) +void i40iw_insert_wqe_hdr(u64 *wqe, u64 header) { wmb(); /* make sure WQE is populated before polarity is set */ set_64bit_val(wqe, 24, header); diff --git a/drivers/infiniband/hw/i40iw/i40iw_p.h b/drivers/infiniband/hw/i40iw/i40iw_p.h index e217a1259f57..5498ad01c280 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_p.h +++ b/drivers/infiniband/hw/i40iw/i40iw_p.h @@ -59,6 +59,8 @@ enum i40iw_status_code i40iw_sc_mr_fast_register(struct i40iw_sc_qp *qp, struct i40iw_fast_reg_stag_info *info, bool post_sq); +void i40iw_insert_wqe_hdr(u64 *wqe, u64 header); + /* HMC/FPM functions */ enum i40iw_status_code i40iw_sc_init_iw_hmc(struct i40iw_sc_dev *dev, u8 hmc_fn_id); diff --git a/drivers/infiniband/hw/i40iw/i40iw_puda.c b/drivers/infiniband/hw/i40iw/i40iw_puda.c index c2cab20c4bc5..59f70676f0e0 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_puda.c +++ b/drivers/infiniband/hw/i40iw/i40iw_puda.c @@ -123,12 +123,11 @@ static void i40iw_puda_post_recvbuf(struct i40iw_puda_rsrc *rsrc, u32 wqe_idx, get_64bit_val(wqe, 24, &offset24); offset24 = (offset24) ? 0 : LS_64(1, I40IWQPSQ_VALID); - set_64bit_val(wqe, 24, offset24); set_64bit_val(wqe, 0, buf->mem.pa); set_64bit_val(wqe, 8, LS_64(buf->mem.size, I40IWQPSQ_FRAG_LEN)); - set_64bit_val(wqe, 24, offset24); + i40iw_insert_wqe_hdr(wqe, offset24); } /** @@ -409,9 +408,7 @@ enum i40iw_status_code i40iw_puda_send(struct i40iw_sc_qp *qp, set_64bit_val(wqe, 8, LS_64(info->len, I40IWQPSQ_FRAG_LEN)); set_64bit_val(wqe, 16, header[0]); - /* Ensure all data is written before writing valid bit */ - wmb(); - set_64bit_val(wqe, 24, header[1]); + i40iw_insert_wqe_hdr(wqe, header[1]); i40iw_debug_buf(qp->dev, I40IW_DEBUG_PUDA, "PUDA SEND WQE", wqe, 32); i40iw_qp_post_wr(&qp->qp_uk); @@ -539,7 +536,7 @@ static enum i40iw_status_code i40iw_puda_qp_wqe(struct i40iw_sc_dev *dev, struct LS_64(2, I40IW_CQPSQ_QP_NEXTIWSTATE) | LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID); - set_64bit_val(wqe, 24, header); + i40iw_insert_wqe_hdr(wqe, header); i40iw_debug_buf(cqp->dev, I40IW_DEBUG_PUDA, "PUDA CQE", wqe, 32); i40iw_sc_cqp_post_sq(cqp); @@ -655,7 +652,7 @@ static enum i40iw_status_code i40iw_puda_cq_wqe(struct i40iw_sc_dev *dev, struct LS_64(1, I40IW_CQPSQ_CQ_ENCEQEMASK) | LS_64(1, I40IW_CQPSQ_CQ_CEQIDVALID) | LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID); - set_64bit_val(wqe, 24, header); + i40iw_insert_wqe_hdr(wqe, header); i40iw_debug_buf(dev, I40IW_DEBUG_PUDA, "PUDA CQE", wqe, I40IW_CQP_WQE_SIZE * 8); -- cgit From 789f903fd75036f937409a9a1616a5a5e5cc5bae Mon Sep 17 00:00:00 2001 From: Mustafa Ismail Date: Tue, 3 Oct 2017 11:11:50 -0500 Subject: i40iw: Fix port number for query QP Port number 0 is an invalid port number. Set it to 1 as there is one port per i40iw device. Fixes: d37498417947 ("i40iw: add files for iwarp interface") Signed-off-by: Mustafa Ismail Signed-off-by: Shiraz Saleem Signed-off-by: Doug Ledford --- drivers/infiniband/hw/i40iw/i40iw_verbs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index 28b3d02d511b..62be0a41ad0b 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -826,12 +826,14 @@ static int i40iw_query_qp(struct ib_qp *ibqp, attr->cap.max_inline_data = I40IW_MAX_INLINE_DATA_SIZE; attr->cap.max_send_sge = I40IW_MAX_WQ_FRAGMENT_COUNT; attr->cap.max_recv_sge = I40IW_MAX_WQ_FRAGMENT_COUNT; + attr->port_num = 1; init_attr->event_handler = iwqp->ibqp.event_handler; init_attr->qp_context = iwqp->ibqp.qp_context; init_attr->send_cq = iwqp->ibqp.send_cq; init_attr->recv_cq = iwqp->ibqp.recv_cq; init_attr->srq = iwqp->ibqp.srq; init_attr->cap = attr->cap; + init_attr->port_num = 1; return 0; } -- cgit From 2a8408e5372503fe4df9b5bbb49fb3af0c212c67 Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Tue, 3 Oct 2017 12:51:58 +0300 Subject: drm/i915/cnl: Reprogram DMC firmware after S3/S4 resume The DMC firmware program memory is lost after S3/S4 system suspend, so we need to reprogram it during resume. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=103070 Fixes: cebfcead63de ("drm/i915/DMC/CNL: Load DMC on CNL") Cc: Anusha Srivatsa Cc: Animesh Manna Cc: Rodrigo Vivi Signed-off-by: Imre Deak Reviewed-by: Rodrigo Vivi Link: https://patchwork.freedesktop.org/patch/msgid/20171003095159.711-1-imre.deak@intel.com (cherry picked from commit 57522c4c87de20d8f7ad4e142a3a4334066d55ff) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/intel_runtime_pm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/i915/intel_runtime_pm.c b/drivers/gpu/drm/i915/intel_runtime_pm.c index b66d8e136aa3..b3a087cb0860 100644 --- a/drivers/gpu/drm/i915/intel_runtime_pm.c +++ b/drivers/gpu/drm/i915/intel_runtime_pm.c @@ -2782,6 +2782,9 @@ static void cnl_display_core_init(struct drm_i915_private *dev_priv, bool resume /* 6. Enable DBUF */ gen9_dbuf_enable(dev_priv); + + if (resume && dev_priv->csr.dmc_payload) + intel_csr_load_program(dev_priv); } #undef CNL_PROCMON_IDX -- cgit From 069d40f5834ad26a58f269225a7e13af17019062 Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Tue, 3 Oct 2017 12:51:59 +0300 Subject: drm/i915/glk: Fix DMC/DC state idleness calculation According to BSpec GLK like BXT needs to ignore the idle state of cores before starting the DMC firmware's DC state handler. Fixes: dbb28b5c3d3c ("drm/i915/DMC/GLK: Load DMC on GLK") Cc: Anusha Srivatsa Cc: Rodrigo Vivi Signed-off-by: Imre Deak Reviewed-by: Rodrigo Vivi Link: https://patchwork.freedesktop.org/patch/msgid/20171003095159.711-2-imre.deak@intel.com (cherry picked from commit b7208a3f3e52791571df064fb96025ad48edd1bf) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/intel_csr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/intel_csr.c b/drivers/gpu/drm/i915/intel_csr.c index 965988f79a55..92c1f8e166dc 100644 --- a/drivers/gpu/drm/i915/intel_csr.c +++ b/drivers/gpu/drm/i915/intel_csr.c @@ -216,7 +216,7 @@ static void gen9_set_dc_state_debugmask(struct drm_i915_private *dev_priv) mask = DC_STATE_DEBUG_MASK_MEMORY_UP; - if (IS_BROXTON(dev_priv)) + if (IS_GEN9_LP(dev_priv)) mask |= DC_STATE_DEBUG_MASK_CORES; /* The below bit doesn't need to be cleared ever afterwards */ -- cgit From e0a86312874e36033cd94fb977dd603a292875c8 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Wed, 4 Oct 2017 23:10:59 +0100 Subject: Update James Hogan's email address Update my imgtec.com and personal email address to my kernel.org one in a few places as MIPS will soon no longer be part of Imagination Technologies, and add mappings in .mailcap so get_maintainer.pl reports the right address. Signed-off-by: James Hogan Signed-off-by: Linus Torvalds --- .mailmap | 2 ++ Documentation/ABI/testing/sysfs-power | 2 +- MAINTAINERS | 6 +++--- drivers/i2c/busses/i2c-img-scb.c | 2 +- drivers/media/rc/ir-sharp-decoder.c | 2 +- 5 files changed, 8 insertions(+), 6 deletions(-) diff --git a/.mailmap b/.mailmap index 5273cfd70ad6..c7b10caecc4e 100644 --- a/.mailmap +++ b/.mailmap @@ -68,6 +68,8 @@ Jacob Shin James Bottomley James Bottomley James E Wilson +James Hogan +James Hogan James Ketrenos Javi Merino diff --git a/Documentation/ABI/testing/sysfs-power b/Documentation/ABI/testing/sysfs-power index 713cab1d5f12..a1d1612f3651 100644 --- a/Documentation/ABI/testing/sysfs-power +++ b/Documentation/ABI/testing/sysfs-power @@ -127,7 +127,7 @@ Description: What; /sys/power/pm_trace_dev_match Date: October 2010 -Contact: James Hogan +Contact: James Hogan Description: The /sys/power/pm_trace_dev_match file contains the name of the device associated with the last PM event point saved in the RTC diff --git a/MAINTAINERS b/MAINTAINERS index 65b0c88d5ee0..3f05fc6961ad 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6738,7 +6738,7 @@ F: Documentation/devicetree/bindings/auxdisplay/img-ascii-lcd.txt F: drivers/auxdisplay/img-ascii-lcd.c IMGTEC IR DECODER DRIVER -M: James Hogan +M: James Hogan S: Maintained F: drivers/media/rc/img-ir/ @@ -7562,7 +7562,7 @@ F: arch/arm64/include/asm/kvm* F: arch/arm64/kvm/ KERNEL VIRTUAL MACHINE FOR MIPS (KVM/mips) -M: James Hogan +M: James Hogan L: linux-mips@linux-mips.org S: Supported F: arch/mips/include/uapi/asm/kvm* @@ -8885,7 +8885,7 @@ F: Documentation/devicetree/bindings/media/meson-ao-cec.txt T: git git://linuxtv.org/media_tree.git METAG ARCHITECTURE -M: James Hogan +M: James Hogan L: linux-metag@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/jhogan/metag.git S: Odd Fixes diff --git a/drivers/i2c/busses/i2c-img-scb.c b/drivers/i2c/busses/i2c-img-scb.c index 84fb35f6837f..eb1d91b986fd 100644 --- a/drivers/i2c/busses/i2c-img-scb.c +++ b/drivers/i2c/busses/i2c-img-scb.c @@ -1459,6 +1459,6 @@ static struct platform_driver img_scb_i2c_driver = { }; module_platform_driver(img_scb_i2c_driver); -MODULE_AUTHOR("James Hogan "); +MODULE_AUTHOR("James Hogan "); MODULE_DESCRIPTION("IMG host I2C driver"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/media/rc/ir-sharp-decoder.c b/drivers/media/rc/ir-sharp-decoder.c index ed43a4212479..129b558acc92 100644 --- a/drivers/media/rc/ir-sharp-decoder.c +++ b/drivers/media/rc/ir-sharp-decoder.c @@ -245,5 +245,5 @@ module_init(ir_sharp_decode_init); module_exit(ir_sharp_decode_exit); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("James Hogan "); +MODULE_AUTHOR("James Hogan "); MODULE_DESCRIPTION("Sharp IR protocol decoder"); -- cgit From e769fcec6bc4bdd1b0e2cf817680148f9c40b1c4 Mon Sep 17 00:00:00 2001 From: Vishakha Narvekar Date: Tue, 3 Oct 2017 16:13:29 -0400 Subject: net: 8021q: skip packets if the vlan is down If the vlan is down, free the packet instead of proceeding with other processing, or counting it as received. If vlan interfaces are used as slaves for bonding, with arp monitoring for connectivity, if the rx counter is seen to be incrementing, then the bond device will not observe that the interface is down. CC: David S. Miller Signed-off-by: Vishakha Narvekar Signed-off-by: David S. Miller --- net/8021q/vlan_core.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index e2ed69850489..0bc31de9071a 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -21,6 +21,12 @@ bool vlan_do_receive(struct sk_buff **skbp) if (unlikely(!skb)) return false; + if (unlikely(!(vlan_dev->flags & IFF_UP))) { + kfree_skb(skb); + *skbp = NULL; + return false; + } + skb->dev = vlan_dev; if (unlikely(skb->pkt_type == PACKET_OTHERHOST)) { /* Our lower layer thinks this is not local, let's make sure. -- cgit From 463a9215f3ca7600b5fff6d903913906ae4548a9 Mon Sep 17 00:00:00 2001 From: Pierre-Yves MORDRET Date: Thu, 21 Sep 2017 15:30:09 +0200 Subject: i2c: stm32f7: fix setup structure I2C drive setup structure is not properly allocated. Make it static instead of pointer to store driver data. Fixes: aeb068c5721485 ("i2c: i2c-stm32f7: add driver") Signed-off-by: Pierre-Yves MORDRET Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-stm32f7.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/i2c/busses/i2c-stm32f7.c b/drivers/i2c/busses/i2c-stm32f7.c index 47c67b0ca896..42ebdb1665e1 100644 --- a/drivers/i2c/busses/i2c-stm32f7.c +++ b/drivers/i2c/busses/i2c-stm32f7.c @@ -215,7 +215,7 @@ struct stm32f7_i2c_dev { unsigned int msg_num; unsigned int msg_id; struct stm32f7_i2c_msg f7_msg; - struct stm32f7_i2c_setup *setup; + struct stm32f7_i2c_setup setup; struct stm32f7_i2c_timings timing; }; @@ -537,7 +537,7 @@ static void stm32f7_i2c_hw_config(struct stm32f7_i2c_dev *i2c_dev) writel_relaxed(timing, i2c_dev->base + STM32F7_I2C_TIMINGR); /* Enable I2C */ - if (i2c_dev->setup->analog_filter) + if (i2c_dev->setup.analog_filter) stm32f7_i2c_clr_bits(i2c_dev->base + STM32F7_I2C_CR1, STM32F7_I2C_CR1_ANFOFF); else @@ -887,22 +887,19 @@ static int stm32f7_i2c_probe(struct platform_device *pdev) } setup = of_device_get_match_data(&pdev->dev); - i2c_dev->setup->rise_time = setup->rise_time; - i2c_dev->setup->fall_time = setup->fall_time; - i2c_dev->setup->dnf = setup->dnf; - i2c_dev->setup->analog_filter = setup->analog_filter; + i2c_dev->setup = *setup; ret = device_property_read_u32(i2c_dev->dev, "i2c-scl-rising-time-ns", &rise_time); if (!ret) - i2c_dev->setup->rise_time = rise_time; + i2c_dev->setup.rise_time = rise_time; ret = device_property_read_u32(i2c_dev->dev, "i2c-scl-falling-time-ns", &fall_time); if (!ret) - i2c_dev->setup->fall_time = fall_time; + i2c_dev->setup.fall_time = fall_time; - ret = stm32f7_i2c_setup_timing(i2c_dev, i2c_dev->setup); + ret = stm32f7_i2c_setup_timing(i2c_dev, &i2c_dev->setup); if (ret) goto clk_free; -- cgit From cb09d943c70da7c8097006db1dc163b2d99338f6 Mon Sep 17 00:00:00 2001 From: Jarkko Nikula Date: Thu, 21 Sep 2017 16:23:16 +0300 Subject: i2c: i801: Add support for Intel Cedar Fork Add PCI ID for Intel Cedar Fork PCH. Signed-off-by: Jarkko Nikula Reviewed-by: Jean Delvare Signed-off-by: Wolfram Sang --- Documentation/i2c/busses/i2c-i801 | 1 + drivers/i2c/busses/Kconfig | 1 + drivers/i2c/busses/i2c-i801.c | 4 ++++ 3 files changed, 6 insertions(+) diff --git a/Documentation/i2c/busses/i2c-i801 b/Documentation/i2c/busses/i2c-i801 index 0500193434cb..d47702456926 100644 --- a/Documentation/i2c/busses/i2c-i801 +++ b/Documentation/i2c/busses/i2c-i801 @@ -36,6 +36,7 @@ Supported adapters: * Intel Gemini Lake (SOC) * Intel Cannon Lake-H (PCH) * Intel Cannon Lake-LP (PCH) + * Intel Cedar Fork (PCH) Datasheets: Publicly available at the Intel website On Intel Patsburg and later chipsets, both the normal host SMBus controller diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig index c06dce2c1da7..45a3f3ca29b3 100644 --- a/drivers/i2c/busses/Kconfig +++ b/drivers/i2c/busses/Kconfig @@ -131,6 +131,7 @@ config I2C_I801 Gemini Lake (SOC) Cannon Lake-H (PCH) Cannon Lake-LP (PCH) + Cedar Fork (PCH) This driver can also be built as a module. If so, the module will be called i2c-i801. diff --git a/drivers/i2c/busses/i2c-i801.c b/drivers/i2c/busses/i2c-i801.c index e114e4e00d29..9e12a53ef7b8 100644 --- a/drivers/i2c/busses/i2c-i801.c +++ b/drivers/i2c/busses/i2c-i801.c @@ -68,6 +68,7 @@ * Gemini Lake (SOC) 0x31d4 32 hard yes yes yes * Cannon Lake-H (PCH) 0xa323 32 hard yes yes yes * Cannon Lake-LP (PCH) 0x9da3 32 hard yes yes yes + * Cedar Fork (PCH) 0x18df 32 hard yes yes yes * * Features supported by this driver: * Software PEC no @@ -204,6 +205,7 @@ /* Older devices have their ID defined in */ #define PCI_DEVICE_ID_INTEL_BAYTRAIL_SMBUS 0x0f12 +#define PCI_DEVICE_ID_INTEL_CDF_SMBUS 0x18df #define PCI_DEVICE_ID_INTEL_DNV_SMBUS 0x19df #define PCI_DEVICE_ID_INTEL_COUGARPOINT_SMBUS 0x1c22 #define PCI_DEVICE_ID_INTEL_PATSBURG_SMBUS 0x1d22 @@ -1025,6 +1027,7 @@ static const struct pci_device_id i801_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_BRASWELL_SMBUS) }, { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SUNRISEPOINT_H_SMBUS) }, { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SUNRISEPOINT_LP_SMBUS) }, + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CDF_SMBUS) }, { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_DNV_SMBUS) }, { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_BROXTON_SMBUS) }, { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_LEWISBURG_SMBUS) }, @@ -1513,6 +1516,7 @@ static int i801_probe(struct pci_dev *dev, const struct pci_device_id *id) case PCI_DEVICE_ID_INTEL_CANNONLAKE_LP_SMBUS: case PCI_DEVICE_ID_INTEL_LEWISBURG_SMBUS: case PCI_DEVICE_ID_INTEL_LEWISBURG_SSKU_SMBUS: + case PCI_DEVICE_ID_INTEL_CDF_SMBUS: case PCI_DEVICE_ID_INTEL_DNV_SMBUS: case PCI_DEVICE_ID_INTEL_KABYLAKE_PCH_H_SMBUS: priv->features |= FEATURE_I2C_BLOCK_READ; -- cgit From a91aee523fb282f8868ba6b918f781679c3ea301 Mon Sep 17 00:00:00 2001 From: Thomas Meyer Date: Thu, 21 Sep 2017 08:24:27 +0200 Subject: i2c: ensure termination of *_device_id tables Make sure (of/i2c/platform)_device_id tables are NULL terminated. Found by coccinelle spatch "misc/of_table.cocci" Signed-off-by: Thomas Meyer Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-sprd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/i2c/busses/i2c-sprd.c b/drivers/i2c/busses/i2c-sprd.c index 22e08ae1704f..25fcc3c1e32b 100644 --- a/drivers/i2c/busses/i2c-sprd.c +++ b/drivers/i2c/busses/i2c-sprd.c @@ -627,6 +627,7 @@ static const struct dev_pm_ops sprd_i2c_pm_ops = { static const struct of_device_id sprd_i2c_of_match[] = { { .compatible = "sprd,sc9860-i2c", }, + {}, }; static struct platform_driver sprd_i2c_driver = { -- cgit From 25f2f440989c7079fdd8fccd54592cc077b63ae5 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 18 Sep 2017 09:15:39 +0100 Subject: i2c: i2c-stm32f7: make structure stm32f7_setup static const The structure stm32f7_setup is local to the source and does not need to be in global scope, make it static const. Cleans up sparse warning: symbol 'stm32f7_setup' was not declared. Should it be static? Signed-off-by: Colin Ian King Acked-by: Pierre-Yves MORDRET Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-stm32f7.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-stm32f7.c b/drivers/i2c/busses/i2c-stm32f7.c index 42ebdb1665e1..d4a6e9c2e9aa 100644 --- a/drivers/i2c/busses/i2c-stm32f7.c +++ b/drivers/i2c/busses/i2c-stm32f7.c @@ -265,7 +265,7 @@ static struct stm32f7_i2c_spec i2c_specs[] = { }, }; -struct stm32f7_i2c_setup stm32f7_setup = { +static const struct stm32f7_i2c_setup stm32f7_setup = { .rise_time = STM32F7_I2C_RISE_TIME_DEFAULT, .fall_time = STM32F7_I2C_FALL_TIME_DEFAULT, .dnf = STM32F7_I2C_DNF_DEFAULT, -- cgit From f26e60167d8b5b1c67b3efd4cb5672da446bdb0e Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 4 Oct 2017 10:39:05 -0500 Subject: x86/kvm: Move kvm_fastop_exception to .fixup section MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When compiling the kernel with the '-frecord-gcc-switches' flag, objtool complains: arch/x86/kvm/emulate.o: warning: objtool: .GCC.command.line+0x0: special: can't find new instruction And also the kernel fails to link. The problem is that the 'kvm_fastop_exception' code gets placed into the throwaway '.GCC.command.line' section instead of '.text'. Exception fixup code is conventionally placed in the '.fixup' section, so put it there where it belongs. Reported-and-tested-by: Guenter Roeck Signed-off-by: Josh Poimboeuf Reviewed-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/emulate.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a36254cbf776..d90cdc77e077 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -425,8 +425,10 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); #op " %al \n\t" \ FOP_RET -asm(".global kvm_fastop_exception \n" - "kvm_fastop_exception: xor %esi, %esi; ret"); +asm(".pushsection .fixup, \"ax\"\n" + ".global kvm_fastop_exception \n" + "kvm_fastop_exception: xor %esi, %esi; ret\n" + ".popsection"); FOP_START(setcc) FOP_SETCC(seto) -- cgit From 954c736f865d6c0c68ae4263a2f3502ee7c447a3 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Mon, 18 Sep 2017 22:53:43 +0300 Subject: ovl: fix may_write_real() for overlayfs directories Overlayfs directory file_inode() is the overlay inode whether the real inode is upper or lower. This fixes a regression in xfstest generic/158. Fixes: 7c6893e3c9ab ("ovl: don't allow writing ioctl on lower layer") Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/namespace.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/namespace.c b/fs/namespace.c index 54059b142d6b..3b601f115b6c 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -468,7 +468,9 @@ static inline int may_write_real(struct file *file) /* File refers to upper, writable layer? */ upperdentry = d_real(dentry, NULL, 0, D_REAL_UPPER); - if (upperdentry && file_inode(file) == d_inode(upperdentry)) + if (upperdentry && + (file_inode(file) == d_inode(upperdentry) || + file_inode(file) == d_inode(dentry))) return 0; /* Lower layer: can't write to real file, sorry... */ -- cgit From e0082a0f04c432cb6d7128ef60d8e425e45ce025 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Sun, 24 Sep 2017 13:01:35 +0300 Subject: ovl: fix error value printed in ovl_lookup_index() Fixes: 359f392ca53e ("ovl: lookup index entry for copy up origin") Cc: # v4.13 Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/overlayfs/namei.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index c3addd1114f1..654bea1a5ac9 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -506,6 +506,7 @@ static struct dentry *ovl_lookup_index(struct dentry *dentry, index = lookup_one_len_unlocked(name.name, ofs->indexdir, name.len); if (IS_ERR(index)) { + err = PTR_ERR(index); pr_warn_ratelimited("overlayfs: failed inode index lookup (ino=%lu, key=%*s, err=%i);\n" "overlayfs: mount with '-o index=off' to disable inodes index.\n", d_inode(origin)->i_ino, name.len, name.name, -- cgit From 9f4ec904dbd4eb1a2db10d5e7dc16eae386fe64d Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Sun, 24 Sep 2017 17:36:26 +0300 Subject: ovl: fix dput() of ERR_PTR in ovl_cleanup_index() Fixes: caf70cb2ba5d ("ovl: cleanup orphan index entries") Cc: # v4.13 Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/overlayfs/util.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 117794582f9f..7ae33d225a67 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -430,7 +430,7 @@ void ovl_inuse_unlock(struct dentry *dentry) } } -/* Called must hold OVL_I(inode)->oi_lock */ +/* Caller must hold OVL_I(inode)->lock */ static void ovl_cleanup_index(struct dentry *dentry) { struct inode *dir = ovl_indexdir(dentry->d_sb)->d_inode; @@ -469,6 +469,9 @@ static void ovl_cleanup_index(struct dentry *dentry) err = PTR_ERR(index); if (!IS_ERR(index)) err = ovl_cleanup(dir, index); + else + index = NULL; + inode_unlock(dir); if (err) goto fail; -- cgit From dc7ab6773e8171e07f16fd0df0c5eea28c899503 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Sun, 24 Sep 2017 22:19:10 +0300 Subject: ovl: fix dentry leak in ovl_indexdir_cleanup() index dentry was not released when breaking out of the loop due to index verification error. Fixes: 415543d5c64f ("ovl: cleanup bad and stale index entries on mount") Cc: # v4.13 Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/overlayfs/readdir.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 62e9b22a2077..0f85ee9c3268 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -988,6 +988,7 @@ int ovl_indexdir_cleanup(struct dentry *dentry, struct vfsmount *mnt, struct path *lowerstack, unsigned int numlower) { int err; + struct dentry *index = NULL; struct inode *dir = dentry->d_inode; struct path path = { .mnt = mnt, .dentry = dentry }; LIST_HEAD(list); @@ -1007,8 +1008,6 @@ int ovl_indexdir_cleanup(struct dentry *dentry, struct vfsmount *mnt, inode_lock_nested(dir, I_MUTEX_PARENT); list_for_each_entry(p, &list, l_node) { - struct dentry *index; - if (p->name[0] == '.') { if (p->len == 1) continue; @@ -1018,6 +1017,7 @@ int ovl_indexdir_cleanup(struct dentry *dentry, struct vfsmount *mnt, index = lookup_one_len(p->name, dentry, p->len); if (IS_ERR(index)) { err = PTR_ERR(index); + index = NULL; break; } err = ovl_verify_index(index, lowerstack, numlower); @@ -1029,7 +1029,9 @@ int ovl_indexdir_cleanup(struct dentry *dentry, struct vfsmount *mnt, break; } dput(index); + index = NULL; } + dput(index); inode_unlock(dir); out: ovl_cache_free(&list); -- cgit From 5820dc0888d302ac05f8b91ffdf7e4e53b4fbf53 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Mon, 25 Sep 2017 16:39:55 +0300 Subject: ovl: fix missing unlock_rename() in ovl_do_copy_up() Use the ovl_lock_rename_workdir() helper which requires unlock_rename() only on lock success. Fixes: ("fd210b7d67ee ovl: move copy up lock out") Cc: # v4.13 Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/overlayfs/copy_up.c | 6 ++---- fs/overlayfs/dir.c | 20 -------------------- fs/overlayfs/overlayfs.h | 1 + fs/overlayfs/util.c | 19 +++++++++++++++++++ 4 files changed, 22 insertions(+), 24 deletions(-) diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index aad97b30d5e6..c441f9387a1b 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -561,10 +561,8 @@ static int ovl_do_copy_up(struct ovl_copy_up_ctx *c) c->tmpfile = true; err = ovl_copy_up_locked(c); } else { - err = -EIO; - if (lock_rename(c->workdir, c->destdir) != NULL) { - pr_err("overlayfs: failed to lock workdir+upperdir\n"); - } else { + err = ovl_lock_rename_workdir(c->workdir, c->destdir); + if (!err) { err = ovl_copy_up_locked(c); unlock_rename(c->workdir, c->destdir); } diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 3309b1912241..cc961a3bd3bd 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -216,26 +216,6 @@ out_unlock: return err; } -static int ovl_lock_rename_workdir(struct dentry *workdir, - struct dentry *upperdir) -{ - /* Workdir should not be the same as upperdir */ - if (workdir == upperdir) - goto err; - - /* Workdir should not be subdir of upperdir and vice versa */ - if (lock_rename(workdir, upperdir) != NULL) - goto err_unlock; - - return 0; - -err_unlock: - unlock_rename(workdir, upperdir); -err: - pr_err("overlayfs: failed to lock workdir+upperdir\n"); - return -EIO; -} - static struct dentry *ovl_clear_empty(struct dentry *dentry, struct list_head *list) { diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index d4e8c1a08fb0..c706a6f99928 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -235,6 +235,7 @@ bool ovl_inuse_trylock(struct dentry *dentry); void ovl_inuse_unlock(struct dentry *dentry); int ovl_nlink_start(struct dentry *dentry, bool *locked); void ovl_nlink_end(struct dentry *dentry, bool locked); +int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir); static inline bool ovl_is_impuredir(struct dentry *dentry) { diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 7ae33d225a67..b9b239fa5cfd 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -560,3 +560,22 @@ void ovl_nlink_end(struct dentry *dentry, bool locked) mutex_unlock(&OVL_I(d_inode(dentry))->lock); } } + +int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir) +{ + /* Workdir should not be the same as upperdir */ + if (workdir == upperdir) + goto err; + + /* Workdir should not be subdir of upperdir and vice versa */ + if (lock_rename(workdir, upperdir) != NULL) + goto err_unlock; + + return 0; + +err_unlock: + unlock_rename(workdir, upperdir); +err: + pr_err("overlayfs: failed to lock workdir+upperdir\n"); + return -EIO; +} -- cgit From 85fdee1eef1a9e48ad5716916677e0c5fbc781e3 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 29 Sep 2017 10:21:21 +0300 Subject: ovl: fix regression caused by exclusive upper/work dir protection Enforcing exclusive ownership on upper/work dirs caused a docker regression: https://github.com/moby/moby/issues/34672. Euan spotted the regression and pointed to the offending commit. Vivek has brought the regression to my attention and provided this reproducer: Terminal 1: mount -t overlay -o workdir=work,lowerdir=lower,upperdir=upper none merged/ Terminal 2: unshare -m Terminal 1: umount merged mount -t overlay -o workdir=work,lowerdir=lower,upperdir=upper none merged/ mount: /root/overlay-testing/merged: none already mounted or mount point busy To fix the regression, I replaced the error with an alarming warning. With index feature enabled, mount does fail, but logs a suggestion to override exclusive dir protection by disabling index. Note that index=off mount does take the inuse locks, so a concurrent index=off will issue the warning and a concurrent index=on mount will fail. Documentation was updated to reflect this change. Fixes: 2cac0c00a6cd ("ovl: get exclusive ownership on upper/work dirs") Cc: # v4.13 Reported-by: Euan Kemp Reported-by: Vivek Goyal Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- Documentation/filesystems/overlayfs.txt | 5 ++++- fs/overlayfs/ovl_entry.h | 3 +++ fs/overlayfs/super.c | 27 +++++++++++++++++++-------- 3 files changed, 26 insertions(+), 9 deletions(-) diff --git a/Documentation/filesystems/overlayfs.txt b/Documentation/filesystems/overlayfs.txt index 36f528a7fdd6..8caa60734647 100644 --- a/Documentation/filesystems/overlayfs.txt +++ b/Documentation/filesystems/overlayfs.txt @@ -210,8 +210,11 @@ path as another overlay mount and it may use a lower layer path that is beneath or above the path of another overlay lower layer path. Using an upper layer path and/or a workdir path that are already used by -another overlay mount is not allowed and will fail with EBUSY. Using +another overlay mount is not allowed and may fail with EBUSY. Using partially overlapping paths is not allowed but will not fail with EBUSY. +If files are accessed from two overlayfs mounts which share or overlap the +upper layer and/or workdir path the behavior of the overlay is undefined, +though it will not result in a crash or deadlock. Mounting an overlay using an upper layer path, where the upper layer path was previously used by another mounted overlay in combination with a diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index 878a750986dd..25d9b5adcd42 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -37,6 +37,9 @@ struct ovl_fs { bool noxattr; /* sb common to all layers */ struct super_block *same_sb; + /* Did we take the inuse lock? */ + bool upperdir_locked; + bool workdir_locked; }; /* private information held for every overlayfs dentry */ diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index fd5ea4facc62..092d150643c1 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -211,9 +211,10 @@ static void ovl_put_super(struct super_block *sb) dput(ufs->indexdir); dput(ufs->workdir); - ovl_inuse_unlock(ufs->workbasedir); + if (ufs->workdir_locked) + ovl_inuse_unlock(ufs->workbasedir); dput(ufs->workbasedir); - if (ufs->upper_mnt) + if (ufs->upper_mnt && ufs->upperdir_locked) ovl_inuse_unlock(ufs->upper_mnt->mnt_root); mntput(ufs->upper_mnt); for (i = 0; i < ufs->numlower; i++) @@ -881,9 +882,13 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) goto out_put_upperpath; err = -EBUSY; - if (!ovl_inuse_trylock(upperpath.dentry)) { - pr_err("overlayfs: upperdir is in-use by another mount\n"); + if (ovl_inuse_trylock(upperpath.dentry)) { + ufs->upperdir_locked = true; + } else if (ufs->config.index) { + pr_err("overlayfs: upperdir is in-use by another mount, mount with '-o index=off' to override exclusive upperdir protection.\n"); goto out_put_upperpath; + } else { + pr_warn("overlayfs: upperdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n"); } err = ovl_mount_dir(ufs->config.workdir, &workpath); @@ -901,9 +906,13 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) } err = -EBUSY; - if (!ovl_inuse_trylock(workpath.dentry)) { - pr_err("overlayfs: workdir is in-use by another mount\n"); + if (ovl_inuse_trylock(workpath.dentry)) { + ufs->workdir_locked = true; + } else if (ufs->config.index) { + pr_err("overlayfs: workdir is in-use by another mount, mount with '-o index=off' to override exclusive workdir protection.\n"); goto out_put_workpath; + } else { + pr_warn("overlayfs: workdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n"); } ufs->workbasedir = workpath.dentry; @@ -1156,11 +1165,13 @@ out_put_lowerpath: out_free_lowertmp: kfree(lowertmp); out_unlock_workdentry: - ovl_inuse_unlock(workpath.dentry); + if (ufs->workdir_locked) + ovl_inuse_unlock(workpath.dentry); out_put_workpath: path_put(&workpath); out_unlock_upperdentry: - ovl_inuse_unlock(upperpath.dentry); + if (ufs->upperdir_locked) + ovl_inuse_unlock(upperpath.dentry); out_put_upperpath: path_put(&upperpath); out_free_config: -- cgit From e42eef4ba38806b18c4a74f0c276fb2e0b548173 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 4 Oct 2017 12:28:18 +0200 Subject: KVM: add X86_LOCAL_APIC dependency MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The rework of the posted interrupt handling broke building without support for the local APIC: ERROR: "boot_cpu_physical_apicid" [arch/x86/kvm/kvm-intel.ko] undefined! That configuration is probably not particularly useful anyway, so we can avoid the randconfig failures by adding a Kconfig dependency. Fixes: 8b306e2f3c41 ("KVM: VMX: avoid double list add with VT-d posted interrupts") Signed-off-by: Arnd Bergmann Signed-off-by: Radim Krčmář --- arch/x86/kvm/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 3ea624452f93..3c48bc8bf08c 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -23,6 +23,7 @@ config KVM depends on HIGH_RES_TIMERS # for TASKSTATS/TASK_DELAY_ACCT: depends on NET && MULTIUSER + depends on X86_LOCAL_APIC select PREEMPT_NOTIFIERS select MMU_NOTIFIER select ANON_INODES -- cgit From 41dcf197ad5373a7dd0a4b6572aec2e3ec6a0e49 Mon Sep 17 00:00:00 2001 From: Jonathan Brassow Date: Mon, 2 Oct 2017 17:17:35 -0500 Subject: dm raid: fix incorrect status output at the end of a "recover" process There are three important fields that indicate the overall health and status of an array: dev_health, sync_ratio, and sync_action. They tell us the condition of the devices in the array, and the degree to which the array is synchronized. This commit fixes a condition that is reported incorrectly. When a member of the array is being rebuilt or a new device is added, the "recover" process is used to synchronize it with the rest of the array. When the process is complete, but the sync thread hasn't yet been reaped, it is possible for the state of MD to be: mddev->recovery = [ MD_RECOVERY_RUNNING MD_RECOVERY_RECOVER MD_RECOVERY_DONE ] curr_resync_completed = (but not MaxSector) and all rdevs to be In_sync. This causes the 'array_in_sync' output parameter that is passed to rs_get_progress() to be computed incorrectly and reported as 'false' -- or not in-sync. This in turn causes the dev_health status characters to be reported as all 'a', rather than the proper 'A'. This can cause erroneous output for several seconds at a time when tools will want to be checking the condition due to events that are raised at the end of a sync process. Fix this by properly calculating the 'array_in_sync' return parameter in rs_get_progress(). Also, remove an unnecessary intermediate 'recovery_cp' variable in rs_get_progress(). Signed-off-by: Jonathan Brassow Signed-off-by: Mike Snitzer --- Documentation/device-mapper/dm-raid.txt | 1 + drivers/md/dm-raid.c | 11 ++++++----- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt index 4a0a7469fdd7..32df07e29f68 100644 --- a/Documentation/device-mapper/dm-raid.txt +++ b/Documentation/device-mapper/dm-raid.txt @@ -344,3 +344,4 @@ Version History (wrong raid10_copies/raid10_format sequence) 1.11.1 Add raid4/5/6 journal write-back support via journal_mode option 1.12.1 fix for MD deadlock between mddev_suspend() and md_write_start() available +1.13.0 Fix dev_health status at end of "recover" (was 'a', now 'A') diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 5bfe285ea9d1..43094ea89e37 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3297,11 +3297,10 @@ static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev, static sector_t rs_get_progress(struct raid_set *rs, sector_t resync_max_sectors, bool *array_in_sync) { - sector_t r, recovery_cp, curr_resync_completed; + sector_t r, curr_resync_completed; struct mddev *mddev = &rs->md; curr_resync_completed = mddev->curr_resync_completed ?: mddev->recovery_cp; - recovery_cp = mddev->recovery_cp; *array_in_sync = false; if (rs_is_raid0(rs)) { @@ -3330,9 +3329,11 @@ static sector_t rs_get_progress(struct raid_set *rs, } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) r = curr_resync_completed; else - r = recovery_cp; + r = mddev->recovery_cp; - if (r == MaxSector) { + if ((r == MaxSector) || + (test_bit(MD_RECOVERY_DONE, &mddev->recovery) && + (mddev->curr_resync_completed == resync_max_sectors))) { /* * Sync complete. */ @@ -3892,7 +3893,7 @@ static void raid_resume(struct dm_target *ti) static struct target_type raid_target = { .name = "raid", - .version = {1, 12, 1}, + .version = {1, 13, 0}, .module = THIS_MODULE, .ctr = raid_ctr, .dtr = raid_dtr, -- cgit From 53ecde0b9126ff140abe3aefd7f0ec64d6fa36b0 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Thu, 7 Sep 2017 15:05:51 +1000 Subject: powerpc/powernv: Increase memory block size to 1GB on radix Memory hot unplug on PowerNV radix hosts is broken. Our memory block size is 256MB but since we map the linear region with very large pages, each pte we tear down maps 1GB. A hot unplug of one 256MB memory block results in 768MB of memory getting unintentionally unmapped. At this point we are likely to oops. Fix this by increasing our memory block size to 1GB on PowerNV radix hosts. Fixes: 4b5d62ca17a1 ("powerpc/mm: add radix__remove_section_mapping()") Cc: stable@vger.kernel.org # v4.11+ Signed-off-by: Anton Blanchard Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/setup.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index 897aa1400eb8..bbb73aa0eb8f 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -272,7 +272,15 @@ static void pnv_kexec_cpu_down(int crash_shutdown, int secondary) #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE static unsigned long pnv_memory_block_size(void) { - return 256UL * 1024 * 1024; + /* + * We map the kernel linear region with 1GB large pages on radix. For + * memory hot unplug to work our memory block size must be at least + * this size. + */ + if (radix_enabled()) + return 1UL * 1024 * 1024 * 1024; + else + return 256UL * 1024 * 1024; } #endif -- cgit From c0d8832e78cbfd4a64b7112e34920af4b0b0e60e Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Fri, 6 Oct 2017 14:16:52 +0100 Subject: arm64: Ensure the instruction emulation is ready for userspace We trap and emulate some instructions (e.g, mrs, deprecated instructions) for the userspace. However the handlers for these are registered as late_initcalls and the userspace could be up and running from the initramfs by that time (with populate_rootfs, which is a rootfs_initcall()). This could cause problems for the early applications ending up in failure like : [ 11.152061] modprobe[93]: undefined instruction: pc=0000ffff8ca48ff4 This patch promotes the specific calls to core_initcalls, which are guaranteed to be completed before we hit userspace. Cc: stable@vger.kernel.org Cc: Dave Martin Cc: Matthias Brugger Cc: James Morse Reported-by: Matwey V. Kornilov Signed-off-by: Suzuki K Poulose Signed-off-by: Catalin Marinas --- arch/arm64/kernel/armv8_deprecated.c | 2 +- arch/arm64/kernel/cpufeature.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c index f0e6d717885b..d06fbe4cd38d 100644 --- a/arch/arm64/kernel/armv8_deprecated.c +++ b/arch/arm64/kernel/armv8_deprecated.c @@ -649,4 +649,4 @@ static int __init armv8_deprecated_init(void) return 0; } -late_initcall(armv8_deprecated_init); +core_initcall(armv8_deprecated_init); diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index cd52d365d1f0..21e2c95d24e7 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -1307,4 +1307,4 @@ static int __init enable_mrs_emulation(void) return 0; } -late_initcall(enable_mrs_emulation); +core_initcall(enable_mrs_emulation); -- cgit From ae2e972dae3cea795e9f8f94eb1601213c2d49f0 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Fri, 6 Oct 2017 14:16:53 +0100 Subject: arm64: Ensure fpsimd support is ready before userspace is active We register the pm/hotplug callbacks for FPSIMD as late_initcall, which happens after the userspace is active (from initramfs via populate_rootfs, a rootfs_initcall). Make sure we are ready even before the userspace could potentially use it, by promoting to a core_initcall. Cc: Will Deacon Cc: Mark Rutland Cc: Dave Martin Signed-off-by: Suzuki K Poulose Signed-off-by: Catalin Marinas --- arch/arm64/kernel/fpsimd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index f444f374bd7b..5d547deb6996 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -444,4 +444,4 @@ static int __init fpsimd_init(void) return 0; } -late_initcall(fpsimd_init); +core_initcall(fpsimd_init); -- cgit From ab8eb7db1dcc03e7ae9eb379884fd701af6b1a38 Mon Sep 17 00:00:00 2001 From: Eugeniy Paltsev Date: Fri, 22 Sep 2017 19:49:11 +0300 Subject: ARC: [plat-hsdk]: Add reset controller node to manage ethernet reset DW ethernet controller on HSDK hangs sometimes after SW reset, so add reset node to make possible to reset DW ethernet controller HW. Signed-off-by: Eugeniy Paltsev Signed-off-by: Vineet Gupta --- arch/arc/boot/dts/hsdk.dts | 9 +++++++++ arch/arc/configs/hsdk_defconfig | 1 + 2 files changed, 10 insertions(+) diff --git a/arch/arc/boot/dts/hsdk.dts b/arch/arc/boot/dts/hsdk.dts index b922f3faf554..8adde1b492f1 100644 --- a/arch/arc/boot/dts/hsdk.dts +++ b/arch/arc/boot/dts/hsdk.dts @@ -12,6 +12,7 @@ /dts-v1/; #include +#include / { model = "snps,hsdk"; @@ -102,6 +103,12 @@ ranges = <0x00000000 0xf0000000 0x10000000>; + cgu_rst: reset-controller@8a0 { + compatible = "snps,hsdk-reset"; + #reset-cells = <1>; + reg = <0x8A0 0x4>, <0xFF0 0x4>; + }; + core_clk: core-clk@0 { compatible = "snps,hsdk-core-pll-clock"; reg = <0x00 0x10>, <0x14B8 0x4>; @@ -158,6 +165,8 @@ clocks = <&gmacclk>; clock-names = "stmmaceth"; phy-handle = <&phy0>; + resets = <&cgu_rst HSDK_ETH_RESET>; + reset-names = "stmmaceth"; mdio { #address-cells = <1>; diff --git a/arch/arc/configs/hsdk_defconfig b/arch/arc/configs/hsdk_defconfig index 7b8f8faf8a24..15f0f6b5fec1 100644 --- a/arch/arc/configs/hsdk_defconfig +++ b/arch/arc/configs/hsdk_defconfig @@ -63,6 +63,7 @@ CONFIG_MMC_SDHCI=y CONFIG_MMC_SDHCI_PLTFM=y CONFIG_MMC_DW=y # CONFIG_IOMMU_SUPPORT is not set +CONFIG_RESET_HSDK=y CONFIG_EXT3_FS=y CONFIG_VFAT_FS=y CONFIG_TMPFS=y -- cgit From 8a5776a5f49812d29fe4b2d0a2d71675c3facf3f Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 8 Oct 2017 20:53:29 -0700 Subject: Linux 4.14-rc4 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index cf007a31d575..2835863bdd5a 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 4 PATCHLEVEL = 14 SUBLEVEL = 0 -EXTRAVERSION = -rc3 +EXTRAVERSION = -rc4 NAME = Fearless Coyote # *DOCUMENTATION* -- cgit From 592e254502041f953e84d091eae2c68cba04c10b Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 3 Nov 2017 12:21:21 +0100 Subject: mm: Handle 0 flags in _calc_vm_trans() macro _calc_vm_trans() does not handle the situation when some of the passed flags are 0 (which can happen if these VM flags do not make sense for the architecture). Improve the _calc_vm_trans() macro to return 0 in such situation. Since all passed flags are constant, this does not add any runtime overhead. Signed-off-by: Jan Kara Signed-off-by: Dan Williams --- include/linux/mman.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/mman.h b/include/linux/mman.h index c8367041fafd..edb6cf6a81ed 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -63,8 +63,9 @@ static inline bool arch_validate_prot(unsigned long prot) * ("bit1" and "bit2" must be single bits) */ #define _calc_vm_trans(x, bit1, bit2) \ + ((!(bit1) || !(bit2)) ? 0 : \ ((bit1) <= (bit2) ? ((x) & (bit1)) * ((bit2) / (bit1)) \ - : ((x) & (bit1)) / ((bit1) / (bit2))) + : ((x) & (bit1)) / ((bit1) / (bit2)))) /* * Combine the mmap "prot" argument into "vm_flags" used internally. -- cgit From 1c9725974074a047f6080eecc62c50a8e840d050 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 1 Nov 2017 16:36:30 +0100 Subject: mm: introduce MAP_SHARED_VALIDATE, a mechanism to safely define new mmap flags The mmap(2) syscall suffers from the ABI anti-pattern of not validating unknown flags. However, proposals like MAP_SYNC need a mechanism to define new behavior that is known to fail on older kernels without the support. Define a new MAP_SHARED_VALIDATE flag pattern that is guaranteed to fail on all legacy mmap implementations. It is worth noting that the original proposal was for a standalone MAP_VALIDATE flag. However, when that could not be supported by all archs Linus observed: I see why you *think* you want a bitmap. You think you want a bitmap because you want to make MAP_VALIDATE be part of MAP_SYNC etc, so that people can do ret = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_SYNC, fd, 0); and "know" that MAP_SYNC actually takes. And I'm saying that whole wish is bogus. You're fundamentally depending on special semantics, just make it explicit. It's already not portable, so don't try to make it so. Rename that MAP_VALIDATE as MAP_SHARED_VALIDATE, make it have a value of 0x3, and make people do ret = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED_VALIDATE | MAP_SYNC, fd, 0); and then the kernel side is easier too (none of that random garbage playing games with looking at the "MAP_VALIDATE bit", but just another case statement in that map type thing. Boom. Done. Similar to ->fallocate() we also want the ability to validate the support for new flags on a per ->mmap() 'struct file_operations' instance basis. Towards that end arrange for flags to be generically validated against a mmap_supported_flags exported by 'struct file_operations'. By default all existing flags are implicitly supported, but new flags require MAP_SHARED_VALIDATE and per-instance-opt-in. Cc: Jan Kara Cc: Arnd Bergmann Cc: Andy Lutomirski Cc: Andrew Morton Suggested-by: Christoph Hellwig Suggested-by: Linus Torvalds Reviewed-by: Ross Zwisler Signed-off-by: Dan Williams Signed-off-by: Jan Kara Signed-off-by: Dan Williams --- arch/alpha/include/uapi/asm/mman.h | 1 + arch/mips/include/uapi/asm/mman.h | 1 + arch/parisc/include/uapi/asm/mman.h | 1 + arch/xtensa/include/uapi/asm/mman.h | 1 + include/linux/fs.h | 1 + include/linux/mman.h | 39 ++++++++++++++++++++++++++++ include/uapi/asm-generic/mman-common.h | 1 + mm/mmap.c | 15 +++++++++++ tools/include/uapi/asm-generic/mman-common.h | 1 + 9 files changed, 61 insertions(+) diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h index 3b26cc62dadb..f6d118aaedb9 100644 --- a/arch/alpha/include/uapi/asm/mman.h +++ b/arch/alpha/include/uapi/asm/mman.h @@ -11,6 +11,7 @@ #define MAP_SHARED 0x01 /* Share changes */ #define MAP_PRIVATE 0x02 /* Changes are private */ +#define MAP_SHARED_VALIDATE 0x03 /* share + validate extension flags */ #define MAP_TYPE 0x0f /* Mask for type of mapping (OSF/1 is _wrong_) */ #define MAP_FIXED 0x100 /* Interpret addr exactly */ #define MAP_ANONYMOUS 0x10 /* don't use a file */ diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h index da3216007fe0..93268e4cd3c7 100644 --- a/arch/mips/include/uapi/asm/mman.h +++ b/arch/mips/include/uapi/asm/mman.h @@ -28,6 +28,7 @@ */ #define MAP_SHARED 0x001 /* Share changes */ #define MAP_PRIVATE 0x002 /* Changes are private */ +#define MAP_SHARED_VALIDATE 0x003 /* share + validate extension flags */ #define MAP_TYPE 0x00f /* Mask for type of mapping */ #define MAP_FIXED 0x010 /* Interpret addr exactly */ diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h index 775b5d5e41a1..bca652aa1677 100644 --- a/arch/parisc/include/uapi/asm/mman.h +++ b/arch/parisc/include/uapi/asm/mman.h @@ -11,6 +11,7 @@ #define MAP_SHARED 0x01 /* Share changes */ #define MAP_PRIVATE 0x02 /* Changes are private */ +#define MAP_SHARED_VALIDATE 0x03 /* share + validate extension flags */ #define MAP_TYPE 0x03 /* Mask for type of mapping */ #define MAP_FIXED 0x04 /* Interpret addr exactly */ #define MAP_ANONYMOUS 0x10 /* don't use a file */ diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h index b15b278aa314..9ab426374714 100644 --- a/arch/xtensa/include/uapi/asm/mman.h +++ b/arch/xtensa/include/uapi/asm/mman.h @@ -35,6 +35,7 @@ */ #define MAP_SHARED 0x001 /* Share changes */ #define MAP_PRIVATE 0x002 /* Changes are private */ +#define MAP_SHARED_VALIDATE 0x003 /* share + validate extension flags */ #define MAP_TYPE 0x00f /* Mask for type of mapping */ #define MAP_FIXED 0x010 /* Interpret addr exactly */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 13dab191a23e..57added3201d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1701,6 +1701,7 @@ struct file_operations { long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); long (*compat_ioctl) (struct file *, unsigned int, unsigned long); int (*mmap) (struct file *, struct vm_area_struct *); + unsigned long mmap_supported_flags; int (*open) (struct inode *, struct file *); int (*flush) (struct file *, fl_owner_t id); int (*release) (struct inode *, struct file *); diff --git a/include/linux/mman.h b/include/linux/mman.h index edb6cf6a81ed..74452e3f2536 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -7,6 +7,45 @@ #include #include +/* + * Arrange for legacy / undefined architecture specific flags to be + * ignored by default in LEGACY_MAP_MASK. + */ +#ifndef MAP_32BIT +#define MAP_32BIT 0 +#endif +#ifndef MAP_HUGE_2MB +#define MAP_HUGE_2MB 0 +#endif +#ifndef MAP_HUGE_1GB +#define MAP_HUGE_1GB 0 +#endif +#ifndef MAP_UNINITIALIZED +#define MAP_UNINITIALIZED 0 +#endif + +/* + * The historical set of flags that all mmap implementations implicitly + * support when a ->mmap_validate() op is not provided in file_operations. + */ +#define LEGACY_MAP_MASK (MAP_SHARED \ + | MAP_PRIVATE \ + | MAP_FIXED \ + | MAP_ANONYMOUS \ + | MAP_DENYWRITE \ + | MAP_EXECUTABLE \ + | MAP_UNINITIALIZED \ + | MAP_GROWSDOWN \ + | MAP_LOCKED \ + | MAP_NORESERVE \ + | MAP_POPULATE \ + | MAP_NONBLOCK \ + | MAP_STACK \ + | MAP_HUGETLB \ + | MAP_32BIT \ + | MAP_HUGE_2MB \ + | MAP_HUGE_1GB) + extern int sysctl_overcommit_memory; extern int sysctl_overcommit_ratio; extern unsigned long sysctl_overcommit_kbytes; diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index 203268f9231e..8ce7f5a0800f 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -16,6 +16,7 @@ #define MAP_SHARED 0x01 /* Share changes */ #define MAP_PRIVATE 0x02 /* Changes are private */ +#define MAP_SHARED_VALIDATE 0x03 /* share + validate extension flags */ #define MAP_TYPE 0x0f /* Mask for type of mapping */ #define MAP_FIXED 0x10 /* Interpret addr exactly */ #define MAP_ANONYMOUS 0x20 /* don't use a file */ diff --git a/mm/mmap.c b/mm/mmap.c index 680506faceae..924839fac0e6 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1387,9 +1387,24 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (file) { struct inode *inode = file_inode(file); + unsigned long flags_mask; + + flags_mask = LEGACY_MAP_MASK | file->f_op->mmap_supported_flags; switch (flags & MAP_TYPE) { case MAP_SHARED: + /* + * Force use of MAP_SHARED_VALIDATE with non-legacy + * flags. E.g. MAP_SYNC is dangerous to use with + * MAP_SHARED as you don't know which consistency model + * you will get. We silently ignore unsupported flags + * with MAP_SHARED to preserve backward compatibility. + */ + flags &= LEGACY_MAP_MASK; + /* fall through */ + case MAP_SHARED_VALIDATE: + if (flags & ~flags_mask) + return -EOPNOTSUPP; if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE)) return -EACCES; diff --git a/tools/include/uapi/asm-generic/mman-common.h b/tools/include/uapi/asm-generic/mman-common.h index 203268f9231e..8ce7f5a0800f 100644 --- a/tools/include/uapi/asm-generic/mman-common.h +++ b/tools/include/uapi/asm-generic/mman-common.h @@ -16,6 +16,7 @@ #define MAP_SHARED 0x01 /* Share changes */ #define MAP_PRIVATE 0x02 /* Changes are private */ +#define MAP_SHARED_VALIDATE 0x03 /* share + validate extension flags */ #define MAP_TYPE 0x0f /* Mask for type of mapping */ #define MAP_FIXED 0x10 /* Interpret addr exactly */ #define MAP_ANONYMOUS 0x20 /* don't use a file */ -- cgit From d81b8a722f691a3907eb4a6df410ab517ba5bfcc Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 1 Nov 2017 16:36:31 +0100 Subject: mm: Remove VM_FAULT_HWPOISON_LARGE_MASK It is unused. Reviewed-by: Ross Zwisler Reviewed-by: Christoph Hellwig Signed-off-by: Jan Kara Signed-off-by: Dan Williams --- include/linux/mm.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 065d99deb847..ca72b67153d5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1182,8 +1182,6 @@ static inline void clear_page_pfmemalloc(struct page *page) #define VM_FAULT_FALLBACK 0x0800 /* huge page fault failed, fall back to small */ #define VM_FAULT_DONE_COW 0x1000 /* ->fault has fully handled COW */ -#define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */ - #define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | \ VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE | \ VM_FAULT_FALLBACK) -- cgit From 31a6f1a6e5a4a26040b67d8fa4256539b36f5893 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 1 Nov 2017 16:36:32 +0100 Subject: dax: Simplify arguments of dax_insert_mapping() dax_insert_mapping() has lots of arguments and a lot of them is actuall duplicated by passing vm_fault structure as well. Change the function to take the same arguments as dax_pmd_insert_mapping(). Reviewed-by: Ross Zwisler Reviewed-by: Christoph Hellwig Signed-off-by: Jan Kara Signed-off-by: Dan Williams --- fs/dax.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index f001d8c72a06..0bc42ac294ca 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -820,23 +820,30 @@ out: } EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); -static int dax_insert_mapping(struct address_space *mapping, - struct block_device *bdev, struct dax_device *dax_dev, - sector_t sector, size_t size, void *entry, - struct vm_area_struct *vma, struct vm_fault *vmf) +static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos) +{ + return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9); +} + +static int dax_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, + loff_t pos, void *entry) { + const sector_t sector = dax_iomap_sector(iomap, pos); + struct vm_area_struct *vma = vmf->vma; + struct address_space *mapping = vma->vm_file->f_mapping; unsigned long vaddr = vmf->address; void *ret, *kaddr; pgoff_t pgoff; int id, rc; pfn_t pfn; - rc = bdev_dax_pgoff(bdev, sector, size, &pgoff); + rc = bdev_dax_pgoff(iomap->bdev, sector, PAGE_SIZE, &pgoff); if (rc) return rc; id = dax_read_lock(); - rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn); + rc = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(PAGE_SIZE), + &kaddr, &pfn); if (rc < 0) { dax_read_unlock(id); return rc; @@ -936,11 +943,6 @@ int __dax_zero_page_range(struct block_device *bdev, } EXPORT_SYMBOL_GPL(__dax_zero_page_range); -static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos) -{ - return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9); -} - static loff_t dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, struct iomap *iomap) @@ -1087,7 +1089,6 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, struct inode *inode = mapping->host; unsigned long vaddr = vmf->address; loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; - sector_t sector; struct iomap iomap = { 0 }; unsigned flags = IOMAP_FAULT; int error, major = 0; @@ -1140,9 +1141,9 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, goto error_finish_iomap; } - sector = dax_iomap_sector(&iomap, pos); - if (vmf->cow_page) { + sector_t sector = dax_iomap_sector(&iomap, pos); + switch (iomap.type) { case IOMAP_HOLE: case IOMAP_UNWRITTEN: @@ -1175,8 +1176,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); major = VM_FAULT_MAJOR; } - error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev, - sector, PAGE_SIZE, entry, vmf->vma, vmf); + error = dax_insert_mapping(vmf, &iomap, pos, entry); /* -EBUSY is fine, somebody else faulted on the same PTE */ if (error == -EBUSY) error = 0; -- cgit From 5e161e4066d3ebeaff95a4b979b42f8bf00494d5 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 1 Nov 2017 16:36:33 +0100 Subject: dax: Factor out getting of pfn out of iomap Factor out code to get pfn out of iomap that is shared between PTE and PMD fault path. Reviewed-by: Christoph Hellwig Reviewed-by: Ross Zwisler Signed-off-by: Jan Kara Signed-off-by: Dan Williams --- fs/dax.c | 83 +++++++++++++++++++++++++++++++++------------------------------- 1 file changed, 43 insertions(+), 40 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 0bc42ac294ca..116eef8d6c69 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -825,30 +825,53 @@ static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos) return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9); } -static int dax_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, - loff_t pos, void *entry) +static int dax_iomap_pfn(struct iomap *iomap, loff_t pos, size_t size, + pfn_t *pfnp) { const sector_t sector = dax_iomap_sector(iomap, pos); - struct vm_area_struct *vma = vmf->vma; - struct address_space *mapping = vma->vm_file->f_mapping; - unsigned long vaddr = vmf->address; - void *ret, *kaddr; pgoff_t pgoff; + void *kaddr; int id, rc; - pfn_t pfn; + long length; - rc = bdev_dax_pgoff(iomap->bdev, sector, PAGE_SIZE, &pgoff); + rc = bdev_dax_pgoff(iomap->bdev, sector, size, &pgoff); if (rc) return rc; - id = dax_read_lock(); - rc = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(PAGE_SIZE), - &kaddr, &pfn); - if (rc < 0) { - dax_read_unlock(id); - return rc; + length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size), + &kaddr, pfnp); + if (length < 0) { + rc = length; + goto out; } + rc = -EINVAL; + if (PFN_PHYS(length) < size) + goto out; + if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(size)-1)) + goto out; + /* For larger pages we need devmap */ + if (length > 1 && !pfn_t_devmap(*pfnp)) + goto out; + rc = 0; +out: dax_read_unlock(id); + return rc; +} + +static int dax_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, + loff_t pos, void *entry) +{ + const sector_t sector = dax_iomap_sector(iomap, pos); + struct vm_area_struct *vma = vmf->vma; + struct address_space *mapping = vma->vm_file->f_mapping; + unsigned long vaddr = vmf->address; + void *ret; + int rc; + pfn_t pfn; + + rc = dax_iomap_pfn(iomap, pos, PAGE_SIZE, &pfn); + if (rc < 0) + return rc; ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0); if (IS_ERR(ret)) @@ -1223,46 +1246,26 @@ static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, { struct address_space *mapping = vmf->vma->vm_file->f_mapping; const sector_t sector = dax_iomap_sector(iomap, pos); - struct dax_device *dax_dev = iomap->dax_dev; - struct block_device *bdev = iomap->bdev; struct inode *inode = mapping->host; - const size_t size = PMD_SIZE; - void *ret = NULL, *kaddr; - long length = 0; - pgoff_t pgoff; + void *ret = NULL; pfn_t pfn = {}; - int id; + int rc; - if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0) + rc = dax_iomap_pfn(iomap, pos, PMD_SIZE, &pfn); + if (rc < 0) goto fallback; - id = dax_read_lock(); - length = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn); - if (length < 0) - goto unlock_fallback; - length = PFN_PHYS(length); - - if (length < size) - goto unlock_fallback; - if (pfn_t_to_pfn(pfn) & PG_PMD_COLOUR) - goto unlock_fallback; - if (!pfn_t_devmap(pfn)) - goto unlock_fallback; - dax_read_unlock(id); - ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, RADIX_DAX_PMD); if (IS_ERR(ret)) goto fallback; - trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret); + trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, ret); return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, pfn, vmf->flags & FAULT_FLAG_WRITE); -unlock_fallback: - dax_read_unlock(id); fallback: - trace_dax_pmd_insert_mapping_fallback(inode, vmf, length, pfn, ret); + trace_dax_pmd_insert_mapping_fallback(inode, vmf, PMD_SIZE, pfn, ret); return VM_FAULT_FALLBACK; } -- cgit From a0987ad5c576626fdb40547f5ac99b4f90608dda Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 1 Nov 2017 16:36:34 +0100 Subject: dax: Create local variable for VMA in dax_iomap_pte_fault() There are already two users and more are coming. Reviewed-by: Christoph Hellwig Reviewed-by: Ross Zwisler Signed-off-by: Jan Kara Signed-off-by: Dan Williams --- fs/dax.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 116eef8d6c69..c09465884bbe 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1108,7 +1108,8 @@ static int dax_fault_return(int error) static int dax_iomap_pte_fault(struct vm_fault *vmf, const struct iomap_ops *ops) { - struct address_space *mapping = vmf->vma->vm_file->f_mapping; + struct vm_area_struct *vma = vmf->vma; + struct address_space *mapping = vma->vm_file->f_mapping; struct inode *inode = mapping->host; unsigned long vaddr = vmf->address; loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; @@ -1196,7 +1197,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, case IOMAP_MAPPED: if (iomap.flags & IOMAP_F_NEW) { count_vm_event(PGMAJFAULT); - count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); + count_memcg_event_mm(vma->vm_mm, PGMAJFAULT); major = VM_FAULT_MAJOR; } error = dax_insert_mapping(vmf, &iomap, pos, entry); -- cgit From d2c43ef13327478b299db0f58193495367c6d4ae Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 1 Nov 2017 16:36:35 +0100 Subject: dax: Create local variable for vmf->flags & FAULT_FLAG_WRITE test There are already two users and more are coming. Reviewed-by: Christoph Hellwig Reviewed-by: Ross Zwisler Signed-off-by: Jan Kara Signed-off-by: Dan Williams --- fs/dax.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index c09465884bbe..5ea71381dba0 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1116,6 +1116,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, struct iomap iomap = { 0 }; unsigned flags = IOMAP_FAULT; int error, major = 0; + bool write = vmf->flags & FAULT_FLAG_WRITE; int vmf_ret = 0; void *entry; @@ -1130,7 +1131,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, goto out; } - if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) + if (write && !vmf->cow_page) flags |= IOMAP_WRITE; entry = grab_mapping_entry(mapping, vmf->pgoff, 0); @@ -1207,7 +1208,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, break; case IOMAP_UNWRITTEN: case IOMAP_HOLE: - if (!(vmf->flags & FAULT_FLAG_WRITE)) { + if (!write) { vmf_ret = dax_load_hole(mapping, entry, vmf); goto finish_iomap; } -- cgit From 1b5a1cb21e0cdfb001050c76dc31039cdece1a63 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 1 Nov 2017 16:36:36 +0100 Subject: dax: Inline dax_insert_mapping() into the callsite dax_insert_mapping() has only one callsite and we will need to further fine tune what it does for synchronous faults. Just inline it into the callsite so that we don't have to pass awkward bools around. Reviewed-by: Christoph Hellwig Reviewed-by: Ross Zwisler Signed-off-by: Jan Kara Signed-off-by: Dan Williams --- fs/dax.c | 46 +++++++++++++++++++--------------------------- 1 file changed, 19 insertions(+), 27 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 5ea71381dba0..5b20c6456926 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -858,32 +858,6 @@ out: return rc; } -static int dax_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, - loff_t pos, void *entry) -{ - const sector_t sector = dax_iomap_sector(iomap, pos); - struct vm_area_struct *vma = vmf->vma; - struct address_space *mapping = vma->vm_file->f_mapping; - unsigned long vaddr = vmf->address; - void *ret; - int rc; - pfn_t pfn; - - rc = dax_iomap_pfn(iomap, pos, PAGE_SIZE, &pfn); - if (rc < 0) - return rc; - - ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0); - if (IS_ERR(ret)) - return PTR_ERR(ret); - - trace_dax_insert_mapping(mapping->host, vmf, ret); - if (vmf->flags & FAULT_FLAG_WRITE) - return vm_insert_mixed_mkwrite(vma, vaddr, pfn); - else - return vm_insert_mixed(vma, vaddr, pfn); -} - /* * The user has performed a load from a hole in the file. Allocating a new * page in the file would cause excessive storage usage for workloads with @@ -1119,6 +1093,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, bool write = vmf->flags & FAULT_FLAG_WRITE; int vmf_ret = 0; void *entry; + pfn_t pfn; trace_dax_pte_fault(inode, vmf, vmf_ret); /* @@ -1201,7 +1176,24 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, count_memcg_event_mm(vma->vm_mm, PGMAJFAULT); major = VM_FAULT_MAJOR; } - error = dax_insert_mapping(vmf, &iomap, pos, entry); + error = dax_iomap_pfn(&iomap, pos, PAGE_SIZE, &pfn); + if (error < 0) + goto error_finish_iomap; + + entry = dax_insert_mapping_entry(mapping, vmf, entry, + dax_iomap_sector(&iomap, pos), + 0); + if (IS_ERR(entry)) { + error = PTR_ERR(entry); + goto error_finish_iomap; + } + + trace_dax_insert_mapping(inode, vmf, entry); + if (write) + error = vm_insert_mixed_mkwrite(vma, vaddr, pfn); + else + error = vm_insert_mixed(vma, vaddr, pfn); + /* -EBUSY is fine, somebody else faulted on the same PTE */ if (error == -EBUSY) error = 0; -- cgit From 302a5e312b3a106fec49ba08da3f6545c9b7ee18 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 1 Nov 2017 16:36:37 +0100 Subject: dax: Inline dax_pmd_insert_mapping() into the callsite dax_pmd_insert_mapping() has only one callsite and we will need to further fine tune what it does for synchronous faults. Just inline it into the callsite so that we don't have to pass awkward bools around. Reviewed-by: Christoph Hellwig Reviewed-by: Ross Zwisler Signed-off-by: Jan Kara Signed-off-by: Dan Williams --- fs/dax.c | 47 +++++++++++++++++-------------------------- include/trace/events/fs_dax.h | 1 - 2 files changed, 19 insertions(+), 29 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 5b20c6456926..675fab8ec41f 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1235,33 +1235,11 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, } #ifdef CONFIG_FS_DAX_PMD -static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, - loff_t pos, void *entry) -{ - struct address_space *mapping = vmf->vma->vm_file->f_mapping; - const sector_t sector = dax_iomap_sector(iomap, pos); - struct inode *inode = mapping->host; - void *ret = NULL; - pfn_t pfn = {}; - int rc; - - rc = dax_iomap_pfn(iomap, pos, PMD_SIZE, &pfn); - if (rc < 0) - goto fallback; - - ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, - RADIX_DAX_PMD); - if (IS_ERR(ret)) - goto fallback; - - trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, ret); - return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, - pfn, vmf->flags & FAULT_FLAG_WRITE); - -fallback: - trace_dax_pmd_insert_mapping_fallback(inode, vmf, PMD_SIZE, pfn, ret); - return VM_FAULT_FALLBACK; -} +/* + * The 'colour' (ie low bits) within a PMD of a page offset. This comes up + * more often than one might expect in the below functions. + */ +#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, void *entry) @@ -1317,6 +1295,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, void *entry; loff_t pos; int error; + pfn_t pfn; /* * Check whether offset isn't beyond end of file now. Caller is @@ -1394,7 +1373,19 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, switch (iomap.type) { case IOMAP_MAPPED: - result = dax_pmd_insert_mapping(vmf, &iomap, pos, entry); + error = dax_iomap_pfn(&iomap, pos, PMD_SIZE, &pfn); + if (error < 0) + goto finish_iomap; + + entry = dax_insert_mapping_entry(mapping, vmf, entry, + dax_iomap_sector(&iomap, pos), + RADIX_DAX_PMD); + if (IS_ERR(entry)) + goto finish_iomap; + + trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry); + result = vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn, + write); break; case IOMAP_UNWRITTEN: case IOMAP_HOLE: diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h index fbc4a06f7310..88a9d19b8ff8 100644 --- a/include/trace/events/fs_dax.h +++ b/include/trace/events/fs_dax.h @@ -148,7 +148,6 @@ DEFINE_EVENT(dax_pmd_insert_mapping_class, name, \ TP_ARGS(inode, vmf, length, pfn, radix_entry)) DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping); -DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping_fallback); DECLARE_EVENT_CLASS(dax_pte_fault_class, TP_PROTO(struct inode *inode, struct vm_fault *vmf, int result), -- cgit From cec04e8c825eaa5a4bb1a3ce0d3784628e965196 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 1 Nov 2017 16:36:38 +0100 Subject: dax: Fix comment describing dax_iomap_fault() Add missing argument description. Reviewed-by: Christoph Hellwig Reviewed-by: Ross Zwisler Signed-off-by: Jan Kara Signed-off-by: Dan Williams --- fs/dax.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/dax.c b/fs/dax.c index 675fab8ec41f..5214ed9ba508 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1435,7 +1435,8 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, /** * dax_iomap_fault - handle a page fault on a DAX file * @vmf: The description of the fault - * @ops: iomap ops passed from the file system + * @pe_size: Size of the page to fault in + * @ops: Iomap ops passed from the file system * * When a page fault occurs, filesystems may call this helper in * their fault handler for DAX files. dax_iomap_fault() assumes the caller -- cgit From 9a0dd42251439de635088b97533109a935864a84 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 1 Nov 2017 16:36:39 +0100 Subject: dax: Allow dax_iomap_fault() to return pfn For synchronous page fault dax_iomap_fault() will need to return PFN which will then need to be inserted into page tables after fsync() completes. Add necessary parameter to dax_iomap_fault(). Reviewed-by: Christoph Hellwig Reviewed-by: Ross Zwisler Signed-off-by: Jan Kara Signed-off-by: Dan Williams --- fs/dax.c | 13 +++++++------ fs/ext2/file.c | 2 +- fs/ext4/file.c | 2 +- fs/xfs/xfs_file.c | 4 ++-- include/linux/dax.h | 2 +- 5 files changed, 12 insertions(+), 11 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 5214ed9ba508..5ddf15161390 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1079,7 +1079,7 @@ static int dax_fault_return(int error) return VM_FAULT_SIGBUS; } -static int dax_iomap_pte_fault(struct vm_fault *vmf, +static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, const struct iomap_ops *ops) { struct vm_area_struct *vma = vmf->vma; @@ -1280,7 +1280,7 @@ fallback: return VM_FAULT_FALLBACK; } -static int dax_iomap_pmd_fault(struct vm_fault *vmf, +static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, const struct iomap_ops *ops) { struct vm_area_struct *vma = vmf->vma; @@ -1425,7 +1425,7 @@ out: return result; } #else -static int dax_iomap_pmd_fault(struct vm_fault *vmf, +static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, const struct iomap_ops *ops) { return VM_FAULT_FALLBACK; @@ -1436,6 +1436,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, * dax_iomap_fault - handle a page fault on a DAX file * @vmf: The description of the fault * @pe_size: Size of the page to fault in + * @pfnp: PFN to insert for synchronous faults if fsync is required * @ops: Iomap ops passed from the file system * * When a page fault occurs, filesystems may call this helper in @@ -1444,13 +1445,13 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, * successfully. */ int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, - const struct iomap_ops *ops) + pfn_t *pfnp, const struct iomap_ops *ops) { switch (pe_size) { case PE_SIZE_PTE: - return dax_iomap_pte_fault(vmf, ops); + return dax_iomap_pte_fault(vmf, pfnp, ops); case PE_SIZE_PMD: - return dax_iomap_pmd_fault(vmf, ops); + return dax_iomap_pmd_fault(vmf, pfnp, ops); default: return VM_FAULT_FALLBACK; } diff --git a/fs/ext2/file.c b/fs/ext2/file.c index ff3a3636a5ca..d2bb7c96307d 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -99,7 +99,7 @@ static int ext2_dax_fault(struct vm_fault *vmf) } down_read(&ei->dax_sem); - ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &ext2_iomap_ops); + ret = dax_iomap_fault(vmf, PE_SIZE_PTE, NULL, &ext2_iomap_ops); up_read(&ei->dax_sem); if (vmf->flags & FAULT_FLAG_WRITE) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index b1da660ac3bc..3cec0b95672f 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -306,7 +306,7 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf, down_read(&EXT4_I(inode)->i_mmap_sem); } if (!IS_ERR(handle)) - result = dax_iomap_fault(vmf, pe_size, &ext4_iomap_ops); + result = dax_iomap_fault(vmf, pe_size, NULL, &ext4_iomap_ops); else result = VM_FAULT_SIGBUS; if (write) { diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 309e26c9dddb..7c6b8def6eed 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1040,7 +1040,7 @@ __xfs_filemap_fault( xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (IS_DAX(inode)) { - ret = dax_iomap_fault(vmf, pe_size, &xfs_iomap_ops); + ret = dax_iomap_fault(vmf, pe_size, NULL, &xfs_iomap_ops); } else { if (write_fault) ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops); @@ -1111,7 +1111,7 @@ xfs_filemap_pfn_mkwrite( if (vmf->pgoff >= size) ret = VM_FAULT_SIGBUS; else if (IS_DAX(inode)) - ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops); + ret = dax_iomap_fault(vmf, PE_SIZE_PTE, NULL, &xfs_iomap_ops); xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); sb_end_pagefault(inode->i_sb); return ret; diff --git a/include/linux/dax.h b/include/linux/dax.h index 122197124b9d..e7fa4b8f45bc 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -95,7 +95,7 @@ bool dax_write_cache_enabled(struct dax_device *dax_dev); ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops); int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, - const struct iomap_ops *ops); + pfn_t *pfnp, const struct iomap_ops *ops); int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); int dax_invalidate_mapping_entry_sync(struct address_space *mapping, pgoff_t index); -- cgit From f5b7b74876cff5664ea8b2ef7f002c54cd6e7c90 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 1 Nov 2017 16:36:40 +0100 Subject: dax: Allow tuning whether dax_insert_mapping_entry() dirties entry Currently we dirty radix tree entry whenever dax_insert_mapping_entry() gets called for a write fault. With synchronous page faults we would like to insert clean radix tree entry and dirty it only once we call fdatasync() and update page tables to save some unnecessary cache flushing. Add 'dirty' argument to dax_insert_mapping_entry() for that. Reviewed-by: Ross Zwisler Reviewed-by: Christoph Hellwig Signed-off-by: Jan Kara Signed-off-by: Dan Williams --- fs/dax.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 5ddf15161390..efc210ff6665 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -526,13 +526,13 @@ static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev, static void *dax_insert_mapping_entry(struct address_space *mapping, struct vm_fault *vmf, void *entry, sector_t sector, - unsigned long flags) + unsigned long flags, bool dirty) { struct radix_tree_root *page_tree = &mapping->page_tree; void *new_entry; pgoff_t index = vmf->pgoff; - if (vmf->flags & FAULT_FLAG_WRITE) + if (dirty) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) { @@ -569,7 +569,7 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, entry = new_entry; } - if (vmf->flags & FAULT_FLAG_WRITE) + if (dirty) radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); spin_unlock_irq(&mapping->tree_lock); @@ -881,7 +881,7 @@ static int dax_load_hole(struct address_space *mapping, void *entry, } entry2 = dax_insert_mapping_entry(mapping, vmf, entry, 0, - RADIX_DAX_ZERO_PAGE); + RADIX_DAX_ZERO_PAGE, false); if (IS_ERR(entry2)) { ret = VM_FAULT_SIGBUS; goto out; @@ -1182,7 +1182,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, entry = dax_insert_mapping_entry(mapping, vmf, entry, dax_iomap_sector(&iomap, pos), - 0); + 0, write); if (IS_ERR(entry)) { error = PTR_ERR(entry); goto error_finish_iomap; @@ -1258,7 +1258,7 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, goto fallback; ret = dax_insert_mapping_entry(mapping, vmf, entry, 0, - RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE); + RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false); if (IS_ERR(ret)) goto fallback; @@ -1379,7 +1379,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, entry = dax_insert_mapping_entry(mapping, vmf, entry, dax_iomap_sector(&iomap, pos), - RADIX_DAX_PMD); + RADIX_DAX_PMD, write); if (IS_ERR(entry)) goto finish_iomap; -- cgit From b6fb293f2497a9841d94f6b57bd2bb2cd222da43 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 1 Nov 2017 16:36:41 +0100 Subject: mm: Define MAP_SYNC and VM_SYNC flags Define new MAP_SYNC flag and corresponding VMA VM_SYNC flag. As the MAP_SYNC flag is not part of LEGACY_MAP_MASK, currently it will be refused by all MAP_SHARED_VALIDATE map attempts and silently ignored for everything else. Reviewed-by: Ross Zwisler Reviewed-by: Christoph Hellwig Signed-off-by: Jan Kara Signed-off-by: Dan Williams --- fs/proc/task_mmu.c | 1 + include/linux/mm.h | 1 + include/linux/mman.h | 8 ++++++-- include/uapi/asm-generic/mman.h | 1 + 4 files changed, 9 insertions(+), 2 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 5589b4bd4b85..ea78b37deeaa 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -664,6 +664,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_ACCOUNT)] = "ac", [ilog2(VM_NORESERVE)] = "nr", [ilog2(VM_HUGETLB)] = "ht", + [ilog2(VM_SYNC)] = "sf", [ilog2(VM_ARCH_1)] = "ar", [ilog2(VM_WIPEONFORK)] = "wf", [ilog2(VM_DONTDUMP)] = "dd", diff --git a/include/linux/mm.h b/include/linux/mm.h index ca72b67153d5..5411cb7442de 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -189,6 +189,7 @@ extern unsigned int kobjsize(const void *objp); #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ #define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ +#define VM_SYNC 0x00800000 /* Synchronous page faults */ #define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ #define VM_WIPEONFORK 0x02000000 /* Wipe VMA contents in child. */ #define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */ diff --git a/include/linux/mman.h b/include/linux/mman.h index 74452e3f2536..3427bf3daef5 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -9,7 +9,7 @@ /* * Arrange for legacy / undefined architecture specific flags to be - * ignored by default in LEGACY_MAP_MASK. + * ignored by mmap handling code. */ #ifndef MAP_32BIT #define MAP_32BIT 0 @@ -23,6 +23,9 @@ #ifndef MAP_UNINITIALIZED #define MAP_UNINITIALIZED 0 #endif +#ifndef MAP_SYNC +#define MAP_SYNC 0 +#endif /* * The historical set of flags that all mmap implementations implicitly @@ -126,7 +129,8 @@ calc_vm_flag_bits(unsigned long flags) { return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) | _calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) | - _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ); + _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ) | + _calc_vm_trans(flags, MAP_SYNC, VM_SYNC ); } unsigned long vm_commit_limit(void); diff --git a/include/uapi/asm-generic/mman.h b/include/uapi/asm-generic/mman.h index 7162cd4cca73..00e55627d2df 100644 --- a/include/uapi/asm-generic/mman.h +++ b/include/uapi/asm-generic/mman.h @@ -12,6 +12,7 @@ #define MAP_NONBLOCK 0x10000 /* do not block on IO */ #define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */ #define MAP_HUGETLB 0x40000 /* create a huge page mapping */ +#define MAP_SYNC 0x80000 /* perform synchronous page faults for the mapping */ /* Bits [26:31] are reserved, see mman-common.h for MAP_HUGETLB usage */ -- cgit From caa51d26f85c248f1c4f43a870ad3ef84bf9eb8f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 1 Nov 2017 16:36:42 +0100 Subject: dax, iomap: Add support for synchronous faults Add a flag to iomap interface informing the caller that inode needs fdstasync(2) for returned extent to become persistent and use it in DAX fault code so that we don't map such extents into page tables immediately. Instead we propagate the information that fdatasync(2) is necessary from dax_iomap_fault() with a new VM_FAULT_NEEDDSYNC flag. Filesystem fault handler is then responsible for calling fdatasync(2) and inserting pfn into page tables. Reviewed-by: Ross Zwisler Reviewed-by: Christoph Hellwig Signed-off-by: Jan Kara Signed-off-by: Dan Williams --- fs/dax.c | 39 +++++++++++++++++++++++++++++++++++++-- include/linux/iomap.h | 5 +++++ include/linux/mm.h | 6 +++++- 3 files changed, 47 insertions(+), 3 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index efc210ff6665..bb9ff907738c 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1091,6 +1091,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, unsigned flags = IOMAP_FAULT; int error, major = 0; bool write = vmf->flags & FAULT_FLAG_WRITE; + bool sync; int vmf_ret = 0; void *entry; pfn_t pfn; @@ -1169,6 +1170,8 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, goto finish_iomap; } + sync = (vma->vm_flags & VM_SYNC) && (iomap.flags & IOMAP_F_DIRTY); + switch (iomap.type) { case IOMAP_MAPPED: if (iomap.flags & IOMAP_F_NEW) { @@ -1182,12 +1185,27 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, entry = dax_insert_mapping_entry(mapping, vmf, entry, dax_iomap_sector(&iomap, pos), - 0, write); + 0, write && !sync); if (IS_ERR(entry)) { error = PTR_ERR(entry); goto error_finish_iomap; } + /* + * If we are doing synchronous page fault and inode needs fsync, + * we can insert PTE into page tables only after that happens. + * Skip insertion for now and return the pfn so that caller can + * insert it after fsync is done. + */ + if (sync) { + if (WARN_ON_ONCE(!pfnp)) { + error = -EIO; + goto error_finish_iomap; + } + *pfnp = pfn; + vmf_ret = VM_FAULT_NEEDDSYNC | major; + goto finish_iomap; + } trace_dax_insert_mapping(inode, vmf, entry); if (write) error = vm_insert_mixed_mkwrite(vma, vaddr, pfn); @@ -1287,6 +1305,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, struct address_space *mapping = vma->vm_file->f_mapping; unsigned long pmd_addr = vmf->address & PMD_MASK; bool write = vmf->flags & FAULT_FLAG_WRITE; + bool sync; unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT; struct inode *inode = mapping->host; int result = VM_FAULT_FALLBACK; @@ -1371,6 +1390,8 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, if (iomap.offset + iomap.length < pos + PMD_SIZE) goto finish_iomap; + sync = (vma->vm_flags & VM_SYNC) && (iomap.flags & IOMAP_F_DIRTY); + switch (iomap.type) { case IOMAP_MAPPED: error = dax_iomap_pfn(&iomap, pos, PMD_SIZE, &pfn); @@ -1379,10 +1400,24 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, entry = dax_insert_mapping_entry(mapping, vmf, entry, dax_iomap_sector(&iomap, pos), - RADIX_DAX_PMD, write); + RADIX_DAX_PMD, write && !sync); if (IS_ERR(entry)) goto finish_iomap; + /* + * If we are doing synchronous page fault and inode needs fsync, + * we can insert PMD into page tables only after that happens. + * Skip insertion for now and return the pfn so that caller can + * insert it after fsync is done. + */ + if (sync) { + if (WARN_ON_ONCE(!pfnp)) + goto finish_iomap; + *pfnp = pfn; + result = VM_FAULT_NEEDDSYNC; + goto finish_iomap; + } + trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry); result = vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn, write); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index f64dc6ce5161..73e3b7085dbe 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -22,6 +22,11 @@ struct vm_fault; * Flags for all iomap mappings: */ #define IOMAP_F_NEW 0x01 /* blocks have been newly allocated */ +/* + * IOMAP_F_DIRTY indicates the inode has uncommitted metadata needed to access + * written data and requires fdatasync to commit them to persistent storage. + */ +#define IOMAP_F_DIRTY 0x02 /* * Flags that only need to be reported for IOMAP_REPORT requests: diff --git a/include/linux/mm.h b/include/linux/mm.h index 5411cb7442de..f57e55782d7d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1182,6 +1182,9 @@ static inline void clear_page_pfmemalloc(struct page *page) #define VM_FAULT_RETRY 0x0400 /* ->fault blocked, must retry */ #define VM_FAULT_FALLBACK 0x0800 /* huge page fault failed, fall back to small */ #define VM_FAULT_DONE_COW 0x1000 /* ->fault has fully handled COW */ +#define VM_FAULT_NEEDDSYNC 0x2000 /* ->fault did not modify page tables + * and needs fsync() to complete (for + * synchronous page faults in DAX) */ #define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | \ VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE | \ @@ -1199,7 +1202,8 @@ static inline void clear_page_pfmemalloc(struct page *page) { VM_FAULT_LOCKED, "LOCKED" }, \ { VM_FAULT_RETRY, "RETRY" }, \ { VM_FAULT_FALLBACK, "FALLBACK" }, \ - { VM_FAULT_DONE_COW, "DONE_COW" } + { VM_FAULT_DONE_COW, "DONE_COW" }, \ + { VM_FAULT_NEEDDSYNC, "NEEDDSYNC" } /* Encode hstate index for a hwpoisoned large page */ #define VM_FAULT_SET_HINDEX(x) ((x) << 12) -- cgit From 71eab6dfd91eabf06fe782a590c07e8a0795f5ea Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 1 Nov 2017 16:36:43 +0100 Subject: dax: Implement dax_finish_sync_fault() Implement a function that filesystems can call to finish handling of synchronous page faults. It takes care of syncing appropriare file range and insertion of page table entry. Reviewed-by: Ross Zwisler Reviewed-by: Christoph Hellwig Signed-off-by: Jan Kara Signed-off-by: Dan Williams --- fs/dax.c | 83 +++++++++++++++++++++++++++++++++++++++++++ include/linux/dax.h | 2 ++ include/trace/events/fs_dax.h | 2 ++ 3 files changed, 87 insertions(+) diff --git a/fs/dax.c b/fs/dax.c index bb9ff907738c..78233c716757 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1492,3 +1492,86 @@ int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, } } EXPORT_SYMBOL_GPL(dax_iomap_fault); + +/** + * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables + * @vmf: The description of the fault + * @pe_size: Size of entry to be inserted + * @pfn: PFN to insert + * + * This function inserts writeable PTE or PMD entry into page tables for mmaped + * DAX file. It takes care of marking corresponding radix tree entry as dirty + * as well. + */ +static int dax_insert_pfn_mkwrite(struct vm_fault *vmf, + enum page_entry_size pe_size, + pfn_t pfn) +{ + struct address_space *mapping = vmf->vma->vm_file->f_mapping; + void *entry, **slot; + pgoff_t index = vmf->pgoff; + int vmf_ret, error; + + spin_lock_irq(&mapping->tree_lock); + entry = get_unlocked_mapping_entry(mapping, index, &slot); + /* Did we race with someone splitting entry or so? */ + if (!entry || + (pe_size == PE_SIZE_PTE && !dax_is_pte_entry(entry)) || + (pe_size == PE_SIZE_PMD && !dax_is_pmd_entry(entry))) { + put_unlocked_mapping_entry(mapping, index, entry); + spin_unlock_irq(&mapping->tree_lock); + trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf, + VM_FAULT_NOPAGE); + return VM_FAULT_NOPAGE; + } + radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); + entry = lock_slot(mapping, slot); + spin_unlock_irq(&mapping->tree_lock); + switch (pe_size) { + case PE_SIZE_PTE: + error = vm_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn); + vmf_ret = dax_fault_return(error); + break; +#ifdef CONFIG_FS_DAX_PMD + case PE_SIZE_PMD: + vmf_ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, + pfn, true); + break; +#endif + default: + vmf_ret = VM_FAULT_FALLBACK; + } + put_locked_mapping_entry(mapping, index); + trace_dax_insert_pfn_mkwrite(mapping->host, vmf, vmf_ret); + return vmf_ret; +} + +/** + * dax_finish_sync_fault - finish synchronous page fault + * @vmf: The description of the fault + * @pe_size: Size of entry to be inserted + * @pfn: PFN to insert + * + * This function ensures that the file range touched by the page fault is + * stored persistently on the media and handles inserting of appropriate page + * table entry. + */ +int dax_finish_sync_fault(struct vm_fault *vmf, enum page_entry_size pe_size, + pfn_t pfn) +{ + int err; + loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT; + size_t len = 0; + + if (pe_size == PE_SIZE_PTE) + len = PAGE_SIZE; + else if (pe_size == PE_SIZE_PMD) + len = PMD_SIZE; + else + WARN_ON_ONCE(1); + err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1); + if (err) + return VM_FAULT_SIGBUS; + return dax_insert_pfn_mkwrite(vmf, pe_size, pfn); +} +EXPORT_SYMBOL_GPL(dax_finish_sync_fault); diff --git a/include/linux/dax.h b/include/linux/dax.h index e7fa4b8f45bc..d403f78b706c 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -96,6 +96,8 @@ ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops); int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, pfn_t *pfnp, const struct iomap_ops *ops); +int dax_finish_sync_fault(struct vm_fault *vmf, enum page_entry_size pe_size, + pfn_t pfn); int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); int dax_invalidate_mapping_entry_sync(struct address_space *mapping, pgoff_t index); diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h index 88a9d19b8ff8..7725459fafef 100644 --- a/include/trace/events/fs_dax.h +++ b/include/trace/events/fs_dax.h @@ -190,6 +190,8 @@ DEFINE_EVENT(dax_pte_fault_class, name, \ DEFINE_PTE_FAULT_EVENT(dax_pte_fault); DEFINE_PTE_FAULT_EVENT(dax_pte_fault_done); DEFINE_PTE_FAULT_EVENT(dax_load_hole); +DEFINE_PTE_FAULT_EVENT(dax_insert_pfn_mkwrite_no_entry); +DEFINE_PTE_FAULT_EVENT(dax_insert_pfn_mkwrite); TRACE_EVENT(dax_insert_mapping, TP_PROTO(struct inode *inode, struct vm_fault *vmf, void *radix_entry), -- cgit From 497f6926d880c57f65bf7c3f1086526fa774c55e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 1 Nov 2017 16:36:44 +0100 Subject: ext4: Simplify error handling in ext4_dax_huge_fault() If transaction starting fails, just bail out of the function immediately instead of checking for that condition throughout the function. Reviewed-by: Ross Zwisler Signed-off-by: Jan Kara Signed-off-by: Dan Williams --- fs/ext4/file.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 3cec0b95672f..208adfc3e673 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -302,16 +302,17 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf, down_read(&EXT4_I(inode)->i_mmap_sem); handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, EXT4_DATA_TRANS_BLOCKS(sb)); + if (IS_ERR(handle)) { + up_read(&EXT4_I(inode)->i_mmap_sem); + sb_end_pagefault(sb); + return VM_FAULT_SIGBUS; + } } else { down_read(&EXT4_I(inode)->i_mmap_sem); } - if (!IS_ERR(handle)) - result = dax_iomap_fault(vmf, pe_size, NULL, &ext4_iomap_ops); - else - result = VM_FAULT_SIGBUS; + result = dax_iomap_fault(vmf, pe_size, NULL, &ext4_iomap_ops); if (write) { - if (!IS_ERR(handle)) - ext4_journal_stop(handle); + ext4_journal_stop(handle); up_read(&EXT4_I(inode)->i_mmap_sem); sb_end_pagefault(sb); } else { -- cgit From b8a6176c214cf9aa2679131ed7e4515cddaadc33 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 1 Nov 2017 16:36:45 +0100 Subject: ext4: Support for synchronous DAX faults We return IOMAP_F_DIRTY flag from ext4_iomap_begin() when asked to prepare blocks for writing and the inode has some uncommitted metadata changes. In the fault handler ext4_dax_fault() we then detect this case (through VM_FAULT_NEEDDSYNC return value) and call helper dax_finish_sync_fault() to flush metadata changes and insert page table entry. Note that this will also dirty corresponding radix tree entry which is what we want - fsync(2) will still provide data integrity guarantees for applications not using userspace flushing. And applications using userspace flushing can avoid calling fsync(2) and thus avoid the performance overhead. Reviewed-by: Ross Zwisler Signed-off-by: Jan Kara Signed-off-by: Dan Williams --- fs/ext4/file.c | 15 ++++++++++++++- fs/ext4/inode.c | 15 +++++++++++++++ fs/jbd2/journal.c | 17 +++++++++++++++++ include/linux/jbd2.h | 1 + 4 files changed, 47 insertions(+), 1 deletion(-) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 208adfc3e673..08a1d1a33a90 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "ext4.h" #include "ext4_jbd2.h" #include "xattr.h" @@ -295,6 +296,7 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf, */ bool write = (vmf->flags & FAULT_FLAG_WRITE) && (vmf->vma->vm_flags & VM_SHARED); + pfn_t pfn; if (write) { sb_start_pagefault(sb); @@ -310,9 +312,12 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf, } else { down_read(&EXT4_I(inode)->i_mmap_sem); } - result = dax_iomap_fault(vmf, pe_size, NULL, &ext4_iomap_ops); + result = dax_iomap_fault(vmf, pe_size, &pfn, &ext4_iomap_ops); if (write) { ext4_journal_stop(handle); + /* Handling synchronous page fault? */ + if (result & VM_FAULT_NEEDDSYNC) + result = dax_finish_sync_fault(vmf, pe_size, pfn); up_read(&EXT4_I(inode)->i_mmap_sem); sb_end_pagefault(sb); } else { @@ -350,6 +355,13 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO; + /* + * We don't support synchronous mappings for non-DAX files. At least + * until someone comes with a sensible use case. + */ + if (!IS_DAX(file_inode(file)) && (vma->vm_flags & VM_SYNC)) + return -EOPNOTSUPP; + file_accessed(file); if (IS_DAX(file_inode(file))) { vma->vm_ops = &ext4_dax_vm_ops; @@ -719,6 +731,7 @@ const struct file_operations ext4_file_operations = { .compat_ioctl = ext4_compat_ioctl, #endif .mmap = ext4_file_mmap, + .mmap_supported_flags = MAP_SYNC, .open = ext4_file_open, .release = ext4_release_file, .fsync = ext4_sync_file, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 31db875bc7a1..13a198924a0f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3394,6 +3394,19 @@ static int ext4_releasepage(struct page *page, gfp_t wait) } #ifdef CONFIG_FS_DAX +static bool ext4_inode_datasync_dirty(struct inode *inode) +{ + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; + + if (journal) + return !jbd2_transaction_committed(journal, + EXT4_I(inode)->i_datasync_tid); + /* Any metadata buffers to write? */ + if (!list_empty(&inode->i_mapping->private_list)) + return true; + return inode->i_state & I_DIRTY_DATASYNC; +} + static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned flags, struct iomap *iomap) { @@ -3466,6 +3479,8 @@ retry: } iomap->flags = 0; + if ((flags & IOMAP_WRITE) && ext4_inode_datasync_dirty(inode)) + iomap->flags |= IOMAP_F_DIRTY; iomap->bdev = inode->i_sb->s_bdev; iomap->dax_dev = sbi->s_daxdev; iomap->offset = first_block << blkbits; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 7d5ef3bf3f3e..fa8cde498b4b 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -738,6 +738,23 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid) return err; } +/* Return 1 when transaction with given tid has already committed. */ +int jbd2_transaction_committed(journal_t *journal, tid_t tid) +{ + int ret = 1; + + read_lock(&journal->j_state_lock); + if (journal->j_running_transaction && + journal->j_running_transaction->t_tid == tid) + ret = 0; + if (journal->j_committing_transaction && + journal->j_committing_transaction->t_tid == tid) + ret = 0; + read_unlock(&journal->j_state_lock); + return ret; +} +EXPORT_SYMBOL(jbd2_transaction_committed); + /* * When this function returns the transaction corresponding to tid * will be completed. If the transaction has currently running, start diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 606b6bce3a5b..296d1e0ea87b 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1367,6 +1367,7 @@ int jbd2_log_start_commit(journal_t *journal, tid_t tid); int __jbd2_log_start_commit(journal_t *journal, tid_t tid); int jbd2_journal_start_commit(journal_t *journal, tid_t *tid); int jbd2_log_wait_commit(journal_t *journal, tid_t tid); +int jbd2_transaction_committed(journal_t *journal, tid_t tid); int jbd2_complete_transaction(journal_t *journal, tid_t tid); int jbd2_log_do_checkpoint(journal_t *journal); int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid); -- cgit From 7b565c9f965bac269cc6a1f25c819f77044abf9d Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 1 Nov 2017 16:36:46 +0100 Subject: xfs: Implement xfs_filemap_pfn_mkwrite() using __xfs_filemap_fault() xfs_filemap_pfn_mkwrite() duplicates a lot of __xfs_filemap_fault(). It will also need to handle flushing for synchronous page faults. So just make that function use __xfs_filemap_fault(). Signed-off-by: Jan Kara Signed-off-by: Dan Williams --- fs/xfs/xfs_file.c | 29 ++++------------------------- fs/xfs/xfs_trace.h | 2 -- 2 files changed, 4 insertions(+), 27 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 7c6b8def6eed..4496b45678de 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1085,37 +1085,16 @@ xfs_filemap_page_mkwrite( } /* - * pfn_mkwrite was originally inteneded to ensure we capture time stamp - * updates on write faults. In reality, it's need to serialise against - * truncate similar to page_mkwrite. Hence we cycle the XFS_MMAPLOCK_SHARED - * to ensure we serialise the fault barrier in place. + * pfn_mkwrite was originally intended to ensure we capture time stamp updates + * on write faults. In reality, it needs to serialise against truncate and + * prepare memory for writing so handle is as standard write fault. */ static int xfs_filemap_pfn_mkwrite( struct vm_fault *vmf) { - struct inode *inode = file_inode(vmf->vma->vm_file); - struct xfs_inode *ip = XFS_I(inode); - int ret = VM_FAULT_NOPAGE; - loff_t size; - - trace_xfs_filemap_pfn_mkwrite(ip); - - sb_start_pagefault(inode->i_sb); - file_update_time(vmf->vma->vm_file); - - /* check if the faulting page hasn't raced with truncate */ - xfs_ilock(ip, XFS_MMAPLOCK_SHARED); - size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (vmf->pgoff >= size) - ret = VM_FAULT_SIGBUS; - else if (IS_DAX(inode)) - ret = dax_iomap_fault(vmf, PE_SIZE_PTE, NULL, &xfs_iomap_ops); - xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); - sb_end_pagefault(inode->i_sb); - return ret; - + return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true); } static const struct vm_operations_struct xfs_file_vm_ops = { diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index bb5514688d47..6333ad09e0f3 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -688,8 +688,6 @@ DEFINE_INODE_EVENT(xfs_inode_set_cowblocks_tag); DEFINE_INODE_EVENT(xfs_inode_clear_cowblocks_tag); DEFINE_INODE_EVENT(xfs_inode_free_cowblocks_invalid); -DEFINE_INODE_EVENT(xfs_filemap_pfn_mkwrite); - TRACE_EVENT(xfs_filemap_fault, TP_PROTO(struct xfs_inode *ip, enum page_entry_size pe_size, bool write_fault), -- cgit From a39e596baa07cb1dc19c2ead14c9fd2a30f22352 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 1 Nov 2017 16:36:47 +0100 Subject: xfs: support for synchronous DAX faults Return IOMAP_F_DIRTY from xfs_file_iomap_begin() when asked to prepare blocks for writing and the inode is pinned, and has dirty fields other than the timestamps. In __xfs_filemap_fault() we then detect this case and call dax_finish_sync_fault() to make sure all metadata is committed, and to insert the page table entry. Note that this will also dirty corresponding radix tree entry which is what we want - fsync(2) will still provide data integrity guarantees for applications not using userspace flushing. And applications using userspace flushing can avoid calling fsync(2) and thus avoid the performance overhead. [JK: Added VM_SYNC flag handling] Reviewed-by: Ross Zwisler Signed-off-by: Christoph Hellwig Signed-off-by: Jan Kara Signed-off-by: Dan Williams --- fs/xfs/xfs_file.c | 15 ++++++++++++++- fs/xfs/xfs_iomap.c | 5 +++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 4496b45678de..4827e82d5d2c 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -44,6 +44,7 @@ #include #include #include +#include static const struct vm_operations_struct xfs_file_vm_ops; @@ -1040,7 +1041,11 @@ __xfs_filemap_fault( xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (IS_DAX(inode)) { - ret = dax_iomap_fault(vmf, pe_size, NULL, &xfs_iomap_ops); + pfn_t pfn; + + ret = dax_iomap_fault(vmf, pe_size, &pfn, &xfs_iomap_ops); + if (ret & VM_FAULT_NEEDDSYNC) + ret = dax_finish_sync_fault(vmf, pe_size, pfn); } else { if (write_fault) ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops); @@ -1110,6 +1115,13 @@ xfs_file_mmap( struct file *filp, struct vm_area_struct *vma) { + /* + * We don't support synchronous mappings for non-DAX files. At least + * until someone comes with a sensible use case. + */ + if (!IS_DAX(file_inode(filp)) && (vma->vm_flags & VM_SYNC)) + return -EOPNOTSUPP; + file_accessed(filp); vma->vm_ops = &xfs_file_vm_ops; if (IS_DAX(file_inode(filp))) @@ -1128,6 +1140,7 @@ const struct file_operations xfs_file_operations = { .compat_ioctl = xfs_file_compat_ioctl, #endif .mmap = xfs_file_mmap, + .mmap_supported_flags = MAP_SYNC, .open = xfs_file_open, .release = xfs_file_release, .fsync = xfs_file_fsync, diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index f179bdf1644d..b43be199fbdf 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -33,6 +33,7 @@ #include "xfs_error.h" #include "xfs_trans.h" #include "xfs_trans_space.h" +#include "xfs_inode_item.h" #include "xfs_iomap.h" #include "xfs_trace.h" #include "xfs_icache.h" @@ -1086,6 +1087,10 @@ xfs_file_iomap_begin( trace_xfs_iomap_found(ip, offset, length, 0, &imap); } + if ((flags & IOMAP_WRITE) && xfs_ipincount(ip) && + (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) + iomap->flags |= IOMAP_F_DIRTY; + xfs_bmbt_to_iomap(ip, iomap, &imap); if (shared) -- cgit From aaa422c4c3f6ee958ea9d6c9260ac40f90a3f4e9 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 13 Nov 2017 16:38:44 -0800 Subject: fs, dax: unify IOMAP_F_DIRTY read vs write handling policy in the dax core While reviewing whether MAP_SYNC should strengthen its current guarantee of syncing writes from the initiating process to also include third-party readers observing dirty metadata, Dave pointed out that the check of IOMAP_WRITE is misplaced. The policy of what to with IOMAP_F_DIRTY should be separated from the generic filesystem mechanism of reporting dirty metadata. Move this policy to the fs-dax core to simplify the per-filesystem iomap handlers, and further centralize code that implements the MAP_SYNC policy. This otherwise should not change behavior, it just makes it easier to change behavior in the future. Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Cc: Ross Zwisler Reported-by: Dave Chinner Signed-off-by: Dan Williams --- fs/dax.c | 15 +++++++++++++-- fs/ext4/inode.c | 2 +- fs/xfs/xfs_iomap.c | 4 ++-- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 78233c716757..27ba300660ff 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1079,6 +1079,17 @@ static int dax_fault_return(int error) return VM_FAULT_SIGBUS; } +/* + * MAP_SYNC on a dax mapping guarantees dirty metadata is + * flushed on write-faults (non-cow), but not read-faults. + */ +static bool dax_fault_is_synchronous(unsigned long flags, + struct vm_area_struct *vma, struct iomap *iomap) +{ + return (flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC) + && (iomap->flags & IOMAP_F_DIRTY); +} + static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, const struct iomap_ops *ops) { @@ -1170,7 +1181,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, goto finish_iomap; } - sync = (vma->vm_flags & VM_SYNC) && (iomap.flags & IOMAP_F_DIRTY); + sync = dax_fault_is_synchronous(flags, vma, &iomap); switch (iomap.type) { case IOMAP_MAPPED: @@ -1390,7 +1401,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, if (iomap.offset + iomap.length < pos + PMD_SIZE) goto finish_iomap; - sync = (vma->vm_flags & VM_SYNC) && (iomap.flags & IOMAP_F_DIRTY); + sync = dax_fault_is_synchronous(iomap_flags, vma, &iomap); switch (iomap.type) { case IOMAP_MAPPED: diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 13a198924a0f..ee4d907a4251 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3479,7 +3479,7 @@ retry: } iomap->flags = 0; - if ((flags & IOMAP_WRITE) && ext4_inode_datasync_dirty(inode)) + if (ext4_inode_datasync_dirty(inode)) iomap->flags |= IOMAP_F_DIRTY; iomap->bdev = inode->i_sb->s_bdev; iomap->dax_dev = sbi->s_daxdev; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index b43be199fbdf..3c0e4cf72d2b 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1087,8 +1087,8 @@ xfs_file_iomap_begin( trace_xfs_iomap_found(ip, offset, length, 0, &imap); } - if ((flags & IOMAP_WRITE) && xfs_ipincount(ip) && - (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) + if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields + & ~XFS_ILOG_TIMESTAMP)) iomap->flags |= IOMAP_F_DIRTY; xfs_bmbt_to_iomap(ip, iomap, &imap); -- cgit From 66a86cc10945648cf506ef314b98deeb7af06419 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sat, 14 Oct 2017 11:32:33 -0700 Subject: dax: quiet bdev_dax_supported() Before we add another failure reason, quiet the existing log messages. Leave it to the caller to decide if bdev_dax_supported() failures are errors worth emitting to the log. Reported-by: Jeff Moyer Reviewed-by: Christoph Hellwig Signed-off-by: Dan Williams --- drivers/dax/super.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 557b93703532..b0cc8117eebe 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -92,21 +92,21 @@ int __bdev_dax_supported(struct super_block *sb, int blocksize) long len; if (blocksize != PAGE_SIZE) { - pr_err("VFS (%s): error: unsupported blocksize for dax\n", + pr_debug("VFS (%s): error: unsupported blocksize for dax\n", sb->s_id); return -EINVAL; } err = bdev_dax_pgoff(bdev, 0, PAGE_SIZE, &pgoff); if (err) { - pr_err("VFS (%s): error: unaligned partition for dax\n", + pr_debug("VFS (%s): error: unaligned partition for dax\n", sb->s_id); return err; } dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); if (!dax_dev) { - pr_err("VFS (%s): error: device does not support dax\n", + pr_debug("VFS (%s): error: device does not support dax\n", sb->s_id); return -EOPNOTSUPP; } @@ -118,7 +118,7 @@ int __bdev_dax_supported(struct super_block *sb, int blocksize) put_dax(dax_dev); if (len < 1) { - pr_err("VFS (%s): error: dax access failed (%ld)", + pr_debug("VFS (%s): error: dax access failed (%ld)\n", sb->s_id, len); return len < 0 ? len : -EIO; } -- cgit From 7a862fbbdec665190c5ef298c0c6ec9f3915cf45 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 23 Oct 2017 10:24:45 -0700 Subject: brd: remove dax support DAX support in brd is awkward because its backing page frames are distinct from the ones provided by pmem, dcssblk, or axonram. We need pfn_t_devmap() entries to fully support DAX, and the limited DAX support for pfn_t_special() page frames is not interesting for brd when pmem is already a superset of brd. Lastly, brd is the only dax capable driver that may sleep in its ->direct_access() implementation. So it causes a global burden with no net gain of kernel functionality. For all these reasons, remove DAX support. Cc: Jens Axboe Cc: Matthew Wilcox Cc: Ross Zwisler Reviewed-by: Christoph Hellwig Signed-off-by: Dan Williams --- drivers/block/Kconfig | 12 ---------- drivers/block/brd.c | 65 --------------------------------------------------- 2 files changed, 77 deletions(-) diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 2dfe99b328f8..da8bf0268ade 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -297,7 +297,6 @@ config BLK_DEV_SX8 config BLK_DEV_RAM tristate "RAM block device support" - select DAX if BLK_DEV_RAM_DAX ---help--- Saying Y here will allow you to use a portion of your RAM memory as a block device, so that you can make file systems on it, read and @@ -333,17 +332,6 @@ config BLK_DEV_RAM_SIZE The default value is 4096 kilobytes. Only change this if you know what you are doing. -config BLK_DEV_RAM_DAX - bool "Support Direct Access (DAX) to RAM block devices" - depends on BLK_DEV_RAM && FS_DAX - default n - help - Support filesystems using DAX to access RAM block devices. This - avoids double-buffering data in the page cache before copying it - to the block device. Answering Y will slightly enlarge the kernel, - and will prevent RAM block device backing store memory from being - allocated from highmem (only a problem for highmem systems). - config CDROM_PKTCDVD tristate "Packet writing on CD/DVD media (DEPRECATED)" depends on !UML diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 2d7178f7754e..b2391bbd7e5a 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -20,11 +20,6 @@ #include #include #include -#ifdef CONFIG_BLK_DEV_RAM_DAX -#include -#include -#include -#endif #include @@ -44,9 +39,6 @@ struct brd_device { struct request_queue *brd_queue; struct gendisk *brd_disk; -#ifdef CONFIG_BLK_DEV_RAM_DAX - struct dax_device *dax_dev; -#endif struct list_head brd_list; /* @@ -112,9 +104,6 @@ static struct page *brd_insert_page(struct brd_device *brd, sector_t sector) * restriction might be able to be lifted. */ gfp_flags = GFP_NOIO | __GFP_ZERO; -#ifndef CONFIG_BLK_DEV_RAM_DAX - gfp_flags |= __GFP_HIGHMEM; -#endif page = alloc_page(gfp_flags); if (!page) return NULL; @@ -334,43 +323,6 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector, return err; } -#ifdef CONFIG_BLK_DEV_RAM_DAX -static long __brd_direct_access(struct brd_device *brd, pgoff_t pgoff, - long nr_pages, void **kaddr, pfn_t *pfn) -{ - struct page *page; - - if (!brd) - return -ENODEV; - page = brd_insert_page(brd, (sector_t)pgoff << PAGE_SECTORS_SHIFT); - if (!page) - return -ENOSPC; - *kaddr = page_address(page); - *pfn = page_to_pfn_t(page); - - return 1; -} - -static long brd_dax_direct_access(struct dax_device *dax_dev, - pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn) -{ - struct brd_device *brd = dax_get_private(dax_dev); - - return __brd_direct_access(brd, pgoff, nr_pages, kaddr, pfn); -} - -static size_t brd_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, - void *addr, size_t bytes, struct iov_iter *i) -{ - return copy_from_iter(addr, bytes, i); -} - -static const struct dax_operations brd_dax_ops = { - .direct_access = brd_dax_direct_access, - .copy_from_iter = brd_dax_copy_from_iter, -}; -#endif - static const struct block_device_operations brd_fops = { .owner = THIS_MODULE, .rw_page = brd_rw_page, @@ -450,21 +402,8 @@ static struct brd_device *brd_alloc(int i) sprintf(disk->disk_name, "ram%d", i); set_capacity(disk, rd_size * 2); -#ifdef CONFIG_BLK_DEV_RAM_DAX - queue_flag_set_unlocked(QUEUE_FLAG_DAX, brd->brd_queue); - brd->dax_dev = alloc_dax(brd, disk->disk_name, &brd_dax_ops); - if (!brd->dax_dev) - goto out_free_inode; -#endif - - return brd; -#ifdef CONFIG_BLK_DEV_RAM_DAX -out_free_inode: - kill_dax(brd->dax_dev); - put_dax(brd->dax_dev); -#endif out_free_queue: blk_cleanup_queue(brd->brd_queue); out_free_dev: @@ -504,10 +443,6 @@ out: static void brd_del_one(struct brd_device *brd) { list_del(&brd->brd_list); -#ifdef CONFIG_BLK_DEV_RAM_DAX - kill_dax(brd->dax_dev); - put_dax(brd->dax_dev); -#endif del_gendisk(brd->brd_disk); brd_free(brd); } -- cgit From 6a21586a637e624ae736f94aeb0839f6a1dd0411 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sat, 14 Oct 2017 17:42:02 -0700 Subject: dax: stop requiring a live device for dax_flush() Now that dax_flush() is no longer a driver callback (commit c3ca015fab6d "dax: remove the pmem_dax_ops->flush abstraction"), stop requiring the dax_read_lock() to be held and the device to be alive. This is in preparation for switching filesystem-dax to store pfns instead of sectors in the radix. Reviewed-by: Christoph Hellwig Signed-off-by: Dan Williams --- drivers/dax/super.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/dax/super.c b/drivers/dax/super.c index b0cc8117eebe..69329e3954ea 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -273,9 +273,6 @@ EXPORT_SYMBOL_GPL(dax_copy_from_iter); void arch_wb_cache_pmem(void *addr, size_t size); void dax_flush(struct dax_device *dax_dev, void *addr, size_t size) { - if (unlikely(!dax_alive(dax_dev))) - return; - if (unlikely(!test_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags))) return; -- cgit From 957ac8c421ad8b5eef9b17fe98e146d8311a541e Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Tue, 14 Nov 2017 20:37:27 -0500 Subject: dax: fix PMD faults on zero-length files PMD faults on a zero length file on a file system mounted with -o dax will not generate SIGBUS as expected. fd = open(...O_TRUNC); addr = mmap(NULL, 2*1024*1024, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); *addr = 'a'; The problem is this code in dax_iomap_pmd_fault: max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT; If the inode size is zero, we end up with a max_pgoff that is way larger than 0. :) Fix it by using DIV_ROUND_UP, as is done elsewhere in the kernel. I tested this with some simple test code that ensured that SIGBUS was received where expected. Cc: Fixes: 642261ac995e ("dax: add struct iomap based DAX PMD support") Signed-off-by: Jeff Moyer Signed-off-by: Dan Williams --- fs/dax.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 27ba300660ff..f757cd0e2d07 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1333,7 +1333,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, * this is a reliable test. */ pgoff = linear_page_index(vma, pmd_addr); - max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT; + max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); trace_dax_pmd_fault(inode, vmf, max_pgoff, 0); @@ -1357,13 +1357,13 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, if ((pmd_addr + PMD_SIZE) > vma->vm_end) goto fallback; - if (pgoff > max_pgoff) { + if (pgoff >= max_pgoff) { result = VM_FAULT_SIGBUS; goto out; } /* If the PMD would extend beyond the file size */ - if ((pgoff | PG_PMD_COLOUR) > max_pgoff) + if ((pgoff | PG_PMD_COLOUR) >= max_pgoff) goto fallback; /* -- cgit From 9f586fff6574f6ecbf323f92d44ffaf0d96225fe Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 14 Nov 2017 09:59:54 -0500 Subject: dax: fix general protection fault in dax_alloc_inode Don't crash in case of allocation failure in dax_alloc_inode. syzkaller hit the following crash on e4880bc5dfb1 kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access [..] RIP: 0010:dax_alloc_inode+0x3b/0x70 drivers/dax/super.c:348 Call Trace: alloc_inode+0x65/0x180 fs/inode.c:208 new_inode_pseudo+0x69/0x190 fs/inode.c:890 new_inode+0x1c/0x40 fs/inode.c:919 mount_pseudo_xattr+0x288/0x560 fs/libfs.c:261 mount_pseudo include/linux/fs.h:2137 [inline] dax_mount+0x2e/0x40 drivers/dax/super.c:388 mount_fs+0x66/0x2d0 fs/super.c:1223 Cc: Fixes: 7b6be8444e0f ("dax: refactor dax-fs into a generic provider...") Reported-by: syzbot Signed-off-by: Mikulas Patocka Signed-off-by: Dan Williams --- drivers/dax/super.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 69329e3954ea..3ec804672601 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -341,6 +341,9 @@ static struct inode *dax_alloc_inode(struct super_block *sb) struct inode *inode; dax_dev = kmem_cache_alloc(dax_cache, GFP_KERNEL); + if (!dax_dev) + return NULL; + inode = &dax_dev->inode; inode->i_rdev = 0; return inode; -- cgit