summaryrefslogtreecommitdiff
path: root/arch
diff options
context:
space:
mode:
Diffstat (limited to 'arch')
-rw-r--r--arch/arm/boot/dts/imx6qp-prtwd3.dts2
-rw-r--r--arch/arm/boot/dts/imx6ull-pinfunc.h2
-rw-r--r--arch/arm/boot/dts/ls1021a-tsn.dts2
-rw-r--r--arch/arm/boot/dts/socfpga_arria10_socdk_qspi.dts2
-rw-r--r--arch/arm/boot/dts/socfpga_arria5_socdk.dts2
-rw-r--r--arch/arm/boot/dts/socfpga_cyclone5_socdk.dts2
-rw-r--r--arch/arm/boot/dts/socfpga_cyclone5_sockit.dts2
-rw-r--r--arch/arm/boot/dts/socfpga_cyclone5_socrates.dts2
-rw-r--r--arch/arm/boot/dts/socfpga_cyclone5_sodia.dts2
-rw-r--r--arch/arm/boot/dts/socfpga_cyclone5_vining_fpga.dts4
-rw-r--r--arch/arm/mach-rockchip/platsmp.c2
-rw-r--r--arch/arm64/Kconfig.platforms1
-rw-r--r--arch/arm64/boot/dts/amlogic/meson-axg-jethome-jethub-j100.dts30
-rw-r--r--arch/arm64/boot/dts/apple/t8103-j274.dts2
-rw-r--r--arch/arm64/boot/dts/apple/t8103.dtsi11
-rw-r--r--arch/arm64/boot/dts/freescale/fsl-ls1088a-ten64.dts2
-rw-r--r--arch/arm64/boot/dts/freescale/fsl-lx2160a-bluebox3.dts4
-rw-r--r--arch/arm64/boot/dts/freescale/imx8mq.dtsi2
-rw-r--r--arch/arm64/boot/dts/rockchip/rk3308-roc-cc.dts2
-rw-r--r--arch/arm64/boot/dts/rockchip/rk3399-khadas-edge.dtsi1
-rw-r--r--arch/arm64/boot/dts/rockchip/rk3399-kobol-helios64.dts1
-rw-r--r--arch/arm64/boot/dts/rockchip/rk3399-leez-p710.dts2
-rw-r--r--arch/arm64/boot/dts/rockchip/rk3399-rock-pi-4.dtsi2
-rw-r--r--arch/arm64/include/asm/kvm_arm.h4
-rw-r--r--arch/arm64/kernel/entry-ftrace.S6
-rw-r--r--arch/arm64/kernel/machine_kexec.c2
-rw-r--r--arch/arm64/kernel/machine_kexec_file.c1
-rw-r--r--arch/arm64/kvm/hyp/include/hyp/switch.h14
-rw-r--r--arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h7
-rw-r--r--arch/arm64/kvm/hyp/nvhe/switch.c8
-rw-r--r--arch/arm64/kvm/hyp/vhe/switch.c4
-rw-r--r--arch/csky/kernel/traps.c4
-rw-r--r--arch/mips/net/bpf_jit_comp.h2
-rw-r--r--arch/parisc/Makefile5
-rw-r--r--arch/parisc/configs/generic-64bit_defconfig14
-rw-r--r--arch/parisc/install.sh1
-rw-r--r--arch/parisc/kernel/time.c30
-rw-r--r--arch/powerpc/kernel/module_64.c42
-rw-r--r--arch/powerpc/platforms/85xx/smp.c4
-rw-r--r--arch/riscv/boot/dts/sifive/hifive-unleashed-a00.dts1
-rw-r--r--arch/riscv/boot/dts/sifive/hifive-unmatched-a00.dts113
-rw-r--r--arch/riscv/include/asm/kvm_host.h8
-rw-r--r--arch/riscv/kvm/mmu.c6
-rw-r--r--arch/s390/configs/debug_defconfig12
-rw-r--r--arch/s390/configs/defconfig9
-rw-r--r--arch/s390/configs/zfcpdump_defconfig2
-rw-r--r--arch/s390/include/asm/pci_io.h7
-rw-r--r--arch/s390/kernel/ftrace.c2
-rw-r--r--arch/s390/kernel/irq.c9
-rw-r--r--arch/s390/kernel/machine_kexec_file.c38
-rw-r--r--arch/s390/lib/test_unwind.c5
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/entry/entry_64.S35
-rw-r--r--arch/x86/include/asm/intel-family.h2
-rw-r--r--arch/x86/include/asm/kvm_host.h3
-rw-r--r--arch/x86/include/asm/sev-common.h11
-rw-r--r--arch/x86/kernel/fpu/signal.c2
-rw-r--r--arch/x86/kernel/sev.c57
-rw-r--r--arch/x86/kernel/smpboot.c14
-rw-r--r--arch/x86/kernel/tsc.c28
-rw-r--r--arch/x86/kernel/tsc_sync.c41
-rw-r--r--arch/x86/kvm/hyperv.c7
-rw-r--r--arch/x86/kvm/ioapic.h1
-rw-r--r--arch/x86/kvm/irq.h1
-rw-r--r--arch/x86/kvm/lapic.c2
-rw-r--r--arch/x86/kvm/mmu/mmu.c120
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h3
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c38
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.h5
-rw-r--r--arch/x86/kvm/svm/avic.c17
-rw-r--r--arch/x86/kvm/svm/pmu.c2
-rw-r--r--arch/x86/kvm/svm/sev.c263
-rw-r--r--arch/x86/kvm/svm/svm.c1
-rw-r--r--arch/x86/kvm/svm/svm.h1
-rw-r--r--arch/x86/kvm/vmx/nested.c53
-rw-r--r--arch/x86/kvm/vmx/posted_intr.c20
-rw-r--r--arch/x86/kvm/vmx/vmx.c89
-rw-r--r--arch/x86/kvm/x86.c87
-rw-r--r--arch/x86/kvm/x86.h7
-rw-r--r--arch/x86/net/bpf_jit_comp.c51
-rw-r--r--arch/x86/platform/efi/quirks.c3
-rw-r--r--arch/x86/realmode/init.c12
-rw-r--r--arch/x86/xen/xen-asm.S20
83 files changed, 917 insertions, 524 deletions
diff --git a/arch/arm/boot/dts/imx6qp-prtwd3.dts b/arch/arm/boot/dts/imx6qp-prtwd3.dts
index 7648e8a02000..cf6571cc4682 100644
--- a/arch/arm/boot/dts/imx6qp-prtwd3.dts
+++ b/arch/arm/boot/dts/imx6qp-prtwd3.dts
@@ -178,6 +178,8 @@
label = "cpu";
ethernet = <&fec>;
phy-mode = "rgmii-id";
+ rx-internal-delay-ps = <2000>;
+ tx-internal-delay-ps = <2000>;
fixed-link {
speed = <100>;
diff --git a/arch/arm/boot/dts/imx6ull-pinfunc.h b/arch/arm/boot/dts/imx6ull-pinfunc.h
index eb025a9d4759..7328d4ef8559 100644
--- a/arch/arm/boot/dts/imx6ull-pinfunc.h
+++ b/arch/arm/boot/dts/imx6ull-pinfunc.h
@@ -82,6 +82,6 @@
#define MX6ULL_PAD_CSI_DATA04__ESAI_TX_FS 0x01F4 0x0480 0x0000 0x9 0x0
#define MX6ULL_PAD_CSI_DATA05__ESAI_TX_CLK 0x01F8 0x0484 0x0000 0x9 0x0
#define MX6ULL_PAD_CSI_DATA06__ESAI_TX5_RX0 0x01FC 0x0488 0x0000 0x9 0x0
-#define MX6ULL_PAD_CSI_DATA07__ESAI_T0 0x0200 0x048C 0x0000 0x9 0x0
+#define MX6ULL_PAD_CSI_DATA07__ESAI_TX0 0x0200 0x048C 0x0000 0x9 0x0
#endif /* __DTS_IMX6ULL_PINFUNC_H */
diff --git a/arch/arm/boot/dts/ls1021a-tsn.dts b/arch/arm/boot/dts/ls1021a-tsn.dts
index ff0ffb22768b..1ea32fff4120 100644
--- a/arch/arm/boot/dts/ls1021a-tsn.dts
+++ b/arch/arm/boot/dts/ls1021a-tsn.dts
@@ -91,6 +91,8 @@
/* Internal port connected to eth2 */
ethernet = <&enet2>;
phy-mode = "rgmii";
+ rx-internal-delay-ps = <0>;
+ tx-internal-delay-ps = <0>;
reg = <4>;
fixed-link {
diff --git a/arch/arm/boot/dts/socfpga_arria10_socdk_qspi.dts b/arch/arm/boot/dts/socfpga_arria10_socdk_qspi.dts
index 2b645642b935..2a745522404d 100644
--- a/arch/arm/boot/dts/socfpga_arria10_socdk_qspi.dts
+++ b/arch/arm/boot/dts/socfpga_arria10_socdk_qspi.dts
@@ -12,7 +12,7 @@
flash0: n25q00@0 {
#address-cells = <1>;
#size-cells = <1>;
- compatible = "n25q00aa";
+ compatible = "micron,mt25qu02g", "jedec,spi-nor";
reg = <0>;
spi-max-frequency = <100000000>;
diff --git a/arch/arm/boot/dts/socfpga_arria5_socdk.dts b/arch/arm/boot/dts/socfpga_arria5_socdk.dts
index 90e676e7019f..1b02d46496a8 100644
--- a/arch/arm/boot/dts/socfpga_arria5_socdk.dts
+++ b/arch/arm/boot/dts/socfpga_arria5_socdk.dts
@@ -119,7 +119,7 @@
flash: flash@0 {
#address-cells = <1>;
#size-cells = <1>;
- compatible = "n25q256a";
+ compatible = "micron,n25q256a", "jedec,spi-nor";
reg = <0>;
spi-max-frequency = <100000000>;
diff --git a/arch/arm/boot/dts/socfpga_cyclone5_socdk.dts b/arch/arm/boot/dts/socfpga_cyclone5_socdk.dts
index 6f138b2b2616..51bb436784e2 100644
--- a/arch/arm/boot/dts/socfpga_cyclone5_socdk.dts
+++ b/arch/arm/boot/dts/socfpga_cyclone5_socdk.dts
@@ -124,7 +124,7 @@
flash0: n25q00@0 {
#address-cells = <1>;
#size-cells = <1>;
- compatible = "n25q00";
+ compatible = "micron,mt25qu02g", "jedec,spi-nor";
reg = <0>; /* chip select */
spi-max-frequency = <100000000>;
diff --git a/arch/arm/boot/dts/socfpga_cyclone5_sockit.dts b/arch/arm/boot/dts/socfpga_cyclone5_sockit.dts
index c155ff02eb6e..cae9ddd5ed38 100644
--- a/arch/arm/boot/dts/socfpga_cyclone5_sockit.dts
+++ b/arch/arm/boot/dts/socfpga_cyclone5_sockit.dts
@@ -169,7 +169,7 @@
flash: flash@0 {
#address-cells = <1>;
#size-cells = <1>;
- compatible = "n25q00";
+ compatible = "micron,mt25qu02g", "jedec,spi-nor";
reg = <0>;
spi-max-frequency = <100000000>;
diff --git a/arch/arm/boot/dts/socfpga_cyclone5_socrates.dts b/arch/arm/boot/dts/socfpga_cyclone5_socrates.dts
index 8d5d3996f6f2..ca18b959e655 100644
--- a/arch/arm/boot/dts/socfpga_cyclone5_socrates.dts
+++ b/arch/arm/boot/dts/socfpga_cyclone5_socrates.dts
@@ -80,7 +80,7 @@
flash: flash@0 {
#address-cells = <1>;
#size-cells = <1>;
- compatible = "n25q256a";
+ compatible = "micron,n25q256a", "jedec,spi-nor";
reg = <0>;
spi-max-frequency = <100000000>;
m25p,fast-read;
diff --git a/arch/arm/boot/dts/socfpga_cyclone5_sodia.dts b/arch/arm/boot/dts/socfpga_cyclone5_sodia.dts
index 99a71757cdf4..3f7aa7bf0863 100644
--- a/arch/arm/boot/dts/socfpga_cyclone5_sodia.dts
+++ b/arch/arm/boot/dts/socfpga_cyclone5_sodia.dts
@@ -116,7 +116,7 @@
flash0: n25q512a@0 {
#address-cells = <1>;
#size-cells = <1>;
- compatible = "n25q512a";
+ compatible = "micron,n25q512a", "jedec,spi-nor";
reg = <0>;
spi-max-frequency = <100000000>;
diff --git a/arch/arm/boot/dts/socfpga_cyclone5_vining_fpga.dts b/arch/arm/boot/dts/socfpga_cyclone5_vining_fpga.dts
index a060718758b6..25874e1b9c82 100644
--- a/arch/arm/boot/dts/socfpga_cyclone5_vining_fpga.dts
+++ b/arch/arm/boot/dts/socfpga_cyclone5_vining_fpga.dts
@@ -224,7 +224,7 @@
n25q128@0 {
#address-cells = <1>;
#size-cells = <1>;
- compatible = "n25q128";
+ compatible = "micron,n25q128", "jedec,spi-nor";
reg = <0>; /* chip select */
spi-max-frequency = <100000000>;
m25p,fast-read;
@@ -241,7 +241,7 @@
n25q00@1 {
#address-cells = <1>;
#size-cells = <1>;
- compatible = "n25q00";
+ compatible = "micron,mt25qu02g", "jedec,spi-nor";
reg = <1>; /* chip select */
spi-max-frequency = <100000000>;
m25p,fast-read;
diff --git a/arch/arm/mach-rockchip/platsmp.c b/arch/arm/mach-rockchip/platsmp.c
index d60856898d97..5ec58d004b7d 100644
--- a/arch/arm/mach-rockchip/platsmp.c
+++ b/arch/arm/mach-rockchip/platsmp.c
@@ -189,7 +189,7 @@ static int __init rockchip_smp_prepare_sram(struct device_node *node)
rockchip_boot_fn = __pa_symbol(secondary_startup);
/* copy the trampoline to sram, that runs during startup of the core */
- memcpy(sram_base_addr, &rockchip_secondary_trampoline, trampoline_sz);
+ memcpy_toio(sram_base_addr, &rockchip_secondary_trampoline, trampoline_sz);
flush_cache_all();
outer_clean_range(0, trampoline_sz);
diff --git a/arch/arm64/Kconfig.platforms b/arch/arm64/Kconfig.platforms
index 1aa8b7073218..54e3910e8b9b 100644
--- a/arch/arm64/Kconfig.platforms
+++ b/arch/arm64/Kconfig.platforms
@@ -161,7 +161,6 @@ config ARCH_MEDIATEK
config ARCH_MESON
bool "Amlogic Platforms"
- select COMMON_CLK
help
This enables support for the arm64 based Amlogic SoCs
such as the s905, S905X/D, S912, A113X/D or S905X/D2
diff --git a/arch/arm64/boot/dts/amlogic/meson-axg-jethome-jethub-j100.dts b/arch/arm64/boot/dts/amlogic/meson-axg-jethome-jethub-j100.dts
index 52ebe371df26..561eec21b4de 100644
--- a/arch/arm64/boot/dts/amlogic/meson-axg-jethome-jethub-j100.dts
+++ b/arch/arm64/boot/dts/amlogic/meson-axg-jethome-jethub-j100.dts
@@ -134,23 +134,23 @@
type = "critical";
};
};
- };
- cpu_cooling_maps: cooling-maps {
- map0 {
- trip = <&cpu_passive>;
- cooling-device = <&cpu0 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
- <&cpu1 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
- <&cpu2 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
- <&cpu3 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>;
- };
+ cpu_cooling_maps: cooling-maps {
+ map0 {
+ trip = <&cpu_passive>;
+ cooling-device = <&cpu0 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
+ <&cpu1 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
+ <&cpu2 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
+ <&cpu3 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>;
+ };
- map1 {
- trip = <&cpu_hot>;
- cooling-device = <&cpu0 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
- <&cpu1 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
- <&cpu2 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
- <&cpu3 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>;
+ map1 {
+ trip = <&cpu_hot>;
+ cooling-device = <&cpu0 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
+ <&cpu1 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
+ <&cpu2 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
+ <&cpu3 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>;
+ };
};
};
};
diff --git a/arch/arm64/boot/dts/apple/t8103-j274.dts b/arch/arm64/boot/dts/apple/t8103-j274.dts
index 33a80f9501dc..02c36301e985 100644
--- a/arch/arm64/boot/dts/apple/t8103-j274.dts
+++ b/arch/arm64/boot/dts/apple/t8103-j274.dts
@@ -60,7 +60,7 @@
&port02 {
bus-range = <3 3>;
- ethernet0: pci@0,0 {
+ ethernet0: ethernet@0,0 {
reg = <0x30000 0x0 0x0 0x0 0x0>;
/* To be filled by the loader */
local-mac-address = [00 10 18 00 00 00];
diff --git a/arch/arm64/boot/dts/apple/t8103.dtsi b/arch/arm64/boot/dts/apple/t8103.dtsi
index fc8b2bb06ffe..8b61e7fd3e9c 100644
--- a/arch/arm64/boot/dts/apple/t8103.dtsi
+++ b/arch/arm64/boot/dts/apple/t8103.dtsi
@@ -7,6 +7,7 @@
* Copyright The Asahi Linux Contributors
*/
+#include <dt-bindings/gpio/gpio.h>
#include <dt-bindings/interrupt-controller/apple-aic.h>
#include <dt-bindings/interrupt-controller/irq.h>
#include <dt-bindings/pinctrl/apple.h>
@@ -143,6 +144,7 @@
apple,npins = <212>;
interrupt-controller;
+ #interrupt-cells = <2>;
interrupt-parent = <&aic>;
interrupts = <AIC_IRQ 190 IRQ_TYPE_LEVEL_HIGH>,
<AIC_IRQ 191 IRQ_TYPE_LEVEL_HIGH>,
@@ -169,6 +171,7 @@
apple,npins = <42>;
interrupt-controller;
+ #interrupt-cells = <2>;
interrupt-parent = <&aic>;
interrupts = <AIC_IRQ 268 IRQ_TYPE_LEVEL_HIGH>,
<AIC_IRQ 269 IRQ_TYPE_LEVEL_HIGH>,
@@ -189,6 +192,7 @@
apple,npins = <23>;
interrupt-controller;
+ #interrupt-cells = <2>;
interrupt-parent = <&aic>;
interrupts = <AIC_IRQ 330 IRQ_TYPE_LEVEL_HIGH>,
<AIC_IRQ 331 IRQ_TYPE_LEVEL_HIGH>,
@@ -209,6 +213,7 @@
apple,npins = <16>;
interrupt-controller;
+ #interrupt-cells = <2>;
interrupt-parent = <&aic>;
interrupts = <AIC_IRQ 391 IRQ_TYPE_LEVEL_HIGH>,
<AIC_IRQ 392 IRQ_TYPE_LEVEL_HIGH>,
@@ -281,7 +286,7 @@
port00: pci@0,0 {
device_type = "pci";
reg = <0x0 0x0 0x0 0x0 0x0>;
- reset-gpios = <&pinctrl_ap 152 0>;
+ reset-gpios = <&pinctrl_ap 152 GPIO_ACTIVE_LOW>;
max-link-speed = <2>;
#address-cells = <3>;
@@ -301,7 +306,7 @@
port01: pci@1,0 {
device_type = "pci";
reg = <0x800 0x0 0x0 0x0 0x0>;
- reset-gpios = <&pinctrl_ap 153 0>;
+ reset-gpios = <&pinctrl_ap 153 GPIO_ACTIVE_LOW>;
max-link-speed = <2>;
#address-cells = <3>;
@@ -321,7 +326,7 @@
port02: pci@2,0 {
device_type = "pci";
reg = <0x1000 0x0 0x0 0x0 0x0>;
- reset-gpios = <&pinctrl_ap 33 0>;
+ reset-gpios = <&pinctrl_ap 33 GPIO_ACTIVE_LOW>;
max-link-speed = <1>;
#address-cells = <3>;
diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1088a-ten64.dts b/arch/arm64/boot/dts/freescale/fsl-ls1088a-ten64.dts
index 3063851c2fb9..d3f03dcbb8c3 100644
--- a/arch/arm64/boot/dts/freescale/fsl-ls1088a-ten64.dts
+++ b/arch/arm64/boot/dts/freescale/fsl-ls1088a-ten64.dts
@@ -38,7 +38,6 @@
powerdn {
label = "External Power Down";
gpios = <&gpio1 17 GPIO_ACTIVE_LOW>;
- interrupts = <&gpio1 17 IRQ_TYPE_EDGE_FALLING>;
linux,code = <KEY_POWER>;
};
@@ -46,7 +45,6 @@
admin {
label = "ADMIN button";
gpios = <&gpio3 8 GPIO_ACTIVE_HIGH>;
- interrupts = <&gpio3 8 IRQ_TYPE_EDGE_RISING>;
linux,code = <KEY_WPS_BUTTON>;
};
};
diff --git a/arch/arm64/boot/dts/freescale/fsl-lx2160a-bluebox3.dts b/arch/arm64/boot/dts/freescale/fsl-lx2160a-bluebox3.dts
index b21be03da0af..042c486bdda2 100644
--- a/arch/arm64/boot/dts/freescale/fsl-lx2160a-bluebox3.dts
+++ b/arch/arm64/boot/dts/freescale/fsl-lx2160a-bluebox3.dts
@@ -386,6 +386,8 @@
reg = <2>;
ethernet = <&dpmac17>;
phy-mode = "rgmii-id";
+ rx-internal-delay-ps = <2000>;
+ tx-internal-delay-ps = <2000>;
fixed-link {
speed = <1000>;
@@ -529,6 +531,8 @@
reg = <2>;
ethernet = <&dpmac18>;
phy-mode = "rgmii-id";
+ rx-internal-delay-ps = <2000>;
+ tx-internal-delay-ps = <2000>;
fixed-link {
speed = <1000>;
diff --git a/arch/arm64/boot/dts/freescale/imx8mq.dtsi b/arch/arm64/boot/dts/freescale/imx8mq.dtsi
index 972766b67a15..71bf497f99c2 100644
--- a/arch/arm64/boot/dts/freescale/imx8mq.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx8mq.dtsi
@@ -524,8 +524,6 @@
<&clk IMX8MQ_VIDEO_PLL1>,
<&clk IMX8MQ_VIDEO_PLL1_OUT>;
assigned-clock-rates = <0>, <0>, <0>, <594000000>;
- interconnects = <&noc IMX8MQ_ICM_LCDIF &noc IMX8MQ_ICS_DRAM>;
- interconnect-names = "dram";
status = "disabled";
port@0 {
diff --git a/arch/arm64/boot/dts/rockchip/rk3308-roc-cc.dts b/arch/arm64/boot/dts/rockchip/rk3308-roc-cc.dts
index 665b2e69455d..ea6820902ede 100644
--- a/arch/arm64/boot/dts/rockchip/rk3308-roc-cc.dts
+++ b/arch/arm64/boot/dts/rockchip/rk3308-roc-cc.dts
@@ -97,7 +97,7 @@
regulator-max-microvolt = <3300000>;
regulator-always-on;
regulator-boot-on;
- vim-supply = <&vcc_io>;
+ vin-supply = <&vcc_io>;
};
vdd_core: vdd-core {
diff --git a/arch/arm64/boot/dts/rockchip/rk3399-khadas-edge.dtsi b/arch/arm64/boot/dts/rockchip/rk3399-khadas-edge.dtsi
index d5c7648c841d..f1fcc6b5b402 100644
--- a/arch/arm64/boot/dts/rockchip/rk3399-khadas-edge.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk3399-khadas-edge.dtsi
@@ -705,7 +705,6 @@
&sdhci {
bus-width = <8>;
mmc-hs400-1_8v;
- mmc-hs400-enhanced-strobe;
non-removable;
status = "okay";
};
diff --git a/arch/arm64/boot/dts/rockchip/rk3399-kobol-helios64.dts b/arch/arm64/boot/dts/rockchip/rk3399-kobol-helios64.dts
index 63c7681843da..b6ac00f64613 100644
--- a/arch/arm64/boot/dts/rockchip/rk3399-kobol-helios64.dts
+++ b/arch/arm64/boot/dts/rockchip/rk3399-kobol-helios64.dts
@@ -276,6 +276,7 @@
clock-output-names = "xin32k", "rk808-clkout2";
pinctrl-names = "default";
pinctrl-0 = <&pmic_int_l>;
+ rockchip,system-power-controller;
vcc1-supply = <&vcc5v0_sys>;
vcc2-supply = <&vcc5v0_sys>;
vcc3-supply = <&vcc5v0_sys>;
diff --git a/arch/arm64/boot/dts/rockchip/rk3399-leez-p710.dts b/arch/arm64/boot/dts/rockchip/rk3399-leez-p710.dts
index 7c93f840bc64..e890166e7fd4 100644
--- a/arch/arm64/boot/dts/rockchip/rk3399-leez-p710.dts
+++ b/arch/arm64/boot/dts/rockchip/rk3399-leez-p710.dts
@@ -55,7 +55,7 @@
regulator-boot-on;
regulator-min-microvolt = <3300000>;
regulator-max-microvolt = <3300000>;
- vim-supply = <&vcc3v3_sys>;
+ vin-supply = <&vcc3v3_sys>;
};
vcc3v3_sys: vcc3v3-sys {
diff --git a/arch/arm64/boot/dts/rockchip/rk3399-rock-pi-4.dtsi b/arch/arm64/boot/dts/rockchip/rk3399-rock-pi-4.dtsi
index 98136c88fa49..6a434be62819 100644
--- a/arch/arm64/boot/dts/rockchip/rk3399-rock-pi-4.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk3399-rock-pi-4.dtsi
@@ -502,7 +502,7 @@
status = "okay";
bt656-supply = <&vcc_3v0>;
- audio-supply = <&vcc_3v0>;
+ audio-supply = <&vcc1v8_codec>;
sdmmc-supply = <&vcc_sdio>;
gpio1830-supply = <&vcc_3v0>;
};
diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index a39fcf318c77..01d47c5886dc 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -91,7 +91,7 @@
#define HCR_HOST_VHE_FLAGS (HCR_RW | HCR_TGE | HCR_E2H)
/* TCR_EL2 Registers bits */
-#define TCR_EL2_RES1 ((1 << 31) | (1 << 23))
+#define TCR_EL2_RES1 ((1U << 31) | (1 << 23))
#define TCR_EL2_TBI (1 << 20)
#define TCR_EL2_PS_SHIFT 16
#define TCR_EL2_PS_MASK (7 << TCR_EL2_PS_SHIFT)
@@ -276,7 +276,7 @@
#define CPTR_EL2_TFP_SHIFT 10
/* Hyp Coprocessor Trap Register */
-#define CPTR_EL2_TCPAC (1 << 31)
+#define CPTR_EL2_TCPAC (1U << 31)
#define CPTR_EL2_TAM (1 << 30)
#define CPTR_EL2_TTA (1 << 20)
#define CPTR_EL2_TFP (1 << CPTR_EL2_TFP_SHIFT)
diff --git a/arch/arm64/kernel/entry-ftrace.S b/arch/arm64/kernel/entry-ftrace.S
index b3e4f9a088b1..8cf970d219f5 100644
--- a/arch/arm64/kernel/entry-ftrace.S
+++ b/arch/arm64/kernel/entry-ftrace.S
@@ -77,11 +77,17 @@
.endm
SYM_CODE_START(ftrace_regs_caller)
+#ifdef BTI_C
+ BTI_C
+#endif
ftrace_regs_entry 1
b ftrace_common
SYM_CODE_END(ftrace_regs_caller)
SYM_CODE_START(ftrace_caller)
+#ifdef BTI_C
+ BTI_C
+#endif
ftrace_regs_entry 0
b ftrace_common
SYM_CODE_END(ftrace_caller)
diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c
index 1038494135c8..6fb31c117ebe 100644
--- a/arch/arm64/kernel/machine_kexec.c
+++ b/arch/arm64/kernel/machine_kexec.c
@@ -147,7 +147,7 @@ int machine_kexec_post_load(struct kimage *kimage)
if (rc)
return rc;
kimage->arch.ttbr1 = __pa(trans_pgd);
- kimage->arch.zero_page = __pa(empty_zero_page);
+ kimage->arch.zero_page = __pa_symbol(empty_zero_page);
reloc_size = __relocate_new_kernel_end - __relocate_new_kernel_start;
memcpy(reloc_code, __relocate_new_kernel_start, reloc_size);
diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c
index 63634b4d72c1..59c648d51848 100644
--- a/arch/arm64/kernel/machine_kexec_file.c
+++ b/arch/arm64/kernel/machine_kexec_file.c
@@ -149,6 +149,7 @@ int load_other_segments(struct kimage *image,
initrd_len, cmdline, 0);
if (!dtb) {
pr_err("Preparing for new dtb failed\n");
+ ret = -EINVAL;
goto out_err;
}
diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
index 7a0af1d39303..96c5f3fb7838 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -403,6 +403,8 @@ typedef bool (*exit_handler_fn)(struct kvm_vcpu *, u64 *);
static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu);
+static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code);
+
/*
* Allow the hypervisor to handle the exit with an exit handler if it has one.
*
@@ -429,6 +431,18 @@ static inline bool kvm_hyp_handle_exit(struct kvm_vcpu *vcpu, u64 *exit_code)
*/
static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code)
{
+ /*
+ * Save PSTATE early so that we can evaluate the vcpu mode
+ * early on.
+ */
+ vcpu->arch.ctxt.regs.pstate = read_sysreg_el2(SYS_SPSR);
+
+ /*
+ * Check whether we want to repaint the state one way or
+ * another.
+ */
+ early_exit_filter(vcpu, exit_code);
+
if (ARM_EXCEPTION_CODE(*exit_code) != ARM_EXCEPTION_IRQ)
vcpu->arch.fault.esr_el2 = read_sysreg_el2(SYS_ESR);
diff --git a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
index de7e14c862e6..7ecca8b07851 100644
--- a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
+++ b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
@@ -70,7 +70,12 @@ static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt)
static inline void __sysreg_save_el2_return_state(struct kvm_cpu_context *ctxt)
{
ctxt->regs.pc = read_sysreg_el2(SYS_ELR);
- ctxt->regs.pstate = read_sysreg_el2(SYS_SPSR);
+ /*
+ * Guest PSTATE gets saved at guest fixup time in all
+ * cases. We still need to handle the nVHE host side here.
+ */
+ if (!has_vhe() && ctxt->__hyp_running_vcpu)
+ ctxt->regs.pstate = read_sysreg_el2(SYS_SPSR);
if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN))
ctxt_sys_reg(ctxt, DISR_EL1) = read_sysreg_s(SYS_VDISR_EL2);
diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
index c0e3fed26d93..d13115a12434 100644
--- a/arch/arm64/kvm/hyp/nvhe/switch.c
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@@ -233,7 +233,7 @@ static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu)
* Returns false if the guest ran in AArch32 when it shouldn't have, and
* thus should exit to the host, or true if a the guest run loop can continue.
*/
-static bool handle_aarch32_guest(struct kvm_vcpu *vcpu, u64 *exit_code)
+static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code)
{
struct kvm *kvm = kern_hyp_va(vcpu->kvm);
@@ -248,10 +248,7 @@ static bool handle_aarch32_guest(struct kvm_vcpu *vcpu, u64 *exit_code)
vcpu->arch.target = -1;
*exit_code &= BIT(ARM_EXIT_WITH_SERROR_BIT);
*exit_code |= ARM_EXCEPTION_IL;
- return false;
}
-
- return true;
}
/* Switch to the guest for legacy non-VHE systems */
@@ -316,9 +313,6 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
/* Jump in the fire! */
exit_code = __guest_enter(vcpu);
- if (unlikely(!handle_aarch32_guest(vcpu, &exit_code)))
- break;
-
/* And we're baaack! */
} while (fixup_guest_exit(vcpu, &exit_code));
diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c
index 5a2cb5d9bc4b..fbb26b93c347 100644
--- a/arch/arm64/kvm/hyp/vhe/switch.c
+++ b/arch/arm64/kvm/hyp/vhe/switch.c
@@ -112,6 +112,10 @@ static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu)
return hyp_exit_handlers;
}
+static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code)
+{
+}
+
/* Switch to the guest for VHE systems running in EL2 */
static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
{
diff --git a/arch/csky/kernel/traps.c b/arch/csky/kernel/traps.c
index e5fbf8653a21..2020af88b636 100644
--- a/arch/csky/kernel/traps.c
+++ b/arch/csky/kernel/traps.c
@@ -209,7 +209,7 @@ asmlinkage void do_trap_illinsn(struct pt_regs *regs)
asmlinkage void do_trap_fpe(struct pt_regs *regs)
{
-#ifdef CONFIG_CPU_HAS_FP
+#ifdef CONFIG_CPU_HAS_FPU
return fpu_fpe(regs);
#else
do_trap_error(regs, SIGILL, ILL_ILLOPC, regs->pc,
@@ -219,7 +219,7 @@ asmlinkage void do_trap_fpe(struct pt_regs *regs)
asmlinkage void do_trap_priv(struct pt_regs *regs)
{
-#ifdef CONFIG_CPU_HAS_FP
+#ifdef CONFIG_CPU_HAS_FPU
if (user_mode(regs) && fpu_libc_helper(regs))
return;
#endif
diff --git a/arch/mips/net/bpf_jit_comp.h b/arch/mips/net/bpf_jit_comp.h
index 6f3a7b07294b..a37fe20818eb 100644
--- a/arch/mips/net/bpf_jit_comp.h
+++ b/arch/mips/net/bpf_jit_comp.h
@@ -98,7 +98,7 @@ do { \
#define emit(...) __emit(__VA_ARGS__)
/* Workaround for R10000 ll/sc errata */
-#ifdef CONFIG_WAR_R10000
+#ifdef CONFIG_WAR_R10000_LLSC
#define LLSC_beqz beqzl
#else
#define LLSC_beqz beqz
diff --git a/arch/parisc/Makefile b/arch/parisc/Makefile
index 8db4af4879d0..82d77f4b0d08 100644
--- a/arch/parisc/Makefile
+++ b/arch/parisc/Makefile
@@ -15,7 +15,12 @@
# Mike Shaver, Helge Deller and Martin K. Petersen
#
+ifdef CONFIG_PARISC_SELF_EXTRACT
+boot := arch/parisc/boot
+KBUILD_IMAGE := $(boot)/bzImage
+else
KBUILD_IMAGE := vmlinuz
+endif
NM = sh $(srctree)/arch/parisc/nm
CHECKFLAGS += -D__hppa__=1
diff --git a/arch/parisc/configs/generic-64bit_defconfig b/arch/parisc/configs/generic-64bit_defconfig
index d2daeac2b217..1b8fd80cbe7f 100644
--- a/arch/parisc/configs/generic-64bit_defconfig
+++ b/arch/parisc/configs/generic-64bit_defconfig
@@ -1,7 +1,9 @@
CONFIG_LOCALVERSION="-64bit"
# CONFIG_LOCALVERSION_AUTO is not set
+CONFIG_KERNEL_LZ4=y
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
+CONFIG_AUDIT=y
CONFIG_BSD_PROCESS_ACCT=y
CONFIG_BSD_PROCESS_ACCT_V3=y
CONFIG_TASKSTATS=y
@@ -35,6 +37,7 @@ CONFIG_MODVERSIONS=y
CONFIG_BLK_DEV_INTEGRITY=y
CONFIG_BINFMT_MISC=m
# CONFIG_COMPACTION is not set
+CONFIG_MEMORY_FAILURE=y
CONFIG_NET=y
CONFIG_PACKET=y
CONFIG_UNIX=y
@@ -65,12 +68,15 @@ CONFIG_SCSI_ISCSI_ATTRS=y
CONFIG_SCSI_SRP_ATTRS=y
CONFIG_ISCSI_BOOT_SYSFS=y
CONFIG_SCSI_MPT2SAS=y
-CONFIG_SCSI_LASI700=m
+CONFIG_SCSI_LASI700=y
CONFIG_SCSI_SYM53C8XX_2=y
CONFIG_SCSI_ZALON=y
CONFIG_SCSI_QLA_ISCSI=m
CONFIG_SCSI_DH=y
CONFIG_ATA=y
+CONFIG_SATA_SIL=y
+CONFIG_SATA_SIS=y
+CONFIG_SATA_VIA=y
CONFIG_PATA_NS87415=y
CONFIG_PATA_SIL680=y
CONFIG_ATA_GENERIC=y
@@ -79,6 +85,7 @@ CONFIG_MD_LINEAR=m
CONFIG_BLK_DEV_DM=m
CONFIG_DM_RAID=m
CONFIG_DM_UEVENT=y
+CONFIG_DM_AUDIT=y
CONFIG_FUSION=y
CONFIG_FUSION_SPI=y
CONFIG_FUSION_SAS=y
@@ -196,10 +203,15 @@ CONFIG_FB_MATROX_G=y
CONFIG_FB_MATROX_I2C=y
CONFIG_FB_MATROX_MAVEN=y
CONFIG_FB_RADEON=y
+CONFIG_LOGO=y
+# CONFIG_LOGO_LINUX_CLUT224 is not set
CONFIG_HIDRAW=y
CONFIG_HID_PID=y
CONFIG_USB_HIDDEV=y
CONFIG_USB=y
+CONFIG_USB_EHCI_HCD=y
+CONFIG_USB_OHCI_HCD=y
+CONFIG_USB_OHCI_HCD_PLATFORM=y
CONFIG_UIO=y
CONFIG_UIO_PDRV_GENIRQ=m
CONFIG_UIO_AEC=m
diff --git a/arch/parisc/install.sh b/arch/parisc/install.sh
index 056d588befdd..70d3cffb0251 100644
--- a/arch/parisc/install.sh
+++ b/arch/parisc/install.sh
@@ -39,6 +39,7 @@ verify "$3"
if [ -n "${INSTALLKERNEL}" ]; then
if [ -x ~/bin/${INSTALLKERNEL} ]; then exec ~/bin/${INSTALLKERNEL} "$@"; fi
if [ -x /sbin/${INSTALLKERNEL} ]; then exec /sbin/${INSTALLKERNEL} "$@"; fi
+ if [ -x /usr/sbin/${INSTALLKERNEL} ]; then exec /usr/sbin/${INSTALLKERNEL} "$@"; fi
fi
# Default install
diff --git a/arch/parisc/kernel/time.c b/arch/parisc/kernel/time.c
index 9fb1e794831b..061119a56fbe 100644
--- a/arch/parisc/kernel/time.c
+++ b/arch/parisc/kernel/time.c
@@ -249,30 +249,16 @@ void __init time_init(void)
static int __init init_cr16_clocksource(void)
{
/*
- * The cr16 interval timers are not syncronized across CPUs on
- * different sockets, so mark them unstable and lower rating on
- * multi-socket SMP systems.
+ * The cr16 interval timers are not syncronized across CPUs, even if
+ * they share the same socket.
*/
if (num_online_cpus() > 1 && !running_on_qemu) {
- int cpu;
- unsigned long cpu0_loc;
- cpu0_loc = per_cpu(cpu_data, 0).cpu_loc;
-
- for_each_online_cpu(cpu) {
- if (cpu == 0)
- continue;
- if ((cpu0_loc != 0) &&
- (cpu0_loc == per_cpu(cpu_data, cpu).cpu_loc))
- continue;
-
- /* mark sched_clock unstable */
- clear_sched_clock_stable();
-
- clocksource_cr16.name = "cr16_unstable";
- clocksource_cr16.flags = CLOCK_SOURCE_UNSTABLE;
- clocksource_cr16.rating = 0;
- break;
- }
+ /* mark sched_clock unstable */
+ clear_sched_clock_stable();
+
+ clocksource_cr16.name = "cr16_unstable";
+ clocksource_cr16.flags = CLOCK_SOURCE_UNSTABLE;
+ clocksource_cr16.rating = 0;
}
/* register at clocksource framework */
diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c
index 6baa676e7cb6..5d77d3f5fbb5 100644
--- a/arch/powerpc/kernel/module_64.c
+++ b/arch/powerpc/kernel/module_64.c
@@ -422,11 +422,17 @@ static inline int create_stub(const Elf64_Shdr *sechdrs,
const char *name)
{
long reladdr;
+ func_desc_t desc;
+ int i;
if (is_mprofile_ftrace_call(name))
return create_ftrace_stub(entry, addr, me);
- memcpy(entry->jump, ppc64_stub_insns, sizeof(ppc64_stub_insns));
+ for (i = 0; i < sizeof(ppc64_stub_insns) / sizeof(u32); i++) {
+ if (patch_instruction(&entry->jump[i],
+ ppc_inst(ppc64_stub_insns[i])))
+ return 0;
+ }
/* Stub uses address relative to r2. */
reladdr = (unsigned long)entry - my_r2(sechdrs, me);
@@ -437,10 +443,24 @@ static inline int create_stub(const Elf64_Shdr *sechdrs,
}
pr_debug("Stub %p get data from reladdr %li\n", entry, reladdr);
- entry->jump[0] |= PPC_HA(reladdr);
- entry->jump[1] |= PPC_LO(reladdr);
- entry->funcdata = func_desc(addr);
- entry->magic = STUB_MAGIC;
+ if (patch_instruction(&entry->jump[0],
+ ppc_inst(entry->jump[0] | PPC_HA(reladdr))))
+ return 0;
+
+ if (patch_instruction(&entry->jump[1],
+ ppc_inst(entry->jump[1] | PPC_LO(reladdr))))
+ return 0;
+
+ // func_desc_t is 8 bytes if ABIv2, else 16 bytes
+ desc = func_desc(addr);
+ for (i = 0; i < sizeof(func_desc_t) / sizeof(u32); i++) {
+ if (patch_instruction(((u32 *)&entry->funcdata) + i,
+ ppc_inst(((u32 *)(&desc))[i])))
+ return 0;
+ }
+
+ if (patch_instruction(&entry->magic, ppc_inst(STUB_MAGIC)))
+ return 0;
return 1;
}
@@ -495,8 +515,11 @@ static int restore_r2(const char *name, u32 *instruction, struct module *me)
me->name, *instruction, instruction);
return 0;
}
+
/* ld r2,R2_STACK_OFFSET(r1) */
- *instruction = PPC_INST_LD_TOC;
+ if (patch_instruction(instruction, ppc_inst(PPC_INST_LD_TOC)))
+ return 0;
+
return 1;
}
@@ -636,9 +659,12 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
}
/* Only replace bits 2 through 26 */
- *(uint32_t *)location
- = (*(uint32_t *)location & ~0x03fffffc)
+ value = (*(uint32_t *)location & ~0x03fffffc)
| (value & 0x03fffffc);
+
+ if (patch_instruction((u32 *)location, ppc_inst(value)))
+ return -EFAULT;
+
break;
case R_PPC64_REL64:
diff --git a/arch/powerpc/platforms/85xx/smp.c b/arch/powerpc/platforms/85xx/smp.c
index 83f4a6389a28..d7081e9af65c 100644
--- a/arch/powerpc/platforms/85xx/smp.c
+++ b/arch/powerpc/platforms/85xx/smp.c
@@ -220,7 +220,7 @@ static int smp_85xx_start_cpu(int cpu)
local_irq_save(flags);
hard_irq_disable();
- if (qoriq_pm_ops)
+ if (qoriq_pm_ops && qoriq_pm_ops->cpu_up_prepare)
qoriq_pm_ops->cpu_up_prepare(cpu);
/* if cpu is not spinning, reset it */
@@ -292,7 +292,7 @@ static int smp_85xx_kick_cpu(int nr)
booting_thread_hwid = cpu_thread_in_core(nr);
primary = cpu_first_thread_sibling(nr);
- if (qoriq_pm_ops)
+ if (qoriq_pm_ops && qoriq_pm_ops->cpu_up_prepare)
qoriq_pm_ops->cpu_up_prepare(nr);
/*
diff --git a/arch/riscv/boot/dts/sifive/hifive-unleashed-a00.dts b/arch/riscv/boot/dts/sifive/hifive-unleashed-a00.dts
index ba304d4c455c..ced0d4e47938 100644
--- a/arch/riscv/boot/dts/sifive/hifive-unleashed-a00.dts
+++ b/arch/riscv/boot/dts/sifive/hifive-unleashed-a00.dts
@@ -76,6 +76,7 @@
spi-max-frequency = <20000000>;
voltage-ranges = <3300 3300>;
disable-wp;
+ gpios = <&gpio 11 GPIO_ACTIVE_LOW>;
};
};
diff --git a/arch/riscv/boot/dts/sifive/hifive-unmatched-a00.dts b/arch/riscv/boot/dts/sifive/hifive-unmatched-a00.dts
index 4f66919215f6..6bfa1f24d3de 100644
--- a/arch/riscv/boot/dts/sifive/hifive-unmatched-a00.dts
+++ b/arch/riscv/boot/dts/sifive/hifive-unmatched-a00.dts
@@ -2,6 +2,7 @@
/* Copyright (c) 2020 SiFive, Inc */
#include "fu740-c000.dtsi"
+#include <dt-bindings/gpio/gpio.h>
#include <dt-bindings/interrupt-controller/irq.h>
/* Clock frequency (in Hz) of the PCB crystal for rtcclk */
@@ -54,10 +55,21 @@
temperature-sensor@4c {
compatible = "ti,tmp451";
reg = <0x4c>;
+ vcc-supply = <&vdd_bpro>;
interrupt-parent = <&gpio>;
interrupts = <6 IRQ_TYPE_LEVEL_LOW>;
};
+ eeprom@54 {
+ compatible = "microchip,24c02", "atmel,24c02";
+ reg = <0x54>;
+ vcc-supply = <&vdd_bpro>;
+ label = "board-id";
+ pagesize = <16>;
+ read-only;
+ size = <256>;
+ };
+
pmic@58 {
compatible = "dlg,da9063";
reg = <0x58>;
@@ -65,48 +77,44 @@
interrupts = <1 IRQ_TYPE_LEVEL_LOW>;
interrupt-controller;
- regulators {
- vdd_bcore1: bcore1 {
- regulator-min-microvolt = <900000>;
- regulator-max-microvolt = <900000>;
- regulator-min-microamp = <5000000>;
- regulator-max-microamp = <5000000>;
- regulator-always-on;
- };
+ onkey {
+ compatible = "dlg,da9063-onkey";
+ };
- vdd_bcore2: bcore2 {
- regulator-min-microvolt = <900000>;
- regulator-max-microvolt = <900000>;
- regulator-min-microamp = <5000000>;
- regulator-max-microamp = <5000000>;
+ rtc {
+ compatible = "dlg,da9063-rtc";
+ };
+
+ wdt {
+ compatible = "dlg,da9063-watchdog";
+ };
+
+ regulators {
+ vdd_bcore: bcores-merged {
+ regulator-min-microvolt = <1050000>;
+ regulator-max-microvolt = <1050000>;
+ regulator-min-microamp = <4800000>;
+ regulator-max-microamp = <4800000>;
regulator-always-on;
};
vdd_bpro: bpro {
regulator-min-microvolt = <1800000>;
regulator-max-microvolt = <1800000>;
- regulator-min-microamp = <2500000>;
- regulator-max-microamp = <2500000>;
+ regulator-min-microamp = <2400000>;
+ regulator-max-microamp = <2400000>;
regulator-always-on;
};
vdd_bperi: bperi {
- regulator-min-microvolt = <1050000>;
- regulator-max-microvolt = <1050000>;
+ regulator-min-microvolt = <1060000>;
+ regulator-max-microvolt = <1060000>;
regulator-min-microamp = <1500000>;
regulator-max-microamp = <1500000>;
regulator-always-on;
};
- vdd_bmem: bmem {
- regulator-min-microvolt = <1200000>;
- regulator-max-microvolt = <1200000>;
- regulator-min-microamp = <3000000>;
- regulator-max-microamp = <3000000>;
- regulator-always-on;
- };
-
- vdd_bio: bio {
+ vdd_bmem_bio: bmem-bio-merged {
regulator-min-microvolt = <1200000>;
regulator-max-microvolt = <1200000>;
regulator-min-microamp = <3000000>;
@@ -117,86 +125,66 @@
vdd_ldo1: ldo1 {
regulator-min-microvolt = <1800000>;
regulator-max-microvolt = <1800000>;
- regulator-min-microamp = <100000>;
- regulator-max-microamp = <100000>;
regulator-always-on;
};
vdd_ldo2: ldo2 {
regulator-min-microvolt = <1800000>;
regulator-max-microvolt = <1800000>;
- regulator-min-microamp = <200000>;
- regulator-max-microamp = <200000>;
regulator-always-on;
};
vdd_ldo3: ldo3 {
- regulator-min-microvolt = <1800000>;
- regulator-max-microvolt = <1800000>;
- regulator-min-microamp = <200000>;
- regulator-max-microamp = <200000>;
+ regulator-min-microvolt = <3300000>;
+ regulator-max-microvolt = <3300000>;
regulator-always-on;
};
vdd_ldo4: ldo4 {
- regulator-min-microvolt = <1800000>;
- regulator-max-microvolt = <1800000>;
- regulator-min-microamp = <200000>;
- regulator-max-microamp = <200000>;
+ regulator-min-microvolt = <2500000>;
+ regulator-max-microvolt = <2500000>;
regulator-always-on;
};
vdd_ldo5: ldo5 {
- regulator-min-microvolt = <1800000>;
- regulator-max-microvolt = <1800000>;
- regulator-min-microamp = <100000>;
- regulator-max-microamp = <100000>;
+ regulator-min-microvolt = <3300000>;
+ regulator-max-microvolt = <3300000>;
regulator-always-on;
};
vdd_ldo6: ldo6 {
- regulator-min-microvolt = <3300000>;
- regulator-max-microvolt = <3300000>;
- regulator-min-microamp = <200000>;
- regulator-max-microamp = <200000>;
+ regulator-min-microvolt = <1800000>;
+ regulator-max-microvolt = <1800000>;
regulator-always-on;
};
vdd_ldo7: ldo7 {
- regulator-min-microvolt = <1800000>;
- regulator-max-microvolt = <1800000>;
- regulator-min-microamp = <200000>;
- regulator-max-microamp = <200000>;
+ regulator-min-microvolt = <3300000>;
+ regulator-max-microvolt = <3300000>;
regulator-always-on;
};
vdd_ldo8: ldo8 {
- regulator-min-microvolt = <1800000>;
- regulator-max-microvolt = <1800000>;
- regulator-min-microamp = <200000>;
- regulator-max-microamp = <200000>;
+ regulator-min-microvolt = <3300000>;
+ regulator-max-microvolt = <3300000>;
regulator-always-on;
};
vdd_ld09: ldo9 {
regulator-min-microvolt = <1050000>;
regulator-max-microvolt = <1050000>;
- regulator-min-microamp = <200000>;
- regulator-max-microamp = <200000>;
+ regulator-always-on;
};
vdd_ldo10: ldo10 {
regulator-min-microvolt = <1000000>;
regulator-max-microvolt = <1000000>;
- regulator-min-microamp = <300000>;
- regulator-max-microamp = <300000>;
+ regulator-always-on;
};
vdd_ldo11: ldo11 {
regulator-min-microvolt = <2500000>;
regulator-max-microvolt = <2500000>;
- regulator-min-microamp = <300000>;
- regulator-max-microamp = <300000>;
regulator-always-on;
};
};
@@ -223,6 +211,7 @@
spi-max-frequency = <20000000>;
voltage-ranges = <3300 3300>;
disable-wp;
+ gpios = <&gpio 15 GPIO_ACTIVE_LOW>;
};
};
@@ -245,4 +234,8 @@
&gpio {
status = "okay";
+ gpio-line-names = "J29.1", "PMICNTB", "PMICSHDN", "J8.1", "J8.3",
+ "PCIe_PWREN", "THERM", "UBRDG_RSTN", "PCIe_PERSTN",
+ "ULPI_RSTN", "J8.2", "UHUB_RSTN", "GEMGXL_RST", "J8.4",
+ "EN_VDD_SD", "SD_CD";
};
diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h
index 25ba21f98504..2639b9ee48f9 100644
--- a/arch/riscv/include/asm/kvm_host.h
+++ b/arch/riscv/include/asm/kvm_host.h
@@ -12,14 +12,12 @@
#include <linux/types.h>
#include <linux/kvm.h>
#include <linux/kvm_types.h>
+#include <asm/csr.h>
#include <asm/kvm_vcpu_fp.h>
#include <asm/kvm_vcpu_timer.h>
-#ifdef CONFIG_64BIT
-#define KVM_MAX_VCPUS (1U << 16)
-#else
-#define KVM_MAX_VCPUS (1U << 9)
-#endif
+#define KVM_MAX_VCPUS \
+ ((HGATP_VMID_MASK >> HGATP_VMID_SHIFT) + 1)
#define KVM_HALT_POLL_NS_DEFAULT 500000
diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
index d81bae8eb55e..fc058ff5f4b6 100644
--- a/arch/riscv/kvm/mmu.c
+++ b/arch/riscv/kvm/mmu.c
@@ -453,6 +453,12 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm)
void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot)
{
+ gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
+ phys_addr_t size = slot->npages << PAGE_SHIFT;
+
+ spin_lock(&kvm->mmu_lock);
+ stage2_unmap_range(kvm, gpa, size, false);
+ spin_unlock(&kvm->mmu_lock);
}
void kvm_arch_commit_memory_region(struct kvm *kvm,
diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig
index fd825097cf04..e45cc27716de 100644
--- a/arch/s390/configs/debug_defconfig
+++ b/arch/s390/configs/debug_defconfig
@@ -117,6 +117,7 @@ CONFIG_UNIX=y
CONFIG_UNIX_DIAG=m
CONFIG_XFRM_USER=m
CONFIG_NET_KEY=m
+CONFIG_NET_SWITCHDEV=y
CONFIG_SMC=m
CONFIG_SMC_DIAG=m
CONFIG_INET=y
@@ -403,7 +404,6 @@ CONFIG_DEVTMPFS=y
CONFIG_CONNECTOR=y
CONFIG_ZRAM=y
CONFIG_BLK_DEV_LOOP=m
-CONFIG_BLK_DEV_CRYPTOLOOP=m
CONFIG_BLK_DEV_DRBD=m
CONFIG_BLK_DEV_NBD=m
CONFIG_BLK_DEV_RAM=y
@@ -476,6 +476,7 @@ CONFIG_MACVLAN=m
CONFIG_MACVTAP=m
CONFIG_VXLAN=m
CONFIG_BAREUDP=m
+CONFIG_AMT=m
CONFIG_TUN=m
CONFIG_VETH=m
CONFIG_VIRTIO_NET=m
@@ -489,6 +490,7 @@ CONFIG_NLMON=m
# CONFIG_NET_VENDOR_AMD is not set
# CONFIG_NET_VENDOR_AQUANTIA is not set
# CONFIG_NET_VENDOR_ARC is not set
+# CONFIG_NET_VENDOR_ASIX is not set
# CONFIG_NET_VENDOR_ATHEROS is not set
# CONFIG_NET_VENDOR_BROADCOM is not set
# CONFIG_NET_VENDOR_BROCADE is not set
@@ -510,6 +512,7 @@ CONFIG_NLMON=m
CONFIG_MLX4_EN=m
CONFIG_MLX5_CORE=m
CONFIG_MLX5_CORE_EN=y
+CONFIG_MLX5_ESWITCH=y
# CONFIG_NET_VENDOR_MICREL is not set
# CONFIG_NET_VENDOR_MICROCHIP is not set
# CONFIG_NET_VENDOR_MICROSEMI is not set
@@ -571,6 +574,7 @@ CONFIG_WATCHDOG=y
CONFIG_WATCHDOG_NOWAYOUT=y
CONFIG_SOFT_WATCHDOG=m
CONFIG_DIAG288_WATCHDOG=m
+# CONFIG_DRM_DEBUG_MODESET_LOCK is not set
CONFIG_FB=y
CONFIG_FRAMEBUFFER_CONSOLE=y
CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY=y
@@ -775,12 +779,14 @@ CONFIG_CRC4=m
CONFIG_CRC7=m
CONFIG_CRC8=m
CONFIG_RANDOM32_SELFTEST=y
+CONFIG_XZ_DEC_MICROLZMA=y
CONFIG_DMA_CMA=y
CONFIG_CMA_SIZE_MBYTES=0
CONFIG_PRINTK_TIME=y
CONFIG_DYNAMIC_DEBUG=y
CONFIG_DEBUG_INFO=y
CONFIG_DEBUG_INFO_DWARF4=y
+CONFIG_DEBUG_INFO_BTF=y
CONFIG_GDB_SCRIPTS=y
CONFIG_HEADERS_INSTALL=y
CONFIG_DEBUG_SECTION_MISMATCH=y
@@ -807,6 +813,7 @@ CONFIG_DEBUG_MEMORY_INIT=y
CONFIG_MEMORY_NOTIFIER_ERROR_INJECT=m
CONFIG_DEBUG_PER_CPU_MAPS=y
CONFIG_KFENCE=y
+CONFIG_KFENCE_STATIC_KEYS=y
CONFIG_DEBUG_SHIRQ=y
CONFIG_PANIC_ON_OOPS=y
CONFIG_DETECT_HUNG_TASK=y
@@ -842,6 +849,7 @@ CONFIG_FTRACE_STARTUP_TEST=y
CONFIG_SAMPLES=y
CONFIG_SAMPLE_TRACE_PRINTK=m
CONFIG_SAMPLE_FTRACE_DIRECT=m
+CONFIG_SAMPLE_FTRACE_DIRECT_MULTI=m
CONFIG_DEBUG_ENTRY=y
CONFIG_CIO_INJECT=y
CONFIG_KUNIT=m
@@ -860,7 +868,7 @@ CONFIG_FAIL_FUNCTION=y
CONFIG_FAULT_INJECTION_STACKTRACE_FILTER=y
CONFIG_LKDTM=m
CONFIG_TEST_MIN_HEAP=y
-CONFIG_KPROBES_SANITY_TEST=y
+CONFIG_KPROBES_SANITY_TEST=m
CONFIG_RBTREE_TEST=y
CONFIG_INTERVAL_TREE_TEST=m
CONFIG_PERCPU_TEST=m
diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig
index c9c3cedff2d8..1c750bfca2d8 100644
--- a/arch/s390/configs/defconfig
+++ b/arch/s390/configs/defconfig
@@ -109,6 +109,7 @@ CONFIG_UNIX=y
CONFIG_UNIX_DIAG=m
CONFIG_XFRM_USER=m
CONFIG_NET_KEY=m
+CONFIG_NET_SWITCHDEV=y
CONFIG_SMC=m
CONFIG_SMC_DIAG=m
CONFIG_INET=y
@@ -394,7 +395,6 @@ CONFIG_DEVTMPFS=y
CONFIG_CONNECTOR=y
CONFIG_ZRAM=y
CONFIG_BLK_DEV_LOOP=m
-CONFIG_BLK_DEV_CRYPTOLOOP=m
CONFIG_BLK_DEV_DRBD=m
CONFIG_BLK_DEV_NBD=m
CONFIG_BLK_DEV_RAM=y
@@ -467,6 +467,7 @@ CONFIG_MACVLAN=m
CONFIG_MACVTAP=m
CONFIG_VXLAN=m
CONFIG_BAREUDP=m
+CONFIG_AMT=m
CONFIG_TUN=m
CONFIG_VETH=m
CONFIG_VIRTIO_NET=m
@@ -480,6 +481,7 @@ CONFIG_NLMON=m
# CONFIG_NET_VENDOR_AMD is not set
# CONFIG_NET_VENDOR_AQUANTIA is not set
# CONFIG_NET_VENDOR_ARC is not set
+# CONFIG_NET_VENDOR_ASIX is not set
# CONFIG_NET_VENDOR_ATHEROS is not set
# CONFIG_NET_VENDOR_BROADCOM is not set
# CONFIG_NET_VENDOR_BROCADE is not set
@@ -501,6 +503,7 @@ CONFIG_NLMON=m
CONFIG_MLX4_EN=m
CONFIG_MLX5_CORE=m
CONFIG_MLX5_CORE_EN=y
+CONFIG_MLX5_ESWITCH=y
# CONFIG_NET_VENDOR_MICREL is not set
# CONFIG_NET_VENDOR_MICROCHIP is not set
# CONFIG_NET_VENDOR_MICROSEMI is not set
@@ -762,12 +765,14 @@ CONFIG_PRIME_NUMBERS=m
CONFIG_CRC4=m
CONFIG_CRC7=m
CONFIG_CRC8=m
+CONFIG_XZ_DEC_MICROLZMA=y
CONFIG_DMA_CMA=y
CONFIG_CMA_SIZE_MBYTES=0
CONFIG_PRINTK_TIME=y
CONFIG_DYNAMIC_DEBUG=y
CONFIG_DEBUG_INFO=y
CONFIG_DEBUG_INFO_DWARF4=y
+CONFIG_DEBUG_INFO_BTF=y
CONFIG_GDB_SCRIPTS=y
CONFIG_DEBUG_SECTION_MISMATCH=y
CONFIG_MAGIC_SYSRQ=y
@@ -792,9 +797,11 @@ CONFIG_HIST_TRIGGERS=y
CONFIG_SAMPLES=y
CONFIG_SAMPLE_TRACE_PRINTK=m
CONFIG_SAMPLE_FTRACE_DIRECT=m
+CONFIG_SAMPLE_FTRACE_DIRECT_MULTI=m
CONFIG_KUNIT=m
CONFIG_KUNIT_DEBUGFS=y
CONFIG_LKDTM=m
+CONFIG_KPROBES_SANITY_TEST=m
CONFIG_PERCPU_TEST=m
CONFIG_ATOMIC64_SELFTEST=y
CONFIG_TEST_BPF=m
diff --git a/arch/s390/configs/zfcpdump_defconfig b/arch/s390/configs/zfcpdump_defconfig
index aceccf3b9a88..eed3b9acfa71 100644
--- a/arch/s390/configs/zfcpdump_defconfig
+++ b/arch/s390/configs/zfcpdump_defconfig
@@ -65,9 +65,11 @@ CONFIG_ZFCP=y
# CONFIG_NETWORK_FILESYSTEMS is not set
CONFIG_LSM="yama,loadpin,safesetid,integrity"
# CONFIG_ZLIB_DFLTCC is not set
+CONFIG_XZ_DEC_MICROLZMA=y
CONFIG_PRINTK_TIME=y
# CONFIG_SYMBOLIC_ERRNAME is not set
CONFIG_DEBUG_INFO=y
+CONFIG_DEBUG_INFO_BTF=y
CONFIG_DEBUG_FS=y
CONFIG_DEBUG_KERNEL=y
CONFIG_PANIC_ON_OOPS=y
diff --git a/arch/s390/include/asm/pci_io.h b/arch/s390/include/asm/pci_io.h
index e4dc64cc9c55..287bb88f7698 100644
--- a/arch/s390/include/asm/pci_io.h
+++ b/arch/s390/include/asm/pci_io.h
@@ -14,12 +14,13 @@
/* I/O Map */
#define ZPCI_IOMAP_SHIFT 48
-#define ZPCI_IOMAP_ADDR_BASE 0x8000000000000000UL
+#define ZPCI_IOMAP_ADDR_SHIFT 62
+#define ZPCI_IOMAP_ADDR_BASE (1UL << ZPCI_IOMAP_ADDR_SHIFT)
#define ZPCI_IOMAP_ADDR_OFF_MASK ((1UL << ZPCI_IOMAP_SHIFT) - 1)
#define ZPCI_IOMAP_MAX_ENTRIES \
- ((ULONG_MAX - ZPCI_IOMAP_ADDR_BASE + 1) / (1UL << ZPCI_IOMAP_SHIFT))
+ (1UL << (ZPCI_IOMAP_ADDR_SHIFT - ZPCI_IOMAP_SHIFT))
#define ZPCI_IOMAP_ADDR_IDX_MASK \
- (~ZPCI_IOMAP_ADDR_OFF_MASK - ZPCI_IOMAP_ADDR_BASE)
+ ((ZPCI_IOMAP_ADDR_BASE - 1) & ~ZPCI_IOMAP_ADDR_OFF_MASK)
struct zpci_iomap_entry {
u32 fh;
diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c
index 5510c7d10ddc..21d62d8b6b9a 100644
--- a/arch/s390/kernel/ftrace.c
+++ b/arch/s390/kernel/ftrace.c
@@ -290,7 +290,6 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
return;
regs = ftrace_get_regs(fregs);
- preempt_disable_notrace();
p = get_kprobe((kprobe_opcode_t *)ip);
if (unlikely(!p) || kprobe_disabled(p))
goto out;
@@ -318,7 +317,6 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
}
__this_cpu_write(current_kprobe, NULL);
out:
- preempt_enable_notrace();
ftrace_test_recursion_unlock(bit);
}
NOKPROBE_SYMBOL(kprobe_ftrace_handler);
diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c
index 0df83ecaa2e0..cb7099682340 100644
--- a/arch/s390/kernel/irq.c
+++ b/arch/s390/kernel/irq.c
@@ -138,7 +138,7 @@ void noinstr do_io_irq(struct pt_regs *regs)
struct pt_regs *old_regs = set_irq_regs(regs);
int from_idle;
- irq_enter();
+ irq_enter_rcu();
if (user_mode(regs)) {
update_timer_sys();
@@ -158,7 +158,8 @@ void noinstr do_io_irq(struct pt_regs *regs)
do_irq_async(regs, IO_INTERRUPT);
} while (MACHINE_IS_LPAR && irq_pending(regs));
- irq_exit();
+ irq_exit_rcu();
+
set_irq_regs(old_regs);
irqentry_exit(regs, state);
@@ -172,7 +173,7 @@ void noinstr do_ext_irq(struct pt_regs *regs)
struct pt_regs *old_regs = set_irq_regs(regs);
int from_idle;
- irq_enter();
+ irq_enter_rcu();
if (user_mode(regs)) {
update_timer_sys();
@@ -190,7 +191,7 @@ void noinstr do_ext_irq(struct pt_regs *regs)
do_irq_async(regs, EXT_INTERRUPT);
- irq_exit();
+ irq_exit_rcu();
set_irq_regs(old_regs);
irqentry_exit(regs, state);
diff --git a/arch/s390/kernel/machine_kexec_file.c b/arch/s390/kernel/machine_kexec_file.c
index 9975ad200d74..8f43575a4dd3 100644
--- a/arch/s390/kernel/machine_kexec_file.c
+++ b/arch/s390/kernel/machine_kexec_file.c
@@ -7,6 +7,8 @@
* Author(s): Philipp Rudo <prudo@linux.vnet.ibm.com>
*/
+#define pr_fmt(fmt) "kexec: " fmt
+
#include <linux/elf.h>
#include <linux/errno.h>
#include <linux/kexec.h>
@@ -290,8 +292,16 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi,
const Elf_Shdr *relsec,
const Elf_Shdr *symtab)
{
+ const char *strtab, *name, *shstrtab;
+ const Elf_Shdr *sechdrs;
Elf_Rela *relas;
int i, r_type;
+ int ret;
+
+ /* String & section header string table */
+ sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff;
+ strtab = (char *)pi->ehdr + sechdrs[symtab->sh_link].sh_offset;
+ shstrtab = (char *)pi->ehdr + sechdrs[pi->ehdr->e_shstrndx].sh_offset;
relas = (void *)pi->ehdr + relsec->sh_offset;
@@ -304,15 +314,27 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi,
sym = (void *)pi->ehdr + symtab->sh_offset;
sym += ELF64_R_SYM(relas[i].r_info);
- if (sym->st_shndx == SHN_UNDEF)
+ if (sym->st_name)
+ name = strtab + sym->st_name;
+ else
+ name = shstrtab + sechdrs[sym->st_shndx].sh_name;
+
+ if (sym->st_shndx == SHN_UNDEF) {
+ pr_err("Undefined symbol: %s\n", name);
return -ENOEXEC;
+ }
- if (sym->st_shndx == SHN_COMMON)
+ if (sym->st_shndx == SHN_COMMON) {
+ pr_err("symbol '%s' in common section\n", name);
return -ENOEXEC;
+ }
if (sym->st_shndx >= pi->ehdr->e_shnum &&
- sym->st_shndx != SHN_ABS)
+ sym->st_shndx != SHN_ABS) {
+ pr_err("Invalid section %d for symbol %s\n",
+ sym->st_shndx, name);
return -ENOEXEC;
+ }
loc = pi->purgatory_buf;
loc += section->sh_offset;
@@ -326,7 +348,15 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi,
addr = section->sh_addr + relas[i].r_offset;
r_type = ELF64_R_TYPE(relas[i].r_info);
- arch_kexec_do_relocs(r_type, loc, val, addr);
+
+ if (r_type == R_390_PLT32DBL)
+ r_type = R_390_PC32DBL;
+
+ ret = arch_kexec_do_relocs(r_type, loc, val, addr);
+ if (ret) {
+ pr_err("Unknown rela relocation: %d\n", r_type);
+ return -ENOEXEC;
+ }
}
return 0;
}
diff --git a/arch/s390/lib/test_unwind.c b/arch/s390/lib/test_unwind.c
index cfc5f5557c06..bc7973359ae2 100644
--- a/arch/s390/lib/test_unwind.c
+++ b/arch/s390/lib/test_unwind.c
@@ -173,10 +173,11 @@ static noinline int unwindme_func4(struct unwindme *u)
}
/*
- * trigger specification exception
+ * Trigger operation exception; use insn notation to bypass
+ * llvm's integrated assembler sanity checks.
*/
asm volatile(
- " mvcl %%r1,%%r1\n"
+ " .insn e,0x0000\n" /* illegal opcode */
"0: nopr %%r7\n"
EX_TABLE(0b, 0b)
:);
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 7399327d1eff..5c2ccb85f2ef 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1932,6 +1932,7 @@ config EFI
depends on ACPI
select UCS2_STRING
select EFI_RUNTIME_WRAPPERS
+ select ARCH_USE_MEMREMAP_PROT
help
This enables the kernel to use EFI runtime services that are
available (such as the EFI variable services).
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index e38a4cf795d9..97b1f84bb53f 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -574,6 +574,10 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
ud2
1:
#endif
+#ifdef CONFIG_XEN_PV
+ ALTERNATIVE "", "jmp xenpv_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
+#endif
+
POP_REGS pop_rdi=0
/*
@@ -890,6 +894,7 @@ SYM_CODE_START_LOCAL(paranoid_entry)
.Lparanoid_entry_checkgs:
/* EBX = 1 -> kernel GSBASE active, no restore required */
movl $1, %ebx
+
/*
* The kernel-enforced convention is a negative GSBASE indicates
* a kernel value. No SWAPGS needed on entry and exit.
@@ -897,21 +902,14 @@ SYM_CODE_START_LOCAL(paranoid_entry)
movl $MSR_GS_BASE, %ecx
rdmsr
testl %edx, %edx
- jns .Lparanoid_entry_swapgs
- ret
+ js .Lparanoid_kernel_gsbase
-.Lparanoid_entry_swapgs:
+ /* EBX = 0 -> SWAPGS required on exit */
+ xorl %ebx, %ebx
swapgs
+.Lparanoid_kernel_gsbase:
- /*
- * The above SAVE_AND_SWITCH_TO_KERNEL_CR3 macro doesn't do an
- * unconditional CR3 write, even in the PTI case. So do an lfence
- * to prevent GS speculation, regardless of whether PTI is enabled.
- */
FENCE_SWAPGS_KERNEL_ENTRY
-
- /* EBX = 0 -> SWAPGS required on exit */
- xorl %ebx, %ebx
ret
SYM_CODE_END(paranoid_entry)
@@ -993,11 +991,6 @@ SYM_CODE_START_LOCAL(error_entry)
pushq %r12
ret
-.Lerror_entry_done_lfence:
- FENCE_SWAPGS_KERNEL_ENTRY
-.Lerror_entry_done:
- ret
-
/*
* There are two places in the kernel that can potentially fault with
* usergs. Handle them here. B stepping K8s sometimes report a
@@ -1020,8 +1013,14 @@ SYM_CODE_START_LOCAL(error_entry)
* .Lgs_change's error handler with kernel gsbase.
*/
SWAPGS
- FENCE_SWAPGS_USER_ENTRY
- jmp .Lerror_entry_done
+
+ /*
+ * Issue an LFENCE to prevent GS speculation, regardless of whether it is a
+ * kernel or user gsbase.
+ */
+.Lerror_entry_done_lfence:
+ FENCE_SWAPGS_KERNEL_ENTRY
+ ret
.Lbstep_iret:
/* Fix truncated RIP */
diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
index 5a0bcf8b78d7..048b6d5aff50 100644
--- a/arch/x86/include/asm/intel-family.h
+++ b/arch/x86/include/asm/intel-family.h
@@ -108,7 +108,7 @@
#define INTEL_FAM6_ALDERLAKE 0x97 /* Golden Cove / Gracemont */
#define INTEL_FAM6_ALDERLAKE_L 0x9A /* Golden Cove / Gracemont */
-#define INTEL_FAM6_RAPTOR_LAKE 0xB7
+#define INTEL_FAM6_RAPTORLAKE 0xB7
/* "Small Core" Processors (Atom) */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6ac61f85e07b..2164b9f4c7b0 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -97,7 +97,7 @@
KVM_ARCH_REQ_FLAGS(25, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_TLB_FLUSH_CURRENT KVM_ARCH_REQ(26)
#define KVM_REQ_TLB_FLUSH_GUEST \
- KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_NO_WAKEUP)
+ KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_APF_READY KVM_ARCH_REQ(28)
#define KVM_REQ_MSR_FILTER_CHANGED KVM_ARCH_REQ(29)
#define KVM_REQ_UPDATE_CPU_DIRTY_LOGGING \
@@ -1036,6 +1036,7 @@ struct kvm_x86_msr_filter {
#define APICV_INHIBIT_REASON_PIT_REINJ 4
#define APICV_INHIBIT_REASON_X2APIC 5
#define APICV_INHIBIT_REASON_BLOCKIRQ 6
+#define APICV_INHIBIT_REASON_ABSENT 7
struct kvm_arch {
unsigned long n_used_mmu_pages;
diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h
index 2cef6c5a52c2..6acaf5af0a3d 100644
--- a/arch/x86/include/asm/sev-common.h
+++ b/arch/x86/include/asm/sev-common.h
@@ -73,4 +73,15 @@
#define GHCB_RESP_CODE(v) ((v) & GHCB_MSR_INFO_MASK)
+/*
+ * Error codes related to GHCB input that can be communicated back to the guest
+ * by setting the lower 32-bits of the GHCB SW_EXITINFO1 field to 2.
+ */
+#define GHCB_ERR_NOT_REGISTERED 1
+#define GHCB_ERR_INVALID_USAGE 2
+#define GHCB_ERR_INVALID_SCRATCH_AREA 3
+#define GHCB_ERR_MISSING_INPUT 4
+#define GHCB_ERR_INVALID_INPUT 5
+#define GHCB_ERR_INVALID_EVENT 6
+
#endif
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index d5958278eba6..91d4b6de58ab 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -118,7 +118,7 @@ static inline bool save_xstate_epilog(void __user *buf, int ia32_frame,
struct fpstate *fpstate)
{
struct xregs_state __user *x = buf;
- struct _fpx_sw_bytes sw_bytes;
+ struct _fpx_sw_bytes sw_bytes = {};
u32 xfeatures;
int err;
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index 74f0ec955384..a9fc2ac7a8bd 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -294,11 +294,6 @@ static enum es_result vc_write_mem(struct es_em_ctxt *ctxt,
char *dst, char *buf, size_t size)
{
unsigned long error_code = X86_PF_PROT | X86_PF_WRITE;
- char __user *target = (char __user *)dst;
- u64 d8;
- u32 d4;
- u16 d2;
- u8 d1;
/*
* This function uses __put_user() independent of whether kernel or user
@@ -320,26 +315,42 @@ static enum es_result vc_write_mem(struct es_em_ctxt *ctxt,
* instructions here would cause infinite nesting.
*/
switch (size) {
- case 1:
+ case 1: {
+ u8 d1;
+ u8 __user *target = (u8 __user *)dst;
+
memcpy(&d1, buf, 1);
if (__put_user(d1, target))
goto fault;
break;
- case 2:
+ }
+ case 2: {
+ u16 d2;
+ u16 __user *target = (u16 __user *)dst;
+
memcpy(&d2, buf, 2);
if (__put_user(d2, target))
goto fault;
break;
- case 4:
+ }
+ case 4: {
+ u32 d4;
+ u32 __user *target = (u32 __user *)dst;
+
memcpy(&d4, buf, 4);
if (__put_user(d4, target))
goto fault;
break;
- case 8:
+ }
+ case 8: {
+ u64 d8;
+ u64 __user *target = (u64 __user *)dst;
+
memcpy(&d8, buf, 8);
if (__put_user(d8, target))
goto fault;
break;
+ }
default:
WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size);
return ES_UNSUPPORTED;
@@ -362,11 +373,6 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
char *src, char *buf, size_t size)
{
unsigned long error_code = X86_PF_PROT;
- char __user *s = (char __user *)src;
- u64 d8;
- u32 d4;
- u16 d2;
- u8 d1;
/*
* This function uses __get_user() independent of whether kernel or user
@@ -388,26 +394,41 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
* instructions here would cause infinite nesting.
*/
switch (size) {
- case 1:
+ case 1: {
+ u8 d1;
+ u8 __user *s = (u8 __user *)src;
+
if (__get_user(d1, s))
goto fault;
memcpy(buf, &d1, 1);
break;
- case 2:
+ }
+ case 2: {
+ u16 d2;
+ u16 __user *s = (u16 __user *)src;
+
if (__get_user(d2, s))
goto fault;
memcpy(buf, &d2, 2);
break;
- case 4:
+ }
+ case 4: {
+ u32 d4;
+ u32 __user *s = (u32 __user *)src;
+
if (__get_user(d4, s))
goto fault;
memcpy(buf, &d4, 4);
break;
- case 8:
+ }
+ case 8: {
+ u64 d8;
+ u64 __user *s = (u64 __user *)src;
if (__get_user(d8, s))
goto fault;
memcpy(buf, &d8, 8);
break;
+ }
default:
WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size);
return ES_UNSUPPORTED;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index ac2909f0cab3..617012f4619f 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -579,6 +579,17 @@ static struct sched_domain_topology_level x86_numa_in_package_topology[] = {
{ NULL, },
};
+static struct sched_domain_topology_level x86_hybrid_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+ { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
+#endif
+#ifdef CONFIG_SCHED_MC
+ { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
+#endif
+ { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+ { NULL, },
+};
+
static struct sched_domain_topology_level x86_topology[] = {
#ifdef CONFIG_SCHED_SMT
{ cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
@@ -1469,8 +1480,11 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
calculate_max_logical_packages();
+ /* XXX for now assume numa-in-package and hybrid don't overlap */
if (x86_has_numa_in_package)
set_sched_topology(x86_numa_in_package_topology);
+ if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU))
+ set_sched_topology(x86_hybrid_topology);
nmi_selftest();
impress_friends();
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 2e076a459a0c..a698196377be 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1180,6 +1180,12 @@ void mark_tsc_unstable(char *reason)
EXPORT_SYMBOL_GPL(mark_tsc_unstable);
+static void __init tsc_disable_clocksource_watchdog(void)
+{
+ clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
+ clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
+}
+
static void __init check_system_tsc_reliable(void)
{
#if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC)
@@ -1196,6 +1202,23 @@ static void __init check_system_tsc_reliable(void)
#endif
if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE))
tsc_clocksource_reliable = 1;
+
+ /*
+ * Disable the clocksource watchdog when the system has:
+ * - TSC running at constant frequency
+ * - TSC which does not stop in C-States
+ * - the TSC_ADJUST register which allows to detect even minimal
+ * modifications
+ * - not more than two sockets. As the number of sockets cannot be
+ * evaluated at the early boot stage where this has to be
+ * invoked, check the number of online memory nodes as a
+ * fallback solution which is an reasonable estimate.
+ */
+ if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
+ boot_cpu_has(X86_FEATURE_NONSTOP_TSC) &&
+ boot_cpu_has(X86_FEATURE_TSC_ADJUST) &&
+ nr_online_nodes <= 2)
+ tsc_disable_clocksource_watchdog();
}
/*
@@ -1387,9 +1410,6 @@ static int __init init_tsc_clocksource(void)
if (tsc_unstable)
goto unreg;
- if (tsc_clocksource_reliable || no_tsc_watchdog)
- clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
-
if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3))
clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP;
@@ -1527,7 +1547,7 @@ void __init tsc_init(void)
}
if (tsc_clocksource_reliable || no_tsc_watchdog)
- clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
+ tsc_disable_clocksource_watchdog();
clocksource_register_khz(&clocksource_tsc_early, tsc_khz);
detect_art();
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 50a4515fe0ad..9452dc9664b5 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -30,6 +30,7 @@ struct tsc_adjust {
};
static DEFINE_PER_CPU(struct tsc_adjust, tsc_adjust);
+static struct timer_list tsc_sync_check_timer;
/*
* TSC's on different sockets may be reset asynchronously.
@@ -77,6 +78,46 @@ void tsc_verify_tsc_adjust(bool resume)
}
}
+/*
+ * Normally the tsc_sync will be checked every time system enters idle
+ * state, but there is still caveat that a system won't enter idle,
+ * either because it's too busy or configured purposely to not enter
+ * idle.
+ *
+ * So setup a periodic timer (every 10 minutes) to make sure the check
+ * is always on.
+ */
+
+#define SYNC_CHECK_INTERVAL (HZ * 600)
+
+static void tsc_sync_check_timer_fn(struct timer_list *unused)
+{
+ int next_cpu;
+
+ tsc_verify_tsc_adjust(false);
+
+ /* Run the check for all onlined CPUs in turn */
+ next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
+ if (next_cpu >= nr_cpu_ids)
+ next_cpu = cpumask_first(cpu_online_mask);
+
+ tsc_sync_check_timer.expires += SYNC_CHECK_INTERVAL;
+ add_timer_on(&tsc_sync_check_timer, next_cpu);
+}
+
+static int __init start_sync_check_timer(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_TSC_ADJUST) || tsc_clocksource_reliable)
+ return 0;
+
+ timer_setup(&tsc_sync_check_timer, tsc_sync_check_timer_fn, 0);
+ tsc_sync_check_timer.expires = jiffies + SYNC_CHECK_INTERVAL;
+ add_timer(&tsc_sync_check_timer);
+
+ return 0;
+}
+late_initcall(start_sync_check_timer);
+
static void tsc_sanitize_first_cpu(struct tsc_adjust *cur, s64 bootval,
unsigned int cpu, bool bootcpu)
{
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 5e19e6e4c2ce..8d8c1cc7cb53 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1922,11 +1922,13 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool
all_cpus = send_ipi_ex.vp_set.format == HV_GENERIC_SET_ALL;
+ if (all_cpus)
+ goto check_and_send_ipi;
+
if (!sparse_banks_len)
goto ret_success;
- if (!all_cpus &&
- kvm_read_guest(kvm,
+ if (kvm_read_guest(kvm,
hc->ingpa + offsetof(struct hv_send_ipi_ex,
vp_set.bank_contents),
sparse_banks,
@@ -1934,6 +1936,7 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool
return HV_STATUS_INVALID_HYPERCALL_INPUT;
}
+check_and_send_ipi:
if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR))
return HV_STATUS_INVALID_HYPERCALL_INPUT;
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index e66e620c3bed..539333ac4b38 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -81,7 +81,6 @@ struct kvm_ioapic {
unsigned long irq_states[IOAPIC_NUM_PINS];
struct kvm_io_device dev;
struct kvm *kvm;
- void (*ack_notifier)(void *opaque, int irq);
spinlock_t lock;
struct rtc_status rtc_status;
struct delayed_work eoi_inject;
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 650642b18d15..c2d7cfe82d00 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -56,7 +56,6 @@ struct kvm_pic {
struct kvm_io_device dev_master;
struct kvm_io_device dev_slave;
struct kvm_io_device dev_elcr;
- void (*ack_notifier)(void *opaque, int irq);
unsigned long irq_states[PIC_NUM_PINS];
};
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 759952dd1222..f206fc35deff 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -707,7 +707,7 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr)
{
int highest_irr;
- if (apic->vcpu->arch.apicv_active)
+ if (kvm_x86_ops.sync_pir_to_irr)
highest_irr = static_call(kvm_x86_sync_pir_to_irr)(apic->vcpu);
else
highest_irr = apic_find_highest_irr(apic);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 3be9beea838d..e2e1d012df22 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -1582,7 +1582,7 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp);
if (is_tdp_mmu_enabled(kvm))
- flush |= kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
+ flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
return flush;
}
@@ -1936,7 +1936,11 @@ static void mmu_audit_disable(void) { }
static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
{
- return sp->role.invalid ||
+ if (sp->role.invalid)
+ return true;
+
+ /* TDP MMU pages due not use the MMU generation. */
+ return !sp->tdp_mmu_page &&
unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
}
@@ -2173,10 +2177,10 @@ static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterato
iterator->shadow_addr = root;
iterator->level = vcpu->arch.mmu->shadow_root_level;
- if (iterator->level == PT64_ROOT_4LEVEL &&
+ if (iterator->level >= PT64_ROOT_4LEVEL &&
vcpu->arch.mmu->root_level < PT64_ROOT_4LEVEL &&
!vcpu->arch.mmu->direct_map)
- --iterator->level;
+ iterator->level = PT32E_ROOT_LEVEL;
if (iterator->level == PT32E_ROOT_LEVEL) {
/*
@@ -3976,6 +3980,20 @@ out_retry:
return true;
}
+/*
+ * Returns true if the page fault is stale and needs to be retried, i.e. if the
+ * root was invalidated by a memslot update or a relevant mmu_notifier fired.
+ */
+static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault, int mmu_seq)
+{
+ if (is_obsolete_sp(vcpu->kvm, to_shadow_page(vcpu->arch.mmu->root_hpa)))
+ return true;
+
+ return fault->slot &&
+ mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva);
+}
+
static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
bool is_tdp_mmu_fault = is_tdp_mmu(vcpu->arch.mmu);
@@ -4013,8 +4031,9 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
else
write_lock(&vcpu->kvm->mmu_lock);
- if (fault->slot && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva))
+ if (is_page_fault_stale(vcpu, fault, mmu_seq))
goto out_unlock;
+
r = make_mmu_pages_available(vcpu);
if (r)
goto out_unlock;
@@ -4855,7 +4874,7 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
struct kvm_mmu *context = &vcpu->arch.guest_mmu;
struct kvm_mmu_role_regs regs = {
.cr0 = cr0,
- .cr4 = cr4,
+ .cr4 = cr4 & ~X86_CR4_PKE,
.efer = efer,
};
union kvm_mmu_role new_role;
@@ -4919,7 +4938,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
context->direct_map = false;
update_permission_bitmask(context, true);
- update_pkru_bitmask(context);
+ context->pkru_mask = 0;
reset_rsvds_bits_mask_ept(vcpu, context, execonly);
reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
}
@@ -5025,6 +5044,14 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
/*
* Invalidate all MMU roles to force them to reinitialize as CPUID
* information is factored into reserved bit calculations.
+ *
+ * Correctly handling multiple vCPU models with respect to paging and
+ * physical address properties) in a single VM would require tracking
+ * all relevant CPUID information in kvm_mmu_page_role. That is very
+ * undesirable as it would increase the memory requirements for
+ * gfn_track (see struct kvm_mmu_page_role comments). For now that
+ * problem is swept under the rug; KVM's CPUID API is horrific and
+ * it's all but impossible to solve it without introducing a new API.
*/
vcpu->arch.root_mmu.mmu_role.ext.valid = 0;
vcpu->arch.guest_mmu.mmu_role.ext.valid = 0;
@@ -5032,24 +5059,10 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
kvm_mmu_reset_context(vcpu);
/*
- * KVM does not correctly handle changing guest CPUID after KVM_RUN, as
- * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't
- * tracked in kvm_mmu_page_role. As a result, KVM may miss guest page
- * faults due to reusing SPs/SPTEs. Alert userspace, but otherwise
- * sweep the problem under the rug.
- *
- * KVM's horrific CPUID ABI makes the problem all but impossible to
- * solve, as correctly handling multiple vCPU models (with respect to
- * paging and physical address properties) in a single VM would require
- * tracking all relevant CPUID information in kvm_mmu_page_role. That
- * is very undesirable as it would double the memory requirements for
- * gfn_track (see struct kvm_mmu_page_role comments), and in practice
- * no sane VMM mucks with the core vCPU model on the fly.
+ * Changing guest CPUID after KVM_RUN is forbidden, see the comment in
+ * kvm_arch_vcpu_ioctl().
*/
- if (vcpu->arch.last_vmentry_cpu != -1) {
- pr_warn_ratelimited("KVM: KVM_SET_CPUID{,2} after KVM_RUN may cause guest instability\n");
- pr_warn_ratelimited("KVM: KVM_SET_CPUID{,2} will fail after KVM_RUN starting with Linux 5.16\n");
- }
+ KVM_BUG_ON(vcpu->arch.last_vmentry_cpu != -1, vcpu->kvm);
}
void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
@@ -5369,7 +5382,7 @@ void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
{
- kvm_mmu_invalidate_gva(vcpu, vcpu->arch.mmu, gva, INVALID_PAGE);
+ kvm_mmu_invalidate_gva(vcpu, vcpu->arch.walk_mmu, gva, INVALID_PAGE);
++vcpu->stat.invlpg;
}
EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
@@ -5854,8 +5867,6 @@ restart:
void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
const struct kvm_memory_slot *slot)
{
- bool flush = false;
-
if (kvm_memslots_have_rmaps(kvm)) {
write_lock(&kvm->mmu_lock);
/*
@@ -5863,17 +5874,14 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
* logging at a 4k granularity and never creates collapsible
* 2m SPTEs during dirty logging.
*/
- flush = slot_handle_level_4k(kvm, slot, kvm_mmu_zap_collapsible_spte, true);
- if (flush)
+ if (slot_handle_level_4k(kvm, slot, kvm_mmu_zap_collapsible_spte, true))
kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
write_unlock(&kvm->mmu_lock);
}
if (is_tdp_mmu_enabled(kvm)) {
read_lock(&kvm->mmu_lock);
- flush = kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot, flush);
- if (flush)
- kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
+ kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot);
read_unlock(&kvm->mmu_lock);
}
}
@@ -6182,23 +6190,46 @@ void kvm_mmu_module_exit(void)
mmu_audit_disable();
}
+/*
+ * Calculate the effective recovery period, accounting for '0' meaning "let KVM
+ * select a halving time of 1 hour". Returns true if recovery is enabled.
+ */
+static bool calc_nx_huge_pages_recovery_period(uint *period)
+{
+ /*
+ * Use READ_ONCE to get the params, this may be called outside of the
+ * param setters, e.g. by the kthread to compute its next timeout.
+ */
+ bool enabled = READ_ONCE(nx_huge_pages);
+ uint ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
+
+ if (!enabled || !ratio)
+ return false;
+
+ *period = READ_ONCE(nx_huge_pages_recovery_period_ms);
+ if (!*period) {
+ /* Make sure the period is not less than one second. */
+ ratio = min(ratio, 3600u);
+ *period = 60 * 60 * 1000 / ratio;
+ }
+ return true;
+}
+
static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp)
{
bool was_recovery_enabled, is_recovery_enabled;
uint old_period, new_period;
int err;
- was_recovery_enabled = nx_huge_pages_recovery_ratio;
- old_period = nx_huge_pages_recovery_period_ms;
+ was_recovery_enabled = calc_nx_huge_pages_recovery_period(&old_period);
err = param_set_uint(val, kp);
if (err)
return err;
- is_recovery_enabled = nx_huge_pages_recovery_ratio;
- new_period = nx_huge_pages_recovery_period_ms;
+ is_recovery_enabled = calc_nx_huge_pages_recovery_period(&new_period);
- if (READ_ONCE(nx_huge_pages) && is_recovery_enabled &&
+ if (is_recovery_enabled &&
(!was_recovery_enabled || old_period > new_period)) {
struct kvm *kvm;
@@ -6262,18 +6293,13 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
static long get_nx_lpage_recovery_timeout(u64 start_time)
{
- uint ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
- uint period = READ_ONCE(nx_huge_pages_recovery_period_ms);
+ bool enabled;
+ uint period;
- if (!period && ratio) {
- /* Make sure the period is not less than one second. */
- ratio = min(ratio, 3600u);
- period = 60 * 60 * 1000 / ratio;
- }
+ enabled = calc_nx_huge_pages_recovery_period(&period);
- return READ_ONCE(nx_huge_pages) && ratio
- ? start_time + msecs_to_jiffies(period) - get_jiffies_64()
- : MAX_SCHEDULE_TIMEOUT;
+ return enabled ? start_time + msecs_to_jiffies(period) - get_jiffies_64()
+ : MAX_SCHEDULE_TIMEOUT;
}
static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data)
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index f87d36898c44..708a5d297fe1 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -911,7 +911,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
r = RET_PF_RETRY;
write_lock(&vcpu->kvm->mmu_lock);
- if (fault->slot && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva))
+
+ if (is_page_fault_stale(vcpu, fault, mmu_seq))
goto out_unlock;
kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index a54c3491af42..1db8496259ad 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -317,9 +317,6 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
int level = sp->role.level;
gfn_t base_gfn = sp->gfn;
- u64 old_child_spte;
- u64 *sptep;
- gfn_t gfn;
int i;
trace_kvm_mmu_prepare_zap_page(sp);
@@ -327,8 +324,9 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
tdp_mmu_unlink_page(kvm, sp, shared);
for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
- sptep = rcu_dereference(pt) + i;
- gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
+ u64 *sptep = rcu_dereference(pt) + i;
+ gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
+ u64 old_child_spte;
if (shared) {
/*
@@ -374,7 +372,7 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
shared);
}
- kvm_flush_remote_tlbs_with_address(kvm, gfn,
+ kvm_flush_remote_tlbs_with_address(kvm, base_gfn,
KVM_PAGES_PER_HPAGE(level + 1));
call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
@@ -1033,9 +1031,9 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
{
struct kvm_mmu_page *root;
- for_each_tdp_mmu_root(kvm, root, range->slot->as_id)
- flush |= zap_gfn_range(kvm, root, range->start, range->end,
- range->may_block, flush, false);
+ for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, false)
+ flush = zap_gfn_range(kvm, root, range->start, range->end,
+ range->may_block, flush, false);
return flush;
}
@@ -1364,10 +1362,9 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
* Clear leaf entries which could be replaced by large mappings, for
* GFNs within the slot.
*/
-static bool zap_collapsible_spte_range(struct kvm *kvm,
+static void zap_collapsible_spte_range(struct kvm *kvm,
struct kvm_mmu_page *root,
- const struct kvm_memory_slot *slot,
- bool flush)
+ const struct kvm_memory_slot *slot)
{
gfn_t start = slot->base_gfn;
gfn_t end = start + slot->npages;
@@ -1378,10 +1375,8 @@ static bool zap_collapsible_spte_range(struct kvm *kvm,
tdp_root_for_each_pte(iter, root, start, end) {
retry:
- if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) {
- flush = false;
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
continue;
- }
if (!is_shadow_present_pte(iter.old_spte) ||
!is_last_spte(iter.old_spte, iter.level))
@@ -1393,6 +1388,7 @@ retry:
pfn, PG_LEVEL_NUM))
continue;
+ /* Note, a successful atomic zap also does a remote TLB flush. */
if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
/*
* The iter must explicitly re-read the SPTE because
@@ -1401,30 +1397,24 @@ retry:
iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
goto retry;
}
- flush = true;
}
rcu_read_unlock();
-
- return flush;
}
/*
* Clear non-leaf entries (and free associated page tables) which could
* be replaced by large mappings, for GFNs within the slot.
*/
-bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
- const struct kvm_memory_slot *slot,
- bool flush)
+void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
+ const struct kvm_memory_slot *slot)
{
struct kvm_mmu_page *root;
lockdep_assert_held_read(&kvm->mmu_lock);
for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
- flush = zap_collapsible_spte_range(kvm, root, slot, flush);
-
- return flush;
+ zap_collapsible_spte_range(kvm, root, slot);
}
/*
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 476b133544dd..3899004a5d91 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -64,9 +64,8 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn, unsigned long mask,
bool wrprot);
-bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
- const struct kvm_memory_slot *slot,
- bool flush);
+void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
+ const struct kvm_memory_slot *slot);
bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
struct kvm_memory_slot *slot, gfn_t gfn,
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index affc0ea98d30..8f9af7b7dbbe 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -900,6 +900,7 @@ out:
bool svm_check_apicv_inhibit_reasons(ulong bit)
{
ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
+ BIT(APICV_INHIBIT_REASON_ABSENT) |
BIT(APICV_INHIBIT_REASON_HYPERV) |
BIT(APICV_INHIBIT_REASON_NESTED) |
BIT(APICV_INHIBIT_REASON_IRQWIN) |
@@ -989,16 +990,18 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu)
static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ int cpu = get_cpu();
+ WARN_ON(cpu != vcpu->cpu);
svm->avic_is_running = is_run;
- if (!kvm_vcpu_apicv_active(vcpu))
- return;
-
- if (is_run)
- avic_vcpu_load(vcpu, vcpu->cpu);
- else
- avic_vcpu_put(vcpu);
+ if (kvm_vcpu_apicv_active(vcpu)) {
+ if (is_run)
+ avic_vcpu_load(vcpu, cpu);
+ else
+ avic_vcpu_put(vcpu);
+ }
+ put_cpu();
}
void svm_vcpu_blocking(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c
index 871c426ec389..b4095dfeeee6 100644
--- a/arch/x86/kvm/svm/pmu.c
+++ b/arch/x86/kvm/svm/pmu.c
@@ -281,7 +281,7 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu)
pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS;
pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1;
- pmu->reserved_bits = 0xffffffff00200000ull;
+ pmu->reserved_bits = 0xfffffff000280000ull;
pmu->version = 1;
/* not applicable to AMD; but clean them to prevent any fall out */
pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 21ac0a5de4e0..7656a2c5662a 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -1543,28 +1543,50 @@ static bool is_cmd_allowed_from_mirror(u32 cmd_id)
return false;
}
-static int sev_lock_for_migration(struct kvm *kvm)
+static int sev_lock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm)
{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_info *dst_sev = &to_kvm_svm(dst_kvm)->sev_info;
+ struct kvm_sev_info *src_sev = &to_kvm_svm(src_kvm)->sev_info;
+ int r = -EBUSY;
+
+ if (dst_kvm == src_kvm)
+ return -EINVAL;
/*
- * Bail if this VM is already involved in a migration to avoid deadlock
- * between two VMs trying to migrate to/from each other.
+ * Bail if these VMs are already involved in a migration to avoid
+ * deadlock between two VMs trying to migrate to/from each other.
*/
- if (atomic_cmpxchg_acquire(&sev->migration_in_progress, 0, 1))
+ if (atomic_cmpxchg_acquire(&dst_sev->migration_in_progress, 0, 1))
return -EBUSY;
- mutex_lock(&kvm->lock);
+ if (atomic_cmpxchg_acquire(&src_sev->migration_in_progress, 0, 1))
+ goto release_dst;
+ r = -EINTR;
+ if (mutex_lock_killable(&dst_kvm->lock))
+ goto release_src;
+ if (mutex_lock_killable(&src_kvm->lock))
+ goto unlock_dst;
return 0;
+
+unlock_dst:
+ mutex_unlock(&dst_kvm->lock);
+release_src:
+ atomic_set_release(&src_sev->migration_in_progress, 0);
+release_dst:
+ atomic_set_release(&dst_sev->migration_in_progress, 0);
+ return r;
}
-static void sev_unlock_after_migration(struct kvm *kvm)
+static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm)
{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_info *dst_sev = &to_kvm_svm(dst_kvm)->sev_info;
+ struct kvm_sev_info *src_sev = &to_kvm_svm(src_kvm)->sev_info;
- mutex_unlock(&kvm->lock);
- atomic_set_release(&sev->migration_in_progress, 0);
+ mutex_unlock(&dst_kvm->lock);
+ mutex_unlock(&src_kvm->lock);
+ atomic_set_release(&dst_sev->migration_in_progress, 0);
+ atomic_set_release(&src_sev->migration_in_progress, 0);
}
@@ -1607,14 +1629,15 @@ static void sev_migrate_from(struct kvm_sev_info *dst,
dst->asid = src->asid;
dst->handle = src->handle;
dst->pages_locked = src->pages_locked;
+ dst->enc_context_owner = src->enc_context_owner;
src->asid = 0;
src->active = false;
src->handle = 0;
src->pages_locked = 0;
+ src->enc_context_owner = NULL;
- INIT_LIST_HEAD(&dst->regions_list);
- list_replace_init(&src->regions_list, &dst->regions_list);
+ list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list);
}
static int sev_es_migrate_from(struct kvm *dst, struct kvm *src)
@@ -1666,15 +1689,6 @@ int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd)
bool charged = false;
int ret;
- ret = sev_lock_for_migration(kvm);
- if (ret)
- return ret;
-
- if (sev_guest(kvm)) {
- ret = -EINVAL;
- goto out_unlock;
- }
-
source_kvm_file = fget(source_fd);
if (!file_is_kvm(source_kvm_file)) {
ret = -EBADF;
@@ -1682,16 +1696,26 @@ int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd)
}
source_kvm = source_kvm_file->private_data;
- ret = sev_lock_for_migration(source_kvm);
+ ret = sev_lock_two_vms(kvm, source_kvm);
if (ret)
goto out_fput;
- if (!sev_guest(source_kvm)) {
+ if (sev_guest(kvm) || !sev_guest(source_kvm)) {
ret = -EINVAL;
- goto out_source;
+ goto out_unlock;
}
src_sev = &to_kvm_svm(source_kvm)->sev_info;
+
+ /*
+ * VMs mirroring src's encryption context rely on it to keep the
+ * ASID allocated, but below we are clearing src_sev->asid.
+ */
+ if (src_sev->num_mirrored_vms) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+
dst_sev->misc_cg = get_current_misc_cg();
cg_cleanup_sev = dst_sev;
if (dst_sev->misc_cg != src_sev->misc_cg) {
@@ -1728,13 +1752,11 @@ out_dst_cgroup:
sev_misc_cg_uncharge(cg_cleanup_sev);
put_misc_cg(cg_cleanup_sev->misc_cg);
cg_cleanup_sev->misc_cg = NULL;
-out_source:
- sev_unlock_after_migration(source_kvm);
+out_unlock:
+ sev_unlock_two_vms(kvm, source_kvm);
out_fput:
if (source_kvm_file)
fput(source_kvm_file);
-out_unlock:
- sev_unlock_after_migration(kvm);
return ret;
}
@@ -1953,76 +1975,60 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
{
struct file *source_kvm_file;
struct kvm *source_kvm;
- struct kvm_sev_info source_sev, *mirror_sev;
+ struct kvm_sev_info *source_sev, *mirror_sev;
int ret;
source_kvm_file = fget(source_fd);
if (!file_is_kvm(source_kvm_file)) {
ret = -EBADF;
- goto e_source_put;
+ goto e_source_fput;
}
source_kvm = source_kvm_file->private_data;
- mutex_lock(&source_kvm->lock);
-
- if (!sev_guest(source_kvm)) {
- ret = -EINVAL;
- goto e_source_unlock;
- }
+ ret = sev_lock_two_vms(kvm, source_kvm);
+ if (ret)
+ goto e_source_fput;
- /* Mirrors of mirrors should work, but let's not get silly */
- if (is_mirroring_enc_context(source_kvm) || source_kvm == kvm) {
+ /*
+ * Mirrors of mirrors should work, but let's not get silly. Also
+ * disallow out-of-band SEV/SEV-ES init if the target is already an
+ * SEV guest, or if vCPUs have been created. KVM relies on vCPUs being
+ * created after SEV/SEV-ES initialization, e.g. to init intercepts.
+ */
+ if (sev_guest(kvm) || !sev_guest(source_kvm) ||
+ is_mirroring_enc_context(source_kvm) || kvm->created_vcpus) {
ret = -EINVAL;
- goto e_source_unlock;
+ goto e_unlock;
}
- memcpy(&source_sev, &to_kvm_svm(source_kvm)->sev_info,
- sizeof(source_sev));
-
/*
* The mirror kvm holds an enc_context_owner ref so its asid can't
* disappear until we're done with it
*/
+ source_sev = &to_kvm_svm(source_kvm)->sev_info;
kvm_get_kvm(source_kvm);
-
- fput(source_kvm_file);
- mutex_unlock(&source_kvm->lock);
- mutex_lock(&kvm->lock);
-
- /*
- * Disallow out-of-band SEV/SEV-ES init if the target is already an
- * SEV guest, or if vCPUs have been created. KVM relies on vCPUs being
- * created after SEV/SEV-ES initialization, e.g. to init intercepts.
- */
- if (sev_guest(kvm) || kvm->created_vcpus) {
- ret = -EINVAL;
- goto e_mirror_unlock;
- }
+ source_sev->num_mirrored_vms++;
/* Set enc_context_owner and copy its encryption context over */
mirror_sev = &to_kvm_svm(kvm)->sev_info;
mirror_sev->enc_context_owner = source_kvm;
mirror_sev->active = true;
- mirror_sev->asid = source_sev.asid;
- mirror_sev->fd = source_sev.fd;
- mirror_sev->es_active = source_sev.es_active;
- mirror_sev->handle = source_sev.handle;
+ mirror_sev->asid = source_sev->asid;
+ mirror_sev->fd = source_sev->fd;
+ mirror_sev->es_active = source_sev->es_active;
+ mirror_sev->handle = source_sev->handle;
+ INIT_LIST_HEAD(&mirror_sev->regions_list);
+ ret = 0;
+
/*
* Do not copy ap_jump_table. Since the mirror does not share the same
* KVM contexts as the original, and they may have different
* memory-views.
*/
- mutex_unlock(&kvm->lock);
- return 0;
-
-e_mirror_unlock:
- mutex_unlock(&kvm->lock);
- kvm_put_kvm(source_kvm);
- return ret;
-e_source_unlock:
- mutex_unlock(&source_kvm->lock);
-e_source_put:
+e_unlock:
+ sev_unlock_two_vms(kvm, source_kvm);
+e_source_fput:
if (source_kvm_file)
fput(source_kvm_file);
return ret;
@@ -2034,17 +2040,24 @@ void sev_vm_destroy(struct kvm *kvm)
struct list_head *head = &sev->regions_list;
struct list_head *pos, *q;
+ WARN_ON(sev->num_mirrored_vms);
+
if (!sev_guest(kvm))
return;
/* If this is a mirror_kvm release the enc_context_owner and skip sev cleanup */
if (is_mirroring_enc_context(kvm)) {
- kvm_put_kvm(sev->enc_context_owner);
+ struct kvm *owner_kvm = sev->enc_context_owner;
+ struct kvm_sev_info *owner_sev = &to_kvm_svm(owner_kvm)->sev_info;
+
+ mutex_lock(&owner_kvm->lock);
+ if (!WARN_ON(!owner_sev->num_mirrored_vms))
+ owner_sev->num_mirrored_vms--;
+ mutex_unlock(&owner_kvm->lock);
+ kvm_put_kvm(owner_kvm);
return;
}
- mutex_lock(&kvm->lock);
-
/*
* Ensure that all guest tagged cache entries are flushed before
* releasing the pages back to the system for use. CLFLUSH will
@@ -2064,8 +2077,6 @@ void sev_vm_destroy(struct kvm *kvm)
}
}
- mutex_unlock(&kvm->lock);
-
sev_unbind_asid(kvm, sev->handle);
sev_asid_free(sev);
}
@@ -2249,7 +2260,7 @@ void sev_free_vcpu(struct kvm_vcpu *vcpu)
__free_page(virt_to_page(svm->sev_es.vmsa));
if (svm->sev_es.ghcb_sa_free)
- kfree(svm->sev_es.ghcb_sa);
+ kvfree(svm->sev_es.ghcb_sa);
}
static void dump_ghcb(struct vcpu_svm *svm)
@@ -2341,24 +2352,29 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm)
memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap));
}
-static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
+static bool sev_es_validate_vmgexit(struct vcpu_svm *svm)
{
struct kvm_vcpu *vcpu;
struct ghcb *ghcb;
- u64 exit_code = 0;
+ u64 exit_code;
+ u64 reason;
ghcb = svm->sev_es.ghcb;
- /* Only GHCB Usage code 0 is supported */
- if (ghcb->ghcb_usage)
- goto vmgexit_err;
-
/*
- * Retrieve the exit code now even though is may not be marked valid
+ * Retrieve the exit code now even though it may not be marked valid
* as it could help with debugging.
*/
exit_code = ghcb_get_sw_exit_code(ghcb);
+ /* Only GHCB Usage code 0 is supported */
+ if (ghcb->ghcb_usage) {
+ reason = GHCB_ERR_INVALID_USAGE;
+ goto vmgexit_err;
+ }
+
+ reason = GHCB_ERR_MISSING_INPUT;
+
if (!ghcb_sw_exit_code_is_valid(ghcb) ||
!ghcb_sw_exit_info_1_is_valid(ghcb) ||
!ghcb_sw_exit_info_2_is_valid(ghcb))
@@ -2437,30 +2453,34 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
case SVM_VMGEXIT_UNSUPPORTED_EVENT:
break;
default:
+ reason = GHCB_ERR_INVALID_EVENT;
goto vmgexit_err;
}
- return 0;
+ return true;
vmgexit_err:
vcpu = &svm->vcpu;
- if (ghcb->ghcb_usage) {
+ if (reason == GHCB_ERR_INVALID_USAGE) {
vcpu_unimpl(vcpu, "vmgexit: ghcb usage %#x is not valid\n",
ghcb->ghcb_usage);
+ } else if (reason == GHCB_ERR_INVALID_EVENT) {
+ vcpu_unimpl(vcpu, "vmgexit: exit code %#llx is not valid\n",
+ exit_code);
} else {
- vcpu_unimpl(vcpu, "vmgexit: exit reason %#llx is not valid\n",
+ vcpu_unimpl(vcpu, "vmgexit: exit code %#llx input is not valid\n",
exit_code);
dump_ghcb(svm);
}
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
- vcpu->run->internal.ndata = 2;
- vcpu->run->internal.data[0] = exit_code;
- vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
+ /* Clear the valid entries fields */
+ memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap));
+
+ ghcb_set_sw_exit_info_1(ghcb, 2);
+ ghcb_set_sw_exit_info_2(ghcb, reason);
- return -EINVAL;
+ return false;
}
void sev_es_unmap_ghcb(struct vcpu_svm *svm)
@@ -2482,7 +2502,7 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm)
svm->sev_es.ghcb_sa_sync = false;
}
- kfree(svm->sev_es.ghcb_sa);
+ kvfree(svm->sev_es.ghcb_sa);
svm->sev_es.ghcb_sa = NULL;
svm->sev_es.ghcb_sa_free = false;
}
@@ -2530,14 +2550,14 @@ static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
scratch_gpa_beg = ghcb_get_sw_scratch(ghcb);
if (!scratch_gpa_beg) {
pr_err("vmgexit: scratch gpa not provided\n");
- return false;
+ goto e_scratch;
}
scratch_gpa_end = scratch_gpa_beg + len;
if (scratch_gpa_end < scratch_gpa_beg) {
pr_err("vmgexit: scratch length (%#llx) not valid for scratch address (%#llx)\n",
len, scratch_gpa_beg);
- return false;
+ goto e_scratch;
}
if ((scratch_gpa_beg & PAGE_MASK) == control->ghcb_gpa) {
@@ -2555,7 +2575,7 @@ static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
scratch_gpa_end > ghcb_scratch_end) {
pr_err("vmgexit: scratch area is outside of GHCB shared buffer area (%#llx - %#llx)\n",
scratch_gpa_beg, scratch_gpa_end);
- return false;
+ goto e_scratch;
}
scratch_va = (void *)svm->sev_es.ghcb;
@@ -2568,18 +2588,18 @@ static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
if (len > GHCB_SCRATCH_AREA_LIMIT) {
pr_err("vmgexit: scratch area exceeds KVM limits (%#llx requested, %#llx limit)\n",
len, GHCB_SCRATCH_AREA_LIMIT);
- return false;
+ goto e_scratch;
}
- scratch_va = kzalloc(len, GFP_KERNEL_ACCOUNT);
+ scratch_va = kvzalloc(len, GFP_KERNEL_ACCOUNT);
if (!scratch_va)
- return false;
+ goto e_scratch;
if (kvm_read_guest(svm->vcpu.kvm, scratch_gpa_beg, scratch_va, len)) {
/* Unable to copy scratch area from guest */
pr_err("vmgexit: kvm_read_guest for scratch area failed\n");
- kfree(scratch_va);
- return false;
+ kvfree(scratch_va);
+ goto e_scratch;
}
/*
@@ -2596,6 +2616,12 @@ static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
svm->sev_es.ghcb_sa_len = len;
return true;
+
+e_scratch:
+ ghcb_set_sw_exit_info_1(ghcb, 2);
+ ghcb_set_sw_exit_info_2(ghcb, GHCB_ERR_INVALID_SCRATCH_AREA);
+
+ return false;
}
static void set_ghcb_msr_bits(struct vcpu_svm *svm, u64 value, u64 mask,
@@ -2646,7 +2672,7 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_CPUID);
if (!ret) {
- ret = -EINVAL;
+ /* Error, keep GHCB MSR value as-is */
break;
}
@@ -2682,10 +2708,13 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
GHCB_MSR_TERM_REASON_POS);
pr_info("SEV-ES guest requested termination: %#llx:%#llx\n",
reason_set, reason_code);
- fallthrough;
+
+ ret = -EINVAL;
+ break;
}
default:
- ret = -EINVAL;
+ /* Error, keep GHCB MSR value as-is */
+ break;
}
trace_kvm_vmgexit_msr_protocol_exit(svm->vcpu.vcpu_id,
@@ -2709,14 +2738,18 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
if (!ghcb_gpa) {
vcpu_unimpl(vcpu, "vmgexit: GHCB gpa is not set\n");
- return -EINVAL;
+
+ /* Without a GHCB, just return right back to the guest */
+ return 1;
}
if (kvm_vcpu_map(vcpu, ghcb_gpa >> PAGE_SHIFT, &svm->sev_es.ghcb_map)) {
/* Unable to map GHCB from guest */
vcpu_unimpl(vcpu, "vmgexit: error mapping GHCB [%#llx] from guest\n",
ghcb_gpa);
- return -EINVAL;
+
+ /* Without a GHCB, just return right back to the guest */
+ return 1;
}
svm->sev_es.ghcb = svm->sev_es.ghcb_map.hva;
@@ -2726,15 +2759,14 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
exit_code = ghcb_get_sw_exit_code(ghcb);
- ret = sev_es_validate_vmgexit(svm);
- if (ret)
- return ret;
+ if (!sev_es_validate_vmgexit(svm))
+ return 1;
sev_es_sync_from_ghcb(svm);
ghcb_set_sw_exit_info_1(ghcb, 0);
ghcb_set_sw_exit_info_2(ghcb, 0);
- ret = -EINVAL;
+ ret = 1;
switch (exit_code) {
case SVM_VMGEXIT_MMIO_READ:
if (!setup_vmgexit_scratch(svm, true, control->exit_info_2))
@@ -2775,20 +2807,17 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
default:
pr_err("svm: vmgexit: unsupported AP jump table request - exit_info_1=%#llx\n",
control->exit_info_1);
- ghcb_set_sw_exit_info_1(ghcb, 1);
- ghcb_set_sw_exit_info_2(ghcb,
- X86_TRAP_UD |
- SVM_EVTINJ_TYPE_EXEPT |
- SVM_EVTINJ_VALID);
+ ghcb_set_sw_exit_info_1(ghcb, 2);
+ ghcb_set_sw_exit_info_2(ghcb, GHCB_ERR_INVALID_INPUT);
}
- ret = 1;
break;
}
case SVM_VMGEXIT_UNSUPPORTED_EVENT:
vcpu_unimpl(vcpu,
"vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n",
control->exit_info_1, control->exit_info_2);
+ ret = -EINVAL;
break;
default:
ret = svm_invoke_exit_handler(vcpu, exit_code);
@@ -2810,7 +2839,7 @@ int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in)
return -EINVAL;
if (!setup_vmgexit_scratch(svm, in, bytes))
- return -EINVAL;
+ return 1;
return kvm_sev_es_string_io(&svm->vcpu, size, port, svm->sev_es.ghcb_sa,
count, in);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 5630c241d5f6..d0f68d11ec70 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -4651,7 +4651,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.load_eoi_exitmap = svm_load_eoi_exitmap,
.hwapic_irr_update = svm_hwapic_irr_update,
.hwapic_isr_update = svm_hwapic_isr_update,
- .sync_pir_to_irr = kvm_lapic_find_highest_irr,
.apicv_post_state_restore = avic_post_state_restore,
.set_tss_addr = svm_set_tss_addr,
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 5faad3dc10e2..1c7306c370fa 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -79,6 +79,7 @@ struct kvm_sev_info {
struct list_head regions_list; /* List of registered regions */
u64 ap_jump_table; /* SEV-ES AP Jump Table address */
struct kvm *enc_context_owner; /* Owner of copied encryption context */
+ unsigned long num_mirrored_vms; /* Number of VMs sharing this ASID */
struct misc_cg *misc_cg; /* For misc cgroup accounting */
atomic_t migration_in_progress;
};
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 1e2f66951566..9c941535f78c 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -1162,29 +1162,26 @@ static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu,
WARN_ON(!enable_vpid);
/*
- * If VPID is enabled and used by vmc12, but L2 does not have a unique
- * TLB tag (ASID), i.e. EPT is disabled and KVM was unable to allocate
- * a VPID for L2, flush the current context as the effective ASID is
- * common to both L1 and L2.
- *
- * Defer the flush so that it runs after vmcs02.EPTP has been set by
- * KVM_REQ_LOAD_MMU_PGD (if nested EPT is enabled) and to avoid
- * redundant flushes further down the nested pipeline.
- *
- * If a TLB flush isn't required due to any of the above, and vpid12 is
- * changing then the new "virtual" VPID (vpid12) will reuse the same
- * "real" VPID (vpid02), and so needs to be flushed. There's no direct
- * mapping between vpid02 and vpid12, vpid02 is per-vCPU and reused for
- * all nested vCPUs. Remember, a flush on VM-Enter does not invalidate
- * guest-physical mappings, so there is no need to sync the nEPT MMU.
+ * VPID is enabled and in use by vmcs12. If vpid12 is changing, then
+ * emulate a guest TLB flush as KVM does not track vpid12 history nor
+ * is the VPID incorporated into the MMU context. I.e. KVM must assume
+ * that the new vpid12 has never been used and thus represents a new
+ * guest ASID that cannot have entries in the TLB.
*/
- if (!nested_has_guest_tlb_tag(vcpu)) {
- kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
- } else if (is_vmenter &&
- vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
+ if (is_vmenter && vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
vmx->nested.last_vpid = vmcs12->virtual_processor_id;
- vpid_sync_context(nested_get_vpid02(vcpu));
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+ return;
}
+
+ /*
+ * If VPID is enabled, used by vmc12, and vpid12 is not changing but
+ * does not have a unique TLB tag (ASID), i.e. EPT is disabled and
+ * KVM was unable to allocate a VPID for L2, flush the current context
+ * as the effective ASID is common to both L1 and L2.
+ */
+ if (!nested_has_guest_tlb_tag(vcpu))
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
}
static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
@@ -2594,8 +2591,10 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
- vmcs12->guest_ia32_perf_global_ctrl)))
+ vmcs12->guest_ia32_perf_global_ctrl))) {
+ *entry_failure_code = ENTRY_FAIL_DEFAULT;
return -EINVAL;
+ }
kvm_rsp_write(vcpu, vmcs12->guest_rsp);
kvm_rip_write(vcpu, vmcs12->guest_rip);
@@ -3344,8 +3343,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
};
u32 failed_index;
- if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
- kvm_vcpu_flush_tlb_current(vcpu);
+ kvm_service_local_tlb_flush_requests(vcpu);
evaluate_pending_interrupts = exec_controls_get(vmx) &
(CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING);
@@ -4502,9 +4500,8 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
(void)nested_get_evmcs_page(vcpu);
}
- /* Service the TLB flush request for L2 before switching to L1. */
- if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
- kvm_vcpu_flush_tlb_current(vcpu);
+ /* Service pending TLB flush requests for L2 before switching to L1. */
+ kvm_service_local_tlb_flush_requests(vcpu);
/*
* VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between
@@ -4857,6 +4854,7 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
if (!vmx->nested.cached_vmcs12)
goto out_cached_vmcs12;
+ vmx->nested.shadow_vmcs12_cache.gpa = INVALID_GPA;
vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
if (!vmx->nested.cached_shadow_vmcs12)
goto out_cached_shadow_vmcs12;
@@ -5289,8 +5287,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
struct gfn_to_hva_cache *ghc = &vmx->nested.vmcs12_cache;
struct vmcs_hdr hdr;
- if (ghc->gpa != vmptr &&
- kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) {
+ if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) {
/*
* Reads from an unbacked page return all 1s,
* which means that the 32 bits located at the
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index 5f81ef092bd4..1c94783b5a54 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -5,6 +5,7 @@
#include <asm/cpu.h>
#include "lapic.h"
+#include "irq.h"
#include "posted_intr.h"
#include "trace.h"
#include "vmx.h"
@@ -77,13 +78,18 @@ after_clear_sn:
pi_set_on(pi_desc);
}
+static bool vmx_can_use_vtd_pi(struct kvm *kvm)
+{
+ return irqchip_in_kernel(kvm) && enable_apicv &&
+ kvm_arch_has_assigned_device(kvm) &&
+ irq_remapping_cap(IRQ_POSTING_CAP);
+}
+
void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
{
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
- if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
- !irq_remapping_cap(IRQ_POSTING_CAP) ||
- !kvm_vcpu_apicv_active(vcpu))
+ if (!vmx_can_use_vtd_pi(vcpu->kvm))
return;
/* Set SN when the vCPU is preempted */
@@ -141,9 +147,7 @@ int pi_pre_block(struct kvm_vcpu *vcpu)
struct pi_desc old, new;
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
- if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
- !irq_remapping_cap(IRQ_POSTING_CAP) ||
- !kvm_vcpu_apicv_active(vcpu))
+ if (!vmx_can_use_vtd_pi(vcpu->kvm))
return 0;
WARN_ON(irqs_disabled());
@@ -270,9 +274,7 @@ int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq,
struct vcpu_data vcpu_info;
int idx, ret = 0;
- if (!kvm_arch_has_assigned_device(kvm) ||
- !irq_remapping_cap(IRQ_POSTING_CAP) ||
- !kvm_vcpu_apicv_active(kvm->vcpus[0]))
+ if (!vmx_can_use_vtd_pi(kvm))
return 0;
idx = srcu_read_lock(&kvm->irq_srcu);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index ba66c171d951..5aadad3e7367 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -2646,15 +2646,6 @@ int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
if (!loaded_vmcs->msr_bitmap)
goto out_vmcs;
memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
-
- if (IS_ENABLED(CONFIG_HYPERV) &&
- static_branch_unlikely(&enable_evmcs) &&
- (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
- struct hv_enlightened_vmcs *evmcs =
- (struct hv_enlightened_vmcs *)loaded_vmcs->vmcs;
-
- evmcs->hv_enlightenments_control.msr_bitmap = 1;
- }
}
memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));
@@ -2918,6 +2909,13 @@ static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)
}
}
+static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu)
+{
+ if (is_guest_mode(vcpu))
+ return nested_get_vpid02(vcpu);
+ return to_vmx(vcpu)->vpid;
+}
+
static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *mmu = vcpu->arch.mmu;
@@ -2930,31 +2928,29 @@ static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
if (enable_ept)
ept_sync_context(construct_eptp(vcpu, root_hpa,
mmu->shadow_root_level));
- else if (!is_guest_mode(vcpu))
- vpid_sync_context(to_vmx(vcpu)->vpid);
else
- vpid_sync_context(nested_get_vpid02(vcpu));
+ vpid_sync_context(vmx_get_current_vpid(vcpu));
}
static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
{
/*
- * vpid_sync_vcpu_addr() is a nop if vmx->vpid==0, see the comment in
+ * vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in
* vmx_flush_tlb_guest() for an explanation of why this is ok.
*/
- vpid_sync_vcpu_addr(to_vmx(vcpu)->vpid, addr);
+ vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr);
}
static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu)
{
/*
- * vpid_sync_context() is a nop if vmx->vpid==0, e.g. if enable_vpid==0
- * or a vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit
- * are required to flush GVA->{G,H}PA mappings from the TLB if vpid is
+ * vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a
+ * vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit are
+ * required to flush GVA->{G,H}PA mappings from the TLB if vpid is
* disabled (VM-Enter with vpid enabled and vpid==0 is disallowed),
* i.e. no explicit INVVPID is necessary.
*/
- vpid_sync_context(to_vmx(vcpu)->vpid);
+ vpid_sync_context(vmx_get_current_vpid(vcpu));
}
void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu)
@@ -6262,9 +6258,9 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
int max_irr;
- bool max_irr_updated;
+ bool got_posted_interrupt;
- if (KVM_BUG_ON(!vcpu->arch.apicv_active, vcpu->kvm))
+ if (KVM_BUG_ON(!enable_apicv, vcpu->kvm))
return -EIO;
if (pi_test_on(&vmx->pi_desc)) {
@@ -6274,22 +6270,33 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
* But on x86 this is just a compiler barrier anyway.
*/
smp_mb__after_atomic();
- max_irr_updated =
+ got_posted_interrupt =
kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr);
-
- /*
- * If we are running L2 and L1 has a new pending interrupt
- * which can be injected, this may cause a vmexit or it may
- * be injected into L2. Either way, this interrupt will be
- * processed via KVM_REQ_EVENT, not RVI, because we do not use
- * virtual interrupt delivery to inject L1 interrupts into L2.
- */
- if (is_guest_mode(vcpu) && max_irr_updated)
- kvm_make_request(KVM_REQ_EVENT, vcpu);
} else {
max_irr = kvm_lapic_find_highest_irr(vcpu);
+ got_posted_interrupt = false;
}
- vmx_hwapic_irr_update(vcpu, max_irr);
+
+ /*
+ * Newly recognized interrupts are injected via either virtual interrupt
+ * delivery (RVI) or KVM_REQ_EVENT. Virtual interrupt delivery is
+ * disabled in two cases:
+ *
+ * 1) If L2 is running and the vCPU has a new pending interrupt. If L1
+ * wants to exit on interrupts, KVM_REQ_EVENT is needed to synthesize a
+ * VM-Exit to L1. If L1 doesn't want to exit, the interrupt is injected
+ * into L2, but KVM doesn't use virtual interrupt delivery to inject
+ * interrupts into L2, and so KVM_REQ_EVENT is again needed.
+ *
+ * 2) If APICv is disabled for this vCPU, assigned devices may still
+ * attempt to post interrupts. The posted interrupt vector will cause
+ * a VM-Exit and the subsequent entry will call sync_pir_to_irr.
+ */
+ if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu))
+ vmx_set_rvi(max_irr);
+ else if (got_posted_interrupt)
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
return max_irr;
}
@@ -6826,6 +6833,19 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
if (err < 0)
goto free_pml;
+ /*
+ * Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a
+ * nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the
+ * feature only for vmcs01, KVM currently isn't equipped to realize any
+ * performance benefits from enabling it for vmcs02.
+ */
+ if (IS_ENABLED(CONFIG_HYPERV) && static_branch_unlikely(&enable_evmcs) &&
+ (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
+ struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
+
+ evmcs->hv_enlightenments_control.msr_bitmap = 1;
+ }
+
/* The MSR bitmap starts with all ones */
bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
@@ -7509,6 +7529,7 @@ static void hardware_unsetup(void)
static bool vmx_check_apicv_inhibit_reasons(ulong bit)
{
ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
+ BIT(APICV_INHIBIT_REASON_ABSENT) |
BIT(APICV_INHIBIT_REASON_HYPERV) |
BIT(APICV_INHIBIT_REASON_BLOCKIRQ);
@@ -7761,10 +7782,10 @@ static __init int hardware_setup(void)
ple_window_shrink = 0;
}
- if (!cpu_has_vmx_apicv()) {
+ if (!cpu_has_vmx_apicv())
enable_apicv = 0;
+ if (!enable_apicv)
vmx_x86_ops.sync_pir_to_irr = NULL;
- }
if (cpu_has_vmx_tsc_scaling()) {
kvm_has_tsc_control = true;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5a403d92833f..0cf1082455df 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -890,7 +890,8 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
!load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)))
return 1;
- if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
+ if (!(cr0 & X86_CR0_PG) &&
+ (is_64_bit_mode(vcpu) || kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)))
return 1;
static_call(kvm_x86_set_cr0)(vcpu, cr0);
@@ -3258,6 +3259,29 @@ static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
static_call(kvm_x86_tlb_flush_guest)(vcpu);
}
+
+static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu)
+{
+ ++vcpu->stat.tlb_flush;
+ static_call(kvm_x86_tlb_flush_current)(vcpu);
+}
+
+/*
+ * Service "local" TLB flush requests, which are specific to the current MMU
+ * context. In addition to the generic event handling in vcpu_enter_guest(),
+ * TLB flushes that are targeted at an MMU context also need to be serviced
+ * prior before nested VM-Enter/VM-Exit.
+ */
+void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu)
+{
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
+ kvm_vcpu_flush_tlb_current(vcpu);
+
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu))
+ kvm_vcpu_flush_tlb_guest(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_service_local_tlb_flush_requests);
+
static void record_steal_time(struct kvm_vcpu *vcpu)
{
struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
@@ -4133,6 +4157,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_SGX_ATTRIBUTE:
#endif
case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM:
+ case KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM:
case KVM_CAP_SREGS2:
case KVM_CAP_EXIT_ON_EMULATION_FAILURE:
case KVM_CAP_VCPU_ATTRIBUTES:
@@ -4448,8 +4473,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
struct kvm_lapic_state *s)
{
- if (vcpu->arch.apicv_active)
- static_call(kvm_x86_sync_pir_to_irr)(vcpu);
+ static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
return kvm_apic_get_state(vcpu, s);
}
@@ -5124,6 +5148,17 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
struct kvm_cpuid __user *cpuid_arg = argp;
struct kvm_cpuid cpuid;
+ /*
+ * KVM does not correctly handle changing guest CPUID after KVM_RUN, as
+ * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't
+ * tracked in kvm_mmu_page_role. As a result, KVM may miss guest page
+ * faults due to reusing SPs/SPTEs. In practice no sane VMM mucks with
+ * the core vCPU model on the fly, so fail.
+ */
+ r = -EINVAL;
+ if (vcpu->arch.last_vmentry_cpu != -1)
+ goto out;
+
r = -EFAULT;
if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
goto out;
@@ -5134,6 +5169,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
struct kvm_cpuid2 __user *cpuid_arg = argp;
struct kvm_cpuid2 cpuid;
+ /*
+ * KVM_SET_CPUID{,2} after KVM_RUN is forbidded, see the comment in
+ * KVM_SET_CPUID case above.
+ */
+ r = -EINVAL;
+ if (vcpu->arch.last_vmentry_cpu != -1)
+ goto out;
+
r = -EFAULT;
if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
goto out;
@@ -5698,6 +5741,7 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
smp_wmb();
kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT;
kvm->arch.nr_reserved_ioapic_pins = cap->args[0];
+ kvm_request_apicv_update(kvm, true, APICV_INHIBIT_REASON_ABSENT);
r = 0;
split_irqchip_unlock:
mutex_unlock(&kvm->lock);
@@ -6078,6 +6122,7 @@ set_identity_unlock:
/* Write kvm->irq_routing before enabling irqchip_in_kernel. */
smp_wmb();
kvm->arch.irqchip_mode = KVM_IRQCHIP_KERNEL;
+ kvm_request_apicv_update(kvm, true, APICV_INHIBIT_REASON_ABSENT);
create_irqchip_unlock:
mutex_unlock(&kvm->lock);
break;
@@ -7077,7 +7122,13 @@ static int emulator_pio_in(struct kvm_vcpu *vcpu, int size,
unsigned short port, void *val, unsigned int count)
{
if (vcpu->arch.pio.count) {
- /* Complete previous iteration. */
+ /*
+ * Complete a previous iteration that required userspace I/O.
+ * Note, @count isn't guaranteed to match pio.count as userspace
+ * can modify ECX before rerunning the vCPU. Ignore any such
+ * shenanigans as KVM doesn't support modifying the rep count,
+ * and the emulator ensures @count doesn't overflow the buffer.
+ */
} else {
int r = __emulator_pio_in(vcpu, size, port, count);
if (!r)
@@ -7086,7 +7137,6 @@ static int emulator_pio_in(struct kvm_vcpu *vcpu, int size,
/* Results already available, fall through. */
}
- WARN_ON(count != vcpu->arch.pio.count);
complete_emulator_pio_in(vcpu, val);
return 1;
}
@@ -8776,10 +8826,9 @@ static void kvm_apicv_init(struct kvm *kvm)
{
init_rwsem(&kvm->arch.apicv_update_lock);
- if (enable_apicv)
- clear_bit(APICV_INHIBIT_REASON_DISABLE,
- &kvm->arch.apicv_inhibit_reasons);
- else
+ set_bit(APICV_INHIBIT_REASON_ABSENT,
+ &kvm->arch.apicv_inhibit_reasons);
+ if (!enable_apicv)
set_bit(APICV_INHIBIT_REASON_DISABLE,
&kvm->arch.apicv_inhibit_reasons);
}
@@ -9528,8 +9577,7 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
if (irqchip_split(vcpu->kvm))
kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
else {
- if (vcpu->arch.apicv_active)
- static_call(kvm_x86_sync_pir_to_irr)(vcpu);
+ static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
if (ioapic_in_kernel(vcpu->kvm))
kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
}
@@ -9648,10 +9696,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
/* Flushing all ASIDs flushes the current ASID... */
kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
}
- if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
- kvm_vcpu_flush_tlb_current(vcpu);
- if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu))
- kvm_vcpu_flush_tlb_guest(vcpu);
+ kvm_service_local_tlb_flush_requests(vcpu);
if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
@@ -9802,10 +9847,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
/*
* This handles the case where a posted interrupt was
- * notified with kvm_vcpu_kick.
+ * notified with kvm_vcpu_kick. Assigned devices can
+ * use the POSTED_INTR_VECTOR even if APICv is disabled,
+ * so do it even if APICv is disabled on this vCPU.
*/
- if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active)
- static_call(kvm_x86_sync_pir_to_irr)(vcpu);
+ if (kvm_lapic_enabled(vcpu))
+ static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
if (kvm_vcpu_exit_request(vcpu)) {
vcpu->mode = OUTSIDE_GUEST_MODE;
@@ -9849,8 +9896,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
break;
- if (vcpu->arch.apicv_active)
- static_call(kvm_x86_sync_pir_to_irr)(vcpu);
+ if (kvm_lapic_enabled(vcpu))
+ static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
if (unlikely(kvm_vcpu_exit_request(vcpu))) {
exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED;
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 997669ae9caa..4abcd8d9836d 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -103,6 +103,7 @@ static inline unsigned int __shrink_ple_window(unsigned int val,
#define MSR_IA32_CR_PAT_DEFAULT 0x0007040600070406ULL
+void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu);
int kvm_check_nested_events(struct kvm_vcpu *vcpu);
static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
@@ -185,12 +186,6 @@ static inline bool mmu_is_nested(struct kvm_vcpu *vcpu)
return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu;
}
-static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu)
-{
- ++vcpu->stat.tlb_flush;
- static_call(kvm_x86_tlb_flush_current)(vcpu);
-}
-
static inline int is_pae(struct kvm_vcpu *vcpu)
{
return kvm_read_cr4_bits(vcpu, X86_CR4_PAE);
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 726700fabca6..bafe36e69227 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1252,19 +1252,54 @@ st: if (is_imm8(insn->off))
case BPF_LDX | BPF_MEM | BPF_DW:
case BPF_LDX | BPF_PROBE_MEM | BPF_DW:
if (BPF_MODE(insn->code) == BPF_PROBE_MEM) {
- /* test src_reg, src_reg */
- maybe_emit_mod(&prog, src_reg, src_reg, true); /* always 1 byte */
- EMIT2(0x85, add_2reg(0xC0, src_reg, src_reg));
- /* jne start_of_ldx */
- EMIT2(X86_JNE, 0);
+ /* Though the verifier prevents negative insn->off in BPF_PROBE_MEM
+ * add abs(insn->off) to the limit to make sure that negative
+ * offset won't be an issue.
+ * insn->off is s16, so it won't affect valid pointers.
+ */
+ u64 limit = TASK_SIZE_MAX + PAGE_SIZE + abs(insn->off);
+ u8 *end_of_jmp1, *end_of_jmp2;
+
+ /* Conservatively check that src_reg + insn->off is a kernel address:
+ * 1. src_reg + insn->off >= limit
+ * 2. src_reg + insn->off doesn't become small positive.
+ * Cannot do src_reg + insn->off >= limit in one branch,
+ * since it needs two spare registers, but JIT has only one.
+ */
+
+ /* movabsq r11, limit */
+ EMIT2(add_1mod(0x48, AUX_REG), add_1reg(0xB8, AUX_REG));
+ EMIT((u32)limit, 4);
+ EMIT(limit >> 32, 4);
+ /* cmp src_reg, r11 */
+ maybe_emit_mod(&prog, src_reg, AUX_REG, true);
+ EMIT2(0x39, add_2reg(0xC0, src_reg, AUX_REG));
+ /* if unsigned '<' goto end_of_jmp2 */
+ EMIT2(X86_JB, 0);
+ end_of_jmp1 = prog;
+
+ /* mov r11, src_reg */
+ emit_mov_reg(&prog, true, AUX_REG, src_reg);
+ /* add r11, insn->off */
+ maybe_emit_1mod(&prog, AUX_REG, true);
+ EMIT2_off32(0x81, add_1reg(0xC0, AUX_REG), insn->off);
+ /* jmp if not carry to start_of_ldx
+ * Otherwise ERR_PTR(-EINVAL) + 128 will be the user addr
+ * that has to be rejected.
+ */
+ EMIT2(0x73 /* JNC */, 0);
+ end_of_jmp2 = prog;
+
/* xor dst_reg, dst_reg */
emit_mov_imm32(&prog, false, dst_reg, 0);
/* jmp byte_after_ldx */
EMIT2(0xEB, 0);
- /* populate jmp_offset for JNE above */
- temp[4] = prog - temp - 5 /* sizeof(test + jne) */;
+ /* populate jmp_offset for JB above to jump to xor dst_reg */
+ end_of_jmp1[-1] = end_of_jmp2 - end_of_jmp1;
+ /* populate jmp_offset for JNC above to jump to start_of_ldx */
start_of_ldx = prog;
+ end_of_jmp2[-1] = start_of_ldx - end_of_jmp2;
}
emit_ldx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
if (BPF_MODE(insn->code) == BPF_PROBE_MEM) {
@@ -1305,7 +1340,7 @@ st: if (is_imm8(insn->off))
* End result: x86 insn "mov rbx, qword ptr [rax+0x14]"
* of 4 bytes will be ignored and rbx will be zero inited.
*/
- ex->fixup = (prog - temp) | (reg2pt_regs[dst_reg] << 8);
+ ex->fixup = (prog - start_of_ldx) | (reg2pt_regs[dst_reg] << 8);
}
break;
diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index b15ebfe40a73..b0b848d6933a 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -277,7 +277,8 @@ void __init efi_arch_mem_reserve(phys_addr_t addr, u64 size)
return;
}
- new = early_memremap(data.phys_map, data.size);
+ new = early_memremap_prot(data.phys_map, data.size,
+ pgprot_val(pgprot_encrypted(FIXMAP_PAGE_NORMAL)));
if (!new) {
pr_err("Failed to map new boot services memmap\n");
return;
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index 4a3da7592b99..38d24d2ab38b 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -72,6 +72,7 @@ static void __init setup_real_mode(void)
#ifdef CONFIG_X86_64
u64 *trampoline_pgd;
u64 efer;
+ int i;
#endif
base = (unsigned char *)real_mode_header;
@@ -128,8 +129,17 @@ static void __init setup_real_mode(void)
trampoline_header->flags = 0;
trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
+
+ /* Map the real mode stub as virtual == physical */
trampoline_pgd[0] = trampoline_pgd_entry.pgd;
- trampoline_pgd[511] = init_top_pgt[511].pgd;
+
+ /*
+ * Include the entirety of the kernel mapping into the trampoline
+ * PGD. This way, all mappings present in the normal kernel page
+ * tables are usable while running on trampoline_pgd.
+ */
+ for (i = pgd_index(__PAGE_OFFSET); i < PTRS_PER_PGD; i++)
+ trampoline_pgd[i] = init_top_pgt[i].pgd;
#endif
sme_sev_setup_real_mode(trampoline_header);
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
index 220dd9678494..444d824775f6 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm.S
@@ -20,6 +20,7 @@
#include <linux/init.h>
#include <linux/linkage.h>
+#include <../entry/calling.h>
.pushsection .noinstr.text, "ax"
/*
@@ -193,6 +194,25 @@ SYM_CODE_START(xen_iret)
SYM_CODE_END(xen_iret)
/*
+ * XEN pv doesn't use trampoline stack, PER_CPU_VAR(cpu_tss_rw + TSS_sp0) is
+ * also the kernel stack. Reusing swapgs_restore_regs_and_return_to_usermode()
+ * in XEN pv would cause %rsp to move up to the top of the kernel stack and
+ * leave the IRET frame below %rsp, which is dangerous to be corrupted if #NMI
+ * interrupts. And swapgs_restore_regs_and_return_to_usermode() pushing the IRET
+ * frame at the same address is useless.
+ */
+SYM_CODE_START(xenpv_restore_regs_and_return_to_usermode)
+ UNWIND_HINT_REGS
+ POP_REGS
+
+ /* stackleak_erase() can work safely on the kernel stack. */
+ STACKLEAK_ERASE_NOCLOBBER
+
+ addq $8, %rsp /* skip regs->orig_ax */
+ jmp xen_iret
+SYM_CODE_END(xenpv_restore_regs_and_return_to_usermode)
+
+/*
* Xen handles syscall callbacks much like ordinary exceptions, which
* means we have:
* - kernel gs