summaryrefslogtreecommitdiff
path: root/arch
diff options
context:
space:
mode:
Diffstat (limited to 'arch')
-rw-r--r--arch/arm/boot/dts/e60k02.dtsi1
-rw-r--r--arch/arm/boot/dts/e70k02.dtsi1
-rw-r--r--arch/arm/boot/dts/imx6sl-tolino-shine2hd.dts1
-rw-r--r--arch/arm/boot/dts/qcom-apq8026-lg-lenok.dts10
-rw-r--r--arch/arm/lib/uaccess_with_memcpy.c4
-rw-r--r--arch/arm64/boot/dts/freescale/fsl-ls1028a-kontron-kbox-a-230-ls.dts12
-rw-r--r--arch/arm64/boot/dts/freescale/fsl-ls1028a-kontron-sl28-var1.dts2
-rw-r--r--arch/arm64/boot/dts/freescale/fsl-ls1028a-kontron-sl28-var2.dts8
-rw-r--r--arch/arm64/boot/dts/freescale/fsl-ls1028a-kontron-sl28-var4.dts2
-rw-r--r--arch/arm64/boot/dts/freescale/fsl-ls1028a-kontron-sl28.dts17
-rw-r--r--arch/arm64/boot/dts/freescale/imx8-ss-lsio.dtsi2
-rw-r--r--arch/arm64/boot/dts/freescale/imx8dxl-evk.dts5
-rw-r--r--arch/arm64/boot/dts/freescale/imx8mm-nitrogen-r2.dts2
-rw-r--r--arch/arm64/boot/dts/freescale/imx8mn.dtsi5
-rw-r--r--arch/arm64/boot/dts/freescale/imx8mp.dtsi4
-rw-r--r--arch/arm64/boot/dts/freescale/imx93.dtsi24
-rw-r--r--arch/arm64/boot/dts/nvidia/tegra194.dtsi2
-rw-r--r--arch/arm64/boot/dts/nvidia/tegra234.dtsi2
-rw-r--r--arch/arm64/boot/dts/qcom/msm8916-thwc-uf896.dts4
-rw-r--r--arch/arm64/boot/dts/qcom/msm8916-thwc-ufi001c.dts28
-rw-r--r--arch/arm64/boot/dts/qcom/msm8916-ufi.dtsi10
-rw-r--r--arch/arm64/boot/dts/qcom/sa8540p-ride.dts2
-rw-r--r--arch/arm64/boot/dts/qcom/sc7280.dtsi2
-rw-r--r--arch/arm64/boot/dts/qcom/sc8280xp-lenovo-thinkpad-x13s.dts27
-rw-r--r--arch/arm64/boot/dts/qcom/sc8280xp.dtsi18
-rw-r--r--arch/arm64/boot/dts/qcom/sm6115.dtsi1
-rw-r--r--arch/arm64/boot/dts/qcom/sm6375.dtsi1
-rw-r--r--arch/arm64/boot/dts/qcom/sm8150.dtsi4
-rw-r--r--arch/arm64/boot/dts/qcom/sm8250-xiaomi-elish.dts2
-rw-r--r--arch/arm64/boot/dts/qcom/sm8350.dtsi1
-rw-r--r--arch/arm64/boot/dts/qcom/sm8450.dtsi5
-rw-r--r--arch/arm64/boot/dts/qcom/sm8550.dtsi49
-rw-r--r--arch/arm64/include/asm/kvm_host.h25
-rw-r--r--arch/arm64/include/asm/kvm_mmu.h4
-rw-r--r--arch/arm64/include/asm/sysreg.h3
-rw-r--r--arch/arm64/include/uapi/asm/kvm.h36
-rw-r--r--arch/arm64/kernel/cpufeature.c11
-rw-r--r--arch/arm64/kernel/efi-header.S2
-rw-r--r--arch/arm64/kvm/arch_timer.c550
-rw-r--r--arch/arm64/kvm/arm.c147
-rw-r--r--arch/arm64/kvm/guest.c31
-rw-r--r--arch/arm64/kvm/handle_exit.c36
-rw-r--r--arch/arm64/kvm/hyp/include/hyp/switch.h53
-rw-r--r--arch/arm64/kvm/hyp/nvhe/debug-sr.c2
-rw-r--r--arch/arm64/kvm/hyp/nvhe/mem_protect.c7
-rw-r--r--arch/arm64/kvm/hyp/nvhe/switch.c18
-rw-r--r--arch/arm64/kvm/hyp/nvhe/timer-sr.c18
-rw-r--r--arch/arm64/kvm/hyp/nvhe/tlb.c38
-rw-r--r--arch/arm64/kvm/hyp/vhe/switch.c7
-rw-r--r--arch/arm64/kvm/hyp/vhe/sysreg-sr.c12
-rw-r--r--arch/arm64/kvm/hypercalls.c189
-rw-r--r--arch/arm64/kvm/pmu-emul.c25
-rw-r--r--arch/arm64/kvm/psci.c37
-rw-r--r--arch/arm64/kvm/reset.c15
-rw-r--r--arch/arm64/kvm/sys_regs.c10
-rw-r--r--arch/arm64/kvm/trace_arm.h6
-rw-r--r--arch/arm64/kvm/vgic/vgic-debug.c8
-rw-r--r--arch/arm64/kvm/vgic/vgic-init.c36
-rw-r--r--arch/arm64/kvm/vgic/vgic-its.c33
-rw-r--r--arch/arm64/kvm/vgic/vgic-kvm-device.c85
-rw-r--r--arch/arm64/kvm/vgic/vgic-mmio-v3.c4
-rw-r--r--arch/arm64/kvm/vgic/vgic-mmio.c12
-rw-r--r--arch/arm64/kvm/vgic/vgic-v4.c11
-rw-r--r--arch/arm64/kvm/vgic/vgic.c27
-rw-r--r--arch/arm64/kvm/vgic/vgic.h3
-rw-r--r--arch/arm64/tools/cpucaps1
-rw-r--r--arch/arm64/tools/sysreg4
-rw-r--r--arch/mips/include/asm/kvm_host.h2
-rw-r--r--arch/mips/kvm/vz.c2
-rw-r--r--arch/powerpc/include/asm/kasan.h2
-rw-r--r--arch/powerpc/include/asm/string.h15
-rw-r--r--arch/powerpc/kernel/prom_init_check.sh9
-rw-r--r--arch/powerpc/mm/fault.c11
-rw-r--r--arch/powerpc/platforms/pseries/Kconfig1
-rw-r--r--arch/riscv/Kconfig22
-rw-r--r--arch/riscv/Makefile10
-rw-r--r--arch/riscv/include/asm/mmu.h2
-rw-r--r--arch/riscv/include/asm/tlbflush.h20
-rw-r--r--arch/riscv/mm/context.c42
-rw-r--r--arch/riscv/mm/fault.c5
-rw-r--r--arch/riscv/mm/tlbflush.c30
-rw-r--r--arch/s390/boot/ipl_report.c8
-rw-r--r--arch/s390/configs/debug_defconfig13
-rw-r--r--arch/s390/configs/defconfig12
-rw-r--r--arch/s390/configs/zfcpdump_defconfig2
-rw-r--r--arch/s390/kvm/interrupt.c4
-rw-r--r--arch/s390/kvm/pci.c2
-rw-r--r--arch/s390/kvm/vsie.c50
-rw-r--r--arch/s390/pci/pci.c16
-rw-r--r--arch/s390/pci/pci_bus.c12
-rw-r--r--arch/s390/pci/pci_bus.h3
-rw-r--r--arch/x86/events/amd/core.c3
-rw-r--r--arch/x86/include/asm/cpufeatures.h9
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h6
-rw-r--r--arch/x86/include/asm/kvm_host.h52
-rw-r--r--arch/x86/include/asm/sev-common.h3
-rw-r--r--arch/x86/include/asm/svm.h10
-rw-r--r--arch/x86/include/asm/xen/cpuid.h22
-rw-r--r--arch/x86/include/uapi/asm/kvm.h3
-rw-r--r--arch/x86/kernel/cpu/mce/core.c1
-rw-r--r--arch/x86/kernel/cpu/resctrl/ctrlmondata.c7
-rw-r--r--arch/x86/kernel/cpu/resctrl/internal.h1
-rw-r--r--arch/x86/kernel/cpu/resctrl/rdtgroup.c25
-rw-r--r--arch/x86/kernel/fpu/xstate.c30
-rw-r--r--arch/x86/kernel/ftrace_64.S2
-rw-r--r--arch/x86/kernel/sev.c26
-rw-r--r--arch/x86/kvm/cpuid.c16
-rw-r--r--arch/x86/kvm/emulate.c8
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h18
-rw-r--r--arch/x86/kvm/kvm_onhyperv.c33
-rw-r--r--arch/x86/kvm/kvm_onhyperv.h5
-rw-r--r--arch/x86/kvm/mmu.h28
-rw-r--r--arch/x86/kvm/mmu/mmu.c519
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h8
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h214
-rw-r--r--arch/x86/kvm/mmu/spte.c2
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.h48
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c215
-rw-r--r--arch/x86/kvm/pmu.c25
-rw-r--r--arch/x86/kvm/pmu.h2
-rw-r--r--arch/x86/kvm/svm/nested.c91
-rw-r--r--arch/x86/kvm/svm/pmu.c2
-rw-r--r--arch/x86/kvm/svm/svm.c212
-rw-r--r--arch/x86/kvm/svm/svm.h29
-rw-r--r--arch/x86/kvm/svm/svm_onhyperv.h5
-rw-r--r--arch/x86/kvm/vmx/nested.c11
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c135
-rw-r--r--arch/x86/kvm/vmx/vmx.c82
-rw-r--r--arch/x86/kvm/vmx/vmx.h20
-rw-r--r--arch/x86/kvm/x86.c230
-rw-r--r--arch/x86/kvm/x86.h64
-rw-r--r--arch/x86/mm/cpu_entry_area.c7
-rw-r--r--arch/x86/mm/mem_encrypt_identity.c3
-rw-r--r--arch/x86/xen/Makefile2
-rw-r--r--arch/x86/xen/enlighten_pv.c3
-rw-r--r--arch/x86/xen/enlighten_pvh.c13
-rw-r--r--arch/x86/xen/time.c7
-rw-r--r--arch/x86/xen/vga.c5
-rw-r--r--arch/x86/xen/xen-ops.h7
139 files changed, 2806 insertions, 1522 deletions
diff --git a/arch/arm/boot/dts/e60k02.dtsi b/arch/arm/boot/dts/e60k02.dtsi
index 94944cc21931..dd03e3860f97 100644
--- a/arch/arm/boot/dts/e60k02.dtsi
+++ b/arch/arm/boot/dts/e60k02.dtsi
@@ -311,6 +311,7 @@
&usbotg1 {
pinctrl-names = "default";
+ pinctrl-0 = <&pinctrl_usbotg1>;
disable-over-current;
srp-disable;
hnp-disable;
diff --git a/arch/arm/boot/dts/e70k02.dtsi b/arch/arm/boot/dts/e70k02.dtsi
index ace3eb8a97b8..4e1bf080eaca 100644
--- a/arch/arm/boot/dts/e70k02.dtsi
+++ b/arch/arm/boot/dts/e70k02.dtsi
@@ -321,6 +321,7 @@
&usbotg1 {
pinctrl-names = "default";
+ pinctrl-0 = <&pinctrl_usbotg1>;
disable-over-current;
srp-disable;
hnp-disable;
diff --git a/arch/arm/boot/dts/imx6sl-tolino-shine2hd.dts b/arch/arm/boot/dts/imx6sl-tolino-shine2hd.dts
index da1399057634..815119c12bd4 100644
--- a/arch/arm/boot/dts/imx6sl-tolino-shine2hd.dts
+++ b/arch/arm/boot/dts/imx6sl-tolino-shine2hd.dts
@@ -625,6 +625,7 @@
&usbotg1 {
pinctrl-names = "default";
+ pinctrl-0 = <&pinctrl_usbotg1>;
disable-over-current;
srp-disable;
hnp-disable;
diff --git a/arch/arm/boot/dts/qcom-apq8026-lg-lenok.dts b/arch/arm/boot/dts/qcom-apq8026-lg-lenok.dts
index de2fb1c01b6e..b82381229adf 100644
--- a/arch/arm/boot/dts/qcom-apq8026-lg-lenok.dts
+++ b/arch/arm/boot/dts/qcom-apq8026-lg-lenok.dts
@@ -27,6 +27,16 @@
};
reserved-memory {
+ sbl_region: sbl@2f00000 {
+ reg = <0x02f00000 0x100000>;
+ no-map;
+ };
+
+ external_image_region: external-image@3100000 {
+ reg = <0x03100000 0x200000>;
+ no-map;
+ };
+
adsp_region: adsp@3300000 {
reg = <0x03300000 0x1400000>;
no-map;
diff --git a/arch/arm/lib/uaccess_with_memcpy.c b/arch/arm/lib/uaccess_with_memcpy.c
index 14eecaaf295f..e4c2677cc1e9 100644
--- a/arch/arm/lib/uaccess_with_memcpy.c
+++ b/arch/arm/lib/uaccess_with_memcpy.c
@@ -116,7 +116,7 @@ __copy_to_user_memcpy(void __user *to, const void *from, unsigned long n)
tocopy = n;
ua_flags = uaccess_save_and_enable();
- memcpy((void *)to, from, tocopy);
+ __memcpy((void *)to, from, tocopy);
uaccess_restore(ua_flags);
to += tocopy;
from += tocopy;
@@ -178,7 +178,7 @@ __clear_user_memset(void __user *addr, unsigned long n)
tocopy = n;
ua_flags = uaccess_save_and_enable();
- memset((void *)addr, 0, tocopy);
+ __memset((void *)addr, 0, tocopy);
uaccess_restore(ua_flags);
addr += tocopy;
n -= tocopy;
diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1028a-kontron-kbox-a-230-ls.dts b/arch/arm64/boot/dts/freescale/fsl-ls1028a-kontron-kbox-a-230-ls.dts
index af9194eca556..73eb6061c73e 100644
--- a/arch/arm64/boot/dts/freescale/fsl-ls1028a-kontron-kbox-a-230-ls.dts
+++ b/arch/arm64/boot/dts/freescale/fsl-ls1028a-kontron-kbox-a-230-ls.dts
@@ -56,14 +56,10 @@
};
&enetc_port2 {
- nvmem-cells = <&base_mac_address 2>;
- nvmem-cell-names = "mac-address";
status = "okay";
};
&enetc_port3 {
- nvmem-cells = <&base_mac_address 3>;
- nvmem-cell-names = "mac-address";
status = "okay";
};
@@ -84,8 +80,6 @@
managed = "in-band-status";
phy-handle = <&qsgmii_phy0>;
phy-mode = "qsgmii";
- nvmem-cells = <&base_mac_address 4>;
- nvmem-cell-names = "mac-address";
status = "okay";
};
@@ -94,8 +88,6 @@
managed = "in-band-status";
phy-handle = <&qsgmii_phy1>;
phy-mode = "qsgmii";
- nvmem-cells = <&base_mac_address 5>;
- nvmem-cell-names = "mac-address";
status = "okay";
};
@@ -104,8 +96,6 @@
managed = "in-band-status";
phy-handle = <&qsgmii_phy2>;
phy-mode = "qsgmii";
- nvmem-cells = <&base_mac_address 6>;
- nvmem-cell-names = "mac-address";
status = "okay";
};
@@ -114,8 +104,6 @@
managed = "in-band-status";
phy-handle = <&qsgmii_phy3>;
phy-mode = "qsgmii";
- nvmem-cells = <&base_mac_address 7>;
- nvmem-cell-names = "mac-address";
status = "okay";
};
diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1028a-kontron-sl28-var1.dts b/arch/arm64/boot/dts/freescale/fsl-ls1028a-kontron-sl28-var1.dts
index 1f34c7553459..7cd29ab970d9 100644
--- a/arch/arm64/boot/dts/freescale/fsl-ls1028a-kontron-sl28-var1.dts
+++ b/arch/arm64/boot/dts/freescale/fsl-ls1028a-kontron-sl28-var1.dts
@@ -55,7 +55,5 @@
&enetc_port1 {
phy-handle = <&phy0>;
phy-mode = "rgmii-id";
- nvmem-cells = <&base_mac_address 0>;
- nvmem-cell-names = "mac-address";
status = "okay";
};
diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1028a-kontron-sl28-var2.dts b/arch/arm64/boot/dts/freescale/fsl-ls1028a-kontron-sl28-var2.dts
index aac41192caa1..113b1df74bf8 100644
--- a/arch/arm64/boot/dts/freescale/fsl-ls1028a-kontron-sl28-var2.dts
+++ b/arch/arm64/boot/dts/freescale/fsl-ls1028a-kontron-sl28-var2.dts
@@ -36,14 +36,10 @@
};
&enetc_port2 {
- nvmem-cells = <&base_mac_address 2>;
- nvmem-cell-names = "mac-address";
status = "okay";
};
&enetc_port3 {
- nvmem-cells = <&base_mac_address 3>;
- nvmem-cell-names = "mac-address";
status = "okay";
};
@@ -56,8 +52,6 @@
managed = "in-band-status";
phy-handle = <&phy0>;
phy-mode = "sgmii";
- nvmem-cells = <&base_mac_address 0>;
- nvmem-cell-names = "mac-address";
status = "okay";
};
@@ -66,8 +60,6 @@
managed = "in-band-status";
phy-handle = <&phy1>;
phy-mode = "sgmii";
- nvmem-cells = <&base_mac_address 1>;
- nvmem-cell-names = "mac-address";
status = "okay";
};
diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1028a-kontron-sl28-var4.dts b/arch/arm64/boot/dts/freescale/fsl-ls1028a-kontron-sl28-var4.dts
index a4421db3784e..9b5e92fb753e 100644
--- a/arch/arm64/boot/dts/freescale/fsl-ls1028a-kontron-sl28-var4.dts
+++ b/arch/arm64/boot/dts/freescale/fsl-ls1028a-kontron-sl28-var4.dts
@@ -43,7 +43,5 @@
&enetc_port1 {
phy-handle = <&phy1>;
phy-mode = "rgmii-id";
- nvmem-cells = <&base_mac_address 1>;
- nvmem-cell-names = "mac-address";
status = "okay";
};
diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1028a-kontron-sl28.dts b/arch/arm64/boot/dts/freescale/fsl-ls1028a-kontron-sl28.dts
index 8b65af4a7147..4ab17b984b03 100644
--- a/arch/arm64/boot/dts/freescale/fsl-ls1028a-kontron-sl28.dts
+++ b/arch/arm64/boot/dts/freescale/fsl-ls1028a-kontron-sl28.dts
@@ -92,8 +92,6 @@
phy-handle = <&phy0>;
phy-mode = "sgmii";
managed = "in-band-status";
- nvmem-cells = <&base_mac_address 0>;
- nvmem-cell-names = "mac-address";
status = "okay";
};
@@ -156,21 +154,6 @@
label = "bootloader environment";
};
};
-
- otp-1 {
- compatible = "user-otp";
-
- nvmem-layout {
- compatible = "kontron,sl28-vpd";
-
- serial_number: serial-number {
- };
-
- base_mac_address: base-mac-address {
- #nvmem-cell-cells = <1>;
- };
- };
- };
};
};
diff --git a/arch/arm64/boot/dts/freescale/imx8-ss-lsio.dtsi b/arch/arm64/boot/dts/freescale/imx8-ss-lsio.dtsi
index 1f3d225e64ec..06b94bbc2b97 100644
--- a/arch/arm64/boot/dts/freescale/imx8-ss-lsio.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx8-ss-lsio.dtsi
@@ -117,7 +117,7 @@ lsio_subsys: bus@5d000000 {
interrupts = <GIC_SPI 92 IRQ_TYPE_LEVEL_HIGH>;
clocks = <&clk IMX_SC_R_FSPI_0 IMX_SC_PM_CLK_PER>,
<&clk IMX_SC_R_FSPI_0 IMX_SC_PM_CLK_PER>;
- clock-names = "fspi", "fspi_en";
+ clock-names = "fspi_en", "fspi";
power-domains = <&pd IMX_SC_R_FSPI_0>;
status = "disabled";
};
diff --git a/arch/arm64/boot/dts/freescale/imx8dxl-evk.dts b/arch/arm64/boot/dts/freescale/imx8dxl-evk.dts
index 1bcf228a22b8..852420349c01 100644
--- a/arch/arm64/boot/dts/freescale/imx8dxl-evk.dts
+++ b/arch/arm64/boot/dts/freescale/imx8dxl-evk.dts
@@ -121,8 +121,6 @@
phy-handle = <&ethphy0>;
nvmem-cells = <&fec_mac1>;
nvmem-cell-names = "mac-address";
- snps,reset-gpios = <&pca6416_1 2 GPIO_ACTIVE_LOW>;
- snps,reset-delays-us = <10 20 200000>;
status = "okay";
mdio {
@@ -136,6 +134,9 @@
eee-broken-1000t;
qca,disable-smarteee;
qca,disable-hibernation-mode;
+ reset-gpios = <&pca6416_1 2 GPIO_ACTIVE_LOW>;
+ reset-assert-us = <20>;
+ reset-deassert-us = <200000>;
vddio-supply = <&vddio0>;
vddio0: vddio-regulator {
diff --git a/arch/arm64/boot/dts/freescale/imx8mm-nitrogen-r2.dts b/arch/arm64/boot/dts/freescale/imx8mm-nitrogen-r2.dts
index 6357078185ed..0e8f0d7161ad 100644
--- a/arch/arm64/boot/dts/freescale/imx8mm-nitrogen-r2.dts
+++ b/arch/arm64/boot/dts/freescale/imx8mm-nitrogen-r2.dts
@@ -247,7 +247,7 @@
compatible = "wlf,wm8960";
reg = <0x1a>;
clocks = <&clk IMX8MM_CLK_SAI1_ROOT>;
- clock-names = "mclk1";
+ clock-names = "mclk";
wlf,shared-lrclk;
#sound-dai-cells = <0>;
};
diff --git a/arch/arm64/boot/dts/freescale/imx8mn.dtsi b/arch/arm64/boot/dts/freescale/imx8mn.dtsi
index ed9ac6c5047c..9e0ddd6b7a32 100644
--- a/arch/arm64/boot/dts/freescale/imx8mn.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx8mn.dtsi
@@ -296,6 +296,7 @@
sai2: sai@30020000 {
compatible = "fsl,imx8mn-sai", "fsl,imx8mq-sai";
reg = <0x30020000 0x10000>;
+ #sound-dai-cells = <0>;
interrupts = <GIC_SPI 96 IRQ_TYPE_LEVEL_HIGH>;
clocks = <&clk IMX8MN_CLK_SAI2_IPG>,
<&clk IMX8MN_CLK_DUMMY>,
@@ -310,6 +311,7 @@
sai3: sai@30030000 {
compatible = "fsl,imx8mn-sai", "fsl,imx8mq-sai";
reg = <0x30030000 0x10000>;
+ #sound-dai-cells = <0>;
interrupts = <GIC_SPI 50 IRQ_TYPE_LEVEL_HIGH>;
clocks = <&clk IMX8MN_CLK_SAI3_IPG>,
<&clk IMX8MN_CLK_DUMMY>,
@@ -324,6 +326,7 @@
sai5: sai@30050000 {
compatible = "fsl,imx8mn-sai", "fsl,imx8mq-sai";
reg = <0x30050000 0x10000>;
+ #sound-dai-cells = <0>;
interrupts = <GIC_SPI 90 IRQ_TYPE_LEVEL_HIGH>;
clocks = <&clk IMX8MN_CLK_SAI5_IPG>,
<&clk IMX8MN_CLK_DUMMY>,
@@ -340,6 +343,7 @@
sai6: sai@30060000 {
compatible = "fsl,imx8mn-sai", "fsl,imx8mq-sai";
reg = <0x30060000 0x10000>;
+ #sound-dai-cells = <0>;
interrupts = <GIC_SPI 90 IRQ_TYPE_LEVEL_HIGH>;
clocks = <&clk IMX8MN_CLK_SAI6_IPG>,
<&clk IMX8MN_CLK_DUMMY>,
@@ -397,6 +401,7 @@
sai7: sai@300b0000 {
compatible = "fsl,imx8mn-sai", "fsl,imx8mq-sai";
reg = <0x300b0000 0x10000>;
+ #sound-dai-cells = <0>;
interrupts = <GIC_SPI 111 IRQ_TYPE_LEVEL_HIGH>;
clocks = <&clk IMX8MN_CLK_SAI7_IPG>,
<&clk IMX8MN_CLK_DUMMY>,
diff --git a/arch/arm64/boot/dts/freescale/imx8mp.dtsi b/arch/arm64/boot/dts/freescale/imx8mp.dtsi
index a19224fe1a6a..2dd60e3252f3 100644
--- a/arch/arm64/boot/dts/freescale/imx8mp.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx8mp.dtsi
@@ -1131,8 +1131,8 @@
reg = <0x32e90000 0x238>;
interrupts = <GIC_SPI 6 IRQ_TYPE_LEVEL_HIGH>;
clocks = <&clk IMX8MP_CLK_MEDIA_DISP2_PIX_ROOT>,
- <&clk IMX8MP_CLK_MEDIA_AXI_ROOT>,
- <&clk IMX8MP_CLK_MEDIA_APB_ROOT>;
+ <&clk IMX8MP_CLK_MEDIA_APB_ROOT>,
+ <&clk IMX8MP_CLK_MEDIA_AXI_ROOT>;
clock-names = "pix", "axi", "disp_axi";
assigned-clocks = <&clk IMX8MP_CLK_MEDIA_DISP2_PIX>,
<&clk IMX8MP_VIDEO_PLL1>;
diff --git a/arch/arm64/boot/dts/freescale/imx93.dtsi b/arch/arm64/boot/dts/freescale/imx93.dtsi
index 2076f9c9983a..41efd97dd6d6 100644
--- a/arch/arm64/boot/dts/freescale/imx93.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx93.dtsi
@@ -164,6 +164,8 @@
lpi2c1: i2c@44340000 {
compatible = "fsl,imx93-lpi2c", "fsl,imx7ulp-lpi2c";
reg = <0x44340000 0x10000>;
+ #address-cells = <1>;
+ #size-cells = <0>;
interrupts = <GIC_SPI 13 IRQ_TYPE_LEVEL_HIGH>;
clocks = <&clk IMX93_CLK_LPI2C1_GATE>,
<&clk IMX93_CLK_BUS_AON>;
@@ -174,6 +176,8 @@
lpi2c2: i2c@44350000 {
compatible = "fsl,imx93-lpi2c", "fsl,imx7ulp-lpi2c";
reg = <0x44350000 0x10000>;
+ #address-cells = <1>;
+ #size-cells = <0>;
interrupts = <GIC_SPI 14 IRQ_TYPE_LEVEL_HIGH>;
clocks = <&clk IMX93_CLK_LPI2C2_GATE>,
<&clk IMX93_CLK_BUS_AON>;
@@ -343,6 +347,8 @@
lpi2c3: i2c@42530000 {
compatible = "fsl,imx93-lpi2c", "fsl,imx7ulp-lpi2c";
reg = <0x42530000 0x10000>;
+ #address-cells = <1>;
+ #size-cells = <0>;
interrupts = <GIC_SPI 62 IRQ_TYPE_LEVEL_HIGH>;
clocks = <&clk IMX93_CLK_LPI2C3_GATE>,
<&clk IMX93_CLK_BUS_WAKEUP>;
@@ -353,6 +359,8 @@
lpi2c4: i2c@42540000 {
compatible = "fsl,imx93-lpi2c", "fsl,imx7ulp-lpi2c";
reg = <0x42540000 0x10000>;
+ #address-cells = <1>;
+ #size-cells = <0>;
interrupts = <GIC_SPI 63 IRQ_TYPE_LEVEL_HIGH>;
clocks = <&clk IMX93_CLK_LPI2C4_GATE>,
<&clk IMX93_CLK_BUS_WAKEUP>;
@@ -455,6 +463,8 @@
lpi2c5: i2c@426b0000 {
compatible = "fsl,imx93-lpi2c", "fsl,imx7ulp-lpi2c";
reg = <0x426b0000 0x10000>;
+ #address-cells = <1>;
+ #size-cells = <0>;
interrupts = <GIC_SPI 195 IRQ_TYPE_LEVEL_HIGH>;
clocks = <&clk IMX93_CLK_LPI2C5_GATE>,
<&clk IMX93_CLK_BUS_WAKEUP>;
@@ -465,6 +475,8 @@
lpi2c6: i2c@426c0000 {
compatible = "fsl,imx93-lpi2c", "fsl,imx7ulp-lpi2c";
reg = <0x426c0000 0x10000>;
+ #address-cells = <1>;
+ #size-cells = <0>;
interrupts = <GIC_SPI 196 IRQ_TYPE_LEVEL_HIGH>;
clocks = <&clk IMX93_CLK_LPI2C6_GATE>,
<&clk IMX93_CLK_BUS_WAKEUP>;
@@ -475,6 +487,8 @@
lpi2c7: i2c@426d0000 {
compatible = "fsl,imx93-lpi2c", "fsl,imx7ulp-lpi2c";
reg = <0x426d0000 0x10000>;
+ #address-cells = <1>;
+ #size-cells = <0>;
interrupts = <GIC_SPI 197 IRQ_TYPE_LEVEL_HIGH>;
clocks = <&clk IMX93_CLK_LPI2C7_GATE>,
<&clk IMX93_CLK_BUS_WAKEUP>;
@@ -485,6 +499,8 @@
lpi2c8: i2c@426e0000 {
compatible = "fsl,imx93-lpi2c", "fsl,imx7ulp-lpi2c";
reg = <0x426e0000 0x10000>;
+ #address-cells = <1>;
+ #size-cells = <0>;
interrupts = <GIC_SPI 198 IRQ_TYPE_LEVEL_HIGH>;
clocks = <&clk IMX93_CLK_LPI2C8_GATE>,
<&clk IMX93_CLK_BUS_WAKEUP>;
@@ -580,9 +596,9 @@
eqos: ethernet@428a0000 {
compatible = "nxp,imx93-dwmac-eqos", "snps,dwmac-5.10a";
reg = <0x428a0000 0x10000>;
- interrupts = <GIC_SPI 183 IRQ_TYPE_LEVEL_HIGH>,
- <GIC_SPI 184 IRQ_TYPE_LEVEL_HIGH>;
- interrupt-names = "eth_wake_irq", "macirq";
+ interrupts = <GIC_SPI 184 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 183 IRQ_TYPE_LEVEL_HIGH>;
+ interrupt-names = "macirq", "eth_wake_irq";
clocks = <&clk IMX93_CLK_ENET_QOS_GATE>,
<&clk IMX93_CLK_ENET_QOS_GATE>,
<&clk IMX93_CLK_ENET_TIMER2>,
@@ -595,7 +611,7 @@
<&clk IMX93_CLK_SYS_PLL_PFD0_DIV2>;
assigned-clock-rates = <100000000>, <250000000>;
intf_mode = <&wakeupmix_gpr 0x28>;
- clk_csr = <0>;
+ snps,clk-csr = <0>;
status = "disabled";
};
diff --git a/arch/arm64/boot/dts/nvidia/tegra194.dtsi b/arch/arm64/boot/dts/nvidia/tegra194.dtsi
index 133dbe5b429d..7096b999b33f 100644
--- a/arch/arm64/boot/dts/nvidia/tegra194.dtsi
+++ b/arch/arm64/boot/dts/nvidia/tegra194.dtsi
@@ -22,7 +22,7 @@
#address-cells = <2>;
#size-cells = <2>;
- ranges = <0x0 0x0 0x0 0x0 0x0 0x40000000>;
+ ranges = <0x0 0x0 0x0 0x0 0x100 0x0>;
apbmisc: misc@100000 {
compatible = "nvidia,tegra194-misc";
diff --git a/arch/arm64/boot/dts/nvidia/tegra234.dtsi b/arch/arm64/boot/dts/nvidia/tegra234.dtsi
index 8fe8eda7654d..f1748cff8a33 100644
--- a/arch/arm64/boot/dts/nvidia/tegra234.dtsi
+++ b/arch/arm64/boot/dts/nvidia/tegra234.dtsi
@@ -20,7 +20,7 @@
#address-cells = <2>;
#size-cells = <2>;
- ranges = <0x0 0x0 0x0 0x0 0x0 0x40000000>;
+ ranges = <0x0 0x0 0x0 0x0 0x100 0x0>;
misc@100000 {
compatible = "nvidia,tegra234-misc";
diff --git a/arch/arm64/boot/dts/qcom/msm8916-thwc-uf896.dts b/arch/arm64/boot/dts/qcom/msm8916-thwc-uf896.dts
index c492db856190..82e260375174 100644
--- a/arch/arm64/boot/dts/qcom/msm8916-thwc-uf896.dts
+++ b/arch/arm64/boot/dts/qcom/msm8916-thwc-uf896.dts
@@ -33,7 +33,3 @@
&gpio_leds_default {
pins = "gpio81", "gpio82", "gpio83";
};
-
-&sim_ctrl_default {
- pins = "gpio1", "gpio2";
-};
diff --git a/arch/arm64/boot/dts/qcom/msm8916-thwc-ufi001c.dts b/arch/arm64/boot/dts/qcom/msm8916-thwc-ufi001c.dts
index 700cf81cbf8c..8433c9710b1c 100644
--- a/arch/arm64/boot/dts/qcom/msm8916-thwc-ufi001c.dts
+++ b/arch/arm64/boot/dts/qcom/msm8916-thwc-ufi001c.dts
@@ -25,6 +25,11 @@
gpios = <&msmgpio 20 GPIO_ACTIVE_HIGH>;
};
+&mpss {
+ pinctrl-0 = <&sim_ctrl_default>;
+ pinctrl-names = "default";
+};
+
&button_default {
pins = "gpio37";
bias-pull-down;
@@ -34,6 +39,25 @@
pins = "gpio20", "gpio21", "gpio22";
};
-&sim_ctrl_default {
- pins = "gpio1", "gpio2";
+/* This selects the external SIM card slot by default */
+&msmgpio {
+ sim_ctrl_default: sim-ctrl-default-state {
+ esim-sel-pins {
+ pins = "gpio0", "gpio3";
+ bias-disable;
+ output-low;
+ };
+
+ sim-en-pins {
+ pins = "gpio1";
+ bias-disable;
+ output-low;
+ };
+
+ sim-sel-pins {
+ pins = "gpio2";
+ bias-disable;
+ output-high;
+ };
+ };
};
diff --git a/arch/arm64/boot/dts/qcom/msm8916-ufi.dtsi b/arch/arm64/boot/dts/qcom/msm8916-ufi.dtsi
index 790a9696da9d..cdf34b74fa8f 100644
--- a/arch/arm64/boot/dts/qcom/msm8916-ufi.dtsi
+++ b/arch/arm64/boot/dts/qcom/msm8916-ufi.dtsi
@@ -92,9 +92,6 @@
};
&mpss {
- pinctrl-0 = <&sim_ctrl_default>;
- pinctrl-names = "default";
-
status = "okay";
};
@@ -240,11 +237,4 @@
drive-strength = <2>;
bias-disable;
};
-
- sim_ctrl_default: sim-ctrl-default-state {
- function = "gpio";
- drive-strength = <2>;
- bias-disable;
- output-low;
- };
};
diff --git a/arch/arm64/boot/dts/qcom/sa8540p-ride.dts b/arch/arm64/boot/dts/qcom/sa8540p-ride.dts
index 3ccb5ffdb3ca..24fa449d48a6 100644
--- a/arch/arm64/boot/dts/qcom/sa8540p-ride.dts
+++ b/arch/arm64/boot/dts/qcom/sa8540p-ride.dts
@@ -241,7 +241,7 @@
};
&remoteproc_nsp0 {
- firmware-name = "qcom/sa8540p/cdsp.mbn";
+ firmware-name = "qcom/sa8540p/cdsp0.mbn";
status = "okay";
};
diff --git a/arch/arm64/boot/dts/qcom/sc7280.dtsi b/arch/arm64/boot/dts/qcom/sc7280.dtsi
index bdcb74925313..8f4ab6bd2886 100644
--- a/arch/arm64/boot/dts/qcom/sc7280.dtsi
+++ b/arch/arm64/boot/dts/qcom/sc7280.dtsi
@@ -2131,6 +2131,8 @@
pinctrl-names = "default";
pinctrl-0 = <&pcie1_clkreq_n>;
+ dma-coherent;
+
iommus = <&apps_smmu 0x1c80 0x1>;
iommu-map = <0x0 &apps_smmu 0x1c80 0x1>,
diff --git a/arch/arm64/boot/dts/qcom/sc8280xp-lenovo-thinkpad-x13s.dts b/arch/arm64/boot/dts/qcom/sc8280xp-lenovo-thinkpad-x13s.dts
index 98e71b933437..99c6d6574559 100644
--- a/arch/arm64/boot/dts/qcom/sc8280xp-lenovo-thinkpad-x13s.dts
+++ b/arch/arm64/boot/dts/qcom/sc8280xp-lenovo-thinkpad-x13s.dts
@@ -370,6 +370,7 @@
regulator-min-microvolt = <1800000>;
regulator-max-microvolt = <1800000>;
regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+ regulator-always-on;
};
vreg_s11b: smps11 {
@@ -377,6 +378,7 @@
regulator-min-microvolt = <1272000>;
regulator-max-microvolt = <1272000>;
regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+ regulator-always-on;
};
vreg_s12b: smps12 {
@@ -384,6 +386,7 @@
regulator-min-microvolt = <984000>;
regulator-max-microvolt = <984000>;
regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+ regulator-always-on;
};
vreg_l3b: ldo3 {
@@ -441,6 +444,7 @@
regulator-min-microvolt = <3008000>;
regulator-max-microvolt = <3960000>;
regulator-initial-mode = <RPMH_REGULATOR_MODE_AUTO>;
+ regulator-always-on;
};
};
@@ -772,75 +776,88 @@
pmic-die-temp@3 {
reg = <PMK8350_ADC7_DIE_TEMP>;
qcom,pre-scaling = <1 1>;
+ label = "pmk8350_die_temp";
};
xo-therm@44 {
reg = <PMK8350_ADC7_AMUX_THM1_100K_PU>;
qcom,hw-settle-time = <200>;
qcom,ratiometric;
+ label = "pmk8350_xo_therm";
};
pmic-die-temp@103 {
reg = <PM8350_ADC7_DIE_TEMP(1)>;
qcom,pre-scaling = <1 1>;
+ label = "pmc8280_1_die_temp";
};
sys-therm@144 {
reg = <PM8350_ADC7_AMUX_THM1_100K_PU(1)>;
qcom,hw-settle-time = <200>;
qcom,ratiometric;
+ label = "sys_therm1";
};
sys-therm@145 {
reg = <PM8350_ADC7_AMUX_THM2_100K_PU(1)>;
qcom,hw-settle-time = <200>;
qcom,ratiometric;
+ label = "sys_therm2";
};
sys-therm@146 {
reg = <PM8350_ADC7_AMUX_THM3_100K_PU(1)>;
qcom,hw-settle-time = <200>;
qcom,ratiometric;
+ label = "sys_therm3";
};
sys-therm@147 {
reg = <PM8350_ADC7_AMUX_THM4_100K_PU(1)>;
qcom,hw-settle-time = <200>;
qcom,ratiometric;
+ label = "sys_therm4";
};
pmic-die-temp@303 {
reg = <PM8350_ADC7_DIE_TEMP(3)>;
qcom,pre-scaling = <1 1>;
+ label = "pmc8280_2_die_temp";
};
sys-therm@344 {
reg = <PM8350_ADC7_AMUX_THM1_100K_PU(3)>;
qcom,hw-settle-time = <200>;
qcom,ratiometric;
+ label = "sys_therm5";
};
sys-therm@345 {
reg = <PM8350_ADC7_AMUX_THM2_100K_PU(3)>;
qcom,hw-settle-time = <200>;
qcom,ratiometric;
+ label = "sys_therm6";
};
sys-therm@346 {
reg = <PM8350_ADC7_AMUX_THM3_100K_PU(3)>;
qcom,hw-settle-time = <200>;
qcom,ratiometric;
+ label = "sys_therm7";
};
sys-therm@347 {
reg = <PM8350_ADC7_AMUX_THM4_100K_PU(3)>;
qcom,hw-settle-time = <200>;
qcom,ratiometric;
+ label = "sys_therm8";
};
pmic-die-temp@403 {
reg = <PMR735A_ADC7_DIE_TEMP>;
qcom,pre-scaling = <1 1>;
+ label = "pmr735a_die_temp";
};
};
@@ -884,9 +901,9 @@
"VA DMIC0", "MIC BIAS1",
"VA DMIC1", "MIC BIAS1",
"VA DMIC2", "MIC BIAS3",
- "TX DMIC0", "MIC BIAS1",
- "TX DMIC1", "MIC BIAS2",
- "TX DMIC2", "MIC BIAS3",
+ "VA DMIC0", "VA MIC BIAS1",
+ "VA DMIC1", "VA MIC BIAS1",
+ "VA DMIC2", "VA MIC BIAS3",
"TX SWR_ADC1", "ADC2_OUTPUT";
wcd-playback-dai-link {
@@ -937,7 +954,7 @@
va-dai-link {
link-name = "VA Capture";
cpu {
- sound-dai = <&q6apmbedai TX_CODEC_DMA_TX_3>;
+ sound-dai = <&q6apmbedai VA_CODEC_DMA_TX_0>;
};
platform {
@@ -1062,7 +1079,7 @@
vdd-micb-supply = <&vreg_s10b>;
- qcom,dmic-sample-rate = <600000>;
+ qcom,dmic-sample-rate = <4800000>;
status = "okay";
};
diff --git a/arch/arm64/boot/dts/qcom/sc8280xp.dtsi b/arch/arm64/boot/dts/qcom/sc8280xp.dtsi
index 0d02599d8867..42bfa9fa5b96 100644
--- a/arch/arm64/boot/dts/qcom/sc8280xp.dtsi
+++ b/arch/arm64/boot/dts/qcom/sc8280xp.dtsi
@@ -2504,12 +2504,12 @@
qcom,ports-sinterval-low = /bits/ 8 <0x03 0x1f 0x1f 0x07 0x00>;
qcom,ports-offset1 = /bits/ 8 <0x00 0x00 0x0B 0x01 0x00>;
qcom,ports-offset2 = /bits/ 8 <0x00 0x00 0x0B 0x00 0x00>;
- qcom,ports-hstart = /bits/ 8 <0xff 0x03 0xff 0xff 0xff>;
- qcom,ports-hstop = /bits/ 8 <0xff 0x06 0xff 0xff 0xff>;
+ qcom,ports-hstart = /bits/ 8 <0xff 0x03 0x00 0xff 0xff>;
+ qcom,ports-hstop = /bits/ 8 <0xff 0x06 0x0f 0xff 0xff>;
qcom,ports-word-length = /bits/ 8 <0x01 0x07 0x04 0xff 0xff>;
- qcom,ports-block-pack-mode = /bits/ 8 <0xff 0x00 0x01 0xff 0xff>;
+ qcom,ports-block-pack-mode = /bits/ 8 <0xff 0xff 0x01 0xff 0xff>;
qcom,ports-lane-control = /bits/ 8 <0x01 0x00 0x00 0x00 0x00>;
- qcom,ports-block-group-count = /bits/ 8 <0xff 0xff 0xff 0xff 0x00>;
+ qcom,ports-block-group-count = /bits/ 8 <0xff 0xff 0xff 0xff 0xff>;
#sound-dai-cells = <1>;
#address-cells = <2>;
@@ -2600,7 +2600,7 @@
<&intc GIC_SPI 520 IRQ_TYPE_LEVEL_HIGH>;
interrupt-names = "core", "wake";
- clocks = <&vamacro>;
+ clocks = <&txmacro>;
clock-names = "iface";
label = "TX";
#sound-dai-cells = <1>;
@@ -2609,15 +2609,15 @@
qcom,din-ports = <4>;
qcom,dout-ports = <0>;
- qcom,ports-sinterval-low = /bits/ 8 <0x01 0x03 0x03 0x03>;
- qcom,ports-offset1 = /bits/ 8 <0x01 0x00 0x02 0x01>;
+ qcom,ports-sinterval-low = /bits/ 8 <0x01 0x01 0x03 0x03>;
+ qcom,ports-offset1 = /bits/ 8 <0x01 0x00 0x02 0x00>;
qcom,ports-offset2 = /bits/ 8 <0x00 0x00 0x00 0x00>;
qcom,ports-block-pack-mode = /bits/ 8 <0xff 0xff 0xff 0xff>;
qcom,ports-hstart = /bits/ 8 <0xff 0xff 0xff 0xff>;
qcom,ports-hstop = /bits/ 8 <0xff 0xff 0xff 0xff>;
- qcom,ports-word-length = /bits/ 8 <0xff 0x00 0xff 0xff>;
+ qcom,ports-word-length = /bits/ 8 <0xff 0xff 0xff 0xff>;
qcom,ports-block-group-count = /bits/ 8 <0xff 0xff 0xff 0xff>;
- qcom,ports-lane-control = /bits/ 8 <0x00 0x01 0x00 0x00>;
+ qcom,ports-lane-control = /bits/ 8 <0x00 0x01 0x00 0x01>;
status = "disabled";
};
diff --git a/arch/arm64/boot/dts/qcom/sm6115.dtsi b/arch/arm64/boot/dts/qcom/sm6115.dtsi
index 4d6ec815b78b..fbd67d2c8d78 100644
--- a/arch/arm64/boot/dts/qcom/sm6115.dtsi
+++ b/arch/arm64/boot/dts/qcom/sm6115.dtsi
@@ -1078,6 +1078,7 @@
dma-names = "tx", "rx";
#address-cells = <1>;
#size-cells = <0>;
+ status = "disabled";
};
};
diff --git a/arch/arm64/boot/dts/qcom/sm6375.dtsi b/arch/arm64/boot/dts/qcom/sm6375.dtsi
index 31b88c738510..068ee4f72485 100644
--- a/arch/arm64/boot/dts/qcom/sm6375.dtsi
+++ b/arch/arm64/boot/dts/qcom/sm6375.dtsi
@@ -1209,6 +1209,7 @@
clock-names = "xo";
power-domains = <&rpmpd SM6375_VDDCX>;
+ power-domain-names = "cx";
memory-region = <&pil_cdsp_mem>;
diff --git a/arch/arm64/boot/dts/qcom/sm8150.dtsi b/arch/arm64/boot/dts/qcom/sm8150.dtsi
index fd20096cfc6e..13e0ce828606 100644
--- a/arch/arm64/boot/dts/qcom/sm8150.dtsi
+++ b/arch/arm64/boot/dts/qcom/sm8150.dtsi
@@ -1826,7 +1826,7 @@
"slave_q2a",
"tbu";
- iommus = <&apps_smmu 0x1d80 0x7f>;
+ iommus = <&apps_smmu 0x1d80 0x3f>;
iommu-map = <0x0 &apps_smmu 0x1d80 0x1>,
<0x100 &apps_smmu 0x1d81 0x1>;
@@ -1925,7 +1925,7 @@
assigned-clocks = <&gcc GCC_PCIE_1_AUX_CLK>;
assigned-clock-rates = <19200000>;
- iommus = <&apps_smmu 0x1e00 0x7f>;
+ iommus = <&apps_smmu 0x1e00 0x3f>;
iommu-map = <0x0 &apps_smmu 0x1e00 0x1>,
<0x100 &apps_smmu 0x1e01 0x1>;
diff --git a/arch/arm64/boot/dts/qcom/sm8250-xiaomi-elish.dts b/arch/arm64/boot/dts/qcom/sm8250-xiaomi-elish.dts
index acaa99c5ff8b..a85d47f7a9e8 100644
--- a/arch/arm64/boot/dts/qcom/sm8250-xiaomi-elish.dts
+++ b/arch/arm64/boot/dts/qcom/sm8250-xiaomi-elish.dts
@@ -625,6 +625,6 @@
};
&venus {
- firmware-name = "qcom/sm8250/elish/venus.mbn";
+ firmware-name = "qcom/sm8250/xiaomi/elish/venus.mbn";
status = "okay";
};
diff --git a/arch/arm64/boot/dts/qcom/sm8350.dtsi b/arch/arm64/boot/dts/qcom/sm8350.dtsi
index 1c97e28da6ad..1a5a612d4234 100644
--- a/arch/arm64/boot/dts/qcom/sm8350.dtsi
+++ b/arch/arm64/boot/dts/qcom/sm8350.dtsi
@@ -1664,6 +1664,7 @@
power-domains = <&gcc UFS_PHY_GDSC>;
iommus = <&apps_smmu 0xe0 0x0>;
+ dma-coherent;
clock-names =
"core_clk",
diff --git a/arch/arm64/boot/dts/qcom/sm8450.dtsi b/arch/arm64/boot/dts/qcom/sm8450.dtsi
index 1a744a33bcf4..b285b1530c10 100644
--- a/arch/arm64/boot/dts/qcom/sm8450.dtsi
+++ b/arch/arm64/boot/dts/qcom/sm8450.dtsi
@@ -2143,8 +2143,8 @@
<&q6prmcc LPASS_HW_DCODEC_VOTE LPASS_CLK_ATTRIBUTE_COUPLE_NO>,
<&vamacro>;
clock-names = "mclk", "npl", "macro", "dcodec", "fsgen";
- assigned-clocks = <&q6prmcc LPASS_CLK_ID_WSA_CORE_TX_MCLK LPASS_CLK_ATTRIBUTE_COUPLE_NO>,
- <&q6prmcc LPASS_CLK_ID_WSA_CORE_TX_2X_MCLK LPASS_CLK_ATTRIBUTE_COUPLE_NO>;
+ assigned-clocks = <&q6prmcc LPASS_CLK_ID_WSA2_CORE_TX_MCLK LPASS_CLK_ATTRIBUTE_COUPLE_NO>,
+ <&q6prmcc LPASS_CLK_ID_WSA2_CORE_TX_2X_MCLK LPASS_CLK_ATTRIBUTE_COUPLE_NO>;
assigned-clock-rates = <19200000>, <19200000>;
#clock-cells = <0>;
@@ -4003,6 +4003,7 @@
power-domains = <&gcc UFS_PHY_GDSC>;
iommus = <&apps_smmu 0xe0 0x0>;
+ dma-coherent;
interconnects = <&aggre1_noc MASTER_UFS_MEM 0 &mc_virt SLAVE_EBI1 0>,
<&gem_noc MASTER_APPSS_PROC 0 &config_noc SLAVE_UFS_MEM_CFG 0>;
diff --git a/arch/arm64/boot/dts/qcom/sm8550.dtsi b/arch/arm64/boot/dts/qcom/sm8550.dtsi
index ff4d342c0725..5d0888398b3c 100644
--- a/arch/arm64/boot/dts/qcom/sm8550.dtsi
+++ b/arch/arm64/boot/dts/qcom/sm8550.dtsi
@@ -66,7 +66,7 @@
CPU0: cpu@0 {
device_type = "cpu";
- compatible = "qcom,kryo";
+ compatible = "arm,cortex-a510";
reg = <0 0>;
enable-method = "psci";
next-level-cache = <&L2_0>;
@@ -89,7 +89,7 @@
CPU1: cpu@100 {
device_type = "cpu";
- compatible = "qcom,kryo";
+ compatible = "arm,cortex-a510";
reg = <0 0x100>;
enable-method = "psci";
next-level-cache = <&L2_100>;
@@ -108,7 +108,7 @@
CPU2: cpu@200 {
device_type = "cpu";
- compatible = "qcom,kryo";
+ compatible = "arm,cortex-a510";
reg = <0 0x200>;
enable-method = "psci";
next-level-cache = <&L2_200>;
@@ -127,7 +127,7 @@
CPU3: cpu@300 {
device_type = "cpu";
- compatible = "qcom,kryo";
+ compatible = "arm,cortex-a715";
reg = <0 0x300>;
enable-method = "psci";
next-level-cache = <&L2_300>;
@@ -146,7 +146,7 @@
CPU4: cpu@400 {
device_type = "cpu";
- compatible = "qcom,kryo";
+ compatible = "arm,cortex-a715";
reg = <0 0x400>;
enable-method = "psci";
next-level-cache = <&L2_400>;
@@ -165,7 +165,7 @@
CPU5: cpu@500 {
device_type = "cpu";
- compatible = "qcom,kryo";
+ compatible = "arm,cortex-a710";
reg = <0 0x500>;
enable-method = "psci";
next-level-cache = <&L2_500>;
@@ -184,7 +184,7 @@
CPU6: cpu@600 {
device_type = "cpu";
- compatible = "qcom,kryo";
+ compatible = "arm,cortex-a710";
reg = <0 0x600>;
enable-method = "psci";
next-level-cache = <&L2_600>;
@@ -203,7 +203,7 @@
CPU7: cpu@700 {
device_type = "cpu";
- compatible = "qcom,kryo";
+ compatible = "arm,cortex-x3";
reg = <0 0x700>;
enable-method = "psci";
next-level-cache = <&L2_700>;
@@ -1905,6 +1905,7 @@
required-opps = <&rpmhpd_opp_nom>;
iommus = <&apps_smmu 0x60 0x0>;
+ dma-coherent;
interconnects = <&aggre1_noc MASTER_UFS_MEM 0 &mc_virt SLAVE_EBI1 0>,
<&gem_noc MASTER_APPSS_PROC 0 &config_noc SLAVE_UFS_MEM_CFG 0>;
@@ -1997,7 +1998,7 @@
lpass_tlmm: pinctrl@6e80000 {
compatible = "qcom,sm8550-lpass-lpi-pinctrl";
reg = <0 0x06e80000 0 0x20000>,
- <0 0x0725a000 0 0x10000>;
+ <0 0x07250000 0 0x10000>;
gpio-controller;
#gpio-cells = <2>;
gpio-ranges = <&lpass_tlmm 0 0 23>;
@@ -2691,7 +2692,7 @@
pins = "gpio28", "gpio29";
function = "qup1_se0";
drive-strength = <2>;
- bias-pull-up;
+ bias-pull-up = <2200>;
};
qup_i2c1_data_clk: qup-i2c1-data-clk-state {
@@ -2699,7 +2700,7 @@
pins = "gpio32", "gpio33";
function = "qup1_se1";
drive-strength = <2>;
- bias-pull-up;
+ bias-pull-up = <2200>;
};
qup_i2c2_data_clk: qup-i2c2-data-clk-state {
@@ -2707,7 +2708,7 @@
pins = "gpio36", "gpio37";
function = "qup1_se2";
drive-strength = <2>;
- bias-pull-up;
+ bias-pull-up = <2200>;
};
qup_i2c3_data_clk: qup-i2c3-data-clk-state {
@@ -2715,7 +2716,7 @@
pins = "gpio40", "gpio41";
function = "qup1_se3";
drive-strength = <2>;
- bias-pull-up;
+ bias-pull-up = <2200>;
};
qup_i2c4_data_clk: qup-i2c4-data-clk-state {
@@ -2723,7 +2724,7 @@
pins = "gpio44", "gpio45";
function = "qup1_se4";
drive-strength = <2>;
- bias-pull-up;
+ bias-pull-up = <2200>;
};
qup_i2c5_data_clk: qup-i2c5-data-clk-state {
@@ -2731,7 +2732,7 @@
pins = "gpio52", "gpio53";
function = "qup1_se5";
drive-strength = <2>;
- bias-pull-up;
+ bias-pull-up = <2200>;
};
qup_i2c6_data_clk: qup-i2c6-data-clk-state {
@@ -2739,7 +2740,7 @@
pins = "gpio48", "gpio49";
function = "qup1_se6";
drive-strength = <2>;
- bias-pull-up;
+ bias-pull-up = <2200>;
};
qup_i2c8_data_clk: qup-i2c8-data-clk-state {
@@ -2747,14 +2748,14 @@
pins = "gpio57";
function = "qup2_se0_l1_mira";
drive-strength = <2>;
- bias-pull-up;
+ bias-pull-up = <2200>;
};
sda-pins {
pins = "gpio56";
function = "qup2_se0_l0_mira";
drive-strength = <2>;
- bias-pull-up;
+ bias-pull-up = <2200>;
};
};
@@ -2763,7 +2764,7 @@
pins = "gpio60", "gpio61";
function = "qup2_se1";
drive-strength = <2>;
- bias-pull-up;
+ bias-pull-up = <2200>;
};
qup_i2c10_data_clk: qup-i2c10-data-clk-state {
@@ -2771,7 +2772,7 @@
pins = "gpio64", "gpio65";
function = "qup2_se2";
drive-strength = <2>;
- bias-pull-up;
+ bias-pull-up = <2200>;
};
qup_i2c11_data_clk: qup-i2c11-data-clk-state {
@@ -2779,7 +2780,7 @@
pins = "gpio68", "gpio69";
function = "qup2_se3";
drive-strength = <2>;
- bias-pull-up;
+ bias-pull-up = <2200>;
};
qup_i2c12_data_clk: qup-i2c12-data-clk-state {
@@ -2787,7 +2788,7 @@
pins = "gpio2", "gpio3";
function = "qup2_se4";
drive-strength = <2>;
- bias-pull-up;
+ bias-pull-up = <2200>;
};
qup_i2c13_data_clk: qup-i2c13-data-clk-state {
@@ -2795,7 +2796,7 @@
pins = "gpio80", "gpio81";
function = "qup2_se5";
drive-strength = <2>;
- bias-pull-up;
+ bias-pull-up = <2200>;
};
qup_i2c15_data_clk: qup-i2c15-data-clk-state {
@@ -2803,7 +2804,7 @@
pins = "gpio72", "gpio106";
function = "qup2_se7";
drive-strength = <2>;
- bias-pull-up;
+ bias-pull-up = <2200>;
};
qup_spi0_cs: qup-spi0-cs-state {
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 6f7b218a681f..b9e36611734f 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -16,6 +16,7 @@
#include <linux/types.h>
#include <linux/jump_label.h>
#include <linux/kvm_types.h>
+#include <linux/maple_tree.h>
#include <linux/percpu.h>
#include <linux/psci.h>
#include <asm/arch_gicv3.h>
@@ -199,6 +200,9 @@ struct kvm_arch {
/* Mandated version of PSCI */
u32 psci_version;
+ /* Protects VM-scoped configuration data */
+ struct mutex config_lock;
+
/*
* If we encounter a data abort without valid instruction syndrome
* information, report this to user space. User space can (and
@@ -221,7 +225,12 @@ struct kvm_arch {
#define KVM_ARCH_FLAG_EL1_32BIT 4
/* PSCI SYSTEM_SUSPEND enabled for the guest */
#define KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED 5
-
+ /* VM counter offset */
+#define KVM_ARCH_FLAG_VM_COUNTER_OFFSET 6
+ /* Timer PPIs made immutable */
+#define KVM_ARCH_FLAG_TIMER_PPIS_IMMUTABLE 7
+ /* SMCCC filter initialized for the VM */
+#define KVM_ARCH_FLAG_SMCCC_FILTER_CONFIGURED 8
unsigned long flags;
/*
@@ -242,6 +251,7 @@ struct kvm_arch {
/* Hypercall features firmware registers' descriptor */
struct kvm_smccc_features smccc_feat;
+ struct maple_tree smccc_filter;
/*
* For an untrusted host VM, 'pkvm.handle' is used to lookup
@@ -365,6 +375,10 @@ enum vcpu_sysreg {
TPIDR_EL2, /* EL2 Software Thread ID Register */
CNTHCTL_EL2, /* Counter-timer Hypervisor Control register */
SP_EL2, /* EL2 Stack Pointer */
+ CNTHP_CTL_EL2,
+ CNTHP_CVAL_EL2,
+ CNTHV_CTL_EL2,
+ CNTHV_CVAL_EL2,
NR_SYS_REGS /* Nothing after this line! */
};
@@ -522,6 +536,7 @@ struct kvm_vcpu_arch {
/* vcpu power state */
struct kvm_mp_state mp_state;
+ spinlock_t mp_state_lock;
/* Cache some mmu pages needed inside spinlock regions */
struct kvm_mmu_memory_cache mmu_page_cache;
@@ -922,6 +937,9 @@ void kvm_reset_sys_regs(struct kvm_vcpu *vcpu);
int __init kvm_sys_reg_table_init(void);
+bool lock_all_vcpus(struct kvm *kvm);
+void unlock_all_vcpus(struct kvm *kvm);
+
/* MMIO helpers */
void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data);
unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len);
@@ -1007,6 +1025,8 @@ int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
struct kvm_arm_copy_mte_tags *copy_tags);
+int kvm_vm_ioctl_set_counter_offset(struct kvm *kvm,
+ struct kvm_arm_counter_offset *offset);
/* Guest/host FPSIMD coordination helpers */
int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu);
@@ -1061,6 +1081,9 @@ bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu);
(system_supports_32bit_el0() && \
!static_branch_unlikely(&arm64_mismatched_32bit_el0))
+#define kvm_vm_has_ran_once(kvm) \
+ (test_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &(kvm)->arch.flags))
+
int kvm_trng_call(struct kvm_vcpu *vcpu);
#ifdef CONFIG_KVM
extern phys_addr_t hyp_mem_base;
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 083cc47dca08..27e63c111f78 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -63,6 +63,7 @@
* specific registers encoded in the instructions).
*/
.macro kern_hyp_va reg
+#ifndef __KVM_VHE_HYPERVISOR__
alternative_cb ARM64_ALWAYS_SYSTEM, kvm_update_va_mask
and \reg, \reg, #1 /* mask with va_mask */
ror \reg, \reg, #1 /* rotate to the first tag bit */
@@ -70,6 +71,7 @@ alternative_cb ARM64_ALWAYS_SYSTEM, kvm_update_va_mask
add \reg, \reg, #0, lsl 12 /* insert the top 12 bits of the tag */
ror \reg, \reg, #63 /* rotate back */
alternative_cb_end
+#endif
.endm
/*
@@ -127,6 +129,7 @@ void kvm_apply_hyp_relocations(void);
static __always_inline unsigned long __kern_hyp_va(unsigned long v)
{
+#ifndef __KVM_VHE_HYPERVISOR__
asm volatile(ALTERNATIVE_CB("and %0, %0, #1\n"
"ror %0, %0, #1\n"
"add %0, %0, #0\n"
@@ -135,6 +138,7 @@ static __always_inline unsigned long __kern_hyp_va(unsigned long v)
ARM64_ALWAYS_SYSTEM,
kvm_update_va_mask)
: "+r" (v));
+#endif
return v;
}
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 9e3ecba3c4e6..a43f21559c3e 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -388,6 +388,7 @@
#define SYS_CNTFRQ_EL0 sys_reg(3, 3, 14, 0, 0)
+#define SYS_CNTPCT_EL0 sys_reg(3, 3, 14, 0, 1)
#define SYS_CNTPCTSS_EL0 sys_reg(3, 3, 14, 0, 5)
#define SYS_CNTVCTSS_EL0 sys_reg(3, 3, 14, 0, 6)
@@ -400,7 +401,9 @@
#define SYS_AARCH32_CNTP_TVAL sys_reg(0, 0, 14, 2, 0)
#define SYS_AARCH32_CNTP_CTL sys_reg(0, 0, 14, 2, 1)
+#define SYS_AARCH32_CNTPCT sys_reg(0, 0, 0, 14, 0)
#define SYS_AARCH32_CNTP_CVAL sys_reg(0, 2, 0, 14, 0)
+#define SYS_AARCH32_CNTPCTSS sys_reg(0, 8, 0, 14, 0)
#define __PMEV_op2(n) ((n) & 0x7)
#define __CNTR_CRm(n) (0x8 | (((n) >> 3) & 0x3))
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index f8129c624b07..f7ddd73a8c0f 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -198,6 +198,15 @@ struct kvm_arm_copy_mte_tags {
__u64 reserved[2];
};
+/*
+ * Counter/Timer offset structure. Describe the virtual/physical offset.
+ * To be used with KVM_ARM_SET_COUNTER_OFFSET.
+ */
+struct kvm_arm_counter_offset {
+ __u64 counter_offset;
+ __u64 reserved;
+};
+
#define KVM_ARM_TAGS_TO_GUEST 0
#define KVM_ARM_TAGS_FROM_GUEST 1
@@ -372,6 +381,10 @@ enum {
#endif
};
+/* Device Control API on vm fd */
+#define KVM_ARM_VM_SMCCC_CTRL 0
+#define KVM_ARM_VM_SMCCC_FILTER 0
+
/* Device Control API: ARM VGIC */
#define KVM_DEV_ARM_VGIC_GRP_ADDR 0
#define KVM_DEV_ARM_VGIC_GRP_DIST_REGS 1
@@ -411,6 +424,8 @@ enum {
#define KVM_ARM_VCPU_TIMER_CTRL 1
#define KVM_ARM_VCPU_TIMER_IRQ_VTIMER 0
#define KVM_ARM_VCPU_TIMER_IRQ_PTIMER 1
+#define KVM_ARM_VCPU_TIMER_IRQ_HVTIMER 2
+#define KVM_ARM_VCPU_TIMER_IRQ_HPTIMER 3
#define KVM_ARM_VCPU_PVTIME_CTRL 2
#define KVM_ARM_VCPU_PVTIME_IPA 0
@@ -469,6 +484,27 @@ enum {
/* run->fail_entry.hardware_entry_failure_reason codes. */
#define KVM_EXIT_FAIL_ENTRY_CPU_UNSUPPORTED (1ULL << 0)
+enum kvm_smccc_filter_action {
+ KVM_SMCCC_FILTER_HANDLE = 0,
+ KVM_SMCCC_FILTER_DENY,
+ KVM_SMCCC_FILTER_FWD_TO_USER,
+
+#ifdef __KERNEL__
+ NR_SMCCC_FILTER_ACTIONS
+#endif
+};
+
+struct kvm_smccc_filter {
+ __u32 base;
+ __u32 nr_functions;
+ __u8 action;
+ __u8 pad[15];
+};
+
+/* arm64-specific KVM_EXIT_HYPERCALL flags */
+#define KVM_HYPERCALL_EXIT_SMC (1U << 0)
+#define KVM_HYPERCALL_EXIT_16BIT (1U << 1)
+
#endif
#endif /* __ARM_KVM_H__ */
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 2e3e55139777..c331c49a7d19 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -2223,6 +2223,17 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
.sign = FTR_UNSIGNED,
.min_field_value = 1,
},
+ {
+ .desc = "Enhanced Counter Virtualization (CNTPOFF)",
+ .capability = ARM64_HAS_ECV_CNTPOFF,
+ .type = ARM64_CPUCAP_SYSTEM_FEATURE,
+ .matches = has_cpuid_feature,
+ .sys_reg = SYS_ID_AA64MMFR0_EL1,
+ .field_pos = ID_AA64MMFR0_EL1_ECV_SHIFT,
+ .field_width = 4,
+ .sign = FTR_UNSIGNED,
+ .min_field_value = ID_AA64MMFR0_EL1_ECV_CNTPOFF,
+ },
#ifdef CONFIG_ARM64_PAN
{
.desc = "Privileged Access Never",
diff --git a/arch/arm64/kernel/efi-header.S b/arch/arm64/kernel/efi-header.S
index 28d8a5dca5f1..d731b4655df8 100644
--- a/arch/arm64/kernel/efi-header.S
+++ b/arch/arm64/kernel/efi-header.S
@@ -66,7 +66,7 @@
.long .Lefi_header_end - .L_head // SizeOfHeaders
.long 0 // CheckSum
.short IMAGE_SUBSYSTEM_EFI_APPLICATION // Subsystem
- .short 0 // DllCharacteristics
+ .short IMAGE_DLL_CHARACTERISTICS_NX_COMPAT // DllCharacteristics
.quad 0 // SizeOfStackReserve
.quad 0 // SizeOfStackCommit
.quad 0 // SizeOfHeapReserve
diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c
index e1af4301b913..05b022be885b 100644
--- a/arch/arm64/kvm/arch_timer.c
+++ b/arch/arm64/kvm/arch_timer.c
@@ -16,6 +16,7 @@
#include <asm/arch_timer.h>
#include <asm/kvm_emulate.h>
#include <asm/kvm_hyp.h>
+#include <asm/kvm_nested.h>
#include <kvm/arm_vgic.h>
#include <kvm/arm_arch_timer.h>
@@ -30,14 +31,11 @@ static u32 host_ptimer_irq_flags;
static DEFINE_STATIC_KEY_FALSE(has_gic_active_state);
-static const struct kvm_irq_level default_ptimer_irq = {
- .irq = 30,
- .level = 1,
-};
-
-static const struct kvm_irq_level default_vtimer_irq = {
- .irq = 27,
- .level = 1,
+static const u8 default_ppi[] = {
+ [TIMER_PTIMER] = 30,
+ [TIMER_VTIMER] = 27,
+ [TIMER_HPTIMER] = 26,
+ [TIMER_HVTIMER] = 28,
};
static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx);
@@ -51,6 +49,24 @@ static void kvm_arm_timer_write(struct kvm_vcpu *vcpu,
static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu,
struct arch_timer_context *timer,
enum kvm_arch_timer_regs treg);
+static bool kvm_arch_timer_get_input_level(int vintid);
+
+static struct irq_ops arch_timer_irq_ops = {
+ .get_input_level = kvm_arch_timer_get_input_level,
+};
+
+static bool has_cntpoff(void)
+{
+ return (has_vhe() && cpus_have_final_cap(ARM64_HAS_ECV_CNTPOFF));
+}
+
+static int nr_timers(struct kvm_vcpu *vcpu)
+{
+ if (!vcpu_has_nv(vcpu))
+ return NR_KVM_EL0_TIMERS;
+
+ return NR_KVM_TIMERS;
+}
u32 timer_get_ctl(struct arch_timer_context *ctxt)
{
@@ -61,6 +77,10 @@ u32 timer_get_ctl(struct arch_timer_context *ctxt)
return __vcpu_sys_reg(vcpu, CNTV_CTL_EL0);
case TIMER_PTIMER:
return __vcpu_sys_reg(vcpu, CNTP_CTL_EL0);
+ case TIMER_HVTIMER:
+ return __vcpu_sys_reg(vcpu, CNTHV_CTL_EL2);
+ case TIMER_HPTIMER:
+ return __vcpu_sys_reg(vcpu, CNTHP_CTL_EL2);
default:
WARN_ON(1);
return 0;
@@ -76,6 +96,10 @@ u64 timer_get_cval(struct arch_timer_context *ctxt)
return __vcpu_sys_reg(vcpu, CNTV_CVAL_EL0);
case TIMER_PTIMER:
return __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0);
+ case TIMER_HVTIMER:
+ return __vcpu_sys_reg(vcpu, CNTHV_CVAL_EL2);
+ case TIMER_HPTIMER:
+ return __vcpu_sys_reg(vcpu, CNTHP_CVAL_EL2);
default:
WARN_ON(1);
return 0;
@@ -84,10 +108,17 @@ u64 timer_get_cval(struct arch_timer_context *ctxt)
static u64 timer_get_offset(struct arch_timer_context *ctxt)
{
+ u64 offset = 0;
+
+ if (!ctxt)
+ return 0;
+
if (ctxt->offset.vm_offset)
- return *ctxt->offset.vm_offset;
+ offset += *ctxt->offset.vm_offset;
+ if (ctxt->offset.vcpu_offset)
+ offset += *ctxt->offset.vcpu_offset;
- return 0;
+ return offset;
}
static void timer_set_ctl(struct arch_timer_context *ctxt, u32 ctl)
@@ -101,6 +132,12 @@ static void timer_set_ctl(struct arch_timer_context *ctxt, u32 ctl)
case TIMER_PTIMER:
__vcpu_sys_reg(vcpu, CNTP_CTL_EL0) = ctl;
break;
+ case TIMER_HVTIMER:
+ __vcpu_sys_reg(vcpu, CNTHV_CTL_EL2) = ctl;
+ break;
+ case TIMER_HPTIMER:
+ __vcpu_sys_reg(vcpu, CNTHP_CTL_EL2) = ctl;
+ break;
default:
WARN_ON(1);
}
@@ -117,6 +154,12 @@ static void timer_set_cval(struct arch_timer_context *ctxt, u64 cval)
case TIMER_PTIMER:
__vcpu_sys_reg(vcpu, CNTP_CVAL_EL0) = cval;
break;
+ case TIMER_HVTIMER:
+ __vcpu_sys_reg(vcpu, CNTHV_CVAL_EL2) = cval;
+ break;
+ case TIMER_HPTIMER:
+ __vcpu_sys_reg(vcpu, CNTHP_CVAL_EL2) = cval;
+ break;
default:
WARN_ON(1);
}
@@ -139,13 +182,27 @@ u64 kvm_phys_timer_read(void)
static void get_timer_map(struct kvm_vcpu *vcpu, struct timer_map *map)
{
- if (has_vhe()) {
+ if (vcpu_has_nv(vcpu)) {
+ if (is_hyp_ctxt(vcpu)) {
+ map->direct_vtimer = vcpu_hvtimer(vcpu);
+ map->direct_ptimer = vcpu_hptimer(vcpu);
+ map->emul_vtimer = vcpu_vtimer(vcpu);
+ map->emul_ptimer = vcpu_ptimer(vcpu);
+ } else {
+ map->direct_vtimer = vcpu_vtimer(vcpu);
+ map->direct_ptimer = vcpu_ptimer(vcpu);
+ map->emul_vtimer = vcpu_hvtimer(vcpu);
+ map->emul_ptimer = vcpu_hptimer(vcpu);
+ }
+ } else if (has_vhe()) {
map->direct_vtimer = vcpu_vtimer(vcpu);
map->direct_ptimer = vcpu_ptimer(vcpu);
+ map->emul_vtimer = NULL;
map->emul_ptimer = NULL;
} else {
map->direct_vtimer = vcpu_vtimer(vcpu);
map->direct_ptimer = NULL;
+ map->emul_vtimer = NULL;
map->emul_ptimer = vcpu_ptimer(vcpu);
}
@@ -212,7 +269,7 @@ static u64 kvm_counter_compute_delta(struct arch_timer_context *timer_ctx,
ns = cyclecounter_cyc2ns(timecounter->cc,
val - now,
timecounter->mask,
- &timecounter->frac);
+ &timer_ctx->ns_frac);
return ns;
}
@@ -240,8 +297,11 @@ static bool vcpu_has_wfit_active(struct kvm_vcpu *vcpu)
static u64 wfit_delay_ns(struct kvm_vcpu *vcpu)
{
- struct arch_timer_context *ctx = vcpu_vtimer(vcpu);
u64 val = vcpu_get_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu));
+ struct arch_timer_context *ctx;
+
+ ctx = (vcpu_has_nv(vcpu) && is_hyp_ctxt(vcpu)) ? vcpu_hvtimer(vcpu)
+ : vcpu_vtimer(vcpu);
return kvm_counter_compute_delta(ctx, val);
}
@@ -255,7 +315,7 @@ static u64 kvm_timer_earliest_exp(struct kvm_vcpu *vcpu)
u64 min_delta = ULLONG_MAX;
int i;
- for (i = 0; i < NR_KVM_TIMERS; i++) {
+ for (i = 0; i < nr_timers(vcpu); i++) {
struct arch_timer_context *ctx = &vcpu->arch.timer_cpu.timers[i];
WARN(ctx->loaded, "timer %d loaded\n", i);
@@ -338,9 +398,11 @@ static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx)
switch (index) {
case TIMER_VTIMER:
+ case TIMER_HVTIMER:
cnt_ctl = read_sysreg_el0(SYS_CNTV_CTL);
break;
case TIMER_PTIMER:
+ case TIMER_HPTIMER:
cnt_ctl = read_sysreg_el0(SYS_CNTP_CTL);
break;
case NR_KVM_TIMERS:
@@ -392,12 +454,12 @@ static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level,
int ret;
timer_ctx->irq.level = new_level;
- trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_ctx->irq.irq,
+ trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_irq(timer_ctx),
timer_ctx->irq.level);
if (!userspace_irqchip(vcpu->kvm)) {
ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id,
- timer_ctx->irq.irq,
+ timer_irq(timer_ctx),
timer_ctx->irq.level,
timer_ctx);
WARN_ON(ret);
@@ -432,6 +494,12 @@ static void set_cntvoff(u64 cntvoff)
kvm_call_hyp(__kvm_timer_set_cntvoff, cntvoff);
}
+static void set_cntpoff(u64 cntpoff)
+{
+ if (has_cntpoff())
+ write_sysreg_s(cntpoff, SYS_CNTPOFF_EL2);
+}
+
static void timer_save_state(struct arch_timer_context *ctx)
{
struct arch_timer_cpu *timer = vcpu_timer(ctx->vcpu);
@@ -447,7 +515,10 @@ static void timer_save_state(struct arch_timer_context *ctx)
goto out;
switch (index) {
+ u64 cval;
+
case TIMER_VTIMER:
+ case TIMER_HVTIMER:
timer_set_ctl(ctx, read_sysreg_el0(SYS_CNTV_CTL));
timer_set_cval(ctx, read_sysreg_el0(SYS_CNTV_CVAL));
@@ -473,13 +544,20 @@ static void timer_save_state(struct arch_timer_context *ctx)
set_cntvoff(0);
break;
case TIMER_PTIMER:
+ case TIMER_HPTIMER:
timer_set_ctl(ctx, read_sysreg_el0(SYS_CNTP_CTL));
- timer_set_cval(ctx, read_sysreg_el0(SYS_CNTP_CVAL));
+ cval = read_sysreg_el0(SYS_CNTP_CVAL);
+
+ if (!has_cntpoff())
+ cval -= timer_get_offset(ctx);
+
+ timer_set_cval(ctx, cval);
/* Disable the timer */
write_sysreg_el0(0, SYS_CNTP_CTL);
isb();
+ set_cntpoff(0);
break;
case NR_KVM_TIMERS:
BUG();
@@ -510,6 +588,7 @@ static void kvm_timer_blocking(struct kvm_vcpu *vcpu)
*/
if (!kvm_timer_irq_can_fire(map.direct_vtimer) &&
!kvm_timer_irq_can_fire(map.direct_ptimer) &&
+ !kvm_timer_irq_can_fire(map.emul_vtimer) &&
!kvm_timer_irq_can_fire(map.emul_ptimer) &&
!vcpu_has_wfit_active(vcpu))
return;
@@ -543,14 +622,23 @@ static void timer_restore_state(struct arch_timer_context *ctx)
goto out;
switch (index) {
+ u64 cval, offset;
+
case TIMER_VTIMER:
+ case TIMER_HVTIMER:
set_cntvoff(timer_get_offset(ctx));
write_sysreg_el0(timer_get_cval(ctx), SYS_CNTV_CVAL);
isb();
write_sysreg_el0(timer_get_ctl(ctx), SYS_CNTV_CTL);
break;
case TIMER_PTIMER:
- write_sysreg_el0(timer_get_cval(ctx), SYS_CNTP_CVAL);
+ case TIMER_HPTIMER:
+ cval = timer_get_cval(ctx);
+ offset = timer_get_offset(ctx);
+ set_cntpoff(offset);
+ if (!has_cntpoff())
+ cval += offset;
+ write_sysreg_el0(cval, SYS_CNTP_CVAL);
isb();
write_sysreg_el0(timer_get_ctl(ctx), SYS_CNTP_CTL);
break;
@@ -586,7 +674,7 @@ static void kvm_timer_vcpu_load_gic(struct arch_timer_context *ctx)
kvm_timer_update_irq(ctx->vcpu, kvm_timer_should_fire(ctx), ctx);
if (irqchip_in_kernel(vcpu->kvm))
- phys_active = kvm_vgic_map_is_active(vcpu, ctx->irq.irq);
+ phys_active = kvm_vgic_map_is_active(vcpu, timer_irq(ctx));
phys_active |= ctx->irq.level;
@@ -621,6 +709,128 @@ static void kvm_timer_vcpu_load_nogic(struct kvm_vcpu *vcpu)
enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags);
}
+/* If _pred is true, set bit in _set, otherwise set it in _clr */
+#define assign_clear_set_bit(_pred, _bit, _clr, _set) \
+ do { \
+ if (_pred) \
+ (_set) |= (_bit); \
+ else \
+ (_clr) |= (_bit); \
+ } while (0)
+
+static void kvm_timer_vcpu_load_nested_switch(struct kvm_vcpu *vcpu,
+ struct timer_map *map)
+{
+ int hw, ret;
+
+ if (!irqchip_in_kernel(vcpu->kvm))
+ return;
+
+ /*
+ * We only ever unmap the vtimer irq on a VHE system that runs nested
+ * virtualization, in which case we have both a valid emul_vtimer,
+ * emul_ptimer, direct_vtimer, and direct_ptimer.
+ *
+ * Since this is called from kvm_timer_vcpu_load(), a change between
+ * vEL2 and vEL1/0 will have just happened, and the timer_map will
+ * represent this, and therefore we switch the emul/direct mappings
+ * below.
+ */
+ hw = kvm_vgic_get_map(vcpu, timer_irq(map->direct_vtimer));
+ if (hw < 0) {
+ kvm_vgic_unmap_phys_irq(vcpu, timer_irq(map->emul_vtimer));
+ kvm_vgic_unmap_phys_irq(vcpu, timer_irq(map->emul_ptimer));
+
+ ret = kvm_vgic_map_phys_irq(vcpu,
+ map->direct_vtimer->host_timer_irq,
+ timer_irq(map->direct_vtimer),
+ &arch_timer_irq_ops);
+ WARN_ON_ONCE(ret);
+ ret = kvm_vgic_map_phys_irq(vcpu,
+ map->direct_ptimer->host_timer_irq,
+ timer_irq(map->direct_ptimer),
+ &arch_timer_irq_ops);
+ WARN_ON_ONCE(ret);
+
+ /*
+ * The virtual offset behaviour is "interresting", as it
+ * always applies when HCR_EL2.E2H==0, but only when
+ * accessed from EL1 when HCR_EL2.E2H==1. So make sure we
+ * track E2H when putting the HV timer in "direct" mode.
+ */
+ if (map->direct_vtimer == vcpu_hvtimer(vcpu)) {
+ struct arch_timer_offset *offs = &map->direct_vtimer->offset;
+
+ if (vcpu_el2_e2h_is_set(vcpu))
+ offs->vcpu_offset = NULL;
+ else
+ offs->vcpu_offset = &__vcpu_sys_reg(vcpu, CNTVOFF_EL2);
+ }
+ }
+}
+
+static void timer_set_traps(struct kvm_vcpu *vcpu, struct timer_map *map)
+{
+ bool tpt, tpc;
+ u64 clr, set;
+
+ /*
+ * No trapping gets configured here with nVHE. See
+ * __timer_enable_traps(), which is where the stuff happens.
+ */
+ if (!has_vhe())
+ return;
+
+ /*
+ * Our default policy is not to trap anything. As we progress
+ * within this function, reality kicks in and we start adding
+ * traps based on emulation requirements.
+ */
+ tpt = tpc = false;
+
+ /*
+ * We have two possibility to deal with a physical offset:
+ *
+ * - Either we have CNTPOFF (yay!) or the offset is 0:
+ * we let the guest freely access the HW
+ *
+ * - or neither of these condition apply:
+ * we trap accesses to the HW, but still use it
+ * after correcting the physical offset
+ */
+ if (!has_cntpoff() && timer_get_offset(map->direct_ptimer))
+ tpt = tpc = true;
+
+ /*
+ * Apply the enable bits that the guest hypervisor has requested for
+ * its own guest. We can only add traps that wouldn't have been set
+ * above.
+ */
+ if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) {
+ u64 val = __vcpu_sys_reg(vcpu, CNTHCTL_EL2);
+
+ /* Use the VHE format for mental sanity */
+ if (!vcpu_el2_e2h_is_set(vcpu))
+ val = (val & (CNTHCTL_EL1PCEN | CNTHCTL_EL1PCTEN)) << 10;
+
+ tpt |= !(val & (CNTHCTL_EL1PCEN << 10));
+ tpc |= !(val & (CNTHCTL_EL1PCTEN << 10));
+ }
+
+ /*
+ * Now that we have collected our requirements, compute the
+ * trap and enable bits.
+ */
+ set = 0;
+ clr = 0;
+
+ assign_clear_set_bit(tpt, CNTHCTL_EL1PCEN << 10, set, clr);
+ assign_clear_set_bit(tpc, CNTHCTL_EL1PCTEN << 10, set, clr);
+
+ /* This only happens on VHE, so use the CNTKCTL_EL1 accessor */
+ sysreg_clear_set(cntkctl_el1, clr, set);
+}
+
void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu)
{
struct arch_timer_cpu *timer = vcpu_timer(vcpu);
@@ -632,6 +842,9 @@ void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu)
get_timer_map(vcpu, &map);
if (static_branch_likely(&has_gic_active_state)) {
+ if (vcpu_has_nv(vcpu))
+ kvm_timer_vcpu_load_nested_switch(vcpu, &map);
+
kvm_timer_vcpu_load_gic(map.direct_vtimer);
if (map.direct_ptimer)
kvm_timer_vcpu_load_gic(map.direct_ptimer);
@@ -644,9 +857,12 @@ void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu)
timer_restore_state(map.direct_vtimer);
if (map.direct_ptimer)
timer_restore_state(map.direct_ptimer);
-
+ if (map.emul_vtimer)
+ timer_emulate(map.emul_vtimer);
if (map.emul_ptimer)
timer_emulate(map.emul_ptimer);
+
+ timer_set_traps(vcpu, &map);
}
bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu)
@@ -689,6 +905,8 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu)
* In any case, we re-schedule the hrtimer for the physical timer when
* coming back to the VCPU thread in kvm_timer_vcpu_load().
*/
+ if (map.emul_vtimer)
+ soft_timer_cancel(&map.emul_vtimer->hrtimer);
if (map.emul_ptimer)
soft_timer_cancel(&map.emul_ptimer->hrtimer);
@@ -738,56 +956,89 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu)
* resets the timer to be disabled and unmasked and is compliant with
* the ARMv7 architecture.
*/
- timer_set_ctl(vcpu_vtimer(vcpu), 0);
- timer_set_ctl(vcpu_ptimer(vcpu), 0);
+ for (int i = 0; i < nr_timers(vcpu); i++)
+ timer_set_ctl(vcpu_get_timer(vcpu, i), 0);
+
+ /*
+ * A vcpu running at EL2 is in charge of the offset applied to
+ * the virtual timer, so use the physical VM offset, and point
+ * the vcpu offset to CNTVOFF_EL2.
+ */
+ if (vcpu_has_nv(vcpu)) {
+ struct arch_timer_offset *offs = &vcpu_vtimer(vcpu)->offset;
+
+ offs->vcpu_offset = &__vcpu_sys_reg(vcpu, CNTVOFF_EL2);
+ offs->vm_offset = &vcpu->kvm->arch.timer_data.poffset;
+ }
if (timer->enabled) {
- kvm_timer_update_irq(vcpu, false, vcpu_vtimer(vcpu));
- kvm_timer_update_irq(vcpu, false, vcpu_ptimer(vcpu));
+ for (int i = 0; i < nr_timers(vcpu); i++)
+ kvm_timer_update_irq(vcpu, false,
+ vcpu_get_timer(vcpu, i));
if (irqchip_in_kernel(vcpu->kvm)) {
- kvm_vgic_reset_mapped_irq(vcpu, map.direct_vtimer->irq.irq);
+ kvm_vgic_reset_mapped_irq(vcpu, timer_irq(map.direct_vtimer));
if (map.direct_ptimer)
- kvm_vgic_reset_mapped_irq(vcpu, map.direct_ptimer->irq.irq);
+ kvm_vgic_reset_mapped_irq(vcpu, timer_irq(map.direct_ptimer));
}
}
+ if (map.emul_vtimer)
+ soft_timer_cancel(&map.emul_vtimer->hrtimer);
if (map.emul_ptimer)
soft_timer_cancel(&map.emul_ptimer->hrtimer);
return 0;
}
+static void timer_context_init(struct kvm_vcpu *vcpu, int timerid)
+{
+ struct arch_timer_context *ctxt = vcpu_get_timer(vcpu, timerid);
+ struct kvm *kvm = vcpu->kvm;
+
+ ctxt->vcpu = vcpu;
+
+ if (timerid == TIMER_VTIMER)
+ ctxt->offset.vm_offset = &kvm->arch.timer_data.voffset;
+ else
+ ctxt->offset.vm_offset = &kvm->arch.timer_data.poffset;
+
+ hrtimer_init(&ctxt->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
+ ctxt->hrtimer.function = kvm_hrtimer_expire;
+
+ switch (timerid) {
+ case TIMER_PTIMER:
+ case TIMER_HPTIMER:
+ ctxt->host_timer_irq = host_ptimer_irq;
+ break;
+ case TIMER_VTIMER:
+ case TIMER_HVTIMER:
+ ctxt->host_timer_irq = host_vtimer_irq;
+ break;
+ }
+}
+
void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu)
{
struct arch_timer_cpu *timer = vcpu_timer(vcpu);
- struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
- struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
- vtimer->vcpu = vcpu;
- vtimer->offset.vm_offset = &vcpu->kvm->arch.timer_data.voffset;
- ptimer->vcpu = vcpu;
+ for (int i = 0; i < NR_KVM_TIMERS; i++)
+ timer_context_init(vcpu, i);
- /* Synchronize cntvoff across all vtimers of a VM. */
- timer_set_offset(vtimer, kvm_phys_timer_read());
- timer_set_offset(ptimer, 0);
+ /* Synchronize offsets across timers of a VM if not already provided */
+ if (!test_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET, &vcpu->kvm->arch.flags)) {
+ timer_set_offset(vcpu_vtimer(vcpu), kvm_phys_timer_read());
+ timer_set_offset(vcpu_ptimer(vcpu), 0);
+ }
hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
timer->bg_timer.function = kvm_bg_timer_expire;
+}
- hrtimer_init(&vtimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
- hrtimer_init(&ptimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
- vtimer->hrtimer.function = kvm_hrtimer_expire;
- ptimer->hrtimer.function = kvm_hrtimer_expire;
-
- vtimer->irq.irq = default_vtimer_irq.irq;
- ptimer->irq.irq = default_ptimer_irq.irq;
-
- vtimer->host_timer_irq = host_vtimer_irq;
- ptimer->host_timer_irq = host_ptimer_irq;
-
- vtimer->host_timer_irq_flags = host_vtimer_irq_flags;
- ptimer->host_timer_irq_flags = host_ptimer_irq_flags;
+void kvm_timer_init_vm(struct kvm *kvm)
+{
+ for (int i = 0; i < NR_KVM_TIMERS; i++)
+ kvm->arch.timer_data.ppi[i] = default_ppi[i];
}
void kvm_timer_cpu_up(void)
@@ -814,8 +1065,11 @@ int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value)
kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value);
break;
case KVM_REG_ARM_TIMER_CNT:
- timer = vcpu_vtimer(vcpu);
- timer_set_offset(timer, kvm_phys_timer_read() - value);
+ if (!test_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET,
+ &vcpu->kvm->arch.flags)) {
+ timer = vcpu_vtimer(vcpu);
+ timer_set_offset(timer, kvm_phys_timer_read() - value);
+ }
break;
case KVM_REG_ARM_TIMER_CVAL:
timer = vcpu_vtimer(vcpu);
@@ -825,6 +1079,13 @@ int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value)
timer = vcpu_ptimer(vcpu);
kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value);
break;
+ case KVM_REG_ARM_PTIMER_CNT:
+ if (!test_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET,
+ &vcpu->kvm->arch.flags)) {
+ timer = vcpu_ptimer(vcpu);
+ timer_set_offset(timer, kvm_phys_timer_read() - value);
+ }
+ break;
case KVM_REG_ARM_PTIMER_CVAL:
timer = vcpu_ptimer(vcpu);
kvm_arm_timer_write(vcpu, timer, TIMER_REG_CVAL, value);
@@ -902,6 +1163,10 @@ static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu,
val = kvm_phys_timer_read() - timer_get_offset(timer);
break;
+ case TIMER_REG_VOFF:
+ val = *timer->offset.vcpu_offset;
+ break;
+
default:
BUG();
}
@@ -920,7 +1185,7 @@ u64 kvm_arm_timer_read_sysreg(struct kvm_vcpu *vcpu,
get_timer_map(vcpu, &map);
timer = vcpu_get_timer(vcpu, tmr);
- if (timer == map.emul_ptimer)
+ if (timer == map.emul_vtimer || timer == map.emul_ptimer)
return kvm_arm_timer_read(vcpu, timer, treg);
preempt_disable();
@@ -952,6 +1217,10 @@ static void kvm_arm_timer_write(struct kvm_vcpu *vcpu,
timer_set_cval(timer, val);
break;
+ case TIMER_REG_VOFF:
+ *timer->offset.vcpu_offset = val;
+ break;
+
default:
BUG();
}
@@ -967,7 +1236,7 @@ void kvm_arm_timer_write_sysreg(struct kvm_vcpu *vcpu,
get_timer_map(vcpu, &map);
timer = vcpu_get_timer(vcpu, tmr);
- if (timer == map.emul_ptimer) {
+ if (timer == map.emul_vtimer || timer == map.emul_ptimer) {
soft_timer_cancel(&timer->hrtimer);
kvm_arm_timer_write(vcpu, timer, treg, val);
timer_emulate(timer);
@@ -1047,10 +1316,6 @@ static const struct irq_domain_ops timer_domain_ops = {
.free = timer_irq_domain_free,
};
-static struct irq_ops arch_timer_irq_ops = {
- .get_input_level = kvm_arch_timer_get_input_level,
-};
-
static void kvm_irq_fixup_flags(unsigned int virq, u32 *flags)
{
*flags = irq_get_trigger_type(virq);
@@ -1192,44 +1457,56 @@ void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu)
static bool timer_irqs_are_valid(struct kvm_vcpu *vcpu)
{
- int vtimer_irq, ptimer_irq, ret;
- unsigned long i;
+ u32 ppis = 0;
+ bool valid;
- vtimer_irq = vcpu_vtimer(vcpu)->irq.irq;
- ret = kvm_vgic_set_owner(vcpu, vtimer_irq, vcpu_vtimer(vcpu));
- if (ret)
- return false;
+ mutex_lock(&vcpu->kvm->arch.config_lock);
- ptimer_irq = vcpu_ptimer(vcpu)->irq.irq;
- ret = kvm_vgic_set_owner(vcpu, ptimer_irq, vcpu_ptimer(vcpu));
- if (ret)
- return false;
+ for (int i = 0; i < nr_timers(vcpu); i++) {
+ struct arch_timer_context *ctx;
+ int irq;
+
+ ctx = vcpu_get_timer(vcpu, i);
+ irq = timer_irq(ctx);
+ if (kvm_vgic_set_owner(vcpu, irq, ctx))
+ break;
- kvm_for_each_vcpu(i, vcpu, vcpu->kvm) {
- if (vcpu_vtimer(vcpu)->irq.irq != vtimer_irq ||
- vcpu_ptimer(vcpu)->irq.irq != ptimer_irq)
- return false;
+ /*
+ * We know by construction that we only have PPIs, so
+ * all values are less than 32.
+ */
+ ppis |= BIT(irq);
}
- return true;
+ valid = hweight32(ppis) == nr_timers(vcpu);
+
+ if (valid)
+ set_bit(KVM_ARCH_FLAG_TIMER_PPIS_IMMUTABLE, &vcpu->kvm->arch.flags);
+
+ mutex_unlock(&vcpu->kvm->arch.config_lock);
+
+ return valid;
}
-bool kvm_arch_timer_get_input_level(int vintid)
+static bool kvm_arch_timer_get_input_level(int vintid)
{
struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
- struct arch_timer_context *timer;
if (WARN(!vcpu, "No vcpu context!\n"))
return false;
- if (vintid == vcpu_vtimer(vcpu)->irq.irq)
- timer = vcpu_vtimer(vcpu);
- else if (vintid == vcpu_ptimer(vcpu)->irq.irq)
- timer = vcpu_ptimer(vcpu);
- else
- BUG();
+ for (int i = 0; i < nr_timers(vcpu); i++) {
+ struct arch_timer_context *ctx;
+
+ ctx = vcpu_get_timer(vcpu, i);
+ if (timer_irq(ctx) == vintid)
+ return kvm_timer_should_fire(ctx);
+ }
+
+ /* A timer IRQ has fired, but no matching timer was found? */
+ WARN_RATELIMIT(1, "timer INTID%d unknown\n", vintid);
- return kvm_timer_should_fire(timer);
+ return false;
}
int kvm_timer_enable(struct kvm_vcpu *vcpu)
@@ -1258,7 +1535,7 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu)
ret = kvm_vgic_map_phys_irq(vcpu,
map.direct_vtimer->host_timer_irq,
- map.direct_vtimer->irq.irq,
+ timer_irq(map.direct_vtimer),
&arch_timer_irq_ops);
if (ret)
return ret;
@@ -1266,7 +1543,7 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu)
if (map.direct_ptimer) {
ret = kvm_vgic_map_phys_irq(vcpu,
map.direct_ptimer->host_timer_irq,
- map.direct_ptimer->irq.irq,
+ timer_irq(map.direct_ptimer),
&arch_timer_irq_ops);
}
@@ -1278,45 +1555,17 @@ no_vgic:
return 0;
}
-/*
- * On VHE system, we only need to configure the EL2 timer trap register once,
- * not for every world switch.
- * The host kernel runs at EL2 with HCR_EL2.TGE == 1,
- * and this makes those bits have no effect for the host kernel execution.
- */
+/* If we have CNTPOFF, permanently set ECV to enable it */
void kvm_timer_init_vhe(void)
{
- /* When HCR_EL2.E2H ==1, EL1PCEN and EL1PCTEN are shifted by 10 */
- u32 cnthctl_shift = 10;
- u64 val;
-
- /*
- * VHE systems allow the guest direct access to the EL1 physical
- * timer/counter.
- */
- val = read_sysreg(cnthctl_el2);
- val |= (CNTHCTL_EL1PCEN << cnthctl_shift);
- val |= (CNTHCTL_EL1PCTEN << cnthctl_shift);
- write_sysreg(val, cnthctl_el2);
-}
-
-static void set_timer_irqs(struct kvm *kvm, int vtimer_irq, int ptimer_irq)
-{
- struct kvm_vcpu *vcpu;
- unsigned long i;
-
- kvm_for_each_vcpu(i, vcpu, kvm) {
- vcpu_vtimer(vcpu)->irq.irq = vtimer_irq;
- vcpu_ptimer(vcpu)->irq.irq = ptimer_irq;
- }
+ if (cpus_have_final_cap(ARM64_HAS_ECV_CNTPOFF))
+ sysreg_clear_set(cntkctl_el1, 0, CNTHCTL_ECV);
}
int kvm_arm_timer_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
{
int __user *uaddr = (int __user *)(long)attr->addr;
- struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
- struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
- int irq;
+ int irq, idx, ret = 0;
if (!irqchip_in_kernel(vcpu->kvm))
return -EINVAL;
@@ -1327,21 +1576,42 @@ int kvm_arm_timer_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
if (!(irq_is_ppi(irq)))
return -EINVAL;
- if (vcpu->arch.timer_cpu.enabled)
- return -EBUSY;
+ mutex_lock(&vcpu->kvm->arch.config_lock);
+
+ if (test_bit(KVM_ARCH_FLAG_TIMER_PPIS_IMMUTABLE,
+ &vcpu->kvm->arch.flags)) {
+ ret = -EBUSY;
+ goto out;
+ }
switch (attr->attr) {
case KVM_ARM_VCPU_TIMER_IRQ_VTIMER:
- set_timer_irqs(vcpu->kvm, irq, ptimer->irq.irq);
+ idx = TIMER_VTIMER;
break;
case KVM_ARM_VCPU_TIMER_IRQ_PTIMER:
- set_timer_irqs(vcpu->kvm, vtimer->irq.irq, irq);
+ idx = TIMER_PTIMER;
+ break;
+ case KVM_ARM_VCPU_TIMER_IRQ_HVTIMER:
+ idx = TIMER_HVTIMER;
+ break;
+ case KVM_ARM_VCPU_TIMER_IRQ_HPTIMER:
+ idx = TIMER_HPTIMER;
break;
default:
- return -ENXIO;
+ ret = -ENXIO;
+ goto out;
}
- return 0;
+ /*
+ * We cannot validate the IRQ unicity before we run, so take it at
+ * face value. The verdict will be given on first vcpu run, for each
+ * vcpu. Yes this is late. Blame it on the stupid API.
+ */
+ vcpu->kvm->arch.timer_data.ppi[idx] = irq;
+
+out:
+ mutex_unlock(&vcpu->kvm->arch.config_lock);
+ return ret;
}
int kvm_arm_timer_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
@@ -1357,11 +1627,17 @@ int kvm_arm_timer_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
case KVM_ARM_VCPU_TIMER_IRQ_PTIMER:
timer = vcpu_ptimer(vcpu);
break;
+ case KVM_ARM_VCPU_TIMER_IRQ_HVTIMER:
+ timer = vcpu_hvtimer(vcpu);
+ break;
+ case KVM_ARM_VCPU_TIMER_IRQ_HPTIMER:
+ timer = vcpu_hptimer(vcpu);
+ break;
default:
return -ENXIO;
}
- irq = timer->irq.irq;
+ irq = timer_irq(timer);
return put_user(irq, uaddr);
}
@@ -1370,8 +1646,42 @@ int kvm_arm_timer_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
switch (attr->attr) {
case KVM_ARM_VCPU_TIMER_IRQ_VTIMER:
case KVM_ARM_VCPU_TIMER_IRQ_PTIMER:
+ case KVM_ARM_VCPU_TIMER_IRQ_HVTIMER:
+ case KVM_ARM_VCPU_TIMER_IRQ_HPTIMER:
return 0;
}
return -ENXIO;
}
+
+int kvm_vm_ioctl_set_counter_offset(struct kvm *kvm,
+ struct kvm_arm_counter_offset *offset)
+{
+ int ret = 0;
+
+ if (offset->reserved)
+ return -EINVAL;
+
+ mutex_lock(&kvm->lock);
+
+ if (lock_all_vcpus(kvm)) {
+ set_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET, &kvm->arch.flags);
+
+ /*
+ * If userspace decides to set the offset using this
+ * API rather than merely restoring the counter
+ * values, the offset applies to both the virtual and
+ * physical views.
+ */
+ kvm->arch.timer_data.voffset = offset->counter_offset;
+ kvm->arch.timer_data.poffset = offset->counter_offset;
+
+ unlock_all_vcpus(kvm);
+ } else {
+ ret = -EBUSY;
+ }
+
+ mutex_unlock(&kvm->lock);
+
+ return ret;
+}
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index a43e1cb3b7e9..95b715cdf6f3 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -128,6 +128,16 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
{
int ret;
+ mutex_init(&kvm->arch.config_lock);
+
+#ifdef CONFIG_LOCKDEP
+ /* Clue in lockdep that the config_lock must be taken inside kvm->lock */
+ mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.config_lock);
+ mutex_unlock(&kvm->arch.config_lock);
+ mutex_unlock(&kvm->lock);
+#endif
+
ret = kvm_share_hyp(kvm, kvm + 1);
if (ret)
return ret;
@@ -148,6 +158,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
kvm_vgic_early_init(kvm);
+ kvm_timer_init_vm(kvm);
+
/* The maximum number of VCPUs is limited by the host's GIC model */
kvm->max_vcpus = kvm_arm_default_max_vcpus();
@@ -192,6 +204,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
kvm_destroy_vcpus(kvm);
kvm_unshare_hyp(kvm, kvm + 1);
+
+ kvm_arm_teardown_hypercalls(kvm);
}
int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
@@ -220,6 +234,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_VCPU_ATTRIBUTES:
case KVM_CAP_PTP_KVM:
case KVM_CAP_ARM_SYSTEM_SUSPEND:
+ case KVM_CAP_COUNTER_OFFSET:
r = 1;
break;
case KVM_CAP_SET_GUEST_DEBUG2:
@@ -326,6 +341,16 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
{
int err;
+ spin_lock_init(&vcpu->arch.mp_state_lock);
+
+#ifdef CONFIG_LOCKDEP
+ /* Inform lockdep that the config_lock is acquired after vcpu->mutex */
+ mutex_lock(&vcpu->mutex);
+ mutex_lock(&vcpu->kvm->arch.config_lock);
+ mutex_unlock(&vcpu->kvm->arch.config_lock);
+ mutex_unlock(&vcpu->mutex);
+#endif
+
/* Force users to call KVM_ARM_VCPU_INIT */
vcpu->arch.target = -1;
bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES);
@@ -443,34 +468,41 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
vcpu->cpu = -1;
}
-void kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu)
+static void __kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu)
{
- vcpu->arch.mp_state.mp_state = KVM_MP_STATE_STOPPED;
+ WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_STOPPED);
kvm_make_request(KVM_REQ_SLEEP, vcpu);
kvm_vcpu_kick(vcpu);
}
+void kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu)
+{
+ spin_lock(&vcpu->arch.mp_state_lock);
+ __kvm_arm_vcpu_power_off(vcpu);
+ spin_unlock(&vcpu->arch.mp_state_lock);
+}
+
bool kvm_arm_vcpu_stopped(struct kvm_vcpu *vcpu)
{
- return vcpu->arch.mp_state.mp_state == KVM_MP_STATE_STOPPED;
+ return READ_ONCE(vcpu->arch.mp_state.mp_state) == KVM_MP_STATE_STOPPED;
}
static void kvm_arm_vcpu_suspend(struct kvm_vcpu *vcpu)
{
- vcpu->arch.mp_state.mp_state = KVM_MP_STATE_SUSPENDED;
+ WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_SUSPENDED);
kvm_make_request(KVM_REQ_SUSPEND, vcpu);
kvm_vcpu_kick(vcpu);
}
static bool kvm_arm_vcpu_suspended(struct kvm_vcpu *vcpu)
{
- return vcpu->arch.mp_state.mp_state == KVM_MP_STATE_SUSPENDED;
+ return READ_ONCE(vcpu->arch.mp_state.mp_state) == KVM_MP_STATE_SUSPENDED;
}
int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
struct kvm_mp_state *mp_state)
{
- *mp_state = vcpu->arch.mp_state;
+ *mp_state = READ_ONCE(vcpu->arch.mp_state);
return 0;
}
@@ -480,12 +512,14 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
{
int ret = 0;
+ spin_lock(&vcpu->arch.mp_state_lock);
+
switch (mp_state->mp_state) {
case KVM_MP_STATE_RUNNABLE:
- vcpu->arch.mp_state = *mp_state;
+ WRITE_ONCE(vcpu->arch.mp_state, *mp_state);
break;
case KVM_MP_STATE_STOPPED:
- kvm_arm_vcpu_power_off(vcpu);
+ __kvm_arm_vcpu_power_off(vcpu);
break;
case KVM_MP_STATE_SUSPENDED:
kvm_arm_vcpu_suspend(vcpu);
@@ -494,6 +528,8 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
ret = -EINVAL;
}
+ spin_unlock(&vcpu->arch.mp_state_lock);
+
return ret;
}
@@ -593,9 +629,9 @@ int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu)
if (kvm_vm_is_protected(kvm))
kvm_call_hyp_nvhe(__pkvm_vcpu_init_traps, vcpu);
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.config_lock);
set_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &kvm->arch.flags);
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.config_lock);
return ret;
}
@@ -1210,10 +1246,14 @@ static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu,
/*
* Handle the "start in power-off" case.
*/
+ spin_lock(&vcpu->arch.mp_state_lock);
+
if (test_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features))
- kvm_arm_vcpu_power_off(vcpu);
+ __kvm_arm_vcpu_power_off(vcpu);
else
- vcpu->arch.mp_state.mp_state = KVM_MP_STATE_RUNNABLE;
+ WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_RUNNABLE);
+
+ spin_unlock(&vcpu->arch.mp_state_lock);
return 0;
}
@@ -1439,10 +1479,31 @@ static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
}
}
+static int kvm_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
+{
+ switch (attr->group) {
+ case KVM_ARM_VM_SMCCC_CTRL:
+ return kvm_vm_smccc_has_attr(kvm, attr);
+ default:
+ return -ENXIO;
+ }
+}
+
+static int kvm_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr)
+{
+ switch (attr->group) {
+ case KVM_ARM_VM_SMCCC_CTRL:
+ return kvm_vm_smccc_set_attr(kvm, attr);
+ default:
+ return -ENXIO;
+ }
+}
+
int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
{
struct kvm *kvm = filp->private_data;
void __user *argp = (void __user *)arg;
+ struct kvm_device_attr attr;
switch (ioctl) {
case KVM_CREATE_IRQCHIP: {
@@ -1478,11 +1539,73 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
return -EFAULT;
return kvm_vm_ioctl_mte_copy_tags(kvm, &copy_tags);
}
+ case KVM_ARM_SET_COUNTER_OFFSET: {
+ struct kvm_arm_counter_offset offset;
+
+ if (copy_from_user(&offset, argp, sizeof(offset)))
+ return -EFAULT;
+ return kvm_vm_ioctl_set_counter_offset(kvm, &offset);
+ }
+ case KVM_HAS_DEVICE_ATTR: {
+ if (copy_from_user(&attr, argp, sizeof(attr)))
+ return -EFAULT;
+
+ return kvm_vm_has_attr(kvm, &attr);
+ }
+ case KVM_SET_DEVICE_ATTR: {
+ if (copy_from_user(&attr, argp, sizeof(attr)))
+ return -EFAULT;
+
+ return kvm_vm_set_attr(kvm, &attr);
+ }
default:
return -EINVAL;
}
}
+/* unlocks vcpus from @vcpu_lock_idx and smaller */
+static void unlock_vcpus(struct kvm *kvm, int vcpu_lock_idx)
+{
+ struct kvm_vcpu *tmp_vcpu;
+
+ for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) {
+ tmp_vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx);
+ mutex_unlock(&tmp_vcpu->mutex);
+ }
+}
+
+void unlock_all_vcpus(struct kvm *kvm)
+{
+ lockdep_assert_held(&kvm->lock);
+
+ unlock_vcpus(kvm, atomic_read(&kvm->online_vcpus) - 1);
+}
+
+/* Returns true if all vcpus were locked, false otherwise */
+bool lock_all_vcpus(struct kvm *kvm)
+{
+ struct kvm_vcpu *tmp_vcpu;
+ unsigned long c;
+
+ lockdep_assert_held(&kvm->lock);
+
+ /*
+ * Any time a vcpu is in an ioctl (including running), the
+ * core KVM code tries to grab the vcpu->mutex.
+ *
+ * By grabbing the vcpu->mutex of all VCPUs we ensure that no
+ * other VCPUs can fiddle with the state while we access it.
+ */
+ kvm_for_each_vcpu(c, tmp_vcpu, kvm) {
+ if (!mutex_trylock(&tmp_vcpu->mutex)) {
+ unlock_vcpus(kvm, c - 1);
+ return false;
+ }
+ }
+
+ return true;
+}
+
static unsigned long nvhe_percpu_size(void)
{
return (unsigned long)CHOOSE_NVHE_SYM(__per_cpu_end) -
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index 26a2ebc465ea..20280a5233f6 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -590,11 +590,16 @@ static unsigned long num_core_regs(const struct kvm_vcpu *vcpu)
return copy_core_reg_indices(vcpu, NULL);
}
-/**
- * ARM64 versions of the TIMER registers, always available on arm64
- */
+static const u64 timer_reg_list[] = {
+ KVM_REG_ARM_TIMER_CTL,
+ KVM_REG_ARM_TIMER_CNT,
+ KVM_REG_ARM_TIMER_CVAL,
+ KVM_REG_ARM_PTIMER_CTL,
+ KVM_REG_ARM_PTIMER_CNT,
+ KVM_REG_ARM_PTIMER_CVAL,
+};
-#define NUM_TIMER_REGS 3
+#define NUM_TIMER_REGS ARRAY_SIZE(timer_reg_list)
static bool is_timer_reg(u64 index)
{
@@ -602,6 +607,9 @@ static bool is_timer_reg(u64 index)
case KVM_REG_ARM_TIMER_CTL:
case KVM_REG_ARM_TIMER_CNT:
case KVM_REG_ARM_TIMER_CVAL:
+ case KVM_REG_ARM_PTIMER_CTL:
+ case KVM_REG_ARM_PTIMER_CNT:
+ case KVM_REG_ARM_PTIMER_CVAL:
return true;
}
return false;
@@ -609,14 +617,11 @@ static bool is_timer_reg(u64 index)
static int copy_timer_indices(struct kvm_vcpu *vcpu, u64 __user *uindices)
{
- if (put_user(KVM_REG_ARM_TIMER_CTL, uindices))
- return -EFAULT;
- uindices++;
- if (put_user(KVM_REG_ARM_TIMER_CNT, uindices))
- return -EFAULT;
- uindices++;
- if (put_user(KVM_REG_ARM_TIMER_CVAL, uindices))
- return -EFAULT;
+ for (int i = 0; i < NUM_TIMER_REGS; i++) {
+ if (put_user(timer_reg_list[i], uindices))
+ return -EFAULT;
+ uindices++;
+ }
return 0;
}
@@ -957,7 +962,9 @@ int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu,
switch (attr->group) {
case KVM_ARM_VCPU_PMU_V3_CTRL:
+ mutex_lock(&vcpu->kvm->arch.config_lock);
ret = kvm_arm_pmu_v3_set_attr(vcpu, attr);
+ mutex_unlock(&vcpu->kvm->arch.config_lock);
break;
case KVM_ARM_VCPU_TIMER_CTRL:
ret = kvm_arm_timer_set_attr(vcpu, attr);
diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
index a798c0b4d717..6dcd6604b6bc 100644
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -36,8 +36,6 @@ static void kvm_handle_guest_serror(struct kvm_vcpu *vcpu, u64 esr)
static int handle_hvc(struct kvm_vcpu *vcpu)
{
- int ret;
-
trace_kvm_hvc_arm64(*vcpu_pc(vcpu), vcpu_get_reg(vcpu, 0),
kvm_vcpu_hvc_get_imm(vcpu));
vcpu->stat.hvc_exit_stat++;
@@ -52,33 +50,29 @@ static int handle_hvc(struct kvm_vcpu *vcpu)
return 1;
}
- ret = kvm_hvc_call_handler(vcpu);
- if (ret < 0) {
- vcpu_set_reg(vcpu, 0, ~0UL);
- return 1;
- }
-
- return ret;
+ return kvm_smccc_call_handler(vcpu);
}
static int handle_smc(struct kvm_vcpu *vcpu)
{
- int ret;
-
/*
* "If an SMC instruction executed at Non-secure EL1 is
* trapped to EL2 because HCR_EL2.TSC is 1, the exception is a
* Trap exception, not a Secure Monitor Call exception [...]"
*
* We need to advance the PC after the trap, as it would
- * otherwise return to the same address...
- *
- * Only handle SMCs from the virtual EL2 with an immediate of zero and
- * skip it otherwise.
+ * otherwise return to the same address. Furthermore, pre-incrementing
+ * the PC before potentially exiting to userspace maintains the same
+ * abstraction for both SMCs and HVCs.
+ */
+ kvm_incr_pc(vcpu);
+
+ /*
+ * SMCs with a nonzero immediate are reserved according to DEN0028E 2.9
+ * "SMC and HVC immediate value".
*/
- if (!vcpu_is_el2(vcpu) || kvm_vcpu_hvc_get_imm(vcpu)) {
+ if (kvm_vcpu_hvc_get_imm(vcpu)) {
vcpu_set_reg(vcpu, 0, ~0UL);
- kvm_incr_pc(vcpu);
return 1;
}
@@ -89,13 +83,7 @@ static int handle_smc(struct kvm_vcpu *vcpu)
* at Non-secure EL1 is trapped to EL2 if HCR_EL2.TSC==1, rather than
* being treated as UNDEFINED.
*/
- ret = kvm_hvc_call_handler(vcpu);
- if (ret < 0)
- vcpu_set_reg(vcpu, 0, ~0UL);
-
- kvm_incr_pc(vcpu);
-
- return ret;
+ return kvm_smccc_call_handler(vcpu);
}
/*
diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
index 07d37ff88a3f..c41166f1a1dd 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -26,6 +26,7 @@
#include <asm/kvm_emulate.h>
#include <asm/kvm_hyp.h>
#include <asm/kvm_mmu.h>
+#include <asm/kvm_nested.h>
#include <asm/fpsimd.h>
#include <asm/debug-monitors.h>
#include <asm/processor.h>
@@ -326,6 +327,55 @@ static bool kvm_hyp_handle_ptrauth(struct kvm_vcpu *vcpu, u64 *exit_code)
return true;
}
+static bool kvm_hyp_handle_cntpct(struct kvm_vcpu *vcpu)
+{
+ struct arch_timer_context *ctxt;
+ u32 sysreg;
+ u64 val;
+
+ /*
+ * We only get here for 64bit guests, 32bit guests will hit
+ * the long and winding road all the way to the standard
+ * handling. Yes, it sucks to be irrelevant.
+ */
+ sysreg = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu));
+
+ switch (sysreg) {
+ case SYS_CNTPCT_EL0:
+ case SYS_CNTPCTSS_EL0:
+ if (vcpu_has_nv(vcpu)) {
+ if (is_hyp_ctxt(vcpu)) {
+ ctxt = vcpu_hptimer(vcpu);
+ break;
+ }
+
+ /* Check for guest hypervisor trapping */
+ val = __vcpu_sys_reg(vcpu, CNTHCTL_EL2);
+ if (!vcpu_el2_e2h_is_set(vcpu))
+ val = (val & CNTHCTL_EL1PCTEN) << 10;
+
+ if (!(val & (CNTHCTL_EL1PCTEN << 10)))
+ return false;
+ }
+
+ ctxt = vcpu_ptimer(vcpu);
+ break;
+ default:
+ return false;
+ }
+
+ val = arch_timer_read_cntpct_el0();
+
+ if (ctxt->offset.vm_offset)
+ val -= *kern_hyp_va(ctxt->offset.vm_offset);
+ if (ctxt->offset.vcpu_offset)
+ val -= *kern_hyp_va(ctxt->offset.vcpu_offset);
+
+ vcpu_set_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu), val);
+ __kvm_skip_instr(vcpu);
+ return true;
+}
+
static bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code)
{
if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_TX2_219_TVM) &&
@@ -339,6 +389,9 @@ static bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code)
if (esr_is_ptrauth_trap(kvm_vcpu_get_esr(vcpu)))
return kvm_hyp_handle_ptrauth(vcpu, exit_code);
+ if (kvm_hyp_handle_cntpct(vcpu))
+ return true;
+
return false;
}
diff --git a/arch/arm64/kvm/hyp/nvhe/debug-sr.c b/arch/arm64/kvm/hyp/nvhe/debug-sr.c
index 2673bde62fad..d756b939f296 100644
--- a/arch/arm64/kvm/hyp/nvhe/debug-sr.c
+++ b/arch/arm64/kvm/hyp/nvhe/debug-sr.c
@@ -37,7 +37,6 @@ static void __debug_save_spe(u64 *pmscr_el1)
/* Now drain all buffered data to memory */
psb_csync();
- dsb(nsh);
}
static void __debug_restore_spe(u64 pmscr_el1)
@@ -69,7 +68,6 @@ static void __debug_save_trace(u64 *trfcr_el1)
isb();
/* Drain the trace buffer to memory */
tsb_csync();
- dsb(nsh);
}
static void __debug_restore_trace(u64 trfcr_el1)
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 552653fa18be..2e9ec4a2a4a3 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -297,6 +297,13 @@ int __pkvm_prot_finalize(void)
params->vttbr = kvm_get_vttbr(mmu);
params->vtcr = host_mmu.arch.vtcr;
params->hcr_el2 |= HCR_VM;
+
+ /*
+ * The CMO below not only cleans the updated params to the
+ * PoC, but also provides the DSB that ensures ongoing
+ * page-table walks that have started before we trapped to EL2
+ * have completed.
+ */
kvm_flush_dcache_to_poc(params, sizeof(*params));
write_sysreg(params->hcr_el2, hcr_el2);
diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
index c2cb46ca4fb6..71fa16a0dc77 100644
--- a/arch/arm64/kvm/hyp/nvhe/switch.c
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@@ -272,6 +272,17 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
*/
__debug_save_host_buffers_nvhe(vcpu);
+ /*
+ * We're about to restore some new MMU state. Make sure
+ * ongoing page-table walks that have started before we
+ * trapped to EL2 have completed. This also synchronises the
+ * above disabling of SPE and TRBE.
+ *
+ * See DDI0487I.a D8.1.5 "Out-of-context translation regimes",
+ * rule R_LFHQG and subsequent information statements.
+ */
+ dsb(nsh);
+
__kvm_adjust_pc(vcpu);
/*
@@ -306,6 +317,13 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
__timer_disable_traps(vcpu);
__hyp_vgic_save_state(vcpu);
+ /*
+ * Same thing as before the guest run: we're about to switch
+ * the MMU context, so let's make sure we don't have any
+ * ongoing EL1&0 translations.
+ */
+ dsb(nsh);
+
__deactivate_traps(vcpu);
__load_host_stage2();
diff --git a/arch/arm64/kvm/hyp/nvhe/timer-sr.c b/arch/arm64/kvm/hyp/nvhe/timer-sr.c
index 9072e71693ba..b185ac0dbd47 100644
--- a/arch/arm64/kvm/hyp/nvhe/timer-sr.c
+++ b/arch/arm64/kvm/hyp/nvhe/timer-sr.c
@@ -9,6 +9,7 @@
#include <linux/kvm_host.h>
#include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
void __kvm_timer_set_cntvoff(u64 cntvoff)
{
@@ -35,14 +36,19 @@ void __timer_disable_traps(struct kvm_vcpu *vcpu)
*/
void __timer_enable_traps(struct kvm_vcpu *vcpu)
{
- u64 val;
+ u64 clr = 0, set = 0;
/*
* Disallow physical timer access for the guest
- * Physical counter access is allowed
+ * Physical counter access is allowed if no offset is enforced
+ * or running protected (we don't offset anything in this case).
*/
- val = read_sysreg(cnthctl_el2);
- val &= ~CNTHCTL_EL1PCEN;
- val |= CNTHCTL_EL1PCTEN;
- write_sysreg(val, cnthctl_el2);
+ clr = CNTHCTL_EL1PCEN;
+ if (is_protected_kvm_enabled() ||
+ !kern_hyp_va(vcpu->kvm)->arch.timer_data.poffset)
+ set |= CNTHCTL_EL1PCTEN;
+ else
+ clr |= CNTHCTL_EL1PCTEN;
+
+ sysreg_clear_set(cnthctl_el2, clr, set);
}
diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c
index d296d617f589..978179133f4b 100644
--- a/arch/arm64/kvm/hyp/nvhe/tlb.c
+++ b/arch/arm64/kvm/hyp/nvhe/tlb.c
@@ -15,8 +15,31 @@ struct tlb_inv_context {
};
static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu,
- struct tlb_inv_context *cxt)
+ struct tlb_inv_context *cxt,
+ bool nsh)
{
+ /*
+ * We have two requirements:
+ *
+ * - ensure that the page table updates are visible to all
+ * CPUs, for which a dsb(DOMAIN-st) is what we need, DOMAIN
+ * being either ish or nsh, depending on the invalidation
+ * type.
+ *
+ * - complete any speculative page table walk started before
+ * we trapped to EL2 so that we can mess with the MM
+ * registers out of context, for which dsb(nsh) is enough
+ *
+ * The composition of these two barriers is a dsb(DOMAIN), and
+ * the 'nsh' parameter tracks the distinction between
+ * Inner-Shareable and Non-Shareable, as specified by the
+ * callers.
+ */
+ if (nsh)
+ dsb(nsh);
+ else
+ dsb(ish);
+
if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) {
u64 val;
@@ -60,10 +83,8 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu,
{
struct tlb_inv_context cxt;
- dsb(ishst);
-
/* Switch to requested VMID */
- __tlb_switch_to_guest(mmu, &cxt);
+ __tlb_switch_to_guest(mmu, &cxt, false);
/*
* We could do so much better if we had the VA as well.
@@ -113,10 +134,8 @@ void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu)
{
struct tlb_inv_context cxt;
- dsb(ishst);
-
/* Switch to requested VMID */
- __tlb_switch_to_guest(mmu, &cxt);
+ __tlb_switch_to_guest(mmu, &cxt, false);
__tlbi(vmalls12e1is);
dsb(ish);
@@ -130,7 +149,7 @@ void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu)
struct tlb_inv_context cxt;
/* Switch to requested VMID */
- __tlb_switch_to_guest(mmu, &cxt);
+ __tlb_switch_to_guest(mmu, &cxt, false);
__tlbi(vmalle1);
asm volatile("ic iallu");
@@ -142,7 +161,8 @@ void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu)
void __kvm_flush_vm_context(void)
{
- dsb(ishst);
+ /* Same remark as in __tlb_switch_to_guest() */
+ dsb(ish);
__tlbi(alle1is);
/*
diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c
index cd3f3117bf16..3d868e84c7a0 100644
--- a/arch/arm64/kvm/hyp/vhe/switch.c
+++ b/arch/arm64/kvm/hyp/vhe/switch.c
@@ -227,11 +227,10 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
/*
* When we exit from the guest we change a number of CPU configuration
- * parameters, such as traps. Make sure these changes take effect
- * before running the host or additional guests.
+ * parameters, such as traps. We rely on the isb() in kvm_call_hyp*()
+ * to make sure these changes take effect before running the host or
+ * additional guests.
*/
- isb();
-
return ret;
}
diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
index 7b44f6b3b547..b35a178e7e0d 100644
--- a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
+++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
@@ -13,6 +13,7 @@
#include <asm/kvm_asm.h>
#include <asm/kvm_emulate.h>
#include <asm/kvm_hyp.h>
+#include <asm/kvm_nested.h>
/*
* VHE: Host and guest must save mdscr_el1 and sp_el0 (and the PC and
@@ -70,6 +71,17 @@ void kvm_vcpu_load_sysregs_vhe(struct kvm_vcpu *vcpu)
__sysreg_save_user_state(host_ctxt);
/*
+ * When running a normal EL1 guest, we only load a new vcpu
+ * after a context switch, which imvolves a DSB, so all
+ * speculative EL1&0 walks will have already completed.
+ * If running NV, the vcpu may transition between vEL1 and
+ * vEL2 without a context switch, so make sure we complete
+ * those walks before loading a new context.
+ */
+ if (vcpu_has_nv(vcpu))
+ dsb(nsh);
+
+ /*
* Load guest EL1 and user state
*
* We must restore the 32-bit state before the sysregs, thanks
diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c
index 5da884e11337..2e16fc7b31bf 100644
--- a/arch/arm64/kvm/hypercalls.c
+++ b/arch/arm64/kvm/hypercalls.c
@@ -47,7 +47,7 @@ static void kvm_ptp_get_time(struct kvm_vcpu *vcpu, u64 *val)
cycles = systime_snapshot.cycles - vcpu->kvm->arch.timer_data.voffset;
break;
case KVM_PTP_PHYS_COUNTER:
- cycles = systime_snapshot.cycles;
+ cycles = systime_snapshot.cycles - vcpu->kvm->arch.timer_data.poffset;
break;
default:
return;
@@ -65,7 +65,7 @@ static void kvm_ptp_get_time(struct kvm_vcpu *vcpu, u64 *val)
val[3] = lower_32_bits(cycles);
}
-static bool kvm_hvc_call_default_allowed(u32 func_id)
+static bool kvm_smccc_default_allowed(u32 func_id)
{
switch (func_id) {
/*
@@ -93,7 +93,7 @@ static bool kvm_hvc_call_default_allowed(u32 func_id)
}
}
-static bool kvm_hvc_call_allowed(struct kvm_vcpu *vcpu, u32 func_id)
+static bool kvm_smccc_test_fw_bmap(struct kvm_vcpu *vcpu, u32 func_id)
{
struct kvm_smccc_features *smccc_feat = &vcpu->kvm->arch.smccc_feat;
@@ -117,20 +117,161 @@ static bool kvm_hvc_call_allowed(struct kvm_vcpu *vcpu, u32 func_id)
return test_bit(KVM_REG_ARM_VENDOR_HYP_BIT_PTP,
&smccc_feat->vendor_hyp_bmap);
default:
- return kvm_hvc_call_default_allowed(func_id);
+ return false;
+ }
+}
+
+#define SMC32_ARCH_RANGE_BEGIN ARM_SMCCC_VERSION_FUNC_ID
+#define SMC32_ARCH_RANGE_END ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \
+ ARM_SMCCC_SMC_32, \
+ 0, ARM_SMCCC_FUNC_MASK)
+
+#define SMC64_ARCH_RANGE_BEGIN ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \
+ ARM_SMCCC_SMC_64, \
+ 0, 0)
+#define SMC64_ARCH_RANGE_END ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \
+ ARM_SMCCC_SMC_64, \
+ 0, ARM_SMCCC_FUNC_MASK)
+
+static void init_smccc_filter(struct kvm *kvm)
+{
+ int r;
+
+ mt_init(&kvm->arch.smccc_filter);
+
+ /*
+ * Prevent userspace from handling any SMCCC calls in the architecture
+ * range, avoiding the risk of misrepresenting Spectre mitigation status
+ * to the guest.
+ */
+ r = mtree_insert_range(&kvm->arch.smccc_filter,
+ SMC32_ARCH_RANGE_BEGIN, SMC32_ARCH_RANGE_END,
+ xa_mk_value(KVM_SMCCC_FILTER_HANDLE),
+ GFP_KERNEL_ACCOUNT);
+ WARN_ON_ONCE(r);
+
+ r = mtree_insert_range(&kvm->arch.smccc_filter,
+ SMC64_ARCH_RANGE_BEGIN, SMC64_ARCH_RANGE_END,
+ xa_mk_value(KVM_SMCCC_FILTER_HANDLE),
+ GFP_KERNEL_ACCOUNT);
+ WARN_ON_ONCE(r);
+
+}
+
+static int kvm_smccc_set_filter(struct kvm *kvm, struct kvm_smccc_filter __user *uaddr)
+{
+ const void *zero_page = page_to_virt(ZERO_PAGE(0));
+ struct kvm_smccc_filter filter;
+ u32 start, end;
+ int r;
+
+ if (copy_from_user(&filter, uaddr, sizeof(filter)))
+ return -EFAULT;
+
+ if (memcmp(filter.pad, zero_page, sizeof(filter.pad)))
+ return -EINVAL;
+
+ start = filter.base;
+ end = start + filter.nr_functions - 1;
+
+ if (end < start || filter.action >= NR_SMCCC_FILTER_ACTIONS)
+ return -EINVAL;
+
+ mutex_lock(&kvm->arch.config_lock);
+
+ if (kvm_vm_has_ran_once(kvm)) {
+ r = -EBUSY;
+ goto out_unlock;
}
+
+ r = mtree_insert_range(&kvm->arch.smccc_filter, start, end,
+ xa_mk_value(filter.action), GFP_KERNEL_ACCOUNT);
+ if (r)
+ goto out_unlock;
+
+ set_bit(KVM_ARCH_FLAG_SMCCC_FILTER_CONFIGURED, &kvm->arch.flags);
+
+out_unlock:
+ mutex_unlock(&kvm->arch.config_lock);
+ return r;
+}
+
+static u8 kvm_smccc_filter_get_action(struct kvm *kvm, u32 func_id)
+{
+ unsigned long idx = func_id;
+ void *val;
+
+ if (!test_bit(KVM_ARCH_FLAG_SMCCC_FILTER_CONFIGURED, &kvm->arch.flags))
+ return KVM_SMCCC_FILTER_HANDLE;
+
+ /*
+ * But where's the error handling, you say?
+ *
+ * mt_find() returns NULL if no entry was found, which just so happens
+ * to match KVM_SMCCC_FILTER_HANDLE.
+ */
+ val = mt_find(&kvm->arch.smccc_filter, &idx, idx);
+ return xa_to_value(val);
}
-int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
+static u8 kvm_smccc_get_action(struct kvm_vcpu *vcpu, u32 func_id)
+{
+ /*
+ * Intervening actions in the SMCCC filter take precedence over the
+ * pseudo-firmware register bitmaps.
+ */
+ u8 action = kvm_smccc_filter_get_action(vcpu->kvm, func_id);
+ if (action != KVM_SMCCC_FILTER_HANDLE)
+ return action;
+
+ if (kvm_smccc_test_fw_bmap(vcpu, func_id) ||
+ kvm_smccc_default_allowed(func_id))
+ return KVM_SMCCC_FILTER_HANDLE;
+
+ return KVM_SMCCC_FILTER_DENY;
+}
+
+static void kvm_prepare_hypercall_exit(struct kvm_vcpu *vcpu, u32 func_id)
+{
+ u8 ec = ESR_ELx_EC(kvm_vcpu_get_esr(vcpu));
+ struct kvm_run *run = vcpu->run;
+ u64 flags = 0;
+
+ if (ec == ESR_ELx_EC_SMC32 || ec == ESR_ELx_EC_SMC64)
+ flags |= KVM_HYPERCALL_EXIT_SMC;
+
+ if (!kvm_vcpu_trap_il_is32bit(vcpu))
+ flags |= KVM_HYPERCALL_EXIT_16BIT;
+
+ run->exit_reason = KVM_EXIT_HYPERCALL;
+ run->hypercall = (typeof(run->hypercall)) {
+ .nr = func_id,
+ .flags = flags,
+ };
+}
+
+int kvm_smccc_call_handler(struct kvm_vcpu *vcpu)
{
struct kvm_smccc_features *smccc_feat = &vcpu->kvm->arch.smccc_feat;
u32 func_id = smccc_get_function(vcpu);
u64 val[4] = {SMCCC_RET_NOT_SUPPORTED};
u32 feature;
+ u8 action;
gpa_t gpa;
- if (!kvm_hvc_call_allowed(vcpu, func_id))
+ action = kvm_smccc_get_action(vcpu, func_id);
+ switch (action) {
+ case KVM_SMCCC_FILTER_HANDLE:
+ break;
+ case KVM_SMCCC_FILTER_DENY:
+ goto out;
+ case KVM_SMCCC_FILTER_FWD_TO_USER:
+ kvm_prepare_hypercall_exit(vcpu, func_id);
+ return 0;
+ default:
+ WARN_RATELIMIT(1, "Unhandled SMCCC filter action: %d\n", action);
goto out;
+ }
switch (func_id) {
case ARM_SMCCC_VERSION_FUNC_ID:
@@ -245,6 +386,13 @@ void kvm_arm_init_hypercalls(struct kvm *kvm)
smccc_feat->std_bmap = KVM_ARM_SMCCC_STD_FEATURES;
smccc_feat->std_hyp_bmap = KVM_ARM_SMCCC_STD_HYP_FEATURES;
smccc_feat->vendor_hyp_bmap = KVM_ARM_SMCCC_VENDOR_HYP_FEATURES;
+
+ init_smccc_filter(kvm);
+}
+
+void kvm_arm_teardown_hypercalls(struct kvm *kvm)
+{
+ mtree_destroy(&kvm->arch.smccc_filter);
}
int kvm_arm_get_fw_num_regs(struct kvm_vcpu *vcpu)
@@ -377,17 +525,16 @@ static int kvm_arm_set_fw_reg_bmap(struct kvm_vcpu *vcpu, u64 reg_id, u64 val)
if (val & ~fw_reg_features)
return -EINVAL;
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.config_lock);
- if (test_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &kvm->arch.flags) &&
- val != *fw_reg_bmap) {
+ if (kvm_vm_has_ran_once(kvm) && val != *fw_reg_bmap) {
ret = -EBUSY;
goto out;
}
WRITE_ONCE(*fw_reg_bmap, val);
out:
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.config_lock);
return ret;
}
@@ -479,3 +626,25 @@ int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
return -EINVAL;
}
+
+int kvm_vm_smccc_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
+{
+ switch (attr->attr) {
+ case KVM_ARM_VM_SMCCC_FILTER:
+ return 0;
+ default:
+ return -ENXIO;
+ }
+}
+
+int kvm_vm_smccc_set_attr(struct kvm *kvm, struct kvm_device_attr *attr)
+{
+ void __user *uaddr = (void __user *)attr->addr;
+
+ switch (attr->attr) {
+ case KVM_ARM_VM_SMCCC_FILTER:
+ return kvm_smccc_set_filter(kvm, uaddr);
+ default:
+ return -ENXIO;
+ }
+}
diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c
index 24908400e190..8402e5a1354e 100644
--- a/arch/arm64/kvm/pmu-emul.c
+++ b/arch/arm64/kvm/pmu-emul.c
@@ -874,13 +874,13 @@ static int kvm_arm_pmu_v3_set_pmu(struct kvm_vcpu *vcpu, int pmu_id)
struct arm_pmu *arm_pmu;
int ret = -ENXIO;
- mutex_lock(&kvm->lock);
+ lockdep_assert_held(&kvm->arch.config_lock);
mutex_lock(&arm_pmus_lock);
list_for_each_entry(entry, &arm_pmus, entry) {
arm_pmu = entry->arm_pmu;
if (arm_pmu->pmu.type == pmu_id) {
- if (test_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &kvm->arch.flags) ||
+ if (kvm_vm_has_ran_once(kvm) ||
(kvm->arch.pmu_filter && kvm->arch.arm_pmu != arm_pmu)) {
ret = -EBUSY;
break;
@@ -894,7 +894,6 @@ static int kvm_arm_pmu_v3_set_pmu(struct kvm_vcpu *vcpu, int pmu_id)
}
mutex_unlock(&arm_pmus_lock);
- mutex_unlock(&kvm->lock);
return ret;
}
@@ -902,22 +901,20 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
{
struct kvm *kvm = vcpu->kvm;
+ lockdep_assert_held(&kvm->arch.config_lock);
+
if (!kvm_vcpu_has_pmu(vcpu))
return -ENODEV;
if (vcpu->arch.pmu.created)
return -EBUSY;
- mutex_lock(&kvm->lock);
if (!kvm->arch.arm_pmu) {
/* No PMU set, get the default one */
kvm->arch.arm_pmu = kvm_pmu_probe_armpmu();
- if (!kvm->arch.arm_pmu) {
- mutex_unlock(&kvm->lock);
+ if (!kvm->arch.arm_pmu)
return -ENODEV;
- }
}
- mutex_unlock(&kvm->lock);
switch (attr->attr) {
case KVM_ARM_VCPU_PMU_V3_IRQ: {
@@ -961,19 +958,13 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
filter.action != KVM_PMU_EVENT_DENY))
return -EINVAL;
- mutex_lock(&kvm->lock);
-
- if (test_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &kvm->arch.flags)) {
- mutex_unlock(&kvm->lock);
+ if (kvm_vm_has_ran_once(kvm))
return -EBUSY;
- }
if (!kvm->arch.pmu_filter) {
kvm->arch.pmu_filter = bitmap_alloc(nr_events, GFP_KERNEL_ACCOUNT);
- if (!kvm->arch.pmu_filter) {
- mutex_unlock(&kvm->lock);
+ if (!kvm->arch.pmu_filter)
return -ENOMEM;
- }
/*
* The default depends on the first applied filter.
@@ -992,8 +983,6 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
else
bitmap_clear(kvm->arch.pmu_filter, filter.base_event, filter.nevents);
- mutex_unlock(&kvm->lock);
-
return 0;
}
case KVM_ARM_VCPU_PMU_V3_SET_PMU: {
diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c
index 7fbc4c1b9df0..1f69b667332b 100644
--- a/arch/arm64/kvm/psci.c
+++ b/arch/arm64/kvm/psci.c
@@ -62,6 +62,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
struct vcpu_reset_state *reset_state;
struct kvm *kvm = source_vcpu->kvm;
struct kvm_vcpu *vcpu = NULL;
+ int ret = PSCI_RET_SUCCESS;
unsigned long cpu_id;
cpu_id = smccc_get_arg1(source_vcpu);
@@ -76,11 +77,15 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
*/
if (!vcpu)
return PSCI_RET_INVALID_PARAMS;
+
+ spin_lock(&vcpu->arch.mp_state_lock);
if (!kvm_arm_vcpu_stopped(vcpu)) {
if (kvm_psci_version(source_vcpu) != KVM_ARM_PSCI_0_1)
- return PSCI_RET_ALREADY_ON;
+ ret = PSCI_RET_ALREADY_ON;
else
- return PSCI_RET_INVALID_PARAMS;
+ ret = PSCI_RET_INVALID_PARAMS;
+
+ goto out_unlock;
}
reset_state = &vcpu->arch.reset_state;
@@ -96,7 +101,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
*/
reset_state->r0 = smccc_get_arg3(source_vcpu);
- WRITE_ONCE(reset_state->reset, true);
+ reset_state->reset = true;
kvm_make_request(KVM_REQ_VCPU_RESET, vcpu);
/*
@@ -105,10 +110,12 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
*/
smp_wmb();
- vcpu->arch.mp_state.mp_state = KVM_MP_STATE_RUNNABLE;
+ WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_RUNNABLE);
kvm_vcpu_wake_up(vcpu);
- return PSCI_RET_SUCCESS;
+out_unlock:
+ spin_unlock(&vcpu->arch.mp_state_lock);
+ return ret;
}
static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu)
@@ -168,8 +175,11 @@ static void kvm_prepare_system_event(struct kvm_vcpu *vcpu, u32 type, u64 flags)
* after this call is handled and before the VCPUs have been
* re-initialized.
*/
- kvm_for_each_vcpu(i, tmp, vcpu->kvm)
- tmp->arch.mp_state.mp_state = KVM_MP_STATE_STOPPED;
+ kvm_for_each_vcpu(i, tmp, vcpu->kvm) {
+ spin_lock(&tmp->arch.mp_state_lock);
+ WRITE_ONCE(tmp->arch.mp_state.mp_state, KVM_MP_STATE_STOPPED);
+ spin_unlock(&tmp->arch.mp_state_lock);
+ }
kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_SLEEP);
memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event));
@@ -229,7 +239,6 @@ static unsigned long kvm_psci_check_allowed_function(struct kvm_vcpu *vcpu, u32
static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu)
{
- struct kvm *kvm = vcpu->kvm;
u32 psci_fn = smccc_get_function(vcpu);
unsigned long val;
int ret = 1;
@@ -254,9 +263,7 @@ static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu)
kvm_psci_narrow_to_32bit(vcpu);
fallthrough;
case PSCI_0_2_FN64_CPU_ON:
- mutex_lock(&kvm->lock);
val = kvm_psci_vcpu_on(vcpu);
- mutex_unlock(&kvm->lock);
break;
case PSCI_0_2_FN_AFFINITY_INFO:
kvm_psci_narrow_to_32bit(vcpu);
@@ -395,7 +402,6 @@ static int kvm_psci_1_x_call(struct kvm_vcpu *vcpu, u32 minor)
static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu)
{
- struct kvm *kvm = vcpu->kvm;
u32 psci_fn = smccc_get_function(vcpu);
unsigned long val;
@@ -405,9 +411,7 @@ static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu)
val = PSCI_RET_SUCCESS;
break;
case KVM_PSCI_FN_CPU_ON:
- mutex_lock(&kvm->lock);
val = kvm_psci_vcpu_on(vcpu);
- mutex_unlock(&kvm->lock);
break;
default:
val = PSCI_RET_NOT_SUPPORTED;
@@ -435,6 +439,7 @@ static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu)
int kvm_psci_call(struct kvm_vcpu *vcpu)
{
u32 psci_fn = smccc_get_function(vcpu);
+ int version = kvm_psci_version(vcpu);
unsigned long val;
val = kvm_psci_check_allowed_function(vcpu, psci_fn);
@@ -443,7 +448,7 @@ int kvm_psci_call(struct kvm_vcpu *vcpu)
return 1;
}
- switch (kvm_psci_version(vcpu)) {
+ switch (version) {
case KVM_ARM_PSCI_1_1:
return kvm_psci_1_x_call(vcpu, 1);
case KVM_ARM_PSCI_1_0:
@@ -453,6 +458,8 @@ int kvm_psci_call(struct kvm_vcpu *vcpu)
case KVM_ARM_PSCI_0_1:
return kvm_psci_0_1_call(vcpu);
default:
- return -EINVAL;
+ WARN_ONCE(1, "Unknown PSCI version %d", version);
+ smccc_set_retval(vcpu, SMCCC_RET_NOT_SUPPORTED, 0, 0, 0);
+ return 1;
}
}
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index 49a3257dec46..b5dee8e57e77 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -205,7 +205,7 @@ static int kvm_set_vm_width(struct kvm_vcpu *vcpu)
is32bit = vcpu_has_feature(vcpu, KVM_ARM_VCPU_EL1_32BIT);
- lockdep_assert_held(&kvm->lock);
+ lockdep_assert_held(&kvm->arch.config_lock);
if (test_bit(KVM_ARCH_FLAG_REG_WIDTH_CONFIGURED, &kvm->arch.flags)) {
/*
@@ -262,17 +262,18 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
bool loaded;
u32 pstate;
- mutex_lock(&vcpu->kvm->lock);
+ mutex_lock(&vcpu->kvm->arch.config_lock);
ret = kvm_set_vm_width(vcpu);
- if (!ret) {
- reset_state = vcpu->arch.reset_state;
- WRITE_ONCE(vcpu->arch.reset_state.reset, false);
- }
- mutex_unlock(&vcpu->kvm->lock);
+ mutex_unlock(&vcpu->kvm->arch.config_lock);
if (ret)
return ret;
+ spin_lock(&vcpu->arch.mp_state_lock);
+ reset_state = vcpu->arch.reset_state;
+ vcpu->arch.reset_state.reset = false;
+ spin_unlock(&vcpu->arch.mp_state_lock);
+
/* Reset PMU outside of the non-preemptible section */
kvm_pmu_vcpu_reset(vcpu);
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 53749d3a0996..feca77083a5c 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1139,6 +1139,12 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu,
tmr = TIMER_PTIMER;
treg = TIMER_REG_CVAL;
break;
+ case SYS_CNTPCT_EL0:
+ case SYS_CNTPCTSS_EL0:
+ case SYS_AARCH32_CNTPCT:
+ tmr = TIMER_PTIMER;
+ treg = TIMER_REG_CNT;
+ break;
default:
print_sys_reg_msg(p, "%s", "Unhandled trapped timer register");
kvm_inject_undefined(vcpu);
@@ -2075,6 +2081,8 @@ static const struct sys_reg_desc sys_reg_descs[] = {
AMU_AMEVTYPER1_EL0(14),
AMU_AMEVTYPER1_EL0(15),
+ { SYS_DESC(SYS_CNTPCT_EL0), access_arch_timer },
+ { SYS_DESC(SYS_CNTPCTSS_EL0), access_arch_timer },
{ SYS_DESC(SYS_CNTP_TVAL_EL0), access_arch_timer },
{ SYS_DESC(SYS_CNTP_CTL_EL0), access_arch_timer },
{ SYS_DESC(SYS_CNTP_CVAL_EL0), access_arch_timer },
@@ -2525,10 +2533,12 @@ static const struct sys_reg_desc cp15_64_regs[] = {
{ Op1( 0), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, TTBR0_EL1 },
{ CP15_PMU_SYS_REG(DIRECT, 0, 0, 9, 0), .access = access_pmu_evcntr },
{ Op1( 0), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_SGI1R */
+ { SYS_DESC(SYS_AARCH32_CNTPCT), access_arch_timer },
{ Op1( 1), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, TTBR1_EL1 },
{ Op1( 1), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_ASGI1R */
{ Op1( 2), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_SGI0R */
{ SYS_DESC(SYS_AARCH32_CNTP_CVAL), access_arch_timer },
+ { SYS_DESC(SYS_AARCH32_CNTPCTSS), access_arch_timer },
};
static bool check_sysreg_table(const struct sys_reg_desc *table, unsigned int n,
diff --git a/arch/arm64/kvm/trace_arm.h b/arch/arm64/kvm/trace_arm.h
index f3e46a976125..6ce5c025218d 100644
--- a/arch/arm64/kvm/trace_arm.h
+++ b/arch/arm64/kvm/trace_arm.h
@@ -206,6 +206,7 @@ TRACE_EVENT(kvm_get_timer_map,
__field( unsigned long, vcpu_id )
__field( int, direct_vtimer )
__field( int, direct_ptimer )
+ __field( int, emul_vtimer )
__field( int, emul_ptimer )
),
@@ -214,14 +215,17 @@ TRACE_EVENT(kvm_get_timer_map,
__entry->direct_vtimer = arch_timer_ctx_index(map->direct_vtimer);
__entry->direct_ptimer =
(map->direct_ptimer) ? arch_timer_ctx_index(map->direct_ptimer) : -1;
+ __entry->emul_vtimer =
+ (map->emul_vtimer) ? arch_timer_ctx_index(map->emul_vtimer) : -1;
__entry->emul_ptimer =
(map->emul_ptimer) ? arch_timer_ctx_index(map->emul_ptimer) : -1;
),
- TP_printk("VCPU: %ld, dv: %d, dp: %d, ep: %d",
+ TP_printk("VCPU: %ld, dv: %d, dp: %d, ev: %d, ep: %d",
__entry->vcpu_id,
__entry->direct_vtimer,
__entry->direct_ptimer,
+ __entry->emul_vtimer,
__entry->emul_ptimer)
);
diff --git a/arch/arm64/kvm/vgic/vgic-debug.c b/arch/arm64/kvm/vgic/vgic-debug.c
index 78cde687383c..07aa0437125a 100644
--- a/arch/arm64/kvm/vgic/vgic-debug.c
+++ b/arch/arm64/kvm/vgic/vgic-debug.c
@@ -85,7 +85,7 @@ static void *vgic_debug_start(struct seq_file *s, loff_t *pos)
struct kvm *kvm = s->private;
struct vgic_state_iter *iter;
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.config_lock);
iter = kvm->arch.vgic.iter;
if (iter) {
iter = ERR_PTR(-EBUSY);
@@ -104,7 +104,7 @@ static void *vgic_debug_start(struct seq_file *s, loff_t *pos)
if (end_of_vgic(iter))
iter = NULL;
out:
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.config_lock);
return iter;
}
@@ -132,12 +132,12 @@ static void vgic_debug_stop(struct seq_file *s, void *v)
if (IS_ERR(v))
return;
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.config_lock);
iter = kvm->arch.vgic.iter;
kfree(iter->lpi_array);
kfree(iter);
kvm->arch.vgic.iter = NULL;
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.config_lock);
}
static void print_dist_state(struct seq_file *s, struct vgic_dist *dist)
diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c
index cd134db41a57..9d42c7cb2b58 100644
--- a/arch/arm64/kvm/vgic/vgic-init.c
+++ b/arch/arm64/kvm/vgic/vgic-init.c
@@ -74,9 +74,6 @@ int kvm_vgic_create(struct kvm *kvm, u32 type)
unsigned long i;
int ret;
- if (irqchip_in_kernel(kvm))
- return -EEXIST;
-
/*
* This function is also called by the KVM_CREATE_IRQCHIP handler,
* which had no chance yet to check the availability of the GICv2
@@ -87,10 +84,20 @@ int kvm_vgic_create(struct kvm *kvm, u32 type)
!kvm_vgic_global_state.can_emulate_gicv2)
return -ENODEV;
+ /* Must be held to avoid race with vCPU creation */
+ lockdep_assert_held(&kvm->lock);
+
ret = -EBUSY;
if (!lock_all_vcpus(kvm))
return ret;
+ mutex_lock(&kvm->arch.config_lock);
+
+ if (irqchip_in_kernel(kvm)) {
+ ret = -EEXIST;
+ goto out_unlock;
+ }
+
kvm_for_each_vcpu(i, vcpu, kvm) {
if (vcpu_has_run_once(vcpu))
goto out_unlock;
@@ -118,6 +125,7 @@ int kvm_vgic_create(struct kvm *kvm, u32 type)
INIT_LIST_HEAD(&kvm->arch.vgic.rd_regions);
out_unlock:
+ mutex_unlock(&kvm->arch.config_lock);
unlock_all_vcpus(kvm);
return ret;
}
@@ -227,9 +235,9 @@ int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
* KVM io device for the redistributor that belongs to this VCPU.
*/
if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
- mutex_lock(&vcpu->kvm->lock);
+ mutex_lock(&vcpu->kvm->arch.config_lock);
ret = vgic_register_redist_iodev(vcpu);
- mutex_unlock(&vcpu->kvm->lock);
+ mutex_unlock(&vcpu->kvm->arch.config_lock);
}
return ret;
}
@@ -250,7 +258,6 @@ static void kvm_vgic_vcpu_enable(struct kvm_vcpu *vcpu)
* The function is generally called when nr_spis has been explicitly set
* by the guest through the KVM DEVICE API. If not nr_spis is set to 256.
* vgic_initialized() returns true when this function has succeeded.
- * Must be called with kvm->lock held!
*/
int vgic_init(struct kvm *kvm)
{
@@ -259,6 +266,8 @@ int vgic_init(struct kvm *kvm)
int ret = 0, i;
unsigned long idx;
+ lockdep_assert_held(&kvm->arch.config_lock);
+
if (vgic_initialized(kvm))
return 0;
@@ -373,12 +382,13 @@ void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF;
}
-/* To be called with kvm->lock held */
static void __kvm_vgic_destroy(struct kvm *kvm)
{
struct kvm_vcpu *vcpu;
unsigned long i;
+ lockdep_assert_held(&kvm->arch.config_lock);
+
vgic_debug_destroy(kvm);
kvm_for_each_vcpu(i, vcpu, kvm)
@@ -389,9 +399,9 @@ static void __kvm_vgic_destroy(struct kvm *kvm)
void kvm_vgic_destroy(struct kvm *kvm)
{
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.config_lock);
__kvm_vgic_destroy(kvm);
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.config_lock);
}
/**
@@ -414,9 +424,9 @@ int vgic_lazy_init(struct kvm *kvm)
if (kvm->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V2)
return -EBUSY;
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.config_lock);
ret = vgic_init(kvm);
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.config_lock);
}
return ret;
@@ -441,7 +451,7 @@ int kvm_vgic_map_resources(struct kvm *kvm)
if (likely(vgic_ready(kvm)))
return 0;
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.config_lock);
if (vgic_ready(kvm))
goto out;
@@ -459,7 +469,7 @@ int kvm_vgic_map_resources(struct kvm *kvm)
dist->ready = true;
out:
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.config_lock);
return ret;
}
diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c
index 2642e9ce2819..750e51e3779a 100644
--- a/arch/arm64/kvm/vgic/vgic-its.c
+++ b/arch/arm64/kvm/vgic/vgic-its.c
@@ -1958,6 +1958,16 @@ static int vgic_its_create(struct kvm_device *dev, u32 type)
mutex_init(&its->its_lock);
mutex_init(&its->cmd_lock);
+ /* Yep, even more trickery for lock ordering... */
+#ifdef CONFIG_LOCKDEP
+ mutex_lock(&dev->kvm->arch.config_lock);
+ mutex_lock(&its->cmd_lock);
+ mutex_lock(&its->its_lock);
+ mutex_unlock(&its->its_lock);
+ mutex_unlock(&its->cmd_lock);
+ mutex_unlock(&dev->kvm->arch.config_lock);
+#endif
+
its->vgic_its_base = VGIC_ADDR_UNDEF;
INIT_LIST_HEAD(&its->device_list);
@@ -2045,6 +2055,13 @@ static int vgic_its_attr_regs_access(struct kvm_device *dev,
mutex_lock(&dev->kvm->lock);
+ if (!lock_all_vcpus(dev->kvm)) {
+ mutex_unlock(&dev->kvm->lock);
+ return -EBUSY;
+ }
+
+ mutex_lock(&dev->kvm->arch.config_lock);
+
if (IS_VGIC_ADDR_UNDEF(its->vgic_its_base)) {
ret = -ENXIO;
goto out;
@@ -2058,11 +2075,6 @@ static int vgic_its_attr_regs_access(struct kvm_device *dev,
goto out;
}
- if (!lock_all_vcpus(dev->kvm)) {
- ret = -EBUSY;
- goto out;
- }
-
addr = its->vgic_its_base + offset;
len = region->access_flags & VGIC_ACCESS_64bit ? 8 : 4;
@@ -2076,8 +2088,9 @@ static int vgic_its_attr_regs_access(struct kvm_device *dev,
} else {
*reg = region->its_read(dev->kvm, its, addr, len);
}
- unlock_all_vcpus(dev->kvm);
out:
+ mutex_unlock(&dev->kvm->arch.config_lock);
+ unlock_all_vcpus(dev->kvm);
mutex_unlock(&dev->kvm->lock);
return ret;
}
@@ -2749,14 +2762,15 @@ static int vgic_its_ctrl(struct kvm *kvm, struct vgic_its *its, u64 attr)
return 0;
mutex_lock(&kvm->lock);
- mutex_lock(&its->its_lock);
if (!lock_all_vcpus(kvm)) {
- mutex_unlock(&its->its_lock);
mutex_unlock(&kvm->lock);
return -EBUSY;
}
+ mutex_lock(&kvm->arch.config_lock);
+ mutex_lock(&its->its_lock);
+
switch (attr) {
case KVM_DEV_ARM_ITS_CTRL_RESET:
vgic_its_reset(kvm, its);
@@ -2769,8 +2783,9 @@ static int vgic_its_ctrl(struct kvm *kvm, struct vgic_its *its, u64 attr)
break;
}
- unlock_all_vcpus(kvm);
mutex_unlock(&its->its_lock);
+ mutex_unlock(&kvm->arch.config_lock);
+ unlock_all_vcpus(kvm);
mutex_unlock(&kvm->lock);
return ret;
}
diff --git a/arch/arm64/kvm/vgic/vgic-kvm-device.c b/arch/arm64/kvm/vgic/vgic-kvm-device.c
index edeac2380591..35cfa268fd5d 100644
--- a/arch/arm64/kvm/vgic/vgic-kvm-device.c
+++ b/arch/arm64/kvm/vgic/vgic-kvm-device.c
@@ -46,7 +46,7 @@ int kvm_set_legacy_vgic_v2_addr(struct kvm *kvm, struct kvm_arm_device_addr *dev
struct vgic_dist *vgic = &kvm->arch.vgic;
int r;
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.config_lock);
switch (FIELD_GET(KVM_ARM_DEVICE_TYPE_MASK, dev_addr->id)) {
case KVM_VGIC_V2_ADDR_TYPE_DIST:
r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2);
@@ -68,7 +68,7 @@ int kvm_set_legacy_vgic_v2_addr(struct kvm *kvm, struct kvm_arm_device_addr *dev
r = -ENODEV;
}
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.config_lock);
return r;
}
@@ -102,7 +102,7 @@ static int kvm_vgic_addr(struct kvm *kvm, struct kvm_device_attr *attr, bool wri
if (get_user(addr, uaddr))
return -EFAULT;
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.config_lock);
switch (attr->attr) {
case KVM_VGIC_V2_ADDR_TYPE_DIST:
r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2);
@@ -191,7 +191,7 @@ static int kvm_vgic_addr(struct kvm *kvm, struct kvm_device_attr *attr, bool wri
}
out:
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.config_lock);
if (!r && !write)
r = put_user(addr, uaddr);
@@ -227,7 +227,7 @@ static int vgic_set_common_attr(struct kvm_device *dev,
(val & 31))
return -EINVAL;
- mutex_lock(&dev->kvm->lock);
+ mutex_lock(&dev->kvm->arch.config_lock);
if (vgic_ready(dev->kvm) || dev->kvm->arch.vgic.nr_spis)
ret = -EBUSY;
@@ -235,16 +235,16 @@ static int vgic_set_common_attr(struct kvm_device *dev,
dev->kvm->arch.vgic.nr_spis =
val - VGIC_NR_PRIVATE_IRQS;
- mutex_unlock(&dev->kvm->lock);
+ mutex_unlock(&dev->kvm->arch.config_lock);
return ret;
}
case KVM_DEV_ARM_VGIC_GRP_CTRL: {
switch (attr->attr) {
case KVM_DEV_ARM_VGIC_CTRL_INIT:
- mutex_lock(&dev->kvm->lock);
+ mutex_lock(&dev->kvm->arch.config_lock);
r = vgic_init(dev->kvm);
- mutex_unlock(&dev->kvm->lock);
+ mutex_unlock(&dev->kvm->arch.config_lock);
return r;
case KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES:
/*
@@ -260,7 +260,10 @@ static int vgic_set_common_attr(struct kvm_device *dev,
mutex_unlock(&dev->kvm->lock);
return -EBUSY;
}
+
+ mutex_lock(&dev->kvm->arch.config_lock);
r = vgic_v3_save_pending_tables(dev->kvm);
+ mutex_unlock(&dev->kvm->arch.config_lock);
unlock_all_vcpus(dev->kvm);
mutex_unlock(&dev->kvm->lock);
return r;
@@ -342,44 +345,6 @@ int vgic_v2_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr,
return 0;
}
-/* unlocks vcpus from @vcpu_lock_idx and smaller */
-static void unlock_vcpus(struct kvm *kvm, int vcpu_lock_idx)
-{
- struct kvm_vcpu *tmp_vcpu;
-
- for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) {
- tmp_vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx);
- mutex_unlock(&tmp_vcpu->mutex);
- }
-}
-
-void unlock_all_vcpus(struct kvm *kvm)
-{
- unlock_vcpus(kvm, atomic_read(&kvm->online_vcpus) - 1);
-}
-
-/* Returns true if all vcpus were locked, false otherwise */
-bool lock_all_vcpus(struct kvm *kvm)
-{
- struct kvm_vcpu *tmp_vcpu;
- unsigned long c;
-
- /*
- * Any time a vcpu is run, vcpu_load is called which tries to grab the
- * vcpu->mutex. By grabbing the vcpu->mutex of all VCPUs we ensure
- * that no other VCPUs are run and fiddle with the vgic state while we
- * access it.
- */
- kvm_for_each_vcpu(c, tmp_vcpu, kvm) {
- if (!mutex_trylock(&tmp_vcpu->mutex)) {
- unlock_vcpus(kvm, c - 1);
- return false;
- }
- }
-
- return true;
-}
-
/**
* vgic_v2_attr_regs_access - allows user space to access VGIC v2 state
*
@@ -411,15 +376,17 @@ static int vgic_v2_attr_regs_access(struct kvm_device *dev,
mutex_lock(&dev->kvm->lock);
+ if (!lock_all_vcpus(dev->kvm)) {
+ mutex_unlock(&dev->kvm->lock);
+ return -EBUSY;
+ }
+
+ mutex_lock(&dev->kvm->arch.config_lock);
+
ret = vgic_init(dev->kvm);
if (ret)
goto out;
- if (!lock_all_vcpus(dev->kvm)) {
- ret = -EBUSY;
- goto out;
- }
-
switch (attr->group) {
case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
ret = vgic_v2_cpuif_uaccess(vcpu, is_write, addr, &val);
@@ -432,8 +399,9 @@ static int vgic_v2_attr_regs_access(struct kvm_device *dev,
break;
}
- unlock_all_vcpus(dev->kvm);
out:
+ mutex_unlock(&dev->kvm->arch.config_lock);
+ unlock_all_vcpus(dev->kvm);
mutex_unlock(&dev->kvm->lock);
if (!ret && !is_write)
@@ -569,12 +537,14 @@ static int vgic_v3_attr_regs_access(struct kvm_device *dev,
mutex_lock(&dev->kvm->lock);
- if (unlikely(!vgic_initialized(dev->kvm))) {
- ret = -EBUSY;
- goto out;
+ if (!lock_all_vcpus(dev->kvm)) {
+ mutex_unlock(&dev->kvm->lock);
+ return -EBUSY;
}
- if (!lock_all_vcpus(dev->kvm)) {
+ mutex_lock(&dev->kvm->arch.config_lock);
+
+ if (unlikely(!vgic_initialized(dev->kvm))) {
ret = -EBUSY;
goto out;
}
@@ -609,8 +579,9 @@ static int vgic_v3_attr_regs_access(struct kvm_device *dev,
break;
}
- unlock_all_vcpus(dev->kvm);
out:
+ mutex_unlock(&dev->kvm->arch.config_lock);
+ unlock_all_vcpus(dev->kvm);
mutex_unlock(&dev->kvm->lock);
if (!ret && uaccess && !is_write) {
diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
index 91201f743033..472b18ac92a2 100644
--- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
@@ -111,7 +111,7 @@ static void vgic_mmio_write_v3_misc(struct kvm_vcpu *vcpu,
case GICD_CTLR: {
bool was_enabled, is_hwsgi;
- mutex_lock(&vcpu->kvm->lock);
+ mutex_lock(&vcpu->kvm->arch.config_lock);
was_enabled = dist->enabled;
is_hwsgi = dist->nassgireq;
@@ -139,7 +139,7 @@ static void vgic_mmio_write_v3_misc(struct kvm_vcpu *vcpu,
else if (!was_enabled && dist->enabled)
vgic_kick_vcpus(vcpu->kvm);
- mutex_unlock(&vcpu->kvm->lock);
+ mutex_unlock(&vcpu->kvm->arch.config_lock);
break;
}
case GICD_TYPER:
diff --git a/arch/arm64/kvm/vgic/vgic-mmio.c b/arch/arm64/kvm/vgic/vgic-mmio.c
index e67b3b2c8044..1939c94e0b24 100644
--- a/arch/arm64/kvm/vgic/vgic-mmio.c
+++ b/arch/arm64/kvm/vgic/vgic-mmio.c
@@ -530,13 +530,13 @@ unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu,
u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
u32 val;
- mutex_lock(&vcpu->kvm->lock);
+ mutex_lock(&vcpu->kvm->arch.config_lock);
vgic_access_active_prepare(vcpu, intid);
val = __vgic_mmio_read_active(vcpu, addr, len);
vgic_access_active_finish(vcpu, intid);
- mutex_unlock(&vcpu->kvm->lock);
+ mutex_unlock(&vcpu->kvm->arch.config_lock);
return val;
}
@@ -625,13 +625,13 @@ void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu,
{
u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
- mutex_lock(&vcpu->kvm->lock);
+ mutex_lock(&vcpu->kvm->arch.config_lock);
vgic_access_active_prepare(vcpu, intid);
__vgic_mmio_write_cactive(vcpu, addr, len, val);
vgic_access_active_finish(vcpu, intid);
- mutex_unlock(&vcpu->kvm->lock);
+ mutex_unlock(&vcpu->kvm->arch.config_lock);
}
int vgic_mmio_uaccess_write_cactive(struct kvm_vcpu *vcpu,
@@ -662,13 +662,13 @@ void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu,
{
u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
- mutex_lock(&vcpu->kvm->lock);
+ mutex_lock(&vcpu->kvm->arch.config_lock);
vgic_access_active_prepare(vcpu, intid);
__vgic_mmio_write_sactive(vcpu, addr, len, val);
vgic_access_active_finish(vcpu, intid);
- mutex_unlock(&vcpu->kvm->lock);
+ mutex_unlock(&vcpu->kvm->arch.config_lock);
}
int vgic_mmio_uaccess_write_sactive(struct kvm_vcpu *vcpu,
diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c
index a413718be92b..3bb003478060 100644
--- a/arch/arm64/kvm/vgic/vgic-v4.c
+++ b/arch/arm64/kvm/vgic/vgic-v4.c
@@ -232,9 +232,8 @@ int vgic_v4_request_vpe_irq(struct kvm_vcpu *vcpu, int irq)
* @kvm: Pointer to the VM being initialized
*
* We may be called each time a vITS is created, or when the
- * vgic is initialized. This relies on kvm->lock to be
- * held. In both cases, the number of vcpus should now be
- * fixed.
+ * vgic is initialized. In both cases, the number of vcpus
+ * should now be fixed.
*/
int vgic_v4_init(struct kvm *kvm)
{
@@ -243,6 +242,8 @@ int vgic_v4_init(struct kvm *kvm)
int nr_vcpus, ret;
unsigned long i;
+ lockdep_assert_held(&kvm->arch.config_lock);
+
if (!kvm_vgic_global_state.has_gicv4)
return 0; /* Nothing to see here... move along. */
@@ -309,14 +310,14 @@ int vgic_v4_init(struct kvm *kvm)
/**
* vgic_v4_teardown - Free the GICv4 data structures
* @kvm: Pointer to the VM being destroyed
- *
- * Relies on kvm->lock to be held.
*/
void vgic_v4_teardown(struct kvm *kvm)
{
struct its_vm *its_vm = &kvm->arch.vgic.its_vm;
int i;
+ lockdep_assert_held(&kvm->arch.config_lock);
+
if (!its_vm->vpes)
return;
diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c
index d97e6080b421..8be4c1ebdec2 100644
--- a/arch/arm64/kvm/vgic/vgic.c
+++ b/arch/arm64/kvm/vgic/vgic.c
@@ -24,11 +24,13 @@ struct vgic_global kvm_vgic_global_state __ro_after_init = {
/*
* Locking order is always:
* kvm->lock (mutex)
- * its->cmd_lock (mutex)
- * its->its_lock (mutex)
- * vgic_cpu->ap_list_lock must be taken with IRQs disabled
- * kvm->lpi_list_lock must be taken with IRQs disabled
- * vgic_irq->irq_lock must be taken with IRQs disabled
+ * vcpu->mutex (mutex)
+ * kvm->arch.config_lock (mutex)
+ * its->cmd_lock (mutex)
+ * its->its_lock (mutex)
+ * vgic_cpu->ap_list_lock must be taken with IRQs disabled
+ * kvm->lpi_list_lock must be taken with IRQs disabled
+ * vgic_irq->irq_lock must be taken with IRQs disabled
*
* As the ap_list_lock might be taken from the timer interrupt handler,
* we have to disable IRQs before taking this lock and everything lower
@@ -573,6 +575,21 @@ int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int vintid)
return 0;
}
+int kvm_vgic_get_map(struct kvm_vcpu *vcpu, unsigned int vintid)
+{
+ struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, vintid);
+ unsigned long flags;
+ int ret = -1;
+
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
+ if (irq->hw)
+ ret = irq->hwintid;
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+
+ vgic_put_irq(vcpu->kvm, irq);
+ return ret;
+}
+
/**
* kvm_vgic_set_owner - Set the owner of an interrupt for a VM
*
diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h
index 7f7f3c5ed85a..f9923beedd27 100644
--- a/arch/arm64/kvm/vgic/vgic.h
+++ b/arch/arm64/kvm/vgic/vgic.h
@@ -273,9 +273,6 @@ int vgic_init(struct kvm *kvm);
void vgic_debug_init(struct kvm *kvm);
void vgic_debug_destroy(struct kvm *kvm);
-bool lock_all_vcpus(struct kvm *kvm);
-void unlock_all_vcpus(struct kvm *kvm);
-
static inline int vgic_v3_max_apr_idx(struct kvm_vcpu *vcpu)
{
struct vgic_cpu *cpu_if = &vcpu->arch.vgic_cpu;
diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps
index 37b1340e9646..40ba95472594 100644
--- a/arch/arm64/tools/cpucaps
+++ b/arch/arm64/tools/cpucaps
@@ -23,6 +23,7 @@ HAS_DCPOP
HAS_DIT
HAS_E0PD
HAS_ECV
+HAS_ECV_CNTPOFF
HAS_EPAN
HAS_GENERIC_AUTH
HAS_GENERIC_AUTH_ARCH_QARMA3
diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg
index dd5a9c7e310f..7063f1aacc54 100644
--- a/arch/arm64/tools/sysreg
+++ b/arch/arm64/tools/sysreg
@@ -1952,6 +1952,10 @@ Sysreg CONTEXTIDR_EL2 3 4 13 0 1
Fields CONTEXTIDR_ELx
EndSysreg
+Sysreg CNTPOFF_EL2 3 4 14 0 6
+Field 63:0 PhysicalOffset
+EndSysreg
+
Sysreg CPACR_EL12 3 5 1 0 2
Fields CPACR_ELx
EndSysreg
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 2803c9c21ef9..957121a495f0 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -757,7 +757,7 @@ struct kvm_mips_callbacks {
int (*vcpu_run)(struct kvm_vcpu *vcpu);
void (*vcpu_reenter)(struct kvm_vcpu *vcpu);
};
-extern struct kvm_mips_callbacks *kvm_mips_callbacks;
+extern const struct kvm_mips_callbacks * const kvm_mips_callbacks;
int kvm_mips_emulation_init(void);
/* Debug: dump vcpu state */
diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c
index dafab003ea0d..3d21cbfa7443 100644
--- a/arch/mips/kvm/vz.c
+++ b/arch/mips/kvm/vz.c
@@ -3305,7 +3305,7 @@ static struct kvm_mips_callbacks kvm_vz_callbacks = {
};
/* FIXME: Get rid of the callbacks now that trap-and-emulate is gone. */
-struct kvm_mips_callbacks *kvm_mips_callbacks = &kvm_vz_callbacks;
+const struct kvm_mips_callbacks * const kvm_mips_callbacks = &kvm_vz_callbacks;
int kvm_mips_emulation_init(void)
{
diff --git a/arch/powerpc/include/asm/kasan.h b/arch/powerpc/include/asm/kasan.h
index 92a968202ba7..365d2720097c 100644
--- a/arch/powerpc/include/asm/kasan.h
+++ b/arch/powerpc/include/asm/kasan.h
@@ -2,7 +2,7 @@
#ifndef __ASM_KASAN_H
#define __ASM_KASAN_H
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN) && !defined(CONFIG_CC_HAS_KASAN_MEMINTRINSIC_PREFIX)
#define _GLOBAL_KASAN(fn) _GLOBAL(__##fn)
#define _GLOBAL_TOC_KASAN(fn) _GLOBAL_TOC(__##fn)
#define EXPORT_SYMBOL_KASAN(fn) EXPORT_SYMBOL(__##fn)
diff --git a/arch/powerpc/include/asm/string.h b/arch/powerpc/include/asm/string.h
index 2aa0e31e6884..60ba22770f51 100644
--- a/arch/powerpc/include/asm/string.h
+++ b/arch/powerpc/include/asm/string.h
@@ -30,11 +30,17 @@ extern int memcmp(const void *,const void *,__kernel_size_t);
extern void * memchr(const void *,int,__kernel_size_t);
void memcpy_flushcache(void *dest, const void *src, size_t size);
+#ifdef CONFIG_KASAN
+/* __mem variants are used by KASAN to implement instrumented meminstrinsics. */
+#ifdef CONFIG_CC_HAS_KASAN_MEMINTRINSIC_PREFIX
+#define __memset memset
+#define __memcpy memcpy
+#define __memmove memmove
+#else /* CONFIG_CC_HAS_KASAN_MEMINTRINSIC_PREFIX */
void *__memset(void *s, int c, __kernel_size_t count);
void *__memcpy(void *to, const void *from, __kernel_size_t n);
void *__memmove(void *to, const void *from, __kernel_size_t n);
-
-#if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__)
+#ifndef __SANITIZE_ADDRESS__
/*
* For files that are not instrumented (e.g. mm/slub.c) we
* should use not instrumented version of mem* functions.
@@ -46,8 +52,9 @@ void *__memmove(void *to, const void *from, __kernel_size_t n);
#ifndef __NO_FORTIFY
#define __NO_FORTIFY /* FORTIFY_SOURCE uses __builtin_memcpy, etc. */
#endif
-
-#endif
+#endif /* !__SANITIZE_ADDRESS__ */
+#endif /* CONFIG_CC_HAS_KASAN_MEMINTRINSIC_PREFIX */
+#endif /* CONFIG_KASAN */
#ifdef CONFIG_PPC64
#ifndef CONFIG_KASAN
diff --git a/arch/powerpc/kernel/prom_init_check.sh b/arch/powerpc/kernel/prom_init_check.sh
index 5a319863f289..69623b9045d5 100644
--- a/arch/powerpc/kernel/prom_init_check.sh
+++ b/arch/powerpc/kernel/prom_init_check.sh
@@ -13,8 +13,13 @@
# If you really need to reference something from prom_init.o add
# it to the list below:
-grep "^CONFIG_KASAN=y$" ${KCONFIG_CONFIG} >/dev/null
-if [ $? -eq 0 ]
+has_renamed_memintrinsics()
+{
+ grep -q "^CONFIG_KASAN=y$" ${KCONFIG_CONFIG} && \
+ ! grep -q "^CONFIG_CC_HAS_KASAN_MEMINTRINSIC_PREFIX=y" ${KCONFIG_CONFIG}
+}
+
+if has_renamed_memintrinsics
then
MEM_FUNCS="__memcpy __memset"
else
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 2bef19cc1b98..af46aa88422b 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -271,11 +271,16 @@ static bool access_error(bool is_write, bool is_exec, struct vm_area_struct *vma
}
/*
- * Check for a read fault. This could be caused by a read on an
- * inaccessible page (i.e. PROT_NONE), or a Radix MMU execute-only page.
+ * VM_READ, VM_WRITE and VM_EXEC all imply read permissions, as
+ * defined in protection_map[]. Read faults can only be caused by
+ * a PROT_NONE mapping, or with a PROT_EXEC-only mapping on Radix.
*/
- if (unlikely(!(vma->vm_flags & VM_READ)))
+ if (unlikely(!vma_is_accessible(vma)))
return true;
+
+ if (unlikely(radix_enabled() && ((vma->vm_flags & VM_ACCESS_FLAGS) == VM_EXEC)))
+ return true;
+
/*
* We should ideally do the vma pkey access check here. But in the
* fault path, handle_mm_fault() also does the same check. To avoid
diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
index b481c5c8bae1..21b22bf16ce6 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -7,6 +7,7 @@ config PPC_PSERIES
select OF_DYNAMIC
select FORCE_PCI
select PCI_MSI
+ select GENERIC_ALLOCATOR
select PPC_XICS
select PPC_XIVE_SPAPR
select PPC_ICP_NATIVE
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index c5e42cc37604..5b182d1c196c 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -464,6 +464,28 @@ config TOOLCHAIN_HAS_ZIHINTPAUSE
depends on !32BIT || $(cc-option,-mabi=ilp32 -march=rv32ima_zihintpause)
depends on LLD_VERSION >= 150000 || LD_VERSION >= 23600
+config TOOLCHAIN_NEEDS_EXPLICIT_ZICSR_ZIFENCEI
+ def_bool y
+ # https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=aed44286efa8ae8717a77d94b51ac3614e2ca6dc
+ depends on AS_IS_GNU && AS_VERSION >= 23800
+ help
+ Newer binutils versions default to ISA spec version 20191213 which
+ moves some instructions from the I extension to the Zicsr and Zifencei
+ extensions.
+
+config TOOLCHAIN_NEEDS_OLD_ISA_SPEC
+ def_bool y
+ depends on TOOLCHAIN_NEEDS_EXPLICIT_ZICSR_ZIFENCEI
+ # https://github.com/llvm/llvm-project/commit/22e199e6afb1263c943c0c0d4498694e15bf8a16
+ depends on CC_IS_CLANG && CLANG_VERSION < 170000
+ help
+ Certain versions of clang do not support zicsr and zifencei via -march
+ but newer versions of binutils require it for the reasons noted in the
+ help text of CONFIG_TOOLCHAIN_NEEDS_EXPLICIT_ZICSR_ZIFENCEI. This
+ option causes an older ISA spec compatible with these older versions
+ of clang to be passed to GAS, which has the same result as passing zicsr
+ and zifencei to -march.
+
config FPU
bool "FPU support"
default y
diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile
index 4de83b9b1772..b05e833a022d 100644
--- a/arch/riscv/Makefile
+++ b/arch/riscv/Makefile
@@ -57,10 +57,12 @@ riscv-march-$(CONFIG_ARCH_RV64I) := rv64ima
riscv-march-$(CONFIG_FPU) := $(riscv-march-y)fd
riscv-march-$(CONFIG_RISCV_ISA_C) := $(riscv-march-y)c
-# Newer binutils versions default to ISA spec version 20191213 which moves some
-# instructions from the I extension to the Zicsr and Zifencei extensions.
-toolchain-need-zicsr-zifencei := $(call cc-option-yn, -march=$(riscv-march-y)_zicsr_zifencei)
-riscv-march-$(toolchain-need-zicsr-zifencei) := $(riscv-march-y)_zicsr_zifencei
+ifdef CONFIG_TOOLCHAIN_NEEDS_OLD_ISA_SPEC
+KBUILD_CFLAGS += -Wa,-misa-spec=2.2
+KBUILD_AFLAGS += -Wa,-misa-spec=2.2
+else
+riscv-march-$(CONFIG_TOOLCHAIN_NEEDS_EXPLICIT_ZICSR_ZIFENCEI) := $(riscv-march-y)_zicsr_zifencei
+endif
# Check if the toolchain supports Zihintpause extension
riscv-march-$(CONFIG_TOOLCHAIN_HAS_ZIHINTPAUSE) := $(riscv-march-y)_zihintpause
diff --git a/arch/riscv/include/asm/mmu.h b/arch/riscv/include/asm/mmu.h
index 5ff1f19fd45c..0099dc116168 100644
--- a/arch/riscv/include/asm/mmu.h
+++ b/arch/riscv/include/asm/mmu.h
@@ -19,8 +19,6 @@ typedef struct {
#ifdef CONFIG_SMP
/* A local icache flush is needed before user execution can resume. */
cpumask_t icache_stale_mask;
- /* A local tlb flush is needed before user execution can resume. */
- cpumask_t tlb_stale_mask;
#endif
} mm_context_t;
diff --git a/arch/riscv/include/asm/tlbflush.h b/arch/riscv/include/asm/tlbflush.h
index 907b9efd39a8..a09196f8de68 100644
--- a/arch/riscv/include/asm/tlbflush.h
+++ b/arch/riscv/include/asm/tlbflush.h
@@ -12,6 +12,8 @@
#include <asm/errata_list.h>
#ifdef CONFIG_MMU
+extern unsigned long asid_mask;
+
static inline void local_flush_tlb_all(void)
{
__asm__ __volatile__ ("sfence.vma" : : : "memory");
@@ -22,24 +24,6 @@ static inline void local_flush_tlb_page(unsigned long addr)
{
ALT_FLUSH_TLB_PAGE(__asm__ __volatile__ ("sfence.vma %0" : : "r" (addr) : "memory"));
}
-
-static inline void local_flush_tlb_all_asid(unsigned long asid)
-{
- __asm__ __volatile__ ("sfence.vma x0, %0"
- :
- : "r" (asid)
- : "memory");
-}
-
-static inline void local_flush_tlb_page_asid(unsigned long addr,
- unsigned long asid)
-{
- __asm__ __volatile__ ("sfence.vma %0, %1"
- :
- : "r" (addr), "r" (asid)
- : "memory");
-}
-
#else /* CONFIG_MMU */
#define local_flush_tlb_all() do { } while (0)
#define local_flush_tlb_page(addr) do { } while (0)
diff --git a/arch/riscv/mm/context.c b/arch/riscv/mm/context.c
index 80ce9caba8d2..12e22e7330e7 100644
--- a/arch/riscv/mm/context.c
+++ b/arch/riscv/mm/context.c
@@ -22,7 +22,7 @@ DEFINE_STATIC_KEY_FALSE(use_asid_allocator);
static unsigned long asid_bits;
static unsigned long num_asids;
-static unsigned long asid_mask;
+unsigned long asid_mask;
static atomic_long_t current_version;
@@ -196,16 +196,6 @@ switch_mm_fast:
if (need_flush_tlb)
local_flush_tlb_all();
-#ifdef CONFIG_SMP
- else {
- cpumask_t *mask = &mm->context.tlb_stale_mask;
-
- if (cpumask_test_cpu(cpu, mask)) {
- cpumask_clear_cpu(cpu, mask);
- local_flush_tlb_all_asid(cntx & asid_mask);
- }
- }
-#endif
}
static void set_mm_noasid(struct mm_struct *mm)
@@ -215,12 +205,24 @@ static void set_mm_noasid(struct mm_struct *mm)
local_flush_tlb_all();
}
-static inline void set_mm(struct mm_struct *mm, unsigned int cpu)
+static inline void set_mm(struct mm_struct *prev,
+ struct mm_struct *next, unsigned int cpu)
{
- if (static_branch_unlikely(&use_asid_allocator))
- set_mm_asid(mm, cpu);
- else
- set_mm_noasid(mm);
+ /*
+ * The mm_cpumask indicates which harts' TLBs contain the virtual
+ * address mapping of the mm. Compared to noasid, using asid
+ * can't guarantee that stale TLB entries are invalidated because
+ * the asid mechanism wouldn't flush TLB for every switch_mm for
+ * performance. So when using asid, keep all CPUs footmarks in
+ * cpumask() until mm reset.
+ */
+ cpumask_set_cpu(cpu, mm_cpumask(next));
+ if (static_branch_unlikely(&use_asid_allocator)) {
+ set_mm_asid(next, cpu);
+ } else {
+ cpumask_clear_cpu(cpu, mm_cpumask(prev));
+ set_mm_noasid(next);
+ }
}
static int __init asids_init(void)
@@ -274,7 +276,8 @@ static int __init asids_init(void)
}
early_initcall(asids_init);
#else
-static inline void set_mm(struct mm_struct *mm, unsigned int cpu)
+static inline void set_mm(struct mm_struct *prev,
+ struct mm_struct *next, unsigned int cpu)
{
/* Nothing to do here when there is no MMU */
}
@@ -327,10 +330,7 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next,
*/
cpu = smp_processor_id();
- cpumask_clear_cpu(cpu, mm_cpumask(prev));
- cpumask_set_cpu(cpu, mm_cpumask(next));
-
- set_mm(next, cpu);
+ set_mm(prev, next, cpu);
flush_icache_deferred(next, cpu);
}
diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c
index 460f785f6e09..d5f3e501dffb 100644
--- a/arch/riscv/mm/fault.c
+++ b/arch/riscv/mm/fault.c
@@ -143,6 +143,8 @@ static inline void vmalloc_fault(struct pt_regs *regs, int code, unsigned long a
no_context(regs, addr);
return;
}
+ if (pud_leaf(*pud_k))
+ goto flush_tlb;
/*
* Since the vmalloc area is global, it is unnecessary
@@ -153,6 +155,8 @@ static inline void vmalloc_fault(struct pt_regs *regs, int code, unsigned long a
no_context(regs, addr);
return;
}
+ if (pmd_leaf(*pmd_k))
+ goto flush_tlb;
/*
* Make sure the actual PTE exists as well to
@@ -172,6 +176,7 @@ static inline void vmalloc_fault(struct pt_regs *regs, int code, unsigned long a
* ordering constraint, not a cache flush; it is
* necessary even after writing invalid entries.
*/
+flush_tlb:
local_flush_tlb_page(addr);
}
diff --git a/arch/riscv/mm/tlbflush.c b/arch/riscv/mm/tlbflush.c
index ce7dfc81bb3f..ef701fa83f36 100644
--- a/arch/riscv/mm/tlbflush.c
+++ b/arch/riscv/mm/tlbflush.c
@@ -5,7 +5,23 @@
#include <linux/sched.h>
#include <asm/sbi.h>
#include <asm/mmu_context.h>
-#include <asm/tlbflush.h>
+
+static inline void local_flush_tlb_all_asid(unsigned long asid)
+{
+ __asm__ __volatile__ ("sfence.vma x0, %0"
+ :
+ : "r" (asid)
+ : "memory");
+}
+
+static inline void local_flush_tlb_page_asid(unsigned long addr,
+ unsigned long asid)
+{
+ __asm__ __volatile__ ("sfence.vma %0, %1"
+ :
+ : "r" (addr), "r" (asid)
+ : "memory");
+}
void flush_tlb_all(void)
{
@@ -15,7 +31,6 @@ void flush_tlb_all(void)
static void __sbi_tlb_flush_range(struct mm_struct *mm, unsigned long start,
unsigned long size, unsigned long stride)
{
- struct cpumask *pmask = &mm->context.tlb_stale_mask;
struct cpumask *cmask = mm_cpumask(mm);
unsigned int cpuid;
bool broadcast;
@@ -27,16 +42,7 @@ static void __sbi_tlb_flush_range(struct mm_struct *mm, unsigned long start,
/* check if the tlbflush needs to be sent to other CPUs */
broadcast = cpumask_any_but(cmask, cpuid) < nr_cpu_ids;
if (static_branch_unlikely(&use_asid_allocator)) {
- unsigned long asid = atomic_long_read(&mm->context.id);
-
- /*
- * TLB will be immediately flushed on harts concurrently
- * executing this MM context. TLB flush on other harts
- * is deferred until this MM context migrates there.
- */
- cpumask_setall(pmask);
- cpumask_clear_cpu(cpuid, pmask);
- cpumask_andnot(pmask, pmask, cmask);
+ unsigned long asid = atomic_long_read(&mm->context.id) & asid_mask;
if (broadcast) {
sbi_remote_sfence_vma_asid(cmask, start, size, asid);
diff --git a/arch/s390/boot/ipl_report.c b/arch/s390/boot/ipl_report.c
index 9b14045065b6..74b5cd264862 100644
--- a/arch/s390/boot/ipl_report.c
+++ b/arch/s390/boot/ipl_report.c
@@ -57,11 +57,19 @@ repeat:
if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && initrd_data.start && initrd_data.size &&
intersects(initrd_data.start, initrd_data.size, safe_addr, size))
safe_addr = initrd_data.start + initrd_data.size;
+ if (intersects(safe_addr, size, (unsigned long)comps, comps->len)) {
+ safe_addr = (unsigned long)comps + comps->len;
+ goto repeat;
+ }
for_each_rb_entry(comp, comps)
if (intersects(safe_addr, size, comp->addr, comp->len)) {
safe_addr = comp->addr + comp->len;
goto repeat;
}
+ if (intersects(safe_addr, size, (unsigned long)certs, certs->len)) {
+ safe_addr = (unsigned long)certs + certs->len;
+ goto repeat;
+ }
for_each_rb_entry(cert, certs)
if (intersects(safe_addr, size, cert->addr, cert->len)) {
safe_addr = cert->addr + cert->len;
diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig
index 3c68fe49042c..4ccf66d29fc2 100644
--- a/arch/s390/configs/debug_defconfig
+++ b/arch/s390/configs/debug_defconfig
@@ -23,7 +23,6 @@ CONFIG_NUMA_BALANCING=y
CONFIG_MEMCG=y
CONFIG_BLK_CGROUP=y
CONFIG_CFS_BANDWIDTH=y
-CONFIG_RT_GROUP_SCHED=y
CONFIG_CGROUP_PIDS=y
CONFIG_CGROUP_RDMA=y
CONFIG_CGROUP_FREEZER=y
@@ -90,7 +89,6 @@ CONFIG_MINIX_SUBPARTITION=y
CONFIG_SOLARIS_X86_PARTITION=y
CONFIG_UNIXWARE_DISKLABEL=y
CONFIG_IOSCHED_BFQ=y
-CONFIG_BFQ_GROUP_IOSCHED=y
CONFIG_BINFMT_MISC=m
CONFIG_ZSWAP=y
CONFIG_ZSMALLOC_STAT=y
@@ -298,7 +296,6 @@ CONFIG_IP_NF_TARGET_REJECT=m
CONFIG_IP_NF_NAT=m
CONFIG_IP_NF_TARGET_MASQUERADE=m
CONFIG_IP_NF_MANGLE=m
-CONFIG_IP_NF_TARGET_CLUSTERIP=m
CONFIG_IP_NF_TARGET_ECN=m
CONFIG_IP_NF_TARGET_TTL=m
CONFIG_IP_NF_RAW=m
@@ -340,7 +337,6 @@ CONFIG_BRIDGE_MRP=y
CONFIG_VLAN_8021Q=m
CONFIG_VLAN_8021Q_GVRP=y
CONFIG_NET_SCHED=y
-CONFIG_NET_SCH_CBQ=m
CONFIG_NET_SCH_HTB=m
CONFIG_NET_SCH_HFSC=m
CONFIG_NET_SCH_PRIO=m
@@ -351,7 +347,6 @@ CONFIG_NET_SCH_SFQ=m
CONFIG_NET_SCH_TEQL=m
CONFIG_NET_SCH_TBF=m
CONFIG_NET_SCH_GRED=m
-CONFIG_NET_SCH_DSMARK=m
CONFIG_NET_SCH_NETEM=m
CONFIG_NET_SCH_DRR=m
CONFIG_NET_SCH_MQPRIO=m
@@ -363,14 +358,11 @@ CONFIG_NET_SCH_INGRESS=m
CONFIG_NET_SCH_PLUG=m
CONFIG_NET_SCH_ETS=m
CONFIG_NET_CLS_BASIC=m
-CONFIG_NET_CLS_TCINDEX=m
CONFIG_NET_CLS_ROUTE4=m
CONFIG_NET_CLS_FW=m
CONFIG_NET_CLS_U32=m
CONFIG_CLS_U32_PERF=y
CONFIG_CLS_U32_MARK=y
-CONFIG_NET_CLS_RSVP=m
-CONFIG_NET_CLS_RSVP6=m
CONFIG_NET_CLS_FLOW=m
CONFIG_NET_CLS_CGROUP=y
CONFIG_NET_CLS_BPF=m
@@ -584,7 +576,7 @@ CONFIG_DIAG288_WATCHDOG=m
CONFIG_FB=y
CONFIG_FRAMEBUFFER_CONSOLE=y
CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY=y
-# CONFIG_HID is not set
+# CONFIG_HID_SUPPORT is not set
# CONFIG_USB_SUPPORT is not set
CONFIG_INFINIBAND=m
CONFIG_INFINIBAND_USER_ACCESS=m
@@ -828,6 +820,7 @@ CONFIG_PANIC_ON_OOPS=y
CONFIG_DETECT_HUNG_TASK=y
CONFIG_WQ_WATCHDOG=y
CONFIG_TEST_LOCKUP=m
+CONFIG_DEBUG_PREEMPT=y
CONFIG_PROVE_LOCKING=y
CONFIG_LOCK_STAT=y
CONFIG_DEBUG_ATOMIC_SLEEP=y
@@ -843,6 +836,7 @@ CONFIG_RCU_CPU_STALL_TIMEOUT=300
# CONFIG_RCU_TRACE is not set
CONFIG_LATENCYTOP=y
CONFIG_BOOTTIME_TRACING=y
+CONFIG_FPROBE=y
CONFIG_FUNCTION_PROFILER=y
CONFIG_STACK_TRACER=y
CONFIG_IRQSOFF_TRACER=y
@@ -857,6 +851,7 @@ CONFIG_SAMPLES=y
CONFIG_SAMPLE_TRACE_PRINTK=m
CONFIG_SAMPLE_FTRACE_DIRECT=m
CONFIG_SAMPLE_FTRACE_DIRECT_MULTI=m
+CONFIG_SAMPLE_FTRACE_OPS=m
CONFIG_DEBUG_ENTRY=y
CONFIG_CIO_INJECT=y
CONFIG_KUNIT=m
diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig
index 9ab91632f74c..693297a2e897 100644
--- a/arch/s390/configs/defconfig
+++ b/arch/s390/configs/defconfig
@@ -21,7 +21,6 @@ CONFIG_NUMA_BALANCING=y
CONFIG_MEMCG=y
CONFIG_BLK_CGROUP=y
CONFIG_CFS_BANDWIDTH=y
-CONFIG_RT_GROUP_SCHED=y
CONFIG_CGROUP_PIDS=y
CONFIG_CGROUP_RDMA=y
CONFIG_CGROUP_FREEZER=y
@@ -85,7 +84,6 @@ CONFIG_MINIX_SUBPARTITION=y
CONFIG_SOLARIS_X86_PARTITION=y
CONFIG_UNIXWARE_DISKLABEL=y
CONFIG_IOSCHED_BFQ=y
-CONFIG_BFQ_GROUP_IOSCHED=y
CONFIG_BINFMT_MISC=m
CONFIG_ZSWAP=y
CONFIG_ZSMALLOC_STAT=y
@@ -289,7 +287,6 @@ CONFIG_IP_NF_TARGET_REJECT=m
CONFIG_IP_NF_NAT=m
CONFIG_IP_NF_TARGET_MASQUERADE=m
CONFIG_IP_NF_MANGLE=m
-CONFIG_IP_NF_TARGET_CLUSTERIP=m
CONFIG_IP_NF_TARGET_ECN=m
CONFIG_IP_NF_TARGET_TTL=m
CONFIG_IP_NF_RAW=m
@@ -330,7 +327,6 @@ CONFIG_BRIDGE_MRP=y
CONFIG_VLAN_8021Q=m
CONFIG_VLAN_8021Q_GVRP=y
CONFIG_NET_SCHED=y
-CONFIG_NET_SCH_CBQ=m
CONFIG_NET_SCH_HTB=m
CONFIG_NET_SCH_HFSC=m
CONFIG_NET_SCH_PRIO=m
@@ -341,7 +337,6 @@ CONFIG_NET_SCH_SFQ=m
CONFIG_NET_SCH_TEQL=m
CONFIG_NET_SCH_TBF=m
CONFIG_NET_SCH_GRED=m
-CONFIG_NET_SCH_DSMARK=m
CONFIG_NET_SCH_NETEM=m
CONFIG_NET_SCH_DRR=m
CONFIG_NET_SCH_MQPRIO=m
@@ -353,14 +348,11 @@ CONFIG_NET_SCH_INGRESS=m
CONFIG_NET_SCH_PLUG=m
CONFIG_NET_SCH_ETS=m
CONFIG_NET_CLS_BASIC=m
-CONFIG_NET_CLS_TCINDEX=m
CONFIG_NET_CLS_ROUTE4=m
CONFIG_NET_CLS_FW=m
CONFIG_NET_CLS_U32=m
CONFIG_CLS_U32_PERF=y
CONFIG_CLS_U32_MARK=y
-CONFIG_NET_CLS_RSVP=m
-CONFIG_NET_CLS_RSVP6=m
CONFIG_NET_CLS_FLOW=m
CONFIG_NET_CLS_CGROUP=y
CONFIG_NET_CLS_BPF=m
@@ -573,7 +565,7 @@ CONFIG_DIAG288_WATCHDOG=m
CONFIG_FB=y
CONFIG_FRAMEBUFFER_CONSOLE=y
CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY=y
-# CONFIG_HID is not set
+# CONFIG_HID_SUPPORT is not set
# CONFIG_USB_SUPPORT is not set
CONFIG_INFINIBAND=m
CONFIG_INFINIBAND_USER_ACCESS=m
@@ -795,6 +787,7 @@ CONFIG_RCU_REF_SCALE_TEST=m
CONFIG_RCU_CPU_STALL_TIMEOUT=60
CONFIG_LATENCYTOP=y
CONFIG_BOOTTIME_TRACING=y
+CONFIG_FPROBE=y
CONFIG_FUNCTION_PROFILER=y
CONFIG_STACK_TRACER=y
CONFIG_SCHED_TRACER=y
@@ -805,6 +798,7 @@ CONFIG_SAMPLES=y
CONFIG_SAMPLE_TRACE_PRINTK=m
CONFIG_SAMPLE_FTRACE_DIRECT=m
CONFIG_SAMPLE_FTRACE_DIRECT_MULTI=m
+CONFIG_SAMPLE_FTRACE_OPS=m
CONFIG_KUNIT=m
CONFIG_KUNIT_DEBUGFS=y
CONFIG_LKDTM=m
diff --git a/arch/s390/configs/zfcpdump_defconfig b/arch/s390/configs/zfcpdump_defconfig
index a9c0c81d1de9..33a232bb68af 100644
--- a/arch/s390/configs/zfcpdump_defconfig
+++ b/arch/s390/configs/zfcpdump_defconfig
@@ -58,7 +58,7 @@ CONFIG_ZFCP=y
# CONFIG_VMCP is not set
# CONFIG_MONWRITER is not set
# CONFIG_S390_VMUR is not set
-# CONFIG_HID is not set
+# CONFIG_HID_SUPPORT is not set
# CONFIG_VIRTIO_MENU is not set
# CONFIG_VHOST_MENU is not set
# CONFIG_IOMMU_SUPPORT is not set
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 9250fde1f97d..da6dac36e959 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -305,7 +305,7 @@ static inline u8 gisa_get_ipm_or_restore_iam(struct kvm_s390_gisa_interrupt *gi)
static inline int gisa_in_alert_list(struct kvm_s390_gisa *gisa)
{
- return READ_ONCE(gisa->next_alert) != (u32)(u64)gisa;
+ return READ_ONCE(gisa->next_alert) != (u32)virt_to_phys(gisa);
}
static inline void gisa_set_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc)
@@ -3168,7 +3168,7 @@ void kvm_s390_gisa_init(struct kvm *kvm)
hrtimer_init(&gi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
gi->timer.function = gisa_vcpu_kicker;
memset(gi->origin, 0, sizeof(struct kvm_s390_gisa));
- gi->origin->next_alert = (u32)(u64)gi->origin;
+ gi->origin->next_alert = (u32)virt_to_phys(gi->origin);
VM_EVENT(kvm, 3, "gisa 0x%pK initialized", gi->origin);
}
diff --git a/arch/s390/kvm/pci.c b/arch/s390/kvm/pci.c
index b124d586db55..7dab00f1e833 100644
--- a/arch/s390/kvm/pci.c
+++ b/arch/s390/kvm/pci.c
@@ -112,7 +112,7 @@ static int zpci_reset_aipb(u8 nisc)
return -EINVAL;
aift->sbv = zpci_aif_sbv;
- aift->gait = (struct zpci_gaite *)zpci_aipb->aipb.gait;
+ aift->gait = phys_to_virt(zpci_aipb->aipb.gait);
return 0;
}
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index b6a0219e470a..8d6b765abf29 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -138,11 +138,15 @@ static int prepare_cpuflags(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
}
/* Copy to APCB FORMAT1 from APCB FORMAT0 */
static int setup_apcb10(struct kvm_vcpu *vcpu, struct kvm_s390_apcb1 *apcb_s,
- unsigned long apcb_o, struct kvm_s390_apcb1 *apcb_h)
+ unsigned long crycb_gpa, struct kvm_s390_apcb1 *apcb_h)
{
struct kvm_s390_apcb0 tmp;
+ unsigned long apcb_gpa;
- if (read_guest_real(vcpu, apcb_o, &tmp, sizeof(struct kvm_s390_apcb0)))
+ apcb_gpa = crycb_gpa + offsetof(struct kvm_s390_crypto_cb, apcb0);
+
+ if (read_guest_real(vcpu, apcb_gpa, &tmp,
+ sizeof(struct kvm_s390_apcb0)))
return -EFAULT;
apcb_s->apm[0] = apcb_h->apm[0] & tmp.apm[0];
@@ -157,15 +161,19 @@ static int setup_apcb10(struct kvm_vcpu *vcpu, struct kvm_s390_apcb1 *apcb_s,
* setup_apcb00 - Copy to APCB FORMAT0 from APCB FORMAT0
* @vcpu: pointer to the virtual CPU
* @apcb_s: pointer to start of apcb in the shadow crycb
- * @apcb_o: pointer to start of original apcb in the guest2
+ * @crycb_gpa: guest physical address to start of original guest crycb
* @apcb_h: pointer to start of apcb in the guest1
*
* Returns 0 and -EFAULT on error reading guest apcb
*/
static int setup_apcb00(struct kvm_vcpu *vcpu, unsigned long *apcb_s,
- unsigned long apcb_o, unsigned long *apcb_h)
+ unsigned long crycb_gpa, unsigned long *apcb_h)
{
- if (read_guest_real(vcpu, apcb_o, apcb_s,
+ unsigned long apcb_gpa;
+
+ apcb_gpa = crycb_gpa + offsetof(struct kvm_s390_crypto_cb, apcb0);
+
+ if (read_guest_real(vcpu, apcb_gpa, apcb_s,
sizeof(struct kvm_s390_apcb0)))
return -EFAULT;
@@ -178,16 +186,20 @@ static int setup_apcb00(struct kvm_vcpu *vcpu, unsigned long *apcb_s,
* setup_apcb11 - Copy the FORMAT1 APCB from the guest to the shadow CRYCB
* @vcpu: pointer to the virtual CPU
* @apcb_s: pointer to start of apcb in the shadow crycb
- * @apcb_o: pointer to start of original guest apcb
+ * @crycb_gpa: guest physical address to start of original guest crycb
* @apcb_h: pointer to start of apcb in the host
*
* Returns 0 and -EFAULT on error reading guest apcb
*/
static int setup_apcb11(struct kvm_vcpu *vcpu, unsigned long *apcb_s,
- unsigned long apcb_o,
+ unsigned long crycb_gpa,
unsigned long *apcb_h)
{
- if (read_guest_real(vcpu, apcb_o, apcb_s,
+ unsigned long apcb_gpa;
+
+ apcb_gpa = crycb_gpa + offsetof(struct kvm_s390_crypto_cb, apcb1);
+
+ if (read_guest_real(vcpu, apcb_gpa, apcb_s,
sizeof(struct kvm_s390_apcb1)))
return -EFAULT;
@@ -200,7 +212,7 @@ static int setup_apcb11(struct kvm_vcpu *vcpu, unsigned long *apcb_s,
* setup_apcb - Create a shadow copy of the apcb.
* @vcpu: pointer to the virtual CPU
* @crycb_s: pointer to shadow crycb
- * @crycb_o: pointer to original guest crycb
+ * @crycb_gpa: guest physical address of original guest crycb
* @crycb_h: pointer to the host crycb
* @fmt_o: format of the original guest crycb.
* @fmt_h: format of the host crycb.
@@ -211,50 +223,46 @@ static int setup_apcb11(struct kvm_vcpu *vcpu, unsigned long *apcb_s,
* Return 0 or an error number if the guest and host crycb are incompatible.
*/
static int setup_apcb(struct kvm_vcpu *vcpu, struct kvm_s390_crypto_cb *crycb_s,
- const u32 crycb_o,
+ const u32 crycb_gpa,
struct kvm_s390_crypto_cb *crycb_h,
int fmt_o, int fmt_h)
{
- struct kvm_s390_crypto_cb *crycb;
-
- crycb = (struct kvm_s390_crypto_cb *) (unsigned long)crycb_o;
-
switch (fmt_o) {
case CRYCB_FORMAT2:
- if ((crycb_o & PAGE_MASK) != ((crycb_o + 256) & PAGE_MASK))
+ if ((crycb_gpa & PAGE_MASK) != ((crycb_gpa + 256) & PAGE_MASK))
return -EACCES;
if (fmt_h != CRYCB_FORMAT2)
return -EINVAL;
return setup_apcb11(vcpu, (unsigned long *)&crycb_s->apcb1,
- (unsigned long) &crycb->apcb1,
+ crycb_gpa,
(unsigned long *)&crycb_h->apcb1);
case CRYCB_FORMAT1:
switch (fmt_h) {
case CRYCB_FORMAT2:
return setup_apcb10(vcpu, &crycb_s->apcb1,
- (unsigned long) &crycb->apcb0,
+ crycb_gpa,
&crycb_h->apcb1);
case CRYCB_FORMAT1:
return setup_apcb00(vcpu,
(unsigned long *) &crycb_s->apcb0,
- (unsigned long) &crycb->apcb0,
+ crycb_gpa,
(unsigned long *) &crycb_h->apcb0);
}
break;
case CRYCB_FORMAT0:
- if ((crycb_o & PAGE_MASK) != ((crycb_o + 32) & PAGE_MASK))
+ if ((crycb_gpa & PAGE_MASK) != ((crycb_gpa + 32) & PAGE_MASK))
return -EACCES;
switch (fmt_h) {
case CRYCB_FORMAT2:
return setup_apcb10(vcpu, &crycb_s->apcb1,
- (unsigned long) &crycb->apcb0,
+ crycb_gpa,
&crycb_h->apcb1);
case CRYCB_FORMAT1:
case CRYCB_FORMAT0:
return setup_apcb00(vcpu,
(unsigned long *) &crycb_s->apcb0,
- (unsigned long) &crycb->apcb0,
+ crycb_gpa,
(unsigned long *) &crycb_h->apcb0);
}
}
diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
index ef38b1514c77..e16afacc8fd1 100644
--- a/arch/s390/pci/pci.c
+++ b/arch/s390/pci/pci.c
@@ -544,8 +544,7 @@ static struct resource *__alloc_res(struct zpci_dev *zdev, unsigned long start,
return r;
}
-int zpci_setup_bus_resources(struct zpci_dev *zdev,
- struct list_head *resources)
+int zpci_setup_bus_resources(struct zpci_dev *zdev)
{
unsigned long addr, size, flags;
struct resource *res;
@@ -581,7 +580,6 @@ int zpci_setup_bus_resources(struct zpci_dev *zdev,
return -ENOMEM;
}
zdev->bars[i].res = res;
- pci_add_resource(resources, res);
}
zdev->has_resources = 1;
@@ -590,17 +588,23 @@ int zpci_setup_bus_resources(struct zpci_dev *zdev,
static void zpci_cleanup_bus_resources(struct zpci_dev *zdev)
{
+ struct resource *res;
int i;
+ pci_lock_rescan_remove();
for (i = 0; i < PCI_STD_NUM_BARS; i++) {
- if (!zdev->bars[i].size || !zdev->bars[i].res)
+ res = zdev->bars[i].res;
+ if (!res)
continue;
+ release_resource(res);
+ pci_bus_remove_resource(zdev->zbus->bus, res);
zpci_free_iomap(zdev, zdev->bars[i].map_idx);
- release_resource(zdev->bars[i].res);
- kfree(zdev->bars[i].res);
+ zdev->bars[i].res = NULL;
+ kfree(res);
}
zdev->has_resources = 0;
+ pci_unlock_rescan_remove();
}
int pcibios_device_add(struct pci_dev *pdev)
diff --git a/arch/s390/pci/pci_bus.c b/arch/s390/pci/pci_bus.c
index 6a8da1b742ae..a99926af2b69 100644
--- a/arch/s390/pci/pci_bus.c
+++ b/arch/s390/pci/pci_bus.c
@@ -41,9 +41,7 @@ static int zpci_nb_devices;
*/
static int zpci_bus_prepare_device(struct zpci_dev *zdev)
{
- struct resource_entry *window, *n;
- struct resource *res;
- int rc;
+ int rc, i;
if (!zdev_enabled(zdev)) {
rc = zpci_enable_device(zdev);
@@ -57,10 +55,10 @@ static int zpci_bus_prepare_device(struct zpci_dev *zdev)
}
if (!zdev->has_resources) {
- zpci_setup_bus_resources(zdev, &zdev->zbus->resources);
- resource_list_for_each_entry_safe(window, n, &zdev->zbus->resources) {
- res = window->res;
- pci_bus_add_resource(zdev->zbus->bus, res, 0);
+ zpci_setup_bus_resources(zdev);
+ for (i = 0; i < PCI_STD_NUM_BARS; i++) {
+ if (zdev->bars[i].res)
+ pci_bus_add_resource(zdev->zbus->bus, zdev->bars[i].res, 0);
}
}
diff --git a/arch/s390/pci/pci_bus.h b/arch/s390/pci/pci_bus.h
index e96c9860e064..af9f0ac79a1b 100644
--- a/arch/s390/pci/pci_bus.h
+++ b/arch/s390/pci/pci_bus.h
@@ -30,8 +30,7 @@ static inline void zpci_zdev_get(struct zpci_dev *zdev)
int zpci_alloc_domain(int domain);
void zpci_free_domain(int domain);
-int zpci_setup_bus_resources(struct zpci_dev *zdev,
- struct list_head *resources);
+int zpci_setup_bus_resources(struct zpci_dev *zdev);
static inline struct zpci_dev *zdev_from_bus(struct pci_bus *bus,
unsigned int devfn)
diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c
index 8c45b198b62f..bccea57dee81 100644
--- a/arch/x86/events/amd/core.c
+++ b/arch/x86/events/amd/core.c
@@ -923,6 +923,7 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs)
/* Event overflow */
handled++;
+ status &= ~mask;
perf_sample_data_init(&data, 0, hwc->last_period);
if (!x86_perf_event_set_period(event))
@@ -933,8 +934,6 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs)
if (perf_event_overflow(event, &data, regs))
x86_pmu_stop(event, 0);
-
- status &= ~mask;
}
/*
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 73c9672c123b..97327a1e3aff 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -226,10 +226,9 @@
/* Virtualization flags: Linux defined, word 8 */
#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
-#define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */
-#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */
-#define X86_FEATURE_EPT ( 8*32+ 3) /* Intel Extended Page Table */
-#define X86_FEATURE_VPID ( 8*32+ 4) /* Intel Virtual Processor ID */
+#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 1) /* Intel FlexPriority */
+#define X86_FEATURE_EPT ( 8*32+ 2) /* Intel Extended Page Table */
+#define X86_FEATURE_VPID ( 8*32+ 3) /* Intel Virtual Processor ID */
#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer VMMCALL to VMCALL */
#define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */
@@ -337,6 +336,7 @@
#define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */
#define X86_FEATURE_AMD_SSB_NO (13*32+26) /* "" Speculative Store Bypass is fixed in hardware. */
#define X86_FEATURE_CPPC (13*32+27) /* Collaborative Processor Performance Control */
+#define X86_FEATURE_AMD_PSFD (13*32+28) /* "" Predictive Store Forwarding Disable */
#define X86_FEATURE_BTC_NO (13*32+29) /* "" Not vulnerable to Branch Type Confusion */
#define X86_FEATURE_BRS (13*32+31) /* Branch Sampling available */
@@ -369,6 +369,7 @@
#define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */
#define X86_FEATURE_X2AVIC (15*32+18) /* Virtual x2apic */
#define X86_FEATURE_V_SPEC_CTRL (15*32+20) /* Virtual SPEC_CTRL */
+#define X86_FEATURE_VNMI (15*32+25) /* Virtual NMI */
#define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* "" SVME addr check */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 8dc345cc6318..13bc212cd4bc 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -54,8 +54,8 @@ KVM_X86_OP(set_rflags)
KVM_X86_OP(get_if_flag)
KVM_X86_OP(flush_tlb_all)
KVM_X86_OP(flush_tlb_current)
-KVM_X86_OP_OPTIONAL(tlb_remote_flush)
-KVM_X86_OP_OPTIONAL(tlb_remote_flush_with_range)
+KVM_X86_OP_OPTIONAL(flush_remote_tlbs)
+KVM_X86_OP_OPTIONAL(flush_remote_tlbs_range)
KVM_X86_OP(flush_tlb_gva)
KVM_X86_OP(flush_tlb_guest)
KVM_X86_OP(vcpu_pre_run)
@@ -68,6 +68,8 @@ KVM_X86_OP(get_interrupt_shadow)
KVM_X86_OP(patch_hypercall)
KVM_X86_OP(inject_irq)
KVM_X86_OP(inject_nmi)
+KVM_X86_OP_OPTIONAL_RET0(is_vnmi_pending)
+KVM_X86_OP_OPTIONAL_RET0(set_vnmi_pending)
KVM_X86_OP(inject_exception)
KVM_X86_OP(cancel_injection)
KVM_X86_OP(interrupt_allowed)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a45de1118a42..fb9d1f2d6136 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -420,6 +420,10 @@ struct kvm_mmu_root_info {
#define KVM_MMU_NUM_PREV_ROOTS 3
+#define KVM_MMU_ROOT_CURRENT BIT(0)
+#define KVM_MMU_ROOT_PREVIOUS(i) BIT(1+i)
+#define KVM_MMU_ROOTS_ALL (BIT(1 + KVM_MMU_NUM_PREV_ROOTS) - 1)
+
#define KVM_HAVE_MMU_RWLOCK
struct kvm_mmu_page;
@@ -439,9 +443,8 @@ struct kvm_mmu {
gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
gpa_t gva_or_gpa, u64 access,
struct x86_exception *exception);
- int (*sync_page)(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp);
- void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa);
+ int (*sync_spte)(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp, int i);
struct kvm_mmu_root_info root;
union kvm_cpu_role cpu_role;
union kvm_mmu_page_role root_role;
@@ -479,11 +482,6 @@ struct kvm_mmu {
u64 pdptrs[4]; /* pae */
};
-struct kvm_tlb_range {
- u64 start_gfn;
- u64 pages;
-};
-
enum pmc_type {
KVM_PMC_GP = 0,
KVM_PMC_FIXED,
@@ -515,6 +513,7 @@ struct kvm_pmc {
#define MSR_ARCH_PERFMON_FIXED_CTR_MAX (MSR_ARCH_PERFMON_FIXED_CTR0 + KVM_PMC_MAX_FIXED - 1)
#define KVM_AMD_PMC_MAX_GENERIC 6
struct kvm_pmu {
+ u8 version;
unsigned nr_arch_gp_counters;
unsigned nr_arch_fixed_counters;
unsigned available_event_types;
@@ -527,7 +526,6 @@ struct kvm_pmu {
u64 global_ovf_ctrl_mask;
u64 reserved_bits;
u64 raw_event_mask;
- u8 version;
struct kvm_pmc gp_counters[KVM_INTEL_PMC_MAX_GENERIC];
struct kvm_pmc fixed_counters[KVM_PMC_MAX_FIXED];
struct irq_work irq_work;
@@ -876,7 +874,8 @@ struct kvm_vcpu_arch {
u64 tsc_scaling_ratio; /* current scaling ratio */
atomic_t nmi_queued; /* unprocessed asynchronous NMIs */
- unsigned nmi_pending; /* NMI queued after currently running handler */
+ /* Number of NMIs pending injection, not including hardware vNMIs. */
+ unsigned int nmi_pending;
bool nmi_injected; /* Trying to inject an NMI this entry */
bool smi_pending; /* SMI queued after currently running handler */
u8 handling_intr_from_guest;
@@ -1585,9 +1584,9 @@ struct kvm_x86_ops {
void (*flush_tlb_all)(struct kvm_vcpu *vcpu);
void (*flush_tlb_current)(struct kvm_vcpu *vcpu);
- int (*tlb_remote_flush)(struct kvm *kvm);
- int (*tlb_remote_flush_with_range)(struct kvm *kvm,
- struct kvm_tlb_range *range);
+ int (*flush_remote_tlbs)(struct kvm *kvm);
+ int (*flush_remote_tlbs_range)(struct kvm *kvm, gfn_t gfn,
+ gfn_t nr_pages);
/*
* Flush any TLB entries associated with the given GVA.
@@ -1621,6 +1620,13 @@ struct kvm_x86_ops {
int (*nmi_allowed)(struct kvm_vcpu *vcpu, bool for_injection);
bool (*get_nmi_mask)(struct kvm_vcpu *vcpu);
void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked);
+ /* Whether or not a virtual NMI is pending in hardware. */
+ bool (*is_vnmi_pending)(struct kvm_vcpu *vcpu);
+ /*
+ * Attempt to pend a virtual NMI in harware. Returns %true on success
+ * to allow using static_call_ret0 as the fallback.
+ */
+ bool (*set_vnmi_pending)(struct kvm_vcpu *vcpu);
void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
void (*enable_irq_window)(struct kvm_vcpu *vcpu);
void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
@@ -1791,8 +1797,8 @@ void kvm_arch_free_vm(struct kvm *kvm);
#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB
static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm)
{
- if (kvm_x86_ops.tlb_remote_flush &&
- !static_call(kvm_x86_tlb_remote_flush)(kvm))
+ if (kvm_x86_ops.flush_remote_tlbs &&
+ !static_call(kvm_x86_flush_remote_tlbs)(kvm))
return 0;
else
return -ENOTSUPP;
@@ -1997,14 +2003,11 @@ static inline int __kvm_irq_line_state(unsigned long *irq_state,
return !!(*irq_state);
}
-#define KVM_MMU_ROOT_CURRENT BIT(0)
-#define KVM_MMU_ROOT_PREVIOUS(i) BIT(1+i)
-#define KVM_MMU_ROOTS_ALL (~0UL)
-
int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level);
void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id);
void kvm_inject_nmi(struct kvm_vcpu *vcpu);
+int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu);
void kvm_update_dr7(struct kvm_vcpu *vcpu);
@@ -2044,8 +2047,8 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
void *insn, int insn_len);
void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
-void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
- gva_t gva, hpa_t root_hpa);
+void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ u64 addr, unsigned long roots);
void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid);
void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd);
@@ -2207,4 +2210,11 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
KVM_X86_QUIRK_FIX_HYPERCALL_INSN | \
KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS)
+/*
+ * KVM previously used a u32 field in kvm_run to indicate the hypercall was
+ * initiated from long mode. KVM now sets bit 0 to indicate long mode, but the
+ * remaining 31 lower bits must be 0 to preserve ABI.
+ */
+#define KVM_EXIT_HYPERCALL_MBZ GENMASK_ULL(31, 1)
+
#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h
index b8357d6ecd47..b63be696b776 100644
--- a/arch/x86/include/asm/sev-common.h
+++ b/arch/x86/include/asm/sev-common.h
@@ -128,8 +128,9 @@ struct snp_psc_desc {
struct psc_entry entries[VMGEXIT_PSC_MAX_ENTRY];
} __packed;
-/* Guest message request error code */
+/* Guest message request error codes */
#define SNP_GUEST_REQ_INVALID_LEN BIT_ULL(32)
+#define SNP_GUEST_REQ_ERR_BUSY BIT_ULL(33)
#define GHCB_MSR_TERM_REQ 0x100
#define GHCB_MSR_TERM_REASON_SET_POS 12
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 770dcf75eaa9..e7c7379d6ac7 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -183,6 +183,12 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
#define V_GIF_SHIFT 9
#define V_GIF_MASK (1 << V_GIF_SHIFT)
+#define V_NMI_PENDING_SHIFT 11
+#define V_NMI_PENDING_MASK (1 << V_NMI_PENDING_SHIFT)
+
+#define V_NMI_BLOCKING_SHIFT 12
+#define V_NMI_BLOCKING_MASK (1 << V_NMI_BLOCKING_SHIFT)
+
#define V_INTR_PRIO_SHIFT 16
#define V_INTR_PRIO_MASK (0x0f << V_INTR_PRIO_SHIFT)
@@ -197,6 +203,9 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
#define V_GIF_ENABLE_SHIFT 25
#define V_GIF_ENABLE_MASK (1 << V_GIF_ENABLE_SHIFT)
+#define V_NMI_ENABLE_SHIFT 26
+#define V_NMI_ENABLE_MASK (1 << V_NMI_ENABLE_SHIFT)
+
#define AVIC_ENABLE_SHIFT 31
#define AVIC_ENABLE_MASK (1 << AVIC_ENABLE_SHIFT)
@@ -278,7 +287,6 @@ static_assert((AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == AVIC_MAX_
static_assert((X2AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_MAX_PHYSICAL_ID);
#define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF)
-#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL
struct vmcb_seg {
diff --git a/arch/x86/include/asm/xen/cpuid.h b/arch/x86/include/asm/xen/cpuid.h
index 6daa9b0c8d11..a3c29b1496c8 100644
--- a/arch/x86/include/asm/xen/cpuid.h
+++ b/arch/x86/include/asm/xen/cpuid.h
@@ -89,11 +89,21 @@
* Sub-leaf 2: EAX: host tsc frequency in kHz
*/
+#define XEN_CPUID_TSC_EMULATED (1u << 0)
+#define XEN_CPUID_HOST_TSC_RELIABLE (1u << 1)
+#define XEN_CPUID_RDTSCP_INSTR_AVAIL (1u << 2)
+
+#define XEN_CPUID_TSC_MODE_DEFAULT (0)
+#define XEN_CPUID_TSC_MODE_ALWAYS_EMULATE (1u)
+#define XEN_CPUID_TSC_MODE_NEVER_EMULATE (2u)
+#define XEN_CPUID_TSC_MODE_PVRDTSCP (3u)
+
/*
* Leaf 5 (0x40000x04)
* HVM-specific features
* Sub-leaf 0: EAX: Features
* Sub-leaf 0: EBX: vcpu id (iff EAX has XEN_HVM_CPUID_VCPU_ID_PRESENT flag)
+ * Sub-leaf 0: ECX: domain id (iff EAX has XEN_HVM_CPUID_DOMID_PRESENT flag)
*/
#define XEN_HVM_CPUID_APIC_ACCESS_VIRT (1u << 0) /* Virtualized APIC registers */
#define XEN_HVM_CPUID_X2APIC_VIRT (1u << 1) /* Virtualized x2APIC accesses */
@@ -102,12 +112,16 @@
#define XEN_HVM_CPUID_VCPU_ID_PRESENT (1u << 3) /* vcpu id is present in EBX */
#define XEN_HVM_CPUID_DOMID_PRESENT (1u << 4) /* domid is present in ECX */
/*
- * Bits 55:49 from the IO-APIC RTE and bits 11:5 from the MSI address can be
- * used to store high bits for the Destination ID. This expands the Destination
- * ID field from 8 to 15 bits, allowing to target APIC IDs up 32768.
+ * With interrupt format set to 0 (non-remappable) bits 55:49 from the
+ * IO-APIC RTE and bits 11:5 from the MSI address can be used to store
+ * high bits for the Destination ID. This expands the Destination ID
+ * field from 8 to 15 bits, allowing to target APIC IDs up 32768.
*/
#define XEN_HVM_CPUID_EXT_DEST_ID (1u << 5)
-/* Per-vCPU event channel upcalls */
+/*
+ * Per-vCPU event channel upcalls work correctly with physical IRQs
+ * bound to event channels.
+ */
#define XEN_HVM_CPUID_UPCALL_VECTOR (1u << 6)
/*
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 7f467fe05d42..1a6a1f987949 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -559,4 +559,7 @@ struct kvm_pmu_event_filter {
#define KVM_VCPU_TSC_CTRL 0 /* control group for the timestamp counter (TSC) */
#define KVM_VCPU_TSC_OFFSET 0 /* attribute for the TSC offset */
+/* x86-specific KVM_EXIT_HYPERCALL flags. */
+#define KVM_EXIT_HYPERCALL_LONG_MODE BIT(0)
+
#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 7832a69d170e..2eec60f50057 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -2355,6 +2355,7 @@ static void mce_restart(void)
{
mce_timer_delete_all();
on_each_cpu(mce_cpu_restart, NULL, 1);
+ mce_schedule_work();
}
/* Toggle features for corrected errors */
diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index eb07d4435391..b44c487727d4 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -368,7 +368,6 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
{
struct resctrl_schema *s;
struct rdtgroup *rdtgrp;
- struct rdt_domain *dom;
struct rdt_resource *r;
char *tok, *resname;
int ret = 0;
@@ -397,10 +396,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
goto out;
}
- list_for_each_entry(s, &resctrl_schema_all, list) {
- list_for_each_entry(dom, &s->res->domains, list)
- memset(dom->staged_config, 0, sizeof(dom->staged_config));
- }
+ rdt_staged_configs_clear();
while ((tok = strsep(&buf, "\n")) != NULL) {
resname = strim(strsep(&tok, ":"));
@@ -445,6 +441,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
}
out:
+ rdt_staged_configs_clear();
rdtgroup_kn_unlock(of->kn);
cpus_read_unlock();
return ret ?: nbytes;
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 8edecc5763d8..85ceaf9a31ac 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -555,5 +555,6 @@ void __check_limbo(struct rdt_domain *d, bool force_free);
void rdt_domain_reconfigure_cdp(struct rdt_resource *r);
void __init thread_throttle_mode_init(void);
void __init mbm_config_rftype_init(const char *config);
+void rdt_staged_configs_clear(void);
#endif /* _ASM_X86_RESCTRL_INTERNAL_H */
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 884b6e9a7e31..6ad33f355861 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -78,6 +78,19 @@ void rdt_last_cmd_printf(const char *fmt, ...)
va_end(ap);
}
+void rdt_staged_configs_clear(void)
+{
+ struct rdt_resource *r;
+ struct rdt_domain *dom;
+
+ lockdep_assert_held(&rdtgroup_mutex);
+
+ for_each_alloc_capable_rdt_resource(r) {
+ list_for_each_entry(dom, &r->domains, list)
+ memset(dom->staged_config, 0, sizeof(dom->staged_config));
+ }
+}
+
/*
* Trivial allocator for CLOSIDs. Since h/w only supports a small number,
* we can keep a bitmap of free CLOSIDs in a single integer.
@@ -3107,7 +3120,9 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp)
{
struct resctrl_schema *s;
struct rdt_resource *r;
- int ret;
+ int ret = 0;
+
+ rdt_staged_configs_clear();
list_for_each_entry(s, &resctrl_schema_all, list) {
r = s->res;
@@ -3119,20 +3134,22 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp)
} else {
ret = rdtgroup_init_cat(s, rdtgrp->closid);
if (ret < 0)
- return ret;
+ goto out;
}
ret = resctrl_arch_update_domains(r, rdtgrp->closid);
if (ret < 0) {
rdt_last_cmd_puts("Failed to initialize allocations\n");
- return ret;
+ goto out;
}
}
rdtgrp->mode = RDT_MODE_SHAREABLE;
- return 0;
+out:
+ rdt_staged_configs_clear();
+ return ret;
}
static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 714166cc25f2..0bab497c9436 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1118,21 +1118,20 @@ void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
zerofrom = offsetof(struct xregs_state, extended_state_area);
/*
- * The ptrace buffer is in non-compacted XSAVE format. In
- * non-compacted format disabled features still occupy state space,
- * but there is no state to copy from in the compacted
- * init_fpstate. The gap tracking will zero these states.
- */
- mask = fpstate->user_xfeatures;
-
- /*
- * Dynamic features are not present in init_fpstate. When they are
- * in an all zeros init state, remove those from 'mask' to zero
- * those features in the user buffer instead of retrieving them
- * from init_fpstate.
+ * This 'mask' indicates which states to copy from fpstate.
+ * Those extended states that are not present in fpstate are
+ * either disabled or initialized:
+ *
+ * In non-compacted format, disabled features still occupy
+ * state space but there is no state to copy from in the
+ * compacted init_fpstate. The gap tracking will zero these
+ * states.
+ *
+ * The extended features have an all zeroes init state. Thus,
+ * remove them from 'mask' to zero those features in the user
+ * buffer instead of retrieving them from init_fpstate.
*/
- if (fpu_state_size_dynamic())
- mask &= (header.xfeatures | xinit->header.xcomp_bv);
+ mask = header.xfeatures;
for_each_extended_xfeature(i, mask) {
/*
@@ -1151,9 +1150,8 @@ void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
pkru.pkru = pkru_val;
membuf_write(&to, &pkru, sizeof(pkru));
} else {
- copy_feature(header.xfeatures & BIT_ULL(i), &to,
+ membuf_write(&to,
__raw_xsave_addr(xsave, i),
- __raw_xsave_addr(xinit, i),
xstate_sizes[i]);
}
/*
diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
index 1265ad519249..fb4f1e01b64a 100644
--- a/arch/x86/kernel/ftrace_64.S
+++ b/arch/x86/kernel/ftrace_64.S
@@ -136,10 +136,12 @@ SYM_TYPED_FUNC_START(ftrace_stub)
RET
SYM_FUNC_END(ftrace_stub)
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
SYM_TYPED_FUNC_START(ftrace_stub_graph)
CALL_DEPTH_ACCOUNT
RET
SYM_FUNC_END(ftrace_stub_graph)
+#endif
#ifdef CONFIG_DYNAMIC_FTRACE
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index 679026a640ef..3f664ab277c4 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -2183,9 +2183,6 @@ int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, unsigned
struct ghcb *ghcb;
int ret;
- if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
- return -ENODEV;
-
if (!fw_err)
return -EINVAL;
@@ -2212,15 +2209,26 @@ int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, unsigned
if (ret)
goto e_put;
- if (ghcb->save.sw_exit_info_2) {
- /* Number of expected pages are returned in RBX */
- if (exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST &&
- ghcb->save.sw_exit_info_2 == SNP_GUEST_REQ_INVALID_LEN)
- input->data_npages = ghcb_get_rbx(ghcb);
+ *fw_err = ghcb->save.sw_exit_info_2;
+ switch (*fw_err) {
+ case 0:
+ break;
- *fw_err = ghcb->save.sw_exit_info_2;
+ case SNP_GUEST_REQ_ERR_BUSY:
+ ret = -EAGAIN;
+ break;
+ case SNP_GUEST_REQ_INVALID_LEN:
+ /* Number of expected pages are returned in RBX */
+ if (exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST) {
+ input->data_npages = ghcb_get_rbx(ghcb);
+ ret = -ENOSPC;
+ break;
+ }
+ fallthrough;
+ default:
ret = -EIO;
+ break;
}
e_put:
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 9583a110cf5f..123bf8b97a4b 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -60,12 +60,6 @@ u32 xstate_required_size(u64 xstate_bv, bool compacted)
return ret;
}
-/*
- * This one is tied to SSB in the user API, and not
- * visible in /proc/cpuinfo.
- */
-#define KVM_X86_FEATURE_AMD_PSFD (13*32+28) /* Predictive Store Forwarding Disable */
-
#define F feature_bit
/* Scattered Flag - For features that are scattered by cpufeatures.h. */
@@ -266,7 +260,7 @@ static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_e
/* Update OSXSAVE bit */
if (boot_cpu_has(X86_FEATURE_XSAVE))
cpuid_entry_change(best, X86_FEATURE_OSXSAVE,
- kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE));
+ kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE));
cpuid_entry_change(best, X86_FEATURE_APIC,
vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE);
@@ -275,7 +269,7 @@ static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_e
best = cpuid_entry2_find(entries, nent, 7, 0);
if (best && boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7)
cpuid_entry_change(best, X86_FEATURE_OSPKE,
- kvm_read_cr4_bits(vcpu, X86_CR4_PKE));
+ kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE));
best = cpuid_entry2_find(entries, nent, 0xD, 0);
if (best)
@@ -420,7 +414,7 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
* KVM_SET_CPUID{,2} again. To support this legacy behavior, check
* whether the supplied CPUID data is equal to what's already set.
*/
- if (vcpu->arch.last_vmentry_cpu != -1) {
+ if (kvm_vcpu_has_run(vcpu)) {
r = kvm_cpuid_check_equal(vcpu, e2, nent);
if (r)
return r;
@@ -715,7 +709,7 @@ void kvm_set_cpu_caps(void)
F(CLZERO) | F(XSAVEERPTR) |
F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) |
F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON) |
- __feature_bit(KVM_X86_FEATURE_AMD_PSFD)
+ F(AMD_PSFD)
);
/*
@@ -1002,7 +996,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
entry->eax = entry->ebx = entry->ecx = 0;
break;
case 0xd: {
- u64 permitted_xcr0 = kvm_caps.supported_xcr0 & xstate_get_guest_group_perm();
+ u64 permitted_xcr0 = kvm_get_filtered_xcr0();
u64 permitted_xss = kvm_caps.supported_xss;
entry->eax &= permitted_xcr0;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index a20bec931764..936a397a08cd 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1640,6 +1640,14 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
goto exception;
break;
case VCPU_SREG_CS:
+ /*
+ * KVM uses "none" when loading CS as part of emulating Real
+ * Mode exceptions and IRET (handled above). In all other
+ * cases, loading CS without a control transfer is a KVM bug.
+ */
+ if (WARN_ON_ONCE(transfer == X86_TRANSFER_NONE))
+ goto exception;
+
if (!(seg_desc.type & 8))
goto exception;
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 4c91f626c058..75eae9c4998a 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -4,7 +4,7 @@
#include <linux/kvm_host.h>
-#define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS
+#define KVM_POSSIBLE_CR0_GUEST_BITS (X86_CR0_TS | X86_CR0_WP)
#define KVM_POSSIBLE_CR4_GUEST_BITS \
(X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
| X86_CR4_OSXMMEXCPT | X86_CR4_PGE | X86_CR4_TSD | X86_CR4_FSGSBASE)
@@ -157,6 +157,14 @@ static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask)
return vcpu->arch.cr0 & mask;
}
+static __always_inline bool kvm_is_cr0_bit_set(struct kvm_vcpu *vcpu,
+ unsigned long cr0_bit)
+{
+ BUILD_BUG_ON(!is_power_of_2(cr0_bit));
+
+ return !!kvm_read_cr0_bits(vcpu, cr0_bit);
+}
+
static inline ulong kvm_read_cr0(struct kvm_vcpu *vcpu)
{
return kvm_read_cr0_bits(vcpu, ~0UL);
@@ -171,6 +179,14 @@ static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask)
return vcpu->arch.cr4 & mask;
}
+static __always_inline bool kvm_is_cr4_bit_set(struct kvm_vcpu *vcpu,
+ unsigned long cr4_bit)
+{
+ BUILD_BUG_ON(!is_power_of_2(cr4_bit));
+
+ return !!kvm_read_cr4_bits(vcpu, cr4_bit);
+}
+
static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu)
{
if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
diff --git a/arch/x86/kvm/kvm_onhyperv.c b/arch/x86/kvm/kvm_onhyperv.c
index 482d6639ef88..ded0bd688c65 100644
--- a/arch/x86/kvm/kvm_onhyperv.c
+++ b/arch/x86/kvm/kvm_onhyperv.c
@@ -10,17 +10,22 @@
#include "hyperv.h"
#include "kvm_onhyperv.h"
+struct kvm_hv_tlb_range {
+ u64 start_gfn;
+ u64 pages;
+};
+
static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush,
void *data)
{
- struct kvm_tlb_range *range = data;
+ struct kvm_hv_tlb_range *range = data;
return hyperv_fill_flush_guest_mapping_list(flush, range->start_gfn,
range->pages);
}
static inline int hv_remote_flush_root_tdp(hpa_t root_tdp,
- struct kvm_tlb_range *range)
+ struct kvm_hv_tlb_range *range)
{
if (range)
return hyperv_flush_guest_mapping_range(root_tdp,
@@ -29,8 +34,8 @@ static inline int hv_remote_flush_root_tdp(hpa_t root_tdp,
return hyperv_flush_guest_mapping(root_tdp);
}
-int hv_remote_flush_tlb_with_range(struct kvm *kvm,
- struct kvm_tlb_range *range)
+static int __hv_flush_remote_tlbs_range(struct kvm *kvm,
+ struct kvm_hv_tlb_range *range)
{
struct kvm_arch *kvm_arch = &kvm->arch;
struct kvm_vcpu *vcpu;
@@ -86,19 +91,29 @@ int hv_remote_flush_tlb_with_range(struct kvm *kvm,
spin_unlock(&kvm_arch->hv_root_tdp_lock);
return ret;
}
-EXPORT_SYMBOL_GPL(hv_remote_flush_tlb_with_range);
-int hv_remote_flush_tlb(struct kvm *kvm)
+int hv_flush_remote_tlbs_range(struct kvm *kvm, gfn_t start_gfn, gfn_t nr_pages)
+{
+ struct kvm_hv_tlb_range range = {
+ .start_gfn = start_gfn,
+ .pages = nr_pages,
+ };
+
+ return __hv_flush_remote_tlbs_range(kvm, &range);
+}
+EXPORT_SYMBOL_GPL(hv_flush_remote_tlbs_range);
+
+int hv_flush_remote_tlbs(struct kvm *kvm)
{
- return hv_remote_flush_tlb_with_range(kvm, NULL);
+ return __hv_flush_remote_tlbs_range(kvm, NULL);
}
-EXPORT_SYMBOL_GPL(hv_remote_flush_tlb);
+EXPORT_SYMBOL_GPL(hv_flush_remote_tlbs);
void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp)
{
struct kvm_arch *kvm_arch = &vcpu->kvm->arch;
- if (kvm_x86_ops.tlb_remote_flush == hv_remote_flush_tlb) {
+ if (kvm_x86_ops.flush_remote_tlbs == hv_flush_remote_tlbs) {
spin_lock(&kvm_arch->hv_root_tdp_lock);
vcpu->arch.hv_root_tdp = root_tdp;
if (root_tdp != kvm_arch->hv_root_tdp)
diff --git a/arch/x86/kvm/kvm_onhyperv.h b/arch/x86/kvm/kvm_onhyperv.h
index 287e98ef9df3..ff127d313242 100644
--- a/arch/x86/kvm/kvm_onhyperv.h
+++ b/arch/x86/kvm/kvm_onhyperv.h
@@ -7,9 +7,8 @@
#define __ARCH_X86_KVM_KVM_ONHYPERV_H__
#if IS_ENABLED(CONFIG_HYPERV)
-int hv_remote_flush_tlb_with_range(struct kvm *kvm,
- struct kvm_tlb_range *range);
-int hv_remote_flush_tlb(struct kvm *kvm);
+int hv_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, gfn_t nr_pages);
+int hv_flush_remote_tlbs(struct kvm *kvm);
void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp);
#else /* !CONFIG_HYPERV */
static inline void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp)
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 168c46fd8dd1..92d5a1924fc1 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -113,6 +113,8 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu);
int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
u64 fault_address, char *insn, int insn_len);
+void __kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *mmu);
int kvm_mmu_load(struct kvm_vcpu *vcpu);
void kvm_mmu_unload(struct kvm_vcpu *vcpu);
@@ -132,7 +134,7 @@ static inline unsigned long kvm_get_pcid(struct kvm_vcpu *vcpu, gpa_t cr3)
{
BUILD_BUG_ON((X86_CR3_PCID_MASK & PAGE_MASK) != 0);
- return kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)
+ return kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE)
? cr3 & X86_CR3_PCID_MASK
: 0;
}
@@ -153,6 +155,24 @@ static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu)
vcpu->arch.mmu->root_role.level);
}
+static inline void kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *mmu)
+{
+ /*
+ * When EPT is enabled, KVM may passthrough CR0.WP to the guest, i.e.
+ * @mmu's snapshot of CR0.WP and thus all related paging metadata may
+ * be stale. Refresh CR0.WP and the metadata on-demand when checking
+ * for permission faults. Exempt nested MMUs, i.e. MMUs for shadowing
+ * nEPT and nNPT, as CR0.WP is ignored in both cases. Note, KVM does
+ * need to refresh nested_mmu, a.k.a. the walker used to translate L2
+ * GVAs to GPAs, as that "MMU" needs to honor L2's CR0.WP.
+ */
+ if (!tdp_enabled || mmu == &vcpu->arch.guest_mmu)
+ return;
+
+ __kvm_mmu_refresh_passthrough_bits(vcpu, mmu);
+}
+
/*
* Check if a given access (described through the I/D, W/R and U/S bits of a
* page fault error code pfec) causes a permission fault with the given PTE
@@ -184,8 +204,12 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
u64 implicit_access = access & PFERR_IMPLICIT_ACCESS;
bool not_smap = ((rflags & X86_EFLAGS_AC) | implicit_access) == X86_EFLAGS_AC;
int index = (pfec + (not_smap << PFERR_RSVD_BIT)) >> 1;
- bool fault = (mmu->permissions[index] >> pte_access) & 1;
u32 errcode = PFERR_PRESENT_MASK;
+ bool fault;
+
+ kvm_mmu_refresh_passthrough_bits(vcpu, mmu);
+
+ fault = (mmu->permissions[index] >> pte_access) & 1;
WARN_ON(pfec & (PFERR_PK_MASK | PFERR_RSVD_MASK));
if (unlikely(mmu->pkru_mask)) {
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 144c5a01cd77..c8961f45e3b1 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -125,17 +125,31 @@ module_param(dbg, bool, 0644);
#define PTE_LIST_EXT 14
/*
- * Slight optimization of cacheline layout, by putting `more' and `spte_count'
- * at the start; then accessing it will only use one single cacheline for
- * either full (entries==PTE_LIST_EXT) case or entries<=6.
+ * struct pte_list_desc is the core data structure used to implement a custom
+ * list for tracking a set of related SPTEs, e.g. all the SPTEs that map a
+ * given GFN when used in the context of rmaps. Using a custom list allows KVM
+ * to optimize for the common case where many GFNs will have at most a handful
+ * of SPTEs pointing at them, i.e. allows packing multiple SPTEs into a small
+ * memory footprint, which in turn improves runtime performance by exploiting
+ * cache locality.
+ *
+ * A list is comprised of one or more pte_list_desc objects (descriptors).
+ * Each individual descriptor stores up to PTE_LIST_EXT SPTEs. If a descriptor
+ * is full and a new SPTEs needs to be added, a new descriptor is allocated and
+ * becomes the head of the list. This means that by definitions, all tail
+ * descriptors are full.
+ *
+ * Note, the meta data fields are deliberately placed at the start of the
+ * structure to optimize the cacheline layout; accessing the descriptor will
+ * touch only a single cacheline so long as @spte_count<=6 (or if only the
+ * descriptors metadata is accessed).
*/
struct pte_list_desc {
struct pte_list_desc *more;
- /*
- * Stores number of entries stored in the pte_list_desc. No need to be
- * u64 but just for easier alignment. When PTE_LIST_EXT, means full.
- */
- u64 spte_count;
+ /* The number of PTEs stored in _this_ descriptor. */
+ u32 spte_count;
+ /* The number of PTEs stored in all tails of this descriptor. */
+ u32 tail_count;
u64 *sptes[PTE_LIST_EXT];
};
@@ -242,32 +256,35 @@ static struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu)
return regs;
}
-static inline bool kvm_available_flush_tlb_with_range(void)
+static unsigned long get_guest_cr3(struct kvm_vcpu *vcpu)
{
- return kvm_x86_ops.tlb_remote_flush_with_range;
+ return kvm_read_cr3(vcpu);
}
-static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm,
- struct kvm_tlb_range *range)
+static inline unsigned long kvm_mmu_get_guest_pgd(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *mmu)
{
- int ret = -ENOTSUPP;
-
- if (range && kvm_x86_ops.tlb_remote_flush_with_range)
- ret = static_call(kvm_x86_tlb_remote_flush_with_range)(kvm, range);
+ if (IS_ENABLED(CONFIG_RETPOLINE) && mmu->get_guest_pgd == get_guest_cr3)
+ return kvm_read_cr3(vcpu);
- if (ret)
- kvm_flush_remote_tlbs(kvm);
+ return mmu->get_guest_pgd(vcpu);
}
-void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
- u64 start_gfn, u64 pages)
+static inline bool kvm_available_flush_remote_tlbs_range(void)
{
- struct kvm_tlb_range range;
+ return kvm_x86_ops.flush_remote_tlbs_range;
+}
- range.start_gfn = start_gfn;
- range.pages = pages;
+void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t start_gfn,
+ gfn_t nr_pages)
+{
+ int ret = -EOPNOTSUPP;
- kvm_flush_remote_tlbs_with_range(kvm, &range);
+ if (kvm_x86_ops.flush_remote_tlbs_range)
+ ret = static_call(kvm_x86_flush_remote_tlbs_range)(kvm, start_gfn,
+ nr_pages);
+ if (ret)
+ kvm_flush_remote_tlbs(kvm);
}
static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index);
@@ -888,9 +905,9 @@ static void unaccount_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
untrack_possible_nx_huge_page(kvm, sp);
}
-static struct kvm_memory_slot *
-gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
- bool no_dirty_log)
+static struct kvm_memory_slot *gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu,
+ gfn_t gfn,
+ bool no_dirty_log)
{
struct kvm_memory_slot *slot;
@@ -929,53 +946,69 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
desc->sptes[0] = (u64 *)rmap_head->val;
desc->sptes[1] = spte;
desc->spte_count = 2;
+ desc->tail_count = 0;
rmap_head->val = (unsigned long)desc | 1;
++count;
} else {
rmap_printk("%p %llx many->many\n", spte, *spte);
desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
- while (desc->spte_count == PTE_LIST_EXT) {
- count += PTE_LIST_EXT;
- if (!desc->more) {
- desc->more = kvm_mmu_memory_cache_alloc(cache);
- desc = desc->more;
- desc->spte_count = 0;
- break;
- }
- desc = desc->more;
+ count = desc->tail_count + desc->spte_count;
+
+ /*
+ * If the previous head is full, allocate a new head descriptor
+ * as tail descriptors are always kept full.
+ */
+ if (desc->spte_count == PTE_LIST_EXT) {
+ desc = kvm_mmu_memory_cache_alloc(cache);
+ desc->more = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+ desc->spte_count = 0;
+ desc->tail_count = count;
+ rmap_head->val = (unsigned long)desc | 1;
}
- count += desc->spte_count;
desc->sptes[desc->spte_count++] = spte;
}
return count;
}
-static void
-pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
- struct pte_list_desc *desc, int i,
- struct pte_list_desc *prev_desc)
+static void pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
+ struct pte_list_desc *desc, int i)
{
- int j = desc->spte_count - 1;
+ struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+ int j = head_desc->spte_count - 1;
+
+ /*
+ * The head descriptor should never be empty. A new head is added only
+ * when adding an entry and the previous head is full, and heads are
+ * removed (this flow) when they become empty.
+ */
+ BUG_ON(j < 0);
- desc->sptes[i] = desc->sptes[j];
- desc->sptes[j] = NULL;
- desc->spte_count--;
- if (desc->spte_count)
+ /*
+ * Replace the to-be-freed SPTE with the last valid entry from the head
+ * descriptor to ensure that tail descriptors are full at all times.
+ * Note, this also means that tail_count is stable for each descriptor.
+ */
+ desc->sptes[i] = head_desc->sptes[j];
+ head_desc->sptes[j] = NULL;
+ head_desc->spte_count--;
+ if (head_desc->spte_count)
return;
- if (!prev_desc && !desc->more)
+
+ /*
+ * The head descriptor is empty. If there are no tail descriptors,
+ * nullify the rmap head to mark the list as emtpy, else point the rmap
+ * head at the next descriptor, i.e. the new head.
+ */
+ if (!head_desc->more)
rmap_head->val = 0;
else
- if (prev_desc)
- prev_desc->more = desc->more;
- else
- rmap_head->val = (unsigned long)desc->more | 1;
- mmu_free_pte_list_desc(desc);
+ rmap_head->val = (unsigned long)head_desc->more | 1;
+ mmu_free_pte_list_desc(head_desc);
}
static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
{
struct pte_list_desc *desc;
- struct pte_list_desc *prev_desc;
int i;
if (!rmap_head->val) {
@@ -991,16 +1024,13 @@ static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
} else {
rmap_printk("%p many->many\n", spte);
desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
- prev_desc = NULL;
while (desc) {
for (i = 0; i < desc->spte_count; ++i) {
if (desc->sptes[i] == spte) {
- pte_list_desc_remove_entry(rmap_head,
- desc, i, prev_desc);
+ pte_list_desc_remove_entry(rmap_head, desc, i);
return;
}
}
- prev_desc = desc;
desc = desc->more;
}
pr_err("%s: %p many->many\n", __func__, spte);
@@ -1047,7 +1077,6 @@ out:
unsigned int pte_list_count(struct kvm_rmap_head *rmap_head)
{
struct pte_list_desc *desc;
- unsigned int count = 0;
if (!rmap_head->val)
return 0;
@@ -1055,13 +1084,7 @@ unsigned int pte_list_count(struct kvm_rmap_head *rmap_head)
return 1;
desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
-
- while (desc) {
- count += desc->spte_count;
- desc = desc->more;
- }
-
- return count;
+ return desc->tail_count + desc->spte_count;
}
static struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level,
@@ -1073,14 +1096,6 @@ static struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level,
return &slot->arch.rmap[level - PG_LEVEL_4K][idx];
}
-static bool rmap_can_add(struct kvm_vcpu *vcpu)
-{
- struct kvm_mmu_memory_cache *mc;
-
- mc = &vcpu->arch.mmu_pte_list_desc_cache;
- return kvm_mmu_memory_cache_nr_free_objects(mc);
-}
-
static void rmap_remove(struct kvm *kvm, u64 *spte)
{
struct kvm_memslots *slots;
@@ -1479,7 +1494,7 @@ restart:
}
}
- if (need_flush && kvm_available_flush_tlb_with_range()) {
+ if (need_flush && kvm_available_flush_remote_tlbs_range()) {
kvm_flush_remote_tlbs_gfn(kvm, gfn, level);
return false;
}
@@ -1504,8 +1519,8 @@ struct slot_rmap_walk_iterator {
struct kvm_rmap_head *end_rmap;
};
-static void
-rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level)
+static void rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator,
+ int level)
{
iterator->level = level;
iterator->gfn = iterator->start_gfn;
@@ -1513,10 +1528,10 @@ rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level)
iterator->end_rmap = gfn_to_rmap(iterator->end_gfn, level, iterator->slot);
}
-static void
-slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
- const struct kvm_memory_slot *slot, int start_level,
- int end_level, gfn_t start_gfn, gfn_t end_gfn)
+static void slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
+ const struct kvm_memory_slot *slot,
+ int start_level, int end_level,
+ gfn_t start_gfn, gfn_t end_gfn)
{
iterator->slot = slot;
iterator->start_level = start_level;
@@ -1789,12 +1804,6 @@ static void mark_unsync(u64 *spte)
kvm_mmu_mark_parents_unsync(sp);
}
-static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp)
-{
- return -1;
-}
-
#define KVM_PAGE_ARRAY_NR 16
struct kvm_mmu_pages {
@@ -1914,10 +1923,79 @@ static bool sp_has_gptes(struct kvm_mmu_page *sp)
&(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \
if ((_sp)->gfn != (_gfn) || !sp_has_gptes(_sp)) {} else
+static bool kvm_sync_page_check(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+ union kvm_mmu_page_role root_role = vcpu->arch.mmu->root_role;
+
+ /*
+ * Ignore various flags when verifying that it's safe to sync a shadow
+ * page using the current MMU context.
+ *
+ * - level: not part of the overall MMU role and will never match as the MMU's
+ * level tracks the root level
+ * - access: updated based on the new guest PTE
+ * - quadrant: not part of the overall MMU role (similar to level)
+ */
+ const union kvm_mmu_page_role sync_role_ign = {
+ .level = 0xf,
+ .access = 0x7,
+ .quadrant = 0x3,
+ .passthrough = 0x1,
+ };
+
+ /*
+ * Direct pages can never be unsync, and KVM should never attempt to
+ * sync a shadow page for a different MMU context, e.g. if the role
+ * differs then the memslot lookup (SMM vs. non-SMM) will be bogus, the
+ * reserved bits checks will be wrong, etc...
+ */
+ if (WARN_ON_ONCE(sp->role.direct || !vcpu->arch.mmu->sync_spte ||
+ (sp->role.word ^ root_role.word) & ~sync_role_ign.word))
+ return false;
+
+ return true;
+}
+
+static int kvm_sync_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i)
+{
+ if (!sp->spt[i])
+ return 0;
+
+ return vcpu->arch.mmu->sync_spte(vcpu, sp, i);
+}
+
+static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+ int flush = 0;
+ int i;
+
+ if (!kvm_sync_page_check(vcpu, sp))
+ return -1;
+
+ for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
+ int ret = kvm_sync_spte(vcpu, sp, i);
+
+ if (ret < -1)
+ return -1;
+ flush |= ret;
+ }
+
+ /*
+ * Note, any flush is purely for KVM's correctness, e.g. when dropping
+ * an existing SPTE or clearing W/A/D bits to ensure an mmu_notifier
+ * unmap or dirty logging event doesn't fail to flush. The guest is
+ * responsible for flushing the TLB to ensure any changes in protection
+ * bits are recognized, i.e. until the guest flushes or page faults on
+ * a relevant address, KVM is architecturally allowed to let vCPUs use
+ * cached translations with the old protection bits.
+ */
+ return flush;
+}
+
static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
struct list_head *invalid_list)
{
- int ret = vcpu->arch.mmu->sync_page(vcpu, sp);
+ int ret = __kvm_sync_page(vcpu, sp);
if (ret < 0)
kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
@@ -3304,9 +3382,9 @@ static bool page_fault_can_be_fast(struct kvm_page_fault *fault)
* Returns true if the SPTE was fixed successfully. Otherwise,
* someone else modified the SPTE from its original value.
*/
-static bool
-fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
- u64 *sptep, u64 old_spte, u64 new_spte)
+static bool fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault,
+ u64 *sptep, u64 old_spte, u64 new_spte)
{
/*
* Theoretically we could also set dirty bit (and flush TLB) here in
@@ -3513,6 +3591,8 @@ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
LIST_HEAD(invalid_list);
bool free_active_root;
+ WARN_ON_ONCE(roots_to_free & ~KVM_MMU_ROOTS_ALL);
+
BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG);
/* Before acquiring the MMU lock, see if we need to do any real work. */
@@ -3731,7 +3811,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
int quadrant, i, r;
hpa_t root;
- root_pgd = mmu->get_guest_pgd(vcpu);
+ root_pgd = kvm_mmu_get_guest_pgd(vcpu, mmu);
root_gfn = root_pgd >> PAGE_SHIFT;
if (mmu_check_root(vcpu, root_gfn))
@@ -4181,7 +4261,7 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
arch.token = alloc_apf_token(vcpu);
arch.gfn = gfn;
arch.direct_map = vcpu->arch.mmu->root_role.direct;
- arch.cr3 = vcpu->arch.mmu->get_guest_pgd(vcpu);
+ arch.cr3 = kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu);
return kvm_setup_async_pf(vcpu, cr2_or_gpa,
kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
@@ -4200,7 +4280,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
return;
if (!vcpu->arch.mmu->root_role.direct &&
- work->arch.cr3 != vcpu->arch.mmu->get_guest_pgd(vcpu))
+ work->arch.cr3 != kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu))
return;
kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true, NULL);
@@ -4469,8 +4549,7 @@ static void nonpaging_init_context(struct kvm_mmu *context)
{
context->page_fault = nonpaging_page_fault;
context->gva_to_gpa = nonpaging_gva_to_gpa;
- context->sync_page = nonpaging_sync_page;
- context->invlpg = NULL;
+ context->sync_spte = NULL;
}
static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd,
@@ -4604,11 +4683,6 @@ void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd)
}
EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd);
-static unsigned long get_cr3(struct kvm_vcpu *vcpu)
-{
- return kvm_read_cr3(vcpu);
-}
-
static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
unsigned int access)
{
@@ -4638,10 +4712,9 @@ static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
#include "paging_tmpl.h"
#undef PTTYPE
-static void
-__reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check,
- u64 pa_bits_rsvd, int level, bool nx, bool gbpages,
- bool pse, bool amd)
+static void __reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check,
+ u64 pa_bits_rsvd, int level, bool nx,
+ bool gbpages, bool pse, bool amd)
{
u64 gbpages_bit_rsvd = 0;
u64 nonleaf_bit8_rsvd = 0;
@@ -4754,9 +4827,9 @@ static void reset_guest_rsvds_bits_mask(struct kvm_vcpu *vcpu,
guest_cpuid_is_amd_or_hygon(vcpu));
}
-static void
-__reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
- u64 pa_bits_rsvd, bool execonly, int huge_page_level)
+static void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
+ u64 pa_bits_rsvd, bool execonly,
+ int huge_page_level)
{
u64 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51);
u64 large_1g_rsvd = 0, large_2m_rsvd = 0;
@@ -4856,8 +4929,7 @@ static inline bool boot_cpu_is_amd(void)
* the direct page table on host, use as much mmu features as
* possible, however, kvm currently does not do execution-protection.
*/
-static void
-reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context)
+static void reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context)
{
struct rsvd_bits_validate *shadow_zero_check;
int i;
@@ -5060,20 +5132,18 @@ static void paging64_init_context(struct kvm_mmu *context)
{
context->page_fault = paging64_page_fault;
context->gva_to_gpa = paging64_gva_to_gpa;
- context->sync_page = paging64_sync_page;
- context->invlpg = paging64_invlpg;
+ context->sync_spte = paging64_sync_spte;
}
static void paging32_init_context(struct kvm_mmu *context)
{
context->page_fault = paging32_page_fault;
context->gva_to_gpa = paging32_gva_to_gpa;
- context->sync_page = paging32_sync_page;
- context->invlpg = paging32_invlpg;
+ context->sync_spte = paging32_sync_spte;
}
-static union kvm_cpu_role
-kvm_calc_cpu_role(struct kvm_vcpu *vcpu, const struct kvm_mmu_role_regs *regs)
+static union kvm_cpu_role kvm_calc_cpu_role(struct kvm_vcpu *vcpu,
+ const struct kvm_mmu_role_regs *regs)
{
union kvm_cpu_role role = {0};
@@ -5112,6 +5182,21 @@ kvm_calc_cpu_role(struct kvm_vcpu *vcpu, const struct kvm_mmu_role_regs *regs)
return role;
}
+void __kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *mmu)
+{
+ const bool cr0_wp = kvm_is_cr0_bit_set(vcpu, X86_CR0_WP);
+
+ BUILD_BUG_ON((KVM_MMU_CR0_ROLE_BITS & KVM_POSSIBLE_CR0_GUEST_BITS) != X86_CR0_WP);
+ BUILD_BUG_ON((KVM_MMU_CR4_ROLE_BITS & KVM_POSSIBLE_CR4_GUEST_BITS));
+
+ if (is_cr0_wp(mmu) == cr0_wp)
+ return;
+
+ mmu->cpu_role.base.cr0_wp = cr0_wp;
+ reset_guest_paging_metadata(vcpu, mmu);
+}
+
static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
{
/* tdp_root_level is architecture forced level, use it if nonzero */
@@ -5157,9 +5242,8 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu,
context->cpu_role.as_u64 = cpu_role.as_u64;
context->root_role.word = root_role.word;
context->page_fault = kvm_tdp_page_fault;
- context->sync_page = nonpaging_sync_page;
- context->invlpg = NULL;
- context->get_guest_pgd = get_cr3;
+ context->sync_spte = NULL;
+ context->get_guest_pgd = get_guest_cr3;
context->get_pdptr = kvm_pdptr_read;
context->inject_page_fault = kvm_inject_page_fault;
@@ -5289,8 +5373,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
context->page_fault = ept_page_fault;
context->gva_to_gpa = ept_gva_to_gpa;
- context->sync_page = ept_sync_page;
- context->invlpg = ept_invlpg;
+ context->sync_spte = ept_sync_spte;
update_permission_bitmask(context, true);
context->pkru_mask = 0;
@@ -5309,7 +5392,7 @@ static void init_kvm_softmmu(struct kvm_vcpu *vcpu,
kvm_init_shadow_mmu(vcpu, cpu_role);
- context->get_guest_pgd = get_cr3;
+ context->get_guest_pgd = get_guest_cr3;
context->get_pdptr = kvm_pdptr_read;
context->inject_page_fault = kvm_inject_page_fault;
}
@@ -5323,7 +5406,7 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu,
return;
g_context->cpu_role.as_u64 = new_mode.as_u64;
- g_context->get_guest_pgd = get_cr3;
+ g_context->get_guest_pgd = get_guest_cr3;
g_context->get_pdptr = kvm_pdptr_read;
g_context->inject_page_fault = kvm_inject_page_fault;
@@ -5331,7 +5414,7 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu,
* L2 page tables are never shadowed, so there is no need to sync
* SPTEs.
*/
- g_context->invlpg = NULL;
+ g_context->sync_spte = NULL;
/*
* Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using
@@ -5393,7 +5476,7 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
* Changing guest CPUID after KVM_RUN is forbidden, see the comment in
* kvm_arch_vcpu_ioctl().
*/
- KVM_BUG_ON(vcpu->arch.last_vmentry_cpu != -1, vcpu->kvm);
+ KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm);
}
void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
@@ -5707,48 +5790,77 @@ emulate:
}
EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
-void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
- gva_t gva, hpa_t root_hpa)
+static void __kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ u64 addr, hpa_t root_hpa)
+{
+ struct kvm_shadow_walk_iterator iterator;
+
+ vcpu_clear_mmio_info(vcpu, addr);
+
+ if (!VALID_PAGE(root_hpa))
+ return;
+
+ write_lock(&vcpu->kvm->mmu_lock);
+ for_each_shadow_entry_using_root(vcpu, root_hpa, addr, iterator) {
+ struct kvm_mmu_page *sp = sptep_to_sp(iterator.sptep);
+
+ if (sp->unsync) {
+ int ret = kvm_sync_spte(vcpu, sp, iterator.index);
+
+ if (ret < 0)
+ mmu_page_zap_pte(vcpu->kvm, sp, iterator.sptep, NULL);
+ if (ret)
+ kvm_flush_remote_tlbs_sptep(vcpu->kvm, iterator.sptep);
+ }
+
+ if (!sp->unsync_children)
+ break;
+ }
+ write_unlock(&vcpu->kvm->mmu_lock);
+}
+
+void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ u64 addr, unsigned long roots)
{
int i;
+ WARN_ON_ONCE(roots & ~KVM_MMU_ROOTS_ALL);
+
/* It's actually a GPA for vcpu->arch.guest_mmu. */
if (mmu != &vcpu->arch.guest_mmu) {
/* INVLPG on a non-canonical address is a NOP according to the SDM. */
- if (is_noncanonical_address(gva, vcpu))
+ if (is_noncanonical_address(addr, vcpu))
return;
- static_call(kvm_x86_flush_tlb_gva)(vcpu, gva);
+ static_call(kvm_x86_flush_tlb_gva)(vcpu, addr);
}
- if (!mmu->invlpg)
+ if (!mmu->sync_spte)
return;
- if (root_hpa == INVALID_PAGE) {
- mmu->invlpg(vcpu, gva, mmu->root.hpa);
+ if (roots & KVM_MMU_ROOT_CURRENT)
+ __kvm_mmu_invalidate_addr(vcpu, mmu, addr, mmu->root.hpa);
- /*
- * INVLPG is required to invalidate any global mappings for the VA,
- * irrespective of PCID. Since it would take us roughly similar amount
- * of work to determine whether any of the prev_root mappings of the VA
- * is marked global, or to just sync it blindly, so we might as well
- * just always sync it.
- *
- * Mappings not reachable via the current cr3 or the prev_roots will be
- * synced when switching to that cr3, so nothing needs to be done here
- * for them.
- */
- for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
- if (VALID_PAGE(mmu->prev_roots[i].hpa))
- mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
- } else {
- mmu->invlpg(vcpu, gva, root_hpa);
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
+ if (roots & KVM_MMU_ROOT_PREVIOUS(i))
+ __kvm_mmu_invalidate_addr(vcpu, mmu, addr, mmu->prev_roots[i].hpa);
}
}
+EXPORT_SYMBOL_GPL(kvm_mmu_invalidate_addr);
void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
{
- kvm_mmu_invalidate_gva(vcpu, vcpu->arch.walk_mmu, gva, INVALID_PAGE);
+ /*
+ * INVLPG is required to invalidate any global mappings for the VA,
+ * irrespective of PCID. Blindly sync all roots as it would take
+ * roughly the same amount of work/time to determine whether any of the
+ * previous roots have a global mapping.
+ *
+ * Mappings not reachable via the current or previous cached roots will
+ * be synced when switching to that new cr3, so nothing needs to be
+ * done here for them.
+ */
+ kvm_mmu_invalidate_addr(vcpu, vcpu->arch.walk_mmu, gva, KVM_MMU_ROOTS_ALL);
++vcpu->stat.invlpg;
}
EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
@@ -5757,27 +5869,20 @@ EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
{
struct kvm_mmu *mmu = vcpu->arch.mmu;
- bool tlb_flush = false;
+ unsigned long roots = 0;
uint i;
- if (pcid == kvm_get_active_pcid(vcpu)) {
- if (mmu->invlpg)
- mmu->invlpg(vcpu, gva, mmu->root.hpa);
- tlb_flush = true;
- }
+ if (pcid == kvm_get_active_pcid(vcpu))
+ roots |= KVM_MMU_ROOT_CURRENT;
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
if (VALID_PAGE(mmu->prev_roots[i].hpa) &&
- pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd)) {
- if (mmu->invlpg)
- mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
- tlb_flush = true;
- }
+ pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd))
+ roots |= KVM_MMU_ROOT_PREVIOUS(i);
}
- if (tlb_flush)
- static_call(kvm_x86_flush_tlb_gva)(vcpu, gva);
-
+ if (roots)
+ kvm_mmu_invalidate_addr(vcpu, mmu, gva, roots);
++vcpu->stat.invlpg;
/*
@@ -5814,29 +5919,30 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
EXPORT_SYMBOL_GPL(kvm_configure_mmu);
/* The return value indicates if tlb flush on all vcpus is needed. */
-typedef bool (*slot_level_handler) (struct kvm *kvm,
+typedef bool (*slot_rmaps_handler) (struct kvm *kvm,
struct kvm_rmap_head *rmap_head,
const struct kvm_memory_slot *slot);
-/* The caller should hold mmu-lock before calling this function. */
-static __always_inline bool
-slot_handle_level_range(struct kvm *kvm, const struct kvm_memory_slot *memslot,
- slot_level_handler fn, int start_level, int end_level,
- gfn_t start_gfn, gfn_t end_gfn, bool flush_on_yield,
- bool flush)
+static __always_inline bool __walk_slot_rmaps(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ slot_rmaps_handler fn,
+ int start_level, int end_level,
+ gfn_t start_gfn, gfn_t end_gfn,
+ bool flush_on_yield, bool flush)
{
struct slot_rmap_walk_iterator iterator;
- for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ for_each_slot_rmap_range(slot, start_level, end_level, start_gfn,
end_gfn, &iterator) {
if (iterator.rmap)
- flush |= fn(kvm, iterator.rmap, memslot);
+ flush |= fn(kvm, iterator.rmap, slot);
if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
if (flush && flush_on_yield) {
- kvm_flush_remote_tlbs_with_address(kvm,
- start_gfn,
- iterator.gfn - start_gfn + 1);
+ kvm_flush_remote_tlbs_range(kvm, start_gfn,
+ iterator.gfn - start_gfn + 1);
flush = false;
}
cond_resched_rwlock_write(&kvm->mmu_lock);
@@ -5846,23 +5952,23 @@ slot_handle_level_range(struct kvm *kvm, const struct kvm_memory_slot *memslot,
return flush;
}
-static __always_inline bool
-slot_handle_level(struct kvm *kvm, const struct kvm_memory_slot *memslot,
- slot_level_handler fn, int start_level, int end_level,
- bool flush_on_yield)
+static __always_inline bool walk_slot_rmaps(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ slot_rmaps_handler fn,
+ int start_level, int end_level,
+ bool flush_on_yield)
{
- return slot_handle_level_range(kvm, memslot, fn, start_level,
- end_level, memslot->base_gfn,
- memslot->base_gfn + memslot->npages - 1,
- flush_on_yield, false);
+ return __walk_slot_rmaps(kvm, slot, fn, start_level, end_level,
+ slot->base_gfn, slot->base_gfn + slot->npages - 1,
+ flush_on_yield, false);
}
-static __always_inline bool
-slot_handle_level_4k(struct kvm *kvm, const struct kvm_memory_slot *memslot,
- slot_level_handler fn, bool flush_on_yield)
+static __always_inline bool walk_slot_rmaps_4k(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ slot_rmaps_handler fn,
+ bool flush_on_yield)
{
- return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K,
- PG_LEVEL_4K, flush_on_yield);
+ return walk_slot_rmaps(kvm, slot, fn, PG_LEVEL_4K, PG_LEVEL_4K, flush_on_yield);
}
static void free_mmu_pages(struct kvm_mmu *mmu)
@@ -6157,9 +6263,9 @@ static bool kvm_rmap_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_e
if (WARN_ON_ONCE(start >= end))
continue;
- flush = slot_handle_level_range(kvm, memslot, __kvm_zap_rmap,
- PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
- start, end - 1, true, flush);
+ flush = __walk_slot_rmaps(kvm, memslot, __kvm_zap_rmap,
+ PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
+ start, end - 1, true, flush);
}
}
@@ -6191,8 +6297,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
}
if (flush)
- kvm_flush_remote_tlbs_with_address(kvm, gfn_start,
- gfn_end - gfn_start);
+ kvm_flush_remote_tlbs_range(kvm, gfn_start, gfn_end - gfn_start);
kvm_mmu_invalidate_end(kvm, 0, -1ul);
@@ -6212,8 +6317,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
{
if (kvm_memslots_have_rmaps(kvm)) {
write_lock(&kvm->mmu_lock);
- slot_handle_level(kvm, memslot, slot_rmap_write_protect,
- start_level, KVM_MAX_HUGEPAGE_LEVEL, false);
+ walk_slot_rmaps(kvm, memslot, slot_rmap_write_protect,
+ start_level, KVM_MAX_HUGEPAGE_LEVEL, false);
write_unlock(&kvm->mmu_lock);
}
@@ -6448,10 +6553,9 @@ static void kvm_shadow_mmu_try_split_huge_pages(struct kvm *kvm,
* all the way to the target level. There's no need to split pages
* already at the target level.
*/
- for (level = KVM_MAX_HUGEPAGE_LEVEL; level > target_level; level--) {
- slot_handle_level_range(kvm, slot, shadow_mmu_try_split_huge_pages,
- level, level, start, end - 1, true, false);
- }
+ for (level = KVM_MAX_HUGEPAGE_LEVEL; level > target_level; level--)
+ __walk_slot_rmaps(kvm, slot, shadow_mmu_try_split_huge_pages,
+ level, level, start, end - 1, true, false);
}
/* Must be called with the mmu_lock held in write-mode. */
@@ -6530,7 +6634,7 @@ restart:
PG_LEVEL_NUM)) {
kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
- if (kvm_available_flush_tlb_with_range())
+ if (kvm_available_flush_remote_tlbs_range())
kvm_flush_remote_tlbs_sptep(kvm, sptep);
else
need_tlb_flush = 1;
@@ -6549,8 +6653,8 @@ static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm,
* Note, use KVM_MAX_HUGEPAGE_LEVEL - 1 since there's no need to zap
* pages that are already mapped at the maximum hugepage level.
*/
- if (slot_handle_level(kvm, slot, kvm_mmu_zap_collapsible_spte,
- PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL - 1, true))
+ if (walk_slot_rmaps(kvm, slot, kvm_mmu_zap_collapsible_spte,
+ PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL - 1, true))
kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
}
@@ -6581,8 +6685,7 @@ void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
* is observed by any other operation on the same memslot.
*/
lockdep_assert_held(&kvm->slots_lock);
- kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
- memslot->npages);
+ kvm_flush_remote_tlbs_range(kvm, memslot->base_gfn, memslot->npages);
}
void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
@@ -6594,7 +6697,7 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
* Clear dirty bits only on 4k SPTEs since the legacy MMU only
* support dirty logging at a 4k granularity.
*/
- slot_handle_level_4k(kvm, memslot, __rmap_clear_dirty, false);
+ walk_slot_rmaps_4k(kvm, memslot, __rmap_clear_dirty, false);
write_unlock(&kvm->mmu_lock);
}
@@ -6664,8 +6767,8 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
}
}
-static unsigned long
-mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
+static unsigned long mmu_shrink_scan(struct shrinker *shrink,
+ struct shrink_control *sc)
{
struct kvm *kvm;
int nr_to_scan = sc->nr_to_scan;
@@ -6723,8 +6826,8 @@ unlock:
return freed;
}
-static unsigned long
-mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
+static unsigned long mmu_shrink_count(struct shrinker *shrink,
+ struct shrink_control *sc)
{
return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
}
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index 2cbb155c686c..d39af5639ce9 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -170,14 +170,14 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
struct kvm_memory_slot *slot, u64 gfn,
int min_level);
-void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
- u64 start_gfn, u64 pages);
+void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t start_gfn,
+ gfn_t nr_pages);
/* Flush the given page (huge or not) of guest memory. */
static inline void kvm_flush_remote_tlbs_gfn(struct kvm *kvm, gfn_t gfn, int level)
{
- kvm_flush_remote_tlbs_with_address(kvm, gfn_round_for_level(gfn, level),
- KVM_PAGES_PER_HPAGE(level));
+ kvm_flush_remote_tlbs_range(kvm, gfn_round_for_level(gfn, level),
+ KVM_PAGES_PER_HPAGE(level));
}
unsigned int pte_list_count(struct kvm_rmap_head *rmap_head);
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index a056f2773dd9..0662e0278e70 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -324,7 +324,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
trace_kvm_mmu_pagetable_walk(addr, access);
retry_walk:
walker->level = mmu->cpu_role.base.level;
- pte = mmu->get_guest_pgd(vcpu);
+ pte = kvm_mmu_get_guest_pgd(vcpu, mmu);
have_ad = PT_HAVE_ACCESSED_DIRTY(mmu);
#if PTTYPE == 64
@@ -519,7 +519,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
static bool
FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- u64 *spte, pt_element_t gpte, bool no_dirty_log)
+ u64 *spte, pt_element_t gpte)
{
struct kvm_memory_slot *slot;
unsigned pte_access;
@@ -535,8 +535,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
pte_access = sp->role.access & FNAME(gpte_access)(gpte);
FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte);
- slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn,
- no_dirty_log && (pte_access & ACC_WRITE_MASK));
+ slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, pte_access & ACC_WRITE_MASK);
if (!slot)
return false;
@@ -605,7 +604,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
if (is_shadow_present_pte(*spte))
continue;
- if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i], true))
+ if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i]))
break;
}
}
@@ -846,64 +845,6 @@ static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t);
}
-static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
-{
- struct kvm_shadow_walk_iterator iterator;
- struct kvm_mmu_page *sp;
- u64 old_spte;
- int level;
- u64 *sptep;
-
- vcpu_clear_mmio_info(vcpu, gva);
-
- /*
- * No need to check return value here, rmap_can_add() can
- * help us to skip pte prefetch later.
- */
- mmu_topup_memory_caches(vcpu, true);
-
- if (!VALID_PAGE(root_hpa)) {
- WARN_ON(1);
- return;
- }
-
- write_lock(&vcpu->kvm->mmu_lock);
- for_each_shadow_entry_using_root(vcpu, root_hpa, gva, iterator) {
- level = iterator.level;
- sptep = iterator.sptep;
-
- sp = sptep_to_sp(sptep);
- old_spte = *sptep;
- if (is_last_spte(old_spte, level)) {
- pt_element_t gpte;
- gpa_t pte_gpa;
-
- if (!sp->unsync)
- break;
-
- pte_gpa = FNAME(get_level1_sp_gpa)(sp);
- pte_gpa += spte_index(sptep) * sizeof(pt_element_t);
-
- mmu_page_zap_pte(vcpu->kvm, sp, sptep, NULL);
- if (is_shadow_present_pte(old_spte))
- kvm_flush_remote_tlbs_sptep(vcpu->kvm, sptep);
-
- if (!rmap_can_add(vcpu))
- break;
-
- if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte,
- sizeof(pt_element_t)))
- break;
-
- FNAME(prefetch_gpte)(vcpu, sp, sptep, gpte, false);
- }
-
- if (!sp->unsync_children)
- break;
- }
- write_unlock(&vcpu->kvm->mmu_lock);
-}
-
/* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */
static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
gpa_t addr, u64 access,
@@ -936,114 +877,75 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
* can't change unless all sptes pointing to it are nuked first.
*
* Returns
- * < 0: the sp should be zapped
- * 0: the sp is synced and no tlb flushing is required
- * > 0: the sp is synced and tlb flushing is required
+ * < 0: failed to sync spte
+ * 0: the spte is synced and no tlb flushing is required
+ * > 0: the spte is synced and tlb flushing is required
*/
-static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i)
{
- union kvm_mmu_page_role root_role = vcpu->arch.mmu->root_role;
- int i;
bool host_writable;
gpa_t first_pte_gpa;
- bool flush = false;
-
- /*
- * Ignore various flags when verifying that it's safe to sync a shadow
- * page using the current MMU context.
- *
- * - level: not part of the overall MMU role and will never match as the MMU's
- * level tracks the root level
- * - access: updated based on the new guest PTE
- * - quadrant: not part of the overall MMU role (similar to level)
- */
- const union kvm_mmu_page_role sync_role_ign = {
- .level = 0xf,
- .access = 0x7,
- .quadrant = 0x3,
- .passthrough = 0x1,
- };
+ u64 *sptep, spte;
+ struct kvm_memory_slot *slot;
+ unsigned pte_access;
+ pt_element_t gpte;
+ gpa_t pte_gpa;
+ gfn_t gfn;
- /*
- * Direct pages can never be unsync, and KVM should never attempt to
- * sync a shadow page for a different MMU context, e.g. if the role
- * differs then the memslot lookup (SMM vs. non-SMM) will be bogus, the
- * reserved bits checks will be wrong, etc...
- */
- if (WARN_ON_ONCE(sp->role.direct ||
- (sp->role.word ^ root_role.word) & ~sync_role_ign.word))
- return -1;
+ if (WARN_ON_ONCE(!sp->spt[i]))
+ return 0;
first_pte_gpa = FNAME(get_level1_sp_gpa)(sp);
+ pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
- for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
- u64 *sptep, spte;
- struct kvm_memory_slot *slot;
- unsigned pte_access;
- pt_element_t gpte;
- gpa_t pte_gpa;
- gfn_t gfn;
-
- if (!sp->spt[i])
- continue;
-
- pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
-
- if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte,
- sizeof(pt_element_t)))
- return -1;
-
- if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
- flush = true;
- continue;
- }
-
- gfn = gpte_to_gfn(gpte);
- pte_access = sp->role.access;
- pte_access &= FNAME(gpte_access)(gpte);
- FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte);
-
- if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access))
- continue;
+ if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte,
+ sizeof(pt_element_t)))
+ return -1;
- /*
- * Drop the SPTE if the new protections would result in a RWX=0
- * SPTE or if the gfn is changing. The RWX=0 case only affects
- * EPT with execute-only support, i.e. EPT without an effective
- * "present" bit, as all other paging modes will create a
- * read-only SPTE if pte_access is zero.
- */
- if ((!pte_access && !shadow_present_mask) ||
- gfn != kvm_mmu_page_get_gfn(sp, i)) {
- drop_spte(vcpu->kvm, &sp->spt[i]);
- flush = true;
- continue;
- }
+ if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte))
+ return 1;
- /* Update the shadowed access bits in case they changed. */
- kvm_mmu_page_set_access(sp, i, pte_access);
+ gfn = gpte_to_gfn(gpte);
+ pte_access = sp->role.access;
+ pte_access &= FNAME(gpte_access)(gpte);
+ FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte);
- sptep = &sp->spt[i];
- spte = *sptep;
- host_writable = spte & shadow_host_writable_mask;
- slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
- make_spte(vcpu, sp, slot, pte_access, gfn,
- spte_to_pfn(spte), spte, true, false,
- host_writable, &spte);
+ if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access))
+ return 0;
- flush |= mmu_spte_update(sptep, spte);
+ /*
+ * Drop the SPTE if the new protections would result in a RWX=0
+ * SPTE or if the gfn is changing. The RWX=0 case only affects
+ * EPT with execute-only support, i.e. EPT without an effective
+ * "present" bit, as all other paging modes will create a
+ * read-only SPTE if pte_access is zero.
+ */
+ if ((!pte_access && !shadow_present_mask) ||
+ gfn != kvm_mmu_page_get_gfn(sp, i)) {
+ drop_spte(vcpu->kvm, &sp->spt[i]);
+ return 1;
}
-
/*
- * Note, any flush is purely for KVM's correctness, e.g. when dropping
- * an existing SPTE or clearing W/A/D bits to ensure an mmu_notifier
- * unmap or dirty logging event doesn't fail to flush. The guest is
- * responsible for flushing the TLB to ensure any changes in protection
- * bits are recognized, i.e. until the guest flushes or page faults on
- * a relevant address, KVM is architecturally allowed to let vCPUs use
- * cached translations with the old protection bits.
+ * Do nothing if the permissions are unchanged. The existing SPTE is
+ * still, and prefetch_invalid_gpte() has verified that the A/D bits
+ * are set in the "new" gPTE, i.e. there is no danger of missing an A/D
+ * update due to A/D bits being set in the SPTE but not the gPTE.
*/
- return flush;
+ if (kvm_mmu_page_get_access(sp, i) == pte_access)
+ return 0;
+
+ /* Update the shadowed access bits in case they changed. */
+ kvm_mmu_page_set_access(sp, i, pte_access);
+
+ sptep = &sp->spt[i];
+ spte = *sptep;
+ host_writable = spte & shadow_host_writable_mask;
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ make_spte(vcpu, sp, slot, pte_access, gfn,
+ spte_to_pfn(spte), spte, true, false,
+ host_writable, &spte);
+
+ return mmu_spte_update(sptep, spte);
}
#undef pt_element_t
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index c15bfca3ed15..cf2c6426a6fc 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -164,7 +164,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
/*
* For simplicity, enforce the NX huge page mitigation even if not
* strictly necessary. KVM could ignore the mitigation if paging is
- * disabled in the guest, as the guest doesn't have an page tables to
+ * disabled in the guest, as the guest doesn't have any page tables to
* abuse. But to safely ignore the mitigation, KVM would have to
* ensure a new MMU is loaded (or all shadow pages zapped) when CR0.PG
* is toggled on, and that's a net negative for performance when TDP is
diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
index f0af385c56e0..fae559559a80 100644
--- a/arch/x86/kvm/mmu/tdp_iter.h
+++ b/arch/x86/kvm/mmu/tdp_iter.h
@@ -29,29 +29,49 @@ static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte)
WRITE_ONCE(*rcu_dereference(sptep), new_spte);
}
+/*
+ * SPTEs must be modified atomically if they are shadow-present, leaf
+ * SPTEs, and have volatile bits, i.e. has bits that can be set outside
+ * of mmu_lock. The Writable bit can be set by KVM's fast page fault
+ * handler, and Accessed and Dirty bits can be set by the CPU.
+ *
+ * Note, non-leaf SPTEs do have Accessed bits and those bits are
+ * technically volatile, but KVM doesn't consume the Accessed bit of
+ * non-leaf SPTEs, i.e. KVM doesn't care if it clobbers the bit. This
+ * logic needs to be reassessed if KVM were to use non-leaf Accessed
+ * bits, e.g. to skip stepping down into child SPTEs when aging SPTEs.
+ */
+static inline bool kvm_tdp_mmu_spte_need_atomic_write(u64 old_spte, int level)
+{
+ return is_shadow_present_pte(old_spte) &&
+ is_last_spte(old_spte, level) &&
+ spte_has_volatile_bits(old_spte);
+}
+
static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte,
u64 new_spte, int level)
{
- /*
- * Atomically write the SPTE if it is a shadow-present, leaf SPTE with
- * volatile bits, i.e. has bits that can be set outside of mmu_lock.
- * The Writable bit can be set by KVM's fast page fault handler, and
- * Accessed and Dirty bits can be set by the CPU.
- *
- * Note, non-leaf SPTEs do have Accessed bits and those bits are
- * technically volatile, but KVM doesn't consume the Accessed bit of
- * non-leaf SPTEs, i.e. KVM doesn't care if it clobbers the bit. This
- * logic needs to be reassessed if KVM were to use non-leaf Accessed
- * bits, e.g. to skip stepping down into child SPTEs when aging SPTEs.
- */
- if (is_shadow_present_pte(old_spte) && is_last_spte(old_spte, level) &&
- spte_has_volatile_bits(old_spte))
+ if (kvm_tdp_mmu_spte_need_atomic_write(old_spte, level))
return kvm_tdp_mmu_write_spte_atomic(sptep, new_spte);
__kvm_tdp_mmu_write_spte(sptep, new_spte);
return old_spte;
}
+static inline u64 tdp_mmu_clear_spte_bits(tdp_ptep_t sptep, u64 old_spte,
+ u64 mask, int level)
+{
+ atomic64_t *sptep_atomic;
+
+ if (kvm_tdp_mmu_spte_need_atomic_write(old_spte, level)) {
+ sptep_atomic = (atomic64_t *)rcu_dereference(sptep);
+ return (u64)atomic64_fetch_and(~mask, sptep_atomic);
+ }
+
+ __kvm_tdp_mmu_write_spte(sptep, old_spte & ~mask);
+ return old_spte;
+}
+
/*
* A TDP iterator performs a pre-order walk over a TDP paging structure.
*/
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 7c25dbf32ecc..b2fca11b91ff 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -334,35 +334,6 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
u64 old_spte, u64 new_spte, int level,
bool shared);
-static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
-{
- if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
- return;
-
- if (is_accessed_spte(old_spte) &&
- (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
- spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
- kvm_set_pfn_accessed(spte_to_pfn(old_spte));
-}
-
-static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
- u64 old_spte, u64 new_spte, int level)
-{
- bool pfn_changed;
- struct kvm_memory_slot *slot;
-
- if (level > PG_LEVEL_4K)
- return;
-
- pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
-
- if ((!is_writable_pte(old_spte) || pfn_changed) &&
- is_writable_pte(new_spte)) {
- slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
- mark_page_dirty_in_slot(kvm, slot, gfn);
- }
-}
-
static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
kvm_account_pgtable_pages((void *)sp->spt, +1);
@@ -505,7 +476,7 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
}
/**
- * __handle_changed_spte - handle bookkeeping associated with an SPTE change
+ * handle_changed_spte - handle bookkeeping associated with an SPTE change
* @kvm: kvm instance
* @as_id: the address space of the paging structure the SPTE was a part of
* @gfn: the base GFN that was mapped by the SPTE
@@ -516,12 +487,13 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
* the MMU lock and the operation must synchronize with other
* threads that might be modifying SPTEs.
*
- * Handle bookkeeping that might result from the modification of a SPTE.
- * This function must be called for all TDP SPTE modifications.
+ * Handle bookkeeping that might result from the modification of a SPTE. Note,
+ * dirty logging updates are handled in common code, not here (see make_spte()
+ * and fast_pf_fix_direct_spte()).
*/
-static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
- u64 old_spte, u64 new_spte, int level,
- bool shared)
+static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
+ u64 old_spte, u64 new_spte, int level,
+ bool shared)
{
bool was_present = is_shadow_present_pte(old_spte);
bool is_present = is_shadow_present_pte(new_spte);
@@ -605,17 +577,10 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
if (was_present && !was_leaf &&
(is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
-}
-static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
- u64 old_spte, u64 new_spte, int level,
- bool shared)
-{
- __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
- shared);
- handle_changed_spte_acc_track(old_spte, new_spte, level);
- handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
- new_spte, level);
+ if (was_leaf && is_accessed_spte(old_spte) &&
+ (!is_present || !is_accessed_spte(new_spte) || pfn_changed))
+ kvm_set_pfn_accessed(spte_to_pfn(old_spte));
}
/*
@@ -658,9 +623,8 @@ static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
return -EBUSY;
- __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
- new_spte, iter->level, true);
- handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
+ handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
+ new_spte, iter->level, true);
return 0;
}
@@ -696,7 +660,7 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
/*
- * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
+ * tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
* @kvm: KVM instance
* @as_id: Address space ID, i.e. regular vs. SMM
* @sptep: Pointer to the SPTE
@@ -704,23 +668,12 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
* @new_spte: The new value that will be set for the SPTE
* @gfn: The base GFN that was (or will be) mapped by the SPTE
* @level: The level _containing_ the SPTE (its parent PT's level)
- * @record_acc_track: Notify the MM subsystem of changes to the accessed state
- * of the page. Should be set unless handling an MMU
- * notifier for access tracking. Leaving record_acc_track
- * unset in that case prevents page accesses from being
- * double counted.
- * @record_dirty_log: Record the page as dirty in the dirty bitmap if
- * appropriate for the change being made. Should be set
- * unless performing certain dirty logging operations.
- * Leaving record_dirty_log unset in that case prevents page
- * writes from being double counted.
*
* Returns the old SPTE value, which _may_ be different than @old_spte if the
* SPTE had voldatile bits.
*/
-static u64 __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
- u64 old_spte, u64 new_spte, gfn_t gfn, int level,
- bool record_acc_track, bool record_dirty_log)
+static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
+ u64 old_spte, u64 new_spte, gfn_t gfn, int level)
{
lockdep_assert_held_write(&kvm->mmu_lock);
@@ -735,46 +688,17 @@ static u64 __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
- __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
-
- if (record_acc_track)
- handle_changed_spte_acc_track(old_spte, new_spte, level);
- if (record_dirty_log)
- handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
- new_spte, level);
+ handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
return old_spte;
}
-static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
- u64 new_spte, bool record_acc_track,
- bool record_dirty_log)
+static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter,
+ u64 new_spte)
{
WARN_ON_ONCE(iter->yielded);
-
- iter->old_spte = __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
- iter->old_spte, new_spte,
- iter->gfn, iter->level,
- record_acc_track, record_dirty_log);
-}
-
-static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
- u64 new_spte)
-{
- _tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
-}
-
-static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
- struct tdp_iter *iter,
- u64 new_spte)
-{
- _tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
-}
-
-static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
- struct tdp_iter *iter,
- u64 new_spte)
-{
- _tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
+ iter->old_spte = tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
+ iter->old_spte, new_spte,
+ iter->gfn, iter->level);
}
#define tdp_root_for_each_pte(_iter, _root, _start, _end) \
@@ -866,7 +790,7 @@ retry:
continue;
if (!shared)
- tdp_mmu_set_spte(kvm, &iter, 0);
+ tdp_mmu_iter_set_spte(kvm, &iter, 0);
else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0))
goto retry;
}
@@ -923,8 +847,8 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
return false;
- __tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
- sp->gfn, sp->role.level + 1, true, true);
+ tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
+ sp->gfn, sp->role.level + 1);
return true;
}
@@ -958,7 +882,7 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
!is_last_spte(iter.old_spte, iter.level))
continue;
- tdp_mmu_set_spte(kvm, &iter, 0);
+ tdp_mmu_iter_set_spte(kvm, &iter, 0);
flush = true;
}
@@ -1128,7 +1052,7 @@ static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
if (ret)
return ret;
} else {
- tdp_mmu_set_spte(kvm, iter, spte);
+ tdp_mmu_iter_set_spte(kvm, iter, spte);
}
tdp_account_mmu_page(kvm, sp);
@@ -1262,33 +1186,42 @@ static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
/*
* Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
* if any of the GFNs in the range have been accessed.
+ *
+ * No need to mark the corresponding PFN as accessed as this call is coming
+ * from the clear_young() or clear_flush_young() notifier, which uses the
+ * return value to determine if the page has been accessed.
*/
static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
struct kvm_gfn_range *range)
{
- u64 new_spte = 0;
+ u64 new_spte;
/* If we have a non-accessed entry we don't need to change the pte. */
if (!is_accessed_spte(iter->old_spte))
return false;
- new_spte = iter->old_spte;
-
- if (spte_ad_enabled(new_spte)) {
- new_spte &= ~shadow_accessed_mask;
+ if (spte_ad_enabled(iter->old_spte)) {
+ iter->old_spte = tdp_mmu_clear_spte_bits(iter->sptep,
+ iter->old_spte,
+ shadow_accessed_mask,
+ iter->level);
+ new_spte = iter->old_spte & ~shadow_accessed_mask;
} else {
/*
* Capture the dirty status of the page, so that it doesn't get
* lost when the SPTE is marked for access tracking.
*/
- if (is_writable_pte(new_spte))
- kvm_set_pfn_dirty(spte_to_pfn(new_spte));
+ if (is_writable_pte(iter->old_spte))
+ kvm_set_pfn_dirty(spte_to_pfn(iter->old_spte));
- new_spte = mark_spte_for_access_track(new_spte);
+ new_spte = mark_spte_for_access_track(iter->old_spte);
+ iter->old_spte = kvm_tdp_mmu_write_spte(iter->sptep,
+ iter->old_spte, new_spte,
+ iter->level);
}
- tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
-
+ trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level,
+ iter->old_spte, new_spte);
return true;
}
@@ -1324,15 +1257,15 @@ static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
* Note, when changing a read-only SPTE, it's not strictly necessary to
* zero the SPTE before setting the new PFN, but doing so preserves the
* invariant that the PFN of a present * leaf SPTE can never change.
- * See __handle_changed_spte().
+ * See handle_changed_spte().
*/
- tdp_mmu_set_spte(kvm, iter, 0);
+ tdp_mmu_iter_set_spte(kvm, iter, 0);
if (!pte_write(range->pte)) {
new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
pte_pfn(range->pte));
- tdp_mmu_set_spte(kvm, iter, new_spte);
+ tdp_mmu_iter_set_spte(kvm, iter, new_spte);
}
return true;
@@ -1349,7 +1282,7 @@ bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
/*
* No need to handle the remote TLB flush under RCU protection, the
* target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a
- * shadow page. See the WARN on pfn_changed in __handle_changed_spte().
+ * shadow page. See the WARN on pfn_changed in handle_changed_spte().
*/
return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
}
@@ -1607,8 +1540,8 @@ void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
gfn_t start, gfn_t end)
{
+ u64 dbit = kvm_ad_enabled() ? shadow_dirty_mask : PT_WRITABLE_MASK;
struct tdp_iter iter;
- u64 new_spte;
bool spte_set = false;
rcu_read_lock();
@@ -1621,19 +1554,13 @@ retry:
if (!is_shadow_present_pte(iter.old_spte))
continue;
- if (spte_ad_need_write_protect(iter.old_spte)) {
- if (is_writable_pte(iter.old_spte))
- new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
- else
- continue;
- } else {
- if (iter.old_spte & shadow_dirty_mask)
- new_spte = iter.old_spte & ~shadow_dirty_mask;
- else
- continue;
- }
+ MMU_WARN_ON(kvm_ad_enabled() &&
+ spte_ad_need_write_protect(iter.old_spte));
- if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
+ if (!(iter.old_spte & dbit))
+ continue;
+
+ if (tdp_mmu_set_spte_atomic(kvm, &iter, iter.old_spte & ~dbit))
goto retry;
spte_set = true;
@@ -1675,8 +1602,9 @@ bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
gfn_t gfn, unsigned long mask, bool wrprot)
{
+ u64 dbit = (wrprot || !kvm_ad_enabled()) ? PT_WRITABLE_MASK :
+ shadow_dirty_mask;
struct tdp_iter iter;
- u64 new_spte;
rcu_read_lock();
@@ -1685,25 +1613,26 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
if (!mask)
break;
+ MMU_WARN_ON(kvm_ad_enabled() &&
+ spte_ad_need_write_protect(iter.old_spte));
+
if (iter.level > PG_LEVEL_4K ||
!(mask & (1UL << (iter.gfn - gfn))))
continue;
mask &= ~(1UL << (iter.gfn - gfn));
- if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
- if (is_writable_pte(iter.old_spte))
- new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
- else
- continue;
- } else {
- if (iter.old_spte & shadow_dirty_mask)
- new_spte = iter.old_spte & ~shadow_dirty_mask;
- else
- continue;
- }
+ if (!(iter.old_spte & dbit))
+ continue;
+
+ iter.old_spte = tdp_mmu_clear_spte_bits(iter.sptep,
+ iter.old_spte, dbit,
+ iter.level);
- tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
+ trace_kvm_tdp_mmu_spte_changed(iter.as_id, iter.gfn, iter.level,
+ iter.old_spte,
+ iter.old_spte & ~dbit);
+ kvm_set_pfn_dirty(spte_to_pfn(iter.old_spte));
}
rcu_read_unlock();
@@ -1821,7 +1750,7 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
if (new_spte == iter.old_spte)
break;
- tdp_mmu_set_spte(kvm, &iter, new_spte);
+ tdp_mmu_iter_set_spte(kvm, &iter, new_spte);
spte_set = true;
}
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 612e6c70ce2e..1690d41c1830 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -93,7 +93,7 @@ void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops)
#undef __KVM_X86_PMU_OP
}
-static inline bool pmc_is_enabled(struct kvm_pmc *pmc)
+static inline bool pmc_is_globally_enabled(struct kvm_pmc *pmc)
{
return static_call(kvm_x86_pmu_pmc_is_enabled)(pmc);
}
@@ -400,6 +400,12 @@ static bool check_pmu_event_filter(struct kvm_pmc *pmc)
return is_fixed_event_allowed(filter, pmc->idx);
}
+static bool pmc_event_is_allowed(struct kvm_pmc *pmc)
+{
+ return pmc_is_globally_enabled(pmc) && pmc_speculative_in_use(pmc) &&
+ check_pmu_event_filter(pmc);
+}
+
static void reprogram_counter(struct kvm_pmc *pmc)
{
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
@@ -409,10 +415,7 @@ static void reprogram_counter(struct kvm_pmc *pmc)
pmc_pause_counter(pmc);
- if (!pmc_speculative_in_use(pmc) || !pmc_is_enabled(pmc))
- goto reprogram_complete;
-
- if (!check_pmu_event_filter(pmc))
+ if (!pmc_event_is_allowed(pmc))
goto reprogram_complete;
if (pmc->counter < pmc->prev_counter)
@@ -540,9 +543,9 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
if (!pmc)
return 1;
- if (!(kvm_read_cr4(vcpu) & X86_CR4_PCE) &&
+ if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_PCE) &&
(static_call(kvm_x86_get_cpl)(vcpu) != 0) &&
- (kvm_read_cr0(vcpu) & X86_CR0_PE))
+ kvm_is_cr0_bit_set(vcpu, X86_CR0_PE))
return 1;
*data = pmc_read_counter(pmc) & mask;
@@ -589,6 +592,10 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
*/
void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
{
+ if (KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm))
+ return;
+
+ bitmap_zero(vcpu_to_pmu(vcpu)->all_valid_pmc_idx, X86_PMC_IDX_MAX);
static_call(kvm_x86_pmu_refresh)(vcpu);
}
@@ -646,7 +653,7 @@ static void kvm_pmu_incr_counter(struct kvm_pmc *pmc)
{
pmc->prev_counter = pmc->counter;
pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc);
- kvm_pmu_request_counter_reprogam(pmc);
+ kvm_pmu_request_counter_reprogram(pmc);
}
static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc,
@@ -684,7 +691,7 @@ void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id)
for_each_set_bit(i, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX) {
pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i);
- if (!pmc || !pmc_is_enabled(pmc) || !pmc_speculative_in_use(pmc))
+ if (!pmc || !pmc_event_is_allowed(pmc))
continue;
/* Ignore checks for edge detect, pin control, invert and CMASK bits */
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index be62c16f2265..5c7bbf03b599 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -195,7 +195,7 @@ static inline void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops)
KVM_PMC_MAX_FIXED);
}
-static inline void kvm_pmu_request_counter_reprogam(struct kvm_pmc *pmc)
+static inline void kvm_pmu_request_counter_reprogram(struct kvm_pmc *pmc)
{
set_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi);
kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 05d38944a6c0..96936ddf1b3c 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -139,13 +139,18 @@ void recalc_intercepts(struct vcpu_svm *svm)
if (g->int_ctl & V_INTR_MASKING_MASK) {
/*
- * Once running L2 with HF_VINTR_MASK, EFLAGS.IF and CR8
- * does not affect any interrupt we may want to inject;
- * therefore, writes to CR8 are irrelevant to L0, as are
- * interrupt window vmexits.
+ * If L2 is active and V_INTR_MASKING is enabled in vmcb12,
+ * disable intercept of CR8 writes as L2's CR8 does not affect
+ * any interrupt KVM may want to inject.
+ *
+ * Similarly, disable intercept of virtual interrupts (used to
+ * detect interrupt windows) if the saved RFLAGS.IF is '0', as
+ * the effective RFLAGS.IF for L1 interrupts will never be set
+ * while L2 is running (L2's RFLAGS.IF doesn't affect L1 IRQs).
*/
vmcb_clr_intercept(c, INTERCEPT_CR8_WRITE);
- vmcb_clr_intercept(c, INTERCEPT_VINTR);
+ if (!(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF))
+ vmcb_clr_intercept(c, INTERCEPT_VINTR);
}
/*
@@ -276,6 +281,11 @@ static bool __nested_vmcb_check_controls(struct kvm_vcpu *vcpu,
if (CC(!nested_svm_check_tlb_ctl(vcpu, control->tlb_ctl)))
return false;
+ if (CC((control->int_ctl & V_NMI_ENABLE_MASK) &&
+ !vmcb12_is_intercept(control, INTERCEPT_NMI))) {
+ return false;
+ }
+
return true;
}
@@ -416,22 +426,24 @@ void nested_sync_control_from_vmcb02(struct vcpu_svm *svm)
/* Only a few fields of int_ctl are written by the processor. */
mask = V_IRQ_MASK | V_TPR_MASK;
- if (!(svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK) &&
- svm_is_intercept(svm, INTERCEPT_VINTR)) {
- /*
- * In order to request an interrupt window, L0 is usurping
- * svm->vmcb->control.int_ctl and possibly setting V_IRQ
- * even if it was clear in L1's VMCB. Restoring it would be
- * wrong. However, in this case V_IRQ will remain true until
- * interrupt_window_interception calls svm_clear_vintr and
- * restores int_ctl. We can just leave it aside.
- */
+ /*
+ * Don't sync vmcb02 V_IRQ back to vmcb12 if KVM (L0) is intercepting
+ * virtual interrupts in order to request an interrupt window, as KVM
+ * has usurped vmcb02's int_ctl. If an interrupt window opens before
+ * the next VM-Exit, svm_clear_vintr() will restore vmcb12's int_ctl.
+ * If no window opens, V_IRQ will be correctly preserved in vmcb12's
+ * int_ctl (because it was never recognized while L2 was running).
+ */
+ if (svm_is_intercept(svm, INTERCEPT_VINTR) &&
+ !test_bit(INTERCEPT_VINTR, (unsigned long *)svm->nested.ctl.intercepts))
mask &= ~V_IRQ_MASK;
- }
if (nested_vgif_enabled(svm))
mask |= V_GIF_MASK;
+ if (nested_vnmi_enabled(svm))
+ mask |= V_NMI_BLOCKING_MASK | V_NMI_PENDING_MASK;
+
svm->nested.ctl.int_ctl &= ~mask;
svm->nested.ctl.int_ctl |= svm->vmcb->control.int_ctl & mask;
}
@@ -651,6 +663,17 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
else
int_ctl_vmcb01_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK);
+ if (vnmi) {
+ if (vmcb01->control.int_ctl & V_NMI_PENDING_MASK) {
+ svm->vcpu.arch.nmi_pending++;
+ kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+ }
+ if (nested_vnmi_enabled(svm))
+ int_ctl_vmcb12_bits |= (V_NMI_PENDING_MASK |
+ V_NMI_ENABLE_MASK |
+ V_NMI_BLOCKING_MASK);
+ }
+
/* Copied from vmcb01. msrpm_base can be overwritten later. */
vmcb02->control.nested_ctl = vmcb01->control.nested_ctl;
vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa;
@@ -1021,6 +1044,28 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
svm_switch_vmcb(svm, &svm->vmcb01);
+ /*
+ * Rules for synchronizing int_ctl bits from vmcb02 to vmcb01:
+ *
+ * V_IRQ, V_IRQ_VECTOR, V_INTR_PRIO_MASK, V_IGN_TPR: If L1 doesn't
+ * intercept interrupts, then KVM will use vmcb02's V_IRQ (and related
+ * flags) to detect interrupt windows for L1 IRQs (even if L1 uses
+ * virtual interrupt masking). Raise KVM_REQ_EVENT to ensure that
+ * KVM re-requests an interrupt window if necessary, which implicitly
+ * copies this bits from vmcb02 to vmcb01.
+ *
+ * V_TPR: If L1 doesn't use virtual interrupt masking, then L1's vTPR
+ * is stored in vmcb02, but its value doesn't need to be copied from/to
+ * vmcb01 because it is copied from/to the virtual APIC's TPR register
+ * on each VM entry/exit.
+ *
+ * V_GIF: If nested vGIF is not used, KVM uses vmcb02's V_GIF for L1's
+ * V_GIF. However, GIF is architecturally clear on each VM exit, thus
+ * there is no need to copy V_GIF from vmcb02 to vmcb01.
+ */
+ if (!nested_exit_on_intr(svm))
+ kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+
if (unlikely(svm->lbrv_enabled && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) {
svm_copy_lbrs(vmcb12, vmcb02);
svm_update_lbrv(vcpu);
@@ -1029,6 +1074,20 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
svm_update_lbrv(vcpu);
}
+ if (vnmi) {
+ if (vmcb02->control.int_ctl & V_NMI_BLOCKING_MASK)
+ vmcb01->control.int_ctl |= V_NMI_BLOCKING_MASK;
+ else
+ vmcb01->control.int_ctl &= ~V_NMI_BLOCKING_MASK;
+
+ if (vcpu->arch.nmi_pending) {
+ vcpu->arch.nmi_pending--;
+ vmcb01->control.int_ctl |= V_NMI_PENDING_MASK;
+ } else {
+ vmcb01->control.int_ctl &= ~V_NMI_PENDING_MASK;
+ }
+ }
+
/*
* On vmexit the GIF is set to false and
* no event can be injected in L1.
diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c
index cc77a0681800..5fa939e411d8 100644
--- a/arch/x86/kvm/svm/pmu.c
+++ b/arch/x86/kvm/svm/pmu.c
@@ -161,7 +161,7 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
data &= ~pmu->reserved_bits;
if (data != pmc->eventsel) {
pmc->eventsel = data;
- kvm_pmu_request_counter_reprogam(pmc);
+ kvm_pmu_request_counter_reprogram(pmc);
}
return 0;
}
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 70183d2271b5..eb308c9994f9 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -95,6 +95,7 @@ static const struct svm_direct_access_msrs {
#endif
{ .index = MSR_IA32_SPEC_CTRL, .always = false },
{ .index = MSR_IA32_PRED_CMD, .always = false },
+ { .index = MSR_IA32_FLUSH_CMD, .always = false },
{ .index = MSR_IA32_LASTBRANCHFROMIP, .always = false },
{ .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
{ .index = MSR_IA32_LASTINTFROMIP, .always = false },
@@ -230,6 +231,8 @@ module_param(dump_invalid_vmcb, bool, 0644);
bool intercept_smi = true;
module_param(intercept_smi, bool, 0444);
+bool vnmi = true;
+module_param(vnmi, bool, 0444);
static bool svm_gp_erratum_intercept = true;
@@ -1311,6 +1314,9 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
if (kvm_vcpu_apicv_active(vcpu))
avic_init_vmcb(svm, vmcb);
+ if (vnmi)
+ svm->vmcb->control.int_ctl |= V_NMI_ENABLE_MASK;
+
if (vgif) {
svm_clr_intercept(svm, INTERCEPT_STGI);
svm_clr_intercept(svm, INTERCEPT_CLGI);
@@ -1584,6 +1590,16 @@ static void svm_set_vintr(struct vcpu_svm *svm)
svm_set_intercept(svm, INTERCEPT_VINTR);
/*
+ * Recalculating intercepts may have cleared the VINTR intercept. If
+ * V_INTR_MASKING is enabled in vmcb12, then the effective RFLAGS.IF
+ * for L1 physical interrupts is L1's RFLAGS.IF at the time of VMRUN.
+ * Requesting an interrupt window if save.RFLAGS.IF=0 is pointless as
+ * interrupts will never be unblocked while L2 is running.
+ */
+ if (!svm_is_intercept(svm, INTERCEPT_VINTR))
+ return;
+
+ /*
* This is just a dummy VINTR to actually cause a vmexit to happen.
* Actual injection of virtual interrupts happens through EVENTINJ.
*/
@@ -2480,16 +2496,29 @@ static int task_switch_interception(struct kvm_vcpu *vcpu)
has_error_code, error_code);
}
+static void svm_clr_iret_intercept(struct vcpu_svm *svm)
+{
+ if (!sev_es_guest(svm->vcpu.kvm))
+ svm_clr_intercept(svm, INTERCEPT_IRET);
+}
+
+static void svm_set_iret_intercept(struct vcpu_svm *svm)
+{
+ if (!sev_es_guest(svm->vcpu.kvm))
+ svm_set_intercept(svm, INTERCEPT_IRET);
+}
+
static int iret_interception(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
++vcpu->stat.nmi_window_exits;
svm->awaiting_iret_completion = true;
- if (!sev_es_guest(vcpu->kvm)) {
- svm_clr_intercept(svm, INTERCEPT_IRET);
+
+ svm_clr_iret_intercept(svm);
+ if (!sev_es_guest(vcpu->kvm))
svm->nmi_iret_rip = kvm_rip_read(vcpu);
- }
+
kvm_make_request(KVM_REQ_EVENT, vcpu);
return 1;
}
@@ -2869,32 +2898,10 @@ static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
return 0;
}
-static int svm_set_msr_ia32_cmd(struct kvm_vcpu *vcpu, struct msr_data *msr,
- bool guest_has_feat, u64 cmd,
- int x86_feature_bit)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- if (!msr->host_initiated && !guest_has_feat)
- return 1;
-
- if (!(msr->data & ~cmd))
- return 1;
- if (!boot_cpu_has(x86_feature_bit))
- return 1;
- if (!msr->data)
- return 0;
-
- wrmsrl(msr->index, cmd);
- set_msr_interception(vcpu, svm->msrpm, msr->index, 0, 1);
-
- return 0;
-}
-
static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
{
struct vcpu_svm *svm = to_svm(vcpu);
- int r;
+ int ret = 0;
u32 ecx = msr->index;
u64 data = msr->data;
@@ -2964,16 +2971,6 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
*/
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
break;
- case MSR_IA32_PRED_CMD:
- r = svm_set_msr_ia32_cmd(vcpu, msr,
- guest_has_pred_cmd_msr(vcpu),
- PRED_CMD_IBPB, X86_FEATURE_IBPB);
- break;
- case MSR_IA32_FLUSH_CMD:
- r = svm_set_msr_ia32_cmd(vcpu, msr,
- guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D),
- L1D_FLUSH, X86_FEATURE_FLUSH_L1D);
- break;
case MSR_AMD64_VIRT_SPEC_CTRL:
if (!msr->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
@@ -3026,10 +3023,10 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
* guest via direct_access_msrs, and switch it via user return.
*/
preempt_disable();
- r = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull);
+ ret = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull);
preempt_enable();
- if (r)
- return 1;
+ if (ret)
+ break;
svm->tsc_aux = data;
break;
@@ -3087,7 +3084,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
default:
return kvm_set_msr_common(vcpu, msr);
}
- return 0;
+ return ret;
}
static int msr_interception(struct kvm_vcpu *vcpu)
@@ -3498,11 +3495,43 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu)
return;
svm->nmi_masked = true;
- if (!sev_es_guest(vcpu->kvm))
- svm_set_intercept(svm, INTERCEPT_IRET);
+ svm_set_iret_intercept(svm);
++vcpu->stat.nmi_injections;
}
+static bool svm_is_vnmi_pending(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (!is_vnmi_enabled(svm))
+ return false;
+
+ return !!(svm->vmcb->control.int_ctl & V_NMI_BLOCKING_MASK);
+}
+
+static bool svm_set_vnmi_pending(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (!is_vnmi_enabled(svm))
+ return false;
+
+ if (svm->vmcb->control.int_ctl & V_NMI_PENDING_MASK)
+ return false;
+
+ svm->vmcb->control.int_ctl |= V_NMI_PENDING_MASK;
+ vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
+
+ /*
+ * Because the pending NMI is serviced by hardware, KVM can't know when
+ * the NMI is "injected", but for all intents and purposes, passing the
+ * NMI off to hardware counts as injection.
+ */
+ ++vcpu->stat.nmi_injections;
+
+ return true;
+}
+
static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -3598,6 +3627,35 @@ static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
}
+static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (is_vnmi_enabled(svm))
+ return svm->vmcb->control.int_ctl & V_NMI_BLOCKING_MASK;
+ else
+ return svm->nmi_masked;
+}
+
+static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (is_vnmi_enabled(svm)) {
+ if (masked)
+ svm->vmcb->control.int_ctl |= V_NMI_BLOCKING_MASK;
+ else
+ svm->vmcb->control.int_ctl &= ~V_NMI_BLOCKING_MASK;
+
+ } else {
+ svm->nmi_masked = masked;
+ if (masked)
+ svm_set_iret_intercept(svm);
+ else
+ svm_clr_iret_intercept(svm);
+ }
+}
+
bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -3609,8 +3667,10 @@ bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
return false;
- return (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
- svm->nmi_masked;
+ if (svm_get_nmi_mask(vcpu))
+ return true;
+
+ return vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK;
}
static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
@@ -3628,26 +3688,6 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
return 1;
}
-static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
-{
- return to_svm(vcpu)->nmi_masked;
-}
-
-static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- if (masked) {
- svm->nmi_masked = true;
- if (!sev_es_guest(vcpu->kvm))
- svm_set_intercept(svm, INTERCEPT_IRET);
- } else {
- svm->nmi_masked = false;
- if (!sev_es_guest(vcpu->kvm))
- svm_clr_intercept(svm, INTERCEPT_IRET);
- }
-}
-
bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -3728,7 +3768,16 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if (svm->nmi_masked && !svm->awaiting_iret_completion)
+ /*
+ * KVM should never request an NMI window when vNMI is enabled, as KVM
+ * allows at most one to-be-injected NMI and one pending NMI, i.e. if
+ * two NMIs arrive simultaneously, KVM will inject one and set
+ * V_NMI_PENDING for the other. WARN, but continue with the standard
+ * single-step approach to try and salvage the pending NMI.
+ */
+ WARN_ON_ONCE(is_vnmi_enabled(svm));
+
+ if (svm_get_nmi_mask(vcpu) && !svm->awaiting_iret_completion)
return; /* IRET will cause a vm exit */
if (!gif_set(svm)) {
@@ -4124,7 +4173,7 @@ static bool svm_has_emulated_msr(struct kvm *kvm, u32 index)
{
switch (index) {
case MSR_IA32_MCG_EXT_CTL:
- case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
+ case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
return false;
case MSR_IA32_SMBASE:
if (!IS_ENABLED(CONFIG_KVM_SMM))
@@ -4166,8 +4215,18 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
svm->vgif_enabled = vgif && guest_cpuid_has(vcpu, X86_FEATURE_VGIF);
+ svm->vnmi_enabled = vnmi && guest_cpuid_has(vcpu, X86_FEATURE_VNMI);
+
svm_recalc_instruction_intercepts(vcpu, svm);
+ if (boot_cpu_has(X86_FEATURE_IBPB))
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0,
+ !!guest_has_pred_cmd_msr(vcpu));
+
+ if (boot_cpu_has(X86_FEATURE_FLUSH_L1D))
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_FLUSH_CMD, 0,
+ !!guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D));
+
/* For sev guests, the memory encryption bit is not reserved in CR3. */
if (sev_guest(vcpu->kvm)) {
best = kvm_find_cpuid_entry(vcpu, 0x8000001F);
@@ -4545,7 +4604,6 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
void *insn, int insn_len)
{
bool smep, smap, is_user;
- unsigned long cr4;
u64 error_code;
/* Emulation is always possible when KVM has access to all guest state. */
@@ -4637,9 +4695,8 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
if (error_code & (PFERR_GUEST_PAGE_MASK | PFERR_FETCH_MASK))
goto resume_guest;
- cr4 = kvm_read_cr4(vcpu);
- smep = cr4 & X86_CR4_SMEP;
- smap = cr4 & X86_CR4_SMAP;
+ smep = kvm_is_cr4_bit_set(vcpu, X86_CR4_SMEP);
+ smap = kvm_is_cr4_bit_set(vcpu, X86_CR4_SMAP);
is_user = svm_get_cpl(vcpu) == 3;
if (smap && (!smep || is_user)) {
pr_err_ratelimited("SEV Guest triggered AMD Erratum 1096\n");
@@ -4777,6 +4834,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.patch_hypercall = svm_patch_hypercall,
.inject_irq = svm_inject_irq,
.inject_nmi = svm_inject_nmi,
+ .is_vnmi_pending = svm_is_vnmi_pending,
+ .set_vnmi_pending = svm_set_vnmi_pending,
.inject_exception = svm_inject_exception,
.cancel_injection = svm_cancel_injection,
.interrupt_allowed = svm_interrupt_allowed,
@@ -4919,6 +4978,9 @@ static __init void svm_set_cpu_caps(void)
if (vgif)
kvm_cpu_cap_set(X86_FEATURE_VGIF);
+ if (vnmi)
+ kvm_cpu_cap_set(X86_FEATURE_VNMI);
+
/* Nested VM can receive #VMEXIT instead of triggering #GP */
kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
}
@@ -5070,6 +5132,16 @@ static __init int svm_hardware_setup(void)
pr_info("Virtual GIF supported\n");
}
+ vnmi = vgif && vnmi && boot_cpu_has(X86_FEATURE_VNMI);
+ if (vnmi)
+ pr_info("Virtual NMI enabled\n");
+
+ if (!vnmi) {
+ svm_x86_ops.is_vnmi_pending = NULL;
+ svm_x86_ops.set_vnmi_pending = NULL;
+ }
+
+
if (lbrv) {
if (!boot_cpu_has(X86_FEATURE_LBRV))
lbrv = false;
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 839809972da1..f44751dd8d5d 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -36,6 +36,7 @@ extern bool npt_enabled;
extern int vgif;
extern bool intercept_smi;
extern bool x2avic_enabled;
+extern bool vnmi;
/*
* Clean bits in VMCB.
@@ -265,6 +266,7 @@ struct vcpu_svm {
bool pause_filter_enabled : 1;
bool pause_threshold_enabled : 1;
bool vgif_enabled : 1;
+ bool vnmi_enabled : 1;
u32 ldr_reg;
u32 dfr_reg;
@@ -539,6 +541,12 @@ static inline bool nested_npt_enabled(struct vcpu_svm *svm)
return svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE;
}
+static inline bool nested_vnmi_enabled(struct vcpu_svm *svm)
+{
+ return svm->vnmi_enabled &&
+ (svm->nested.ctl.int_ctl & V_NMI_ENABLE_MASK);
+}
+
static inline bool is_x2apic_msrpm_offset(u32 offset)
{
/* 4 msrs per u8, and 4 u8 in u32 */
@@ -548,6 +556,27 @@ static inline bool is_x2apic_msrpm_offset(u32 offset)
(msr < (APIC_BASE_MSR + 0x100));
}
+static inline struct vmcb *get_vnmi_vmcb_l1(struct vcpu_svm *svm)
+{
+ if (!vnmi)
+ return NULL;
+
+ if (is_guest_mode(&svm->vcpu))
+ return NULL;
+ else
+ return svm->vmcb01.ptr;
+}
+
+static inline bool is_vnmi_enabled(struct vcpu_svm *svm)
+{
+ struct vmcb *vmcb = get_vnmi_vmcb_l1(svm);
+
+ if (vmcb)
+ return !!(vmcb->control.int_ctl & V_NMI_ENABLE_MASK);
+ else
+ return false;
+}
+
/* svm.c */
#define MSR_INVALID 0xffffffffU
diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h
index cff838f15db5..823001033539 100644
--- a/arch/x86/kvm/svm/svm_onhyperv.h
+++ b/arch/x86/kvm/svm/svm_onhyperv.h
@@ -35,9 +35,8 @@ static inline __init void svm_hv_hardware_setup(void)
if (npt_enabled &&
ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB) {
pr_info(KBUILD_MODNAME ": Hyper-V enlightened NPT TLB flush enabled\n");
- svm_x86_ops.tlb_remote_flush = hv_remote_flush_tlb;
- svm_x86_ops.tlb_remote_flush_with_range =
- hv_remote_flush_tlb_with_range;
+ svm_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs;
+ svm_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range;
}
if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) {
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index dab529cd799c..96ede74a6067 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -358,6 +358,7 @@ static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp)
static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp,
gpa_t addr)
{
+ unsigned long roots = 0;
uint i;
struct kvm_mmu_root_info *cached_root;
@@ -368,8 +369,10 @@ static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp,
if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd,
eptp))
- vcpu->arch.mmu->invlpg(vcpu, addr, cached_root->hpa);
+ roots |= KVM_MMU_ROOT_PREVIOUS(i);
}
+ if (roots)
+ kvm_mmu_invalidate_addr(vcpu, vcpu->arch.mmu, addr, roots);
}
static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
@@ -4481,7 +4484,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
* CR0_GUEST_HOST_MASK is already set in the original vmcs01
* (KVM doesn't change it);
*/
- vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS;
+ vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits();
vmx_set_cr0(vcpu, vmcs12->host_cr0);
/* Same as above - no reason to call set_cr4_guest_host_mask(). */
@@ -4632,7 +4635,7 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
*/
vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx));
- vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS;
+ vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits();
vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW));
vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
@@ -5154,7 +5157,7 @@ static int handle_vmxon(struct kvm_vcpu *vcpu)
* does force CR0.PE=1, but only to also force VM86 in order to emulate
* Real Mode, and so there's no need to check CR0.PE manually.
*/
- if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) {
+ if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_VMXE)) {
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index e8a3be0b9df9..741efe2c497b 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -57,7 +57,7 @@ static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data)
pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i);
__set_bit(INTEL_PMC_IDX_FIXED + i, pmu->pmc_in_use);
- kvm_pmu_request_counter_reprogam(pmc);
+ kvm_pmu_request_counter_reprogram(pmc);
}
}
@@ -76,13 +76,13 @@ static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx)
static void reprogram_counters(struct kvm_pmu *pmu, u64 diff)
{
int bit;
- struct kvm_pmc *pmc;
- for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) {
- pmc = intel_pmc_idx_to_pmc(pmu, bit);
- if (pmc)
- kvm_pmu_request_counter_reprogam(pmc);
- }
+ if (!diff)
+ return;
+
+ for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX)
+ set_bit(bit, pmu->reprogram_pmi);
+ kvm_make_request(KVM_REQ_PMU, pmu_to_vcpu(pmu));
}
static bool intel_hw_event_available(struct kvm_pmc *pmc)
@@ -351,45 +351,47 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
switch (msr) {
case MSR_CORE_PERF_FIXED_CTR_CTRL:
msr_info->data = pmu->fixed_ctr_ctrl;
- return 0;
+ break;
case MSR_CORE_PERF_GLOBAL_STATUS:
msr_info->data = pmu->global_status;
- return 0;
+ break;
case MSR_CORE_PERF_GLOBAL_CTRL:
msr_info->data = pmu->global_ctrl;
- return 0;
+ break;
case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
msr_info->data = 0;
- return 0;
+ break;
case MSR_IA32_PEBS_ENABLE:
msr_info->data = pmu->pebs_enable;
- return 0;
+ break;
case MSR_IA32_DS_AREA:
msr_info->data = pmu->ds_area;
- return 0;
+ break;
case MSR_PEBS_DATA_CFG:
msr_info->data = pmu->pebs_data_cfg;
- return 0;
+ break;
default:
if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
(pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) {
u64 val = pmc_read_counter(pmc);
msr_info->data =
val & pmu->counter_bitmask[KVM_PMC_GP];
- return 0;
+ break;
} else if ((pmc = get_fixed_pmc(pmu, msr))) {
u64 val = pmc_read_counter(pmc);
msr_info->data =
val & pmu->counter_bitmask[KVM_PMC_FIXED];
- return 0;
+ break;
} else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
msr_info->data = pmc->eventsel;
- return 0;
- } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, true))
- return 0;
+ break;
+ } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, true)) {
+ break;
+ }
+ return 1;
}
- return 1;
+ return 0;
}
static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
@@ -402,44 +404,43 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
switch (msr) {
case MSR_CORE_PERF_FIXED_CTR_CTRL:
- if (pmu->fixed_ctr_ctrl == data)
- return 0;
- if (!(data & pmu->fixed_ctr_ctrl_mask)) {
+ if (data & pmu->fixed_ctr_ctrl_mask)
+ return 1;
+
+ if (pmu->fixed_ctr_ctrl != data)
reprogram_fixed_counters(pmu, data);
- return 0;
- }
break;
case MSR_CORE_PERF_GLOBAL_STATUS:
- if (msr_info->host_initiated) {
- pmu->global_status = data;
- return 0;
- }
- break; /* RO MSR */
+ if (!msr_info->host_initiated)
+ return 1; /* RO MSR */
+
+ pmu->global_status = data;
+ break;
case MSR_CORE_PERF_GLOBAL_CTRL:
- if (pmu->global_ctrl == data)
- return 0;
- if (kvm_valid_perf_global_ctrl(pmu, data)) {
+ if (!kvm_valid_perf_global_ctrl(pmu, data))
+ return 1;
+
+ if (pmu->global_ctrl != data) {
diff = pmu->global_ctrl ^ data;
pmu->global_ctrl = data;
reprogram_counters(pmu, diff);
- return 0;
}
break;
case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
- if (!(data & pmu->global_ovf_ctrl_mask)) {
- if (!msr_info->host_initiated)
- pmu->global_status &= ~data;
- return 0;
- }
+ if (data & pmu->global_ovf_ctrl_mask)
+ return 1;
+
+ if (!msr_info->host_initiated)
+ pmu->global_status &= ~data;
break;
case MSR_IA32_PEBS_ENABLE:
- if (pmu->pebs_enable == data)
- return 0;
- if (!(data & pmu->pebs_enable_mask)) {
+ if (data & pmu->pebs_enable_mask)
+ return 1;
+
+ if (pmu->pebs_enable != data) {
diff = pmu->pebs_enable ^ data;
pmu->pebs_enable = data;
reprogram_counters(pmu, diff);
- return 0;
}
break;
case MSR_IA32_DS_AREA:
@@ -447,15 +448,14 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
if (is_noncanonical_address(data, vcpu))
return 1;
+
pmu->ds_area = data;
- return 0;
+ break;
case MSR_PEBS_DATA_CFG:
- if (pmu->pebs_data_cfg == data)
- return 0;
- if (!(data & pmu->pebs_data_cfg_mask)) {
- pmu->pebs_data_cfg = data;
- return 0;
- }
+ if (data & pmu->pebs_data_cfg_mask)
+ return 1;
+
+ pmu->pebs_data_cfg = data;
break;
default:
if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
@@ -463,33 +463,38 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if ((msr & MSR_PMC_FULL_WIDTH_BIT) &&
(data & ~pmu->counter_bitmask[KVM_PMC_GP]))
return 1;
+
if (!msr_info->host_initiated &&
!(msr & MSR_PMC_FULL_WIDTH_BIT))
data = (s64)(s32)data;
pmc->counter += data - pmc_read_counter(pmc);
pmc_update_sample_period(pmc);
- return 0;
+ break;
} else if ((pmc = get_fixed_pmc(pmu, msr))) {
pmc->counter += data - pmc_read_counter(pmc);
pmc_update_sample_period(pmc);
- return 0;
+ break;
} else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
- if (data == pmc->eventsel)
- return 0;
reserved_bits = pmu->reserved_bits;
if ((pmc->idx == 2) &&
(pmu->raw_event_mask & HSW_IN_TX_CHECKPOINTED))
reserved_bits ^= HSW_IN_TX_CHECKPOINTED;
- if (!(data & reserved_bits)) {
+ if (data & reserved_bits)
+ return 1;
+
+ if (data != pmc->eventsel) {
pmc->eventsel = data;
- kvm_pmu_request_counter_reprogam(pmc);
- return 0;
+ kvm_pmu_request_counter_reprogram(pmc);
}
- } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, false))
- return 0;
+ break;
+ } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, false)) {
+ break;
+ }
+ /* Not a known PMU MSR. */
+ return 1;
}
- return 1;
+ return 0;
}
static void setup_fixed_pmc_eventsel(struct kvm_pmu *pmu)
@@ -531,6 +536,16 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
pmu->pebs_enable_mask = ~0ull;
pmu->pebs_data_cfg_mask = ~0ull;
+ memset(&lbr_desc->records, 0, sizeof(lbr_desc->records));
+
+ /*
+ * Setting passthrough of LBR MSRs is done only in the VM-Entry loop,
+ * and PMU refresh is disallowed after the vCPU has run, i.e. this code
+ * should never be reached while KVM is passing through MSRs.
+ */
+ if (KVM_BUG_ON(lbr_desc->msr_passthrough, vcpu->kvm))
+ return;
+
entry = kvm_find_cpuid_entry(vcpu, 0xa);
if (!entry || !vcpu->kvm->arch.enable_pmu)
return;
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index e06fcd6144b0..44fb619803b8 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -164,6 +164,7 @@ module_param(allow_smaller_maxphyaddr, bool, S_IRUGO);
static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = {
MSR_IA32_SPEC_CTRL,
MSR_IA32_PRED_CMD,
+ MSR_IA32_FLUSH_CMD,
MSR_IA32_TSC,
#ifdef CONFIG_X86_64
MSR_FS_BASE,
@@ -1945,7 +1946,7 @@ static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx,
static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
{
switch (msr->index) {
- case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
+ case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
if (!nested)
return 1;
return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);
@@ -2030,7 +2031,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash
[msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0];
break;
- case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
+ case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
if (!nested_vmx_allowed(vcpu))
return 1;
if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
@@ -2133,39 +2134,6 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated
return debugctl;
}
-static int vmx_set_msr_ia32_cmd(struct kvm_vcpu *vcpu,
- struct msr_data *msr_info,
- bool guest_has_feat, u64 cmd,
- int x86_feature_bit)
-{
- if (!msr_info->host_initiated && !guest_has_feat)
- return 1;
-
- if (!(msr_info->data & ~cmd))
- return 1;
- if (!boot_cpu_has(x86_feature_bit))
- return 1;
- if (!msr_info->data)
- return 0;
-
- wrmsrl(msr_info->index, cmd);
-
- /*
- * For non-nested:
- * When it's written (to non-zero) for the first time, pass
- * it through.
- *
- * For nested:
- * The handling of the MSR bitmap for L2 guests is done in
- * nested_vmx_prepare_msr_bitmap. We should not touch the
- * vmcs02.msr_bitmap here since it gets completely overwritten
- * in the merging.
- */
- vmx_disable_intercept_for_msr(vcpu, msr_info->index, MSR_TYPE_W);
-
- return 0;
-}
-
/*
* Writes msr value into the appropriate "register".
* Returns 0 on success, non-0 otherwise.
@@ -2318,18 +2286,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR))
return 1;
goto find_uret_msr;
- case MSR_IA32_PRED_CMD:
- ret = vmx_set_msr_ia32_cmd(vcpu, msr_info,
- guest_has_pred_cmd_msr(vcpu),
- PRED_CMD_IBPB,
- X86_FEATURE_IBPB);
- break;
- case MSR_IA32_FLUSH_CMD:
- ret = vmx_set_msr_ia32_cmd(vcpu, msr_info,
- guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D),
- L1D_FLUSH,
- X86_FEATURE_FLUSH_L1D);
- break;
case MSR_IA32_CR_PAT:
if (!kvm_pat_valid(data))
return 1;
@@ -2384,7 +2340,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vmx->msr_ia32_sgxlepubkeyhash
[msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data;
break;
- case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
+ case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
if (!msr_info->host_initiated)
return 1; /* they are read-only */
if (!nested_vmx_allowed(vcpu))
@@ -4790,7 +4746,7 @@ static void init_vmcs(struct vcpu_vmx *vmx)
/* 22.2.1, 20.8.1 */
vm_entry_controls_set(vmx, vmx_vmentry_ctrl());
- vmx->vcpu.arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS;
+ vmx->vcpu.arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits();
vmcs_writel(CR0_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr0_guest_owned_bits);
set_cr4_guest_host_mask(vmx);
@@ -5180,7 +5136,7 @@ bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu)
if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
return true;
- return vmx_get_cpl(vcpu) == 3 && kvm_read_cr0_bits(vcpu, X86_CR0_AM) &&
+ return vmx_get_cpl(vcpu) == 3 && kvm_is_cr0_bit_set(vcpu, X86_CR0_AM) &&
(kvm_get_rflags(vcpu) & X86_EFLAGS_AC);
}
@@ -5517,7 +5473,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
break;
case 3: /* lmsw */
val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
- trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val);
+ trace_kvm_cr_write(0, (kvm_read_cr0_bits(vcpu, ~0xful) | val));
kvm_lmsw(vcpu, val);
return kvm_skip_emulated_instruction(vcpu);
@@ -6974,7 +6930,7 @@ static bool vmx_has_emulated_msr(struct kvm *kvm, u32 index)
* real mode.
*/
return enable_unrestricted_guest || emulate_invalid_guest_state;
- case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
+ case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
return nested;
case MSR_AMD64_VIRT_SPEC_CTRL:
case MSR_AMD64_TSC_RATIO:
@@ -7575,7 +7531,7 @@ static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
if (!kvm_arch_has_noncoherent_dma(vcpu->kvm))
return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
- if (kvm_read_cr0(vcpu) & X86_CR0_CD) {
+ if (kvm_read_cr0_bits(vcpu, X86_CR0_CD)) {
if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
cache = MTRR_TYPE_WRBACK;
else
@@ -7761,6 +7717,13 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R,
!guest_cpuid_has(vcpu, X86_FEATURE_XFD));
+ if (boot_cpu_has(X86_FEATURE_IBPB))
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W,
+ !guest_has_pred_cmd_msr(vcpu));
+
+ if (boot_cpu_has(X86_FEATURE_FLUSH_L1D))
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W,
+ !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D));
set_cr4_guest_host_mask(vmx);
@@ -7793,9 +7756,11 @@ static u64 vmx_get_perf_capabilities(void)
if (boot_cpu_has(X86_FEATURE_PDCM))
rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap);
- x86_perf_get_lbr(&lbr);
- if (lbr.nr)
- perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT;
+ if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) {
+ x86_perf_get_lbr(&lbr);
+ if (lbr.nr)
+ perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT;
+ }
if (vmx_pebs_supported()) {
perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK;
@@ -8447,9 +8412,8 @@ static __init int hardware_setup(void)
#if IS_ENABLED(CONFIG_HYPERV)
if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
&& enable_ept) {
- vmx_x86_ops.tlb_remote_flush = hv_remote_flush_tlb;
- vmx_x86_ops.tlb_remote_flush_with_range =
- hv_remote_flush_tlb_with_range;
+ vmx_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs;
+ vmx_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range;
}
#endif
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 2acdc54bc34b..9e66531861cf 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -369,7 +369,7 @@ struct vcpu_vmx {
struct lbr_desc lbr_desc;
/* Save desired MSR intercept (read: pass-through) state */
-#define MAX_POSSIBLE_PASSTHROUGH_MSRS 15
+#define MAX_POSSIBLE_PASSTHROUGH_MSRS 16
struct {
DECLARE_BITMAP(read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
DECLARE_BITMAP(write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
@@ -640,6 +640,24 @@ BUILD_CONTROLS_SHADOW(tertiary_exec, TERTIARY_VM_EXEC_CONTROL, 64)
(1 << VCPU_EXREG_EXIT_INFO_1) | \
(1 << VCPU_EXREG_EXIT_INFO_2))
+static inline unsigned long vmx_l1_guest_owned_cr0_bits(void)
+{
+ unsigned long bits = KVM_POSSIBLE_CR0_GUEST_BITS;
+
+ /*
+ * CR0.WP needs to be intercepted when KVM is shadowing legacy paging
+ * in order to construct shadow PTEs with the correct protections.
+ * Note! CR0.WP technically can be passed through to the guest if
+ * paging is disabled, but checking CR0.PG would generate a cyclical
+ * dependency of sorts due to forcing the caller to ensure CR0 holds
+ * the correct value prior to determining which CR0 bits can be owned
+ * by L1. Keep it simple and limit the optimization to EPT.
+ */
+ if (!enable_ept)
+ bits &= ~X86_CR0_WP;
+ return bits;
+}
+
static __always_inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm)
{
return container_of(kvm, struct kvm_vmx, kvm);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 237c483b1230..523c39a03c00 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -194,7 +194,7 @@ bool __read_mostly eager_page_split = true;
module_param(eager_page_split, bool, 0644);
/* Enable/disable SMT_RSB bug mitigation */
-bool __read_mostly mitigate_smt_rsb;
+static bool __read_mostly mitigate_smt_rsb;
module_param(mitigate_smt_rsb, bool, 0444);
/*
@@ -802,8 +802,8 @@ void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
*/
if ((fault->error_code & PFERR_PRESENT_MASK) &&
!(fault->error_code & PFERR_RSVD_MASK))
- kvm_mmu_invalidate_gva(vcpu, fault_mmu, fault->address,
- fault_mmu->root.hpa);
+ kvm_mmu_invalidate_addr(vcpu, fault_mmu, fault->address,
+ KVM_MMU_ROOT_CURRENT);
fault_mmu->inject_page_fault(vcpu, fault);
}
@@ -841,7 +841,7 @@ bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr)
{
- if ((dr != 4 && dr != 5) || !kvm_read_cr4_bits(vcpu, X86_CR4_DE))
+ if ((dr != 4 && dr != 5) || !kvm_is_cr4_bit_set(vcpu, X86_CR4_DE))
return true;
kvm_queue_exception(vcpu, UD_VECTOR);
@@ -906,6 +906,24 @@ EXPORT_SYMBOL_GPL(load_pdptrs);
void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0)
{
+ /*
+ * CR0.WP is incorporated into the MMU role, but only for non-nested,
+ * indirect shadow MMUs. If paging is disabled, no updates are needed
+ * as there are no permission bits to emulate. If TDP is enabled, the
+ * MMU's metadata needs to be updated, e.g. so that emulating guest
+ * translations does the right thing, but there's no need to unload the
+ * root as CR0.WP doesn't affect SPTEs.
+ */
+ if ((cr0 ^ old_cr0) == X86_CR0_WP) {
+ if (!(cr0 & X86_CR0_PG))
+ return;
+
+ if (tdp_enabled) {
+ kvm_init_mmu(vcpu);
+ return;
+ }
+ }
+
if ((cr0 ^ old_cr0) & X86_CR0_PG) {
kvm_clear_async_pf_completion_queue(vcpu);
kvm_async_pf_hash_reset(vcpu);
@@ -965,7 +983,7 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
return 1;
if (!(cr0 & X86_CR0_PG) &&
- (is_64_bit_mode(vcpu) || kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)))
+ (is_64_bit_mode(vcpu) || kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE)))
return 1;
static_call(kvm_x86_set_cr0)(vcpu, cr0);
@@ -987,7 +1005,7 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
if (vcpu->arch.guest_state_protected)
return;
- if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
+ if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
if (vcpu->arch.xcr0 != host_xcr0)
xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
@@ -1001,7 +1019,7 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
if (static_cpu_has(X86_FEATURE_PKU) &&
vcpu->arch.pkru != vcpu->arch.host_pkru &&
((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
- kvm_read_cr4_bits(vcpu, X86_CR4_PKE)))
+ kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE)))
write_pkru(vcpu->arch.pkru);
#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */
}
@@ -1015,14 +1033,14 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
if (static_cpu_has(X86_FEATURE_PKU) &&
((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
- kvm_read_cr4_bits(vcpu, X86_CR4_PKE))) {
+ kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE))) {
vcpu->arch.pkru = rdpkru();
if (vcpu->arch.pkru != vcpu->arch.host_pkru)
write_pkru(vcpu->arch.host_pkru);
}
#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */
- if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
+ if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
if (vcpu->arch.xcr0 != host_xcr0)
xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
@@ -1178,9 +1196,6 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
return 1;
if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {
- if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID))
- return 1;
-
/* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
return 1;
@@ -1227,7 +1242,7 @@ static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid)
* PCIDs for them are also 0, because MOV to CR3 always flushes the TLB
* with PCIDE=0.
*/
- if (!kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
+ if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE))
return;
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
@@ -1242,9 +1257,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
bool skip_tlb_flush = false;
unsigned long pcid = 0;
#ifdef CONFIG_X86_64
- bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
-
- if (pcid_enabled) {
+ if (kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE)) {
skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH;
cr3 &= ~X86_CR3_PCID_NOFLUSH;
pcid = cr3 & X86_CR3_PCID_MASK;
@@ -1543,39 +1556,41 @@ static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)];
static unsigned num_emulated_msrs;
/*
- * List of msr numbers which are used to expose MSR-based features that
- * can be used by a hypervisor to validate requested CPU features.
+ * List of MSRs that control the existence of MSR-based features, i.e. MSRs
+ * that are effectively CPUID leafs. VMX MSRs are also included in the set of
+ * feature MSRs, but are handled separately to allow expedited lookups.
*/
-static const u32 msr_based_features_all[] = {
- MSR_IA32_VMX_BASIC,
- MSR_IA32_VMX_TRUE_PINBASED_CTLS,
- MSR_IA32_VMX_PINBASED_CTLS,
- MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
- MSR_IA32_VMX_PROCBASED_CTLS,
- MSR_IA32_VMX_TRUE_EXIT_CTLS,
- MSR_IA32_VMX_EXIT_CTLS,
- MSR_IA32_VMX_TRUE_ENTRY_CTLS,
- MSR_IA32_VMX_ENTRY_CTLS,
- MSR_IA32_VMX_MISC,
- MSR_IA32_VMX_CR0_FIXED0,
- MSR_IA32_VMX_CR0_FIXED1,
- MSR_IA32_VMX_CR4_FIXED0,
- MSR_IA32_VMX_CR4_FIXED1,
- MSR_IA32_VMX_VMCS_ENUM,
- MSR_IA32_VMX_PROCBASED_CTLS2,
- MSR_IA32_VMX_EPT_VPID_CAP,
- MSR_IA32_VMX_VMFUNC,
-
+static const u32 msr_based_features_all_except_vmx[] = {
MSR_AMD64_DE_CFG,
MSR_IA32_UCODE_REV,
MSR_IA32_ARCH_CAPABILITIES,
MSR_IA32_PERF_CAPABILITIES,
};
-static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)];
+static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all_except_vmx) +
+ (KVM_LAST_EMULATED_VMX_MSR - KVM_FIRST_EMULATED_VMX_MSR + 1)];
static unsigned int num_msr_based_features;
/*
+ * All feature MSRs except uCode revID, which tracks the currently loaded uCode
+ * patch, are immutable once the vCPU model is defined.
+ */
+static bool kvm_is_immutable_feature_msr(u32 msr)
+{
+ int i;
+
+ if (msr >= KVM_FIRST_EMULATED_VMX_MSR && msr <= KVM_LAST_EMULATED_VMX_MSR)
+ return true;
+
+ for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++) {
+ if (msr == msr_based_features_all_except_vmx[i])
+ return msr != MSR_IA32_UCODE_REV;
+ }
+
+ return false;
+}
+
+/*
* Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM
* does not yet virtualize. These include:
* 10 - MISC_PACKAGE_CTRLS
@@ -2192,6 +2207,22 @@ static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
{
+ u64 val;
+
+ /*
+ * Disallow writes to immutable feature MSRs after KVM_RUN. KVM does
+ * not support modifying the guest vCPU model on the fly, e.g. changing
+ * the nVMX capabilities while L2 is running is nonsensical. Ignore
+ * writes of the same value, e.g. to allow userspace to blindly stuff
+ * all MSRs when emulating RESET.
+ */
+ if (kvm_vcpu_has_run(vcpu) && kvm_is_immutable_feature_msr(index)) {
+ if (do_get_msr(vcpu, index, &val) || *data != val)
+ return -EINVAL;
+
+ return 0;
+ }
+
return kvm_set_msr_ignored_check(vcpu, index, *data, true);
}
@@ -3614,9 +3645,40 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (data & ~kvm_caps.supported_perf_cap)
return 1;
+ /*
+ * Note, this is not just a performance optimization! KVM
+ * disallows changing feature MSRs after the vCPU has run; PMU
+ * refresh will bug the VM if called after the vCPU has run.
+ */
+ if (vcpu->arch.perf_capabilities == data)
+ break;
+
vcpu->arch.perf_capabilities = data;
kvm_pmu_refresh(vcpu);
- return 0;
+ break;
+ case MSR_IA32_PRED_CMD:
+ if (!msr_info->host_initiated && !guest_has_pred_cmd_msr(vcpu))
+ return 1;
+
+ if (!boot_cpu_has(X86_FEATURE_IBPB) || (data & ~PRED_CMD_IBPB))
+ return 1;
+ if (!data)
+ break;
+
+ wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
+ break;
+ case MSR_IA32_FLUSH_CMD:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D))
+ return 1;
+
+ if (!boot_cpu_has(X86_FEATURE_FLUSH_L1D) || (data & ~L1D_FLUSH))
+ return 1;
+ if (!data)
+ break;
+
+ wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
+ break;
case MSR_EFER:
return set_efer(vcpu, msr_info);
case MSR_K7_HWCR:
@@ -4531,9 +4593,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = 0;
break;
case KVM_CAP_XSAVE2: {
- u64 guest_perm = xstate_get_guest_group_perm();
-
- r = xstate_required_size(kvm_caps.supported_xcr0 & guest_perm, false);
+ r = xstate_required_size(kvm_get_filtered_xcr0(), false);
if (r < sizeof(struct kvm_xsave))
r = sizeof(struct kvm_xsave);
break;
@@ -5033,7 +5093,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
return 0;
if (mce->status & MCI_STATUS_UC) {
if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
- !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {
+ !kvm_is_cr4_bit_set(vcpu, X86_CR4_MCE)) {
kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
return 0;
}
@@ -5125,7 +5185,7 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
events->interrupt.shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
events->nmi.injected = vcpu->arch.nmi_injected;
- events->nmi.pending = vcpu->arch.nmi_pending != 0;
+ events->nmi.pending = kvm_get_nr_pending_nmis(vcpu);
events->nmi.masked = static_call(kvm_x86_get_nmi_mask)(vcpu);
/* events->sipi_vector is never valid when reporting to user space */
@@ -5212,8 +5272,11 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
events->interrupt.shadow);
vcpu->arch.nmi_injected = events->nmi.injected;
- if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
- vcpu->arch.nmi_pending = events->nmi.pending;
+ if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) {
+ vcpu->arch.nmi_pending = 0;
+ atomic_set(&vcpu->arch.nmi_queued, events->nmi.pending);
+ kvm_make_request(KVM_REQ_NMI, vcpu);
+ }
static_call(kvm_x86_set_nmi_mask)(vcpu, events->nmi.masked);
if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
@@ -7009,6 +7072,18 @@ out:
return r;
}
+static void kvm_probe_feature_msr(u32 msr_index)
+{
+ struct kvm_msr_entry msr = {
+ .index = msr_index,
+ };
+
+ if (kvm_get_msr_feature(&msr))
+ return;
+
+ msr_based_features[num_msr_based_features++] = msr_index;
+}
+
static void kvm_probe_msr_to_save(u32 msr_index)
{
u32 dummy[2];
@@ -7084,7 +7159,7 @@ static void kvm_probe_msr_to_save(u32 msr_index)
msrs_to_save[num_msrs_to_save++] = msr_index;
}
-static void kvm_init_msr_list(void)
+static void kvm_init_msr_lists(void)
{
unsigned i;
@@ -7110,15 +7185,11 @@ static void kvm_init_msr_list(void)
emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i];
}
- for (i = 0; i < ARRAY_SIZE(msr_based_features_all); i++) {
- struct kvm_msr_entry msr;
-
- msr.index = msr_based_features_all[i];
- if (kvm_get_msr_feature(&msr))
- continue;
+ for (i = KVM_FIRST_EMULATED_VMX_MSR; i <= KVM_LAST_EMULATED_VMX_MSR; i++)
+ kvm_probe_feature_msr(i);
- msr_based_features[num_msr_based_features++] = msr_based_features_all[i];
- }
+ for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++)
+ kvm_probe_feature_msr(msr_based_features_all_except_vmx[i]);
}
static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
@@ -9452,7 +9523,7 @@ static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
kvm_caps.max_guest_tsc_khz = max;
}
kvm_caps.default_tsc_scaling_ratio = 1ULL << kvm_caps.tsc_scaling_ratio_frac_bits;
- kvm_init_msr_list();
+ kvm_init_msr_lists();
return 0;
out_unwind_ops:
@@ -9783,7 +9854,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
vcpu->run->hypercall.args[0] = gpa;
vcpu->run->hypercall.args[1] = npages;
vcpu->run->hypercall.args[2] = attrs;
- vcpu->run->hypercall.longmode = op_64_bit;
+ vcpu->run->hypercall.flags = 0;
+ if (op_64_bit)
+ vcpu->run->hypercall.flags |= KVM_EXIT_HYPERCALL_LONG_MODE;
+
+ WARN_ON_ONCE(vcpu->run->hypercall.flags & KVM_EXIT_HYPERCALL_MBZ);
vcpu->arch.complete_userspace_io = complete_hypercall_exit;
return 0;
}
@@ -10138,19 +10213,46 @@ out:
static void process_nmi(struct kvm_vcpu *vcpu)
{
- unsigned limit = 2;
+ unsigned int limit;
/*
- * x86 is limited to one NMI running, and one NMI pending after it.
- * If an NMI is already in progress, limit further NMIs to just one.
- * Otherwise, allow two (and we'll inject the first one immediately).
+ * x86 is limited to one NMI pending, but because KVM can't react to
+ * incoming NMIs as quickly as bare metal, e.g. if the vCPU is
+ * scheduled out, KVM needs to play nice with two queued NMIs showing
+ * up at the same time. To handle this scenario, allow two NMIs to be
+ * (temporarily) pending so long as NMIs are not blocked and KVM is not
+ * waiting for a previous NMI injection to complete (which effectively
+ * blocks NMIs). KVM will immediately inject one of the two NMIs, and
+ * will request an NMI window to handle the second NMI.
*/
if (static_call(kvm_x86_get_nmi_mask)(vcpu) || vcpu->arch.nmi_injected)
limit = 1;
+ else
+ limit = 2;
+
+ /*
+ * Adjust the limit to account for pending virtual NMIs, which aren't
+ * tracked in vcpu->arch.nmi_pending.
+ */
+ if (static_call(kvm_x86_is_vnmi_pending)(vcpu))
+ limit--;
vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);
- kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ if (vcpu->arch.nmi_pending &&
+ (static_call(kvm_x86_set_vnmi_pending)(vcpu)))
+ vcpu->arch.nmi_pending--;
+
+ if (vcpu->arch.nmi_pending)
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+}
+
+/* Return total number of NMIs pending injection to the VM */
+int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.nmi_pending +
+ static_call(kvm_x86_is_vnmi_pending)(vcpu);
}
void kvm_make_scan_ioapic_request_mask(struct kvm *kvm,
@@ -13236,7 +13338,7 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
return 1;
}
- pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
+ pcid_enabled = kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE);
switch (type) {
case INVPCID_TYPE_INDIV_ADDR:
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index a8167b47b8c8..c544602d07a3 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -3,6 +3,7 @@
#define ARCH_X86_KVM_X86_H
#include <linux/kvm_host.h>
+#include <asm/fpu/xstate.h>
#include <asm/mce.h>
#include <asm/pvclock.h>
#include "kvm_cache_regs.h"
@@ -40,6 +41,14 @@ void kvm_spurious_fault(void);
failed; \
})
+/*
+ * The first...last VMX feature MSRs that are emulated by KVM. This may or may
+ * not cover all known VMX MSRs, as KVM doesn't emulate an MSR until there's an
+ * associated feature that KVM supports for nested virtualization.
+ */
+#define KVM_FIRST_EMULATED_VMX_MSR MSR_IA32_VMX_BASIC
+#define KVM_LAST_EMULATED_VMX_MSR MSR_IA32_VMX_VMFUNC
+
#define KVM_DEFAULT_PLE_GAP 128
#define KVM_VMX_DEFAULT_PLE_WINDOW 4096
#define KVM_DEFAULT_PLE_WINDOW_GROW 2
@@ -83,6 +92,11 @@ static inline unsigned int __shrink_ple_window(unsigned int val,
void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu);
int kvm_check_nested_events(struct kvm_vcpu *vcpu);
+static inline bool kvm_vcpu_has_run(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.last_vmentry_cpu != -1;
+}
+
static inline bool kvm_is_exception_pending(struct kvm_vcpu *vcpu)
{
return vcpu->arch.exception.pending ||
@@ -123,15 +137,15 @@ static inline bool kvm_exception_is_soft(unsigned int nr)
static inline bool is_protmode(struct kvm_vcpu *vcpu)
{
- return kvm_read_cr0_bits(vcpu, X86_CR0_PE);
+ return kvm_is_cr0_bit_set(vcpu, X86_CR0_PE);
}
-static inline int is_long_mode(struct kvm_vcpu *vcpu)
+static inline bool is_long_mode(struct kvm_vcpu *vcpu)
{
#ifdef CONFIG_X86_64
- return vcpu->arch.efer & EFER_LMA;
+ return !!(vcpu->arch.efer & EFER_LMA);
#else
- return 0;
+ return false;
#endif
}
@@ -171,19 +185,19 @@ static inline bool mmu_is_nested(struct kvm_vcpu *vcpu)
return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu;
}
-static inline int is_pae(struct kvm_vcpu *vcpu)
+static inline bool is_pae(struct kvm_vcpu *vcpu)
{
- return kvm_read_cr4_bits(vcpu, X86_CR4_PAE);
+ return kvm_is_cr4_bit_set(vcpu, X86_CR4_PAE);
}
-static inline int is_pse(struct kvm_vcpu *vcpu)
+static inline bool is_pse(struct kvm_vcpu *vcpu)
{
- return kvm_read_cr4_bits(vcpu, X86_CR4_PSE);
+ return kvm_is_cr4_bit_set(vcpu, X86_CR4_PSE);
}
-static inline int is_paging(struct kvm_vcpu *vcpu)
+static inline bool is_paging(struct kvm_vcpu *vcpu)
{
- return likely(kvm_read_cr0_bits(vcpu, X86_CR0_PG));
+ return likely(kvm_is_cr0_bit_set(vcpu, X86_CR0_PG));
}
static inline bool is_pae_paging(struct kvm_vcpu *vcpu)
@@ -193,7 +207,7 @@ static inline bool is_pae_paging(struct kvm_vcpu *vcpu)
static inline u8 vcpu_virt_addr_bits(struct kvm_vcpu *vcpu)
{
- return kvm_read_cr4_bits(vcpu, X86_CR4_LA57) ? 57 : 48;
+ return kvm_is_cr4_bit_set(vcpu, X86_CR4_LA57) ? 57 : 48;
}
static inline bool is_noncanonical_address(u64 la, struct kvm_vcpu *vcpu)
@@ -315,6 +329,34 @@ extern struct kvm_caps kvm_caps;
extern bool enable_pmu;
+/*
+ * Get a filtered version of KVM's supported XCR0 that strips out dynamic
+ * features for which the current process doesn't (yet) have permission to use.
+ * This is intended to be used only when enumerating support to userspace,
+ * e.g. in KVM_GET_SUPPORTED_CPUID and KVM_CAP_XSAVE2, it does NOT need to be
+ * used to check/restrict guest behavior as KVM rejects KVM_SET_CPUID{2} if
+ * userspace attempts to enable unpermitted features.
+ */
+static inline u64 kvm_get_filtered_xcr0(void)
+{
+ u64 permitted_xcr0 = kvm_caps.supported_xcr0;
+
+ BUILD_BUG_ON(XFEATURE_MASK_USER_DYNAMIC != XFEATURE_MASK_XTILE_DATA);
+
+ if (permitted_xcr0 & XFEATURE_MASK_USER_DYNAMIC) {
+ permitted_xcr0 &= xstate_get_guest_group_perm();
+
+ /*
+ * Treat XTILE_CFG as unsupported if the current process isn't
+ * allowed to use XTILE_DATA, as attempting to set XTILE_CFG in
+ * XCR0 without setting XTILE_DATA is architecturally illegal.
+ */
+ if (!(permitted_xcr0 & XFEATURE_MASK_XTILE_DATA))
+ permitted_xcr0 &= ~XFEATURE_MASK_XTILE_CFG;
+ }
+ return permitted_xcr0;
+}
+
static inline bool kvm_mpx_supported(void)
{
return (kvm_caps.supported_xcr0 & (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR))
diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c
index 7316a8224259..e91500a80963 100644
--- a/arch/x86/mm/cpu_entry_area.c
+++ b/arch/x86/mm/cpu_entry_area.c
@@ -10,6 +10,7 @@
#include <asm/fixmap.h>
#include <asm/desc.h>
#include <asm/kasan.h>
+#include <asm/setup.h>
static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage);
@@ -29,6 +30,12 @@ static __init void init_cea_offsets(void)
unsigned int max_cea;
unsigned int i, j;
+ if (!kaslr_enabled()) {
+ for_each_possible_cpu(i)
+ per_cpu(_cea_offset, i) = i;
+ return;
+ }
+
max_cea = (CPU_ENTRY_AREA_MAP_SIZE - PAGE_SIZE) / CPU_ENTRY_AREA_SIZE;
/* O(sodding terrible) */
diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c
index 88cccd65029d..c6efcf559d88 100644
--- a/arch/x86/mm/mem_encrypt_identity.c
+++ b/arch/x86/mm/mem_encrypt_identity.c
@@ -600,7 +600,8 @@ void __init sme_enable(struct boot_params *bp)
cmdline_ptr = (const char *)((u64)bp->hdr.cmd_line_ptr |
((u64)bp->ext_cmd_line_ptr << 32));
- cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer));
+ if (cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer)) < 0)
+ return;
if (!strncmp(buffer, cmdline_on, sizeof(buffer)))
sme_me_mask = me_mask;
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 3c5b52fbe4a7..a9ec8c9f5c5d 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -45,6 +45,6 @@ obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o
-obj-$(CONFIG_XEN_PV_DOM0) += vga.o
+obj-$(CONFIG_XEN_DOM0) += vga.o
obj-$(CONFIG_XEN_EFI) += efi.o
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index bb59cc6ddb2d..093b78c8bbec 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -1390,7 +1390,8 @@ asmlinkage __visible void __init xen_start_kernel(struct start_info *si)
x86_platform.set_legacy_features =
xen_dom0_set_legacy_features;
- xen_init_vga(info, xen_start_info->console.dom0.info_size);
+ xen_init_vga(info, xen_start_info->console.dom0.info_size,
+ &boot_params.screen_info);
xen_start_info->console.domU.mfn = 0;
xen_start_info->console.domU.evtchn = 0;
diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c
index bcae606bbc5c..ada3868c02c2 100644
--- a/arch/x86/xen/enlighten_pvh.c
+++ b/arch/x86/xen/enlighten_pvh.c
@@ -43,6 +43,19 @@ void __init xen_pvh_init(struct boot_params *boot_params)
x86_init.oem.banner = xen_banner;
xen_efi_init(boot_params);
+
+ if (xen_initial_domain()) {
+ struct xen_platform_op op = {
+ .cmd = XENPF_get_dom0_console,
+ };
+ int ret = HYPERVISOR_platform_op(&op);
+
+ if (ret > 0)
+ xen_init_vga(&op.u.dom0_console,
+ min(ret * sizeof(char),
+ sizeof(op.u.dom0_console)),
+ &boot_params->screen_info);
+ }
}
void __init mem_map_via_hcall(struct boot_params *boot_params_p)
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 1d597364b49d..b74ac2562cfb 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -20,6 +20,7 @@
#include <asm/pvclock.h>
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>
+#include <asm/xen/cpuid.h>
#include <xen/events.h>
#include <xen/features.h>
@@ -503,11 +504,7 @@ static int __init xen_tsc_safe_clocksource(void)
/* Leaf 4, sub-leaf 0 (0x40000x03) */
cpuid_count(xen_cpuid_base() + 3, 0, &eax, &ebx, &ecx, &edx);
- /* tsc_mode = no_emulate (2) */
- if (ebx != 2)
- return 0;
-
- return 1;
+ return ebx == XEN_CPUID_TSC_MODE_NEVER_EMULATE;
}
static void __init xen_time_init(void)
diff --git a/arch/x86/xen/vga.c b/arch/x86/xen/vga.c
index 14ea32e734d5..d97adab8420f 100644
--- a/arch/x86/xen/vga.c
+++ b/arch/x86/xen/vga.c
@@ -9,10 +9,9 @@
#include "xen-ops.h"
-void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size)
+void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size,
+ struct screen_info *screen_info)
{
- struct screen_info *screen_info = &boot_params.screen_info;
-
/* This is drawn from a dump from vgacon:startup in
* standard Linux. */
screen_info->orig_video_mode = 3;
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 9a8bb972193d..a10903785a33 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -108,11 +108,12 @@ static inline void xen_uninit_lock_cpu(int cpu)
struct dom0_vga_console_info;
-#ifdef CONFIG_XEN_PV_DOM0
-void __init xen_init_vga(const struct dom0_vga_console_info *, size_t size);
+#ifdef CONFIG_XEN_DOM0
+void __init xen_init_vga(const struct dom0_vga_console_info *, size_t size,
+ struct screen_info *);
#else
static inline void __init xen_init_vga(const struct dom0_vga_console_info *info,
- size_t size)
+ size_t size, struct screen_info *si)
{
}
#endif