diff options
Diffstat (limited to 'arch/arm64')
198 files changed, 22298 insertions, 1604 deletions
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 40fb05d96c60..d0bc8bae7c8d 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -9,6 +9,8 @@ config ARM64 select ACPI_MCFG if (ACPI && PCI) select ACPI_SPCR_TABLE if ACPI select ACPI_PPTT if ACPI + select ARCH_HAS_DEBUG_WX + select ARCH_BINFMT_ELF_STATE select ARCH_HAS_DEBUG_VIRTUAL select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_DMA_PREP_COHERENT @@ -20,6 +22,7 @@ config ARM64 select ARCH_HAS_KCOV select ARCH_HAS_KEEPINITRD select ARCH_HAS_MEMBARRIER_SYNC_CORE + select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE select ARCH_HAS_PTE_DEVMAP select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_SETUP_DMA_OPS @@ -32,6 +35,7 @@ config ARM64 select ARCH_HAS_SYSCALL_WRAPPER select ARCH_HAS_TEARDOWN_DMA_OPS if IOMMU_SUPPORT select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST + select ARCH_HAVE_ELF_PROT select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_INLINE_READ_LOCK if !PREEMPTION select ARCH_INLINE_READ_LOCK_BH if !PREEMPTION @@ -61,9 +65,12 @@ config ARM64 select ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE if !PREEMPTION select ARCH_KEEP_MEMBLOCK select ARCH_USE_CMPXCHG_LOCKREF + select ARCH_USE_GNU_PROPERTY select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS + select ARCH_USE_SYM_ANNOTATIONS select ARCH_SUPPORTS_MEMORY_FAILURE + select ARCH_SUPPORTS_SHADOW_CALL_STACK if CC_HAVE_SHADOW_CALL_STACK select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 && (GCC_VERSION >= 50000 || CC_IS_CLANG) select ARCH_SUPPORTS_NUMA_BALANCING @@ -156,7 +163,6 @@ config ARM64 select HAVE_GCC_PLUGINS select HAVE_HW_BREAKPOINT if PERF_EVENTS select HAVE_IRQ_TIME_ACCOUNTING - select HAVE_MEMBLOCK_NODE_MAP if NUMA select HAVE_NMI select HAVE_PATA_PLATFORM select HAVE_PERF_EVENTS @@ -524,13 +530,13 @@ config ARM64_ERRATUM_1418040 If unsure, say Y. -config ARM64_WORKAROUND_SPECULATIVE_AT_VHE +config ARM64_WORKAROUND_SPECULATIVE_AT bool config ARM64_ERRATUM_1165522 - bool "Cortex-A76: Speculative AT instruction using out-of-context translation regime could cause subsequent request to generate an incorrect translation" + bool "Cortex-A76: 1165522: Speculative AT instruction using out-of-context translation regime could cause subsequent request to generate an incorrect translation" default y - select ARM64_WORKAROUND_SPECULATIVE_AT_VHE + select ARM64_WORKAROUND_SPECULATIVE_AT help This option adds a workaround for ARM Cortex-A76 erratum 1165522. @@ -540,10 +546,23 @@ config ARM64_ERRATUM_1165522 If unsure, say Y. +config ARM64_ERRATUM_1319367 + bool "Cortex-A57/A72: 1319537: Speculative AT instruction using out-of-context translation regime could cause subsequent request to generate an incorrect translation" + default y + select ARM64_WORKAROUND_SPECULATIVE_AT + help + This option adds work arounds for ARM Cortex-A57 erratum 1319537 + and A72 erratum 1319367 + + Cortex-A57 and A72 cores could end-up with corrupted TLBs by + speculating an AT instruction during a guest context switch. + + If unsure, say Y. + config ARM64_ERRATUM_1530923 - bool "Cortex-A55: Speculative AT instruction using out-of-context translation regime could cause subsequent request to generate an incorrect translation" + bool "Cortex-A55: 1530923: Speculative AT instruction using out-of-context translation regime could cause subsequent request to generate an incorrect translation" default y - select ARM64_WORKAROUND_SPECULATIVE_AT_VHE + select ARM64_WORKAROUND_SPECULATIVE_AT help This option adds a workaround for ARM Cortex-A55 erratum 1530923. @@ -553,6 +572,9 @@ config ARM64_ERRATUM_1530923 If unsure, say Y. +config ARM64_WORKAROUND_REPEAT_TLBI + bool + config ARM64_ERRATUM_1286807 bool "Cortex-A76: Modification of the translation table for a virtual address might lead to read-after-read ordering violation" default y @@ -569,22 +591,6 @@ config ARM64_ERRATUM_1286807 invalidated has been observed by other observers. The workaround repeats the TLBI+DSB operation. -config ARM64_WORKAROUND_SPECULATIVE_AT_NVHE - bool - -config ARM64_ERRATUM_1319367 - bool "Cortex-A57/A72: Speculative AT instruction using out-of-context translation regime could cause subsequent request to generate an incorrect translation" - default y - select ARM64_WORKAROUND_SPECULATIVE_AT_NVHE - help - This option adds work arounds for ARM Cortex-A57 erratum 1319537 - and A72 erratum 1319367 - - Cortex-A57 and A72 cores could end-up with corrupted TLBs by - speculating an AT instruction during a guest context switch. - - If unsure, say Y. - config ARM64_ERRATUM_1463225 bool "Cortex-A76: Software Step might prevent interrupt recognition" default y @@ -694,6 +700,35 @@ config CAVIUM_TX2_ERRATUM_219 If unsure, say Y. +config FUJITSU_ERRATUM_010001 + bool "Fujitsu-A64FX erratum E#010001: Undefined fault may occur wrongly" + default y + help + This option adds a workaround for Fujitsu-A64FX erratum E#010001. + On some variants of the Fujitsu-A64FX cores ver(1.0, 1.1), memory + accesses may cause undefined fault (Data abort, DFSC=0b111111). + This fault occurs under a specific hardware condition when a + load/store instruction performs an address translation using: + case-1 TTBR0_EL1 with TCR_EL1.NFD0 == 1. + case-2 TTBR0_EL2 with TCR_EL2.NFD0 == 1. + case-3 TTBR1_EL1 with TCR_EL1.NFD1 == 1. + case-4 TTBR1_EL2 with TCR_EL2.NFD1 == 1. + + The workaround is to ensure these bits are clear in TCR_ELx. + The workaround only affects the Fujitsu-A64FX. + + If unsure, say Y. + +config HISILICON_ERRATUM_161600802 + bool "Hip07 161600802: Erroneous redistributor VLPI base" + default y + help + The HiSilicon Hip07 SoC uses the wrong redistributor base + when issued ITS commands such as VMOVP and VMAPP, and requires + a 128kB offset to be applied to the target address in this commands. + + If unsure, say Y. + config QCOM_FALKOR_ERRATUM_1003 bool "Falkor E1003: Incorrect translation due to ASID change" default y @@ -705,9 +740,6 @@ config QCOM_FALKOR_ERRATUM_1003 is unchanged. Work around the erratum by invalidating the walk cache entries for the trampoline before entering the kernel proper. -config ARM64_WORKAROUND_REPEAT_TLBI - bool - config QCOM_FALKOR_ERRATUM_1009 bool "Falkor E1009: Prematurely complete a DSB after a TLBI" default y @@ -729,25 +761,6 @@ config QCOM_QDF2400_ERRATUM_0065 If unsure, say Y. -config SOCIONEXT_SYNQUACER_PREITS - bool "Socionext Synquacer: Workaround for GICv3 pre-ITS" - default y - help - Socionext Synquacer SoCs implement a separate h/w block to generate - MSI doorbell writes with non-zero values for the device ID. - - If unsure, say Y. - -config HISILICON_ERRATUM_161600802 - bool "Hip07 161600802: Erroneous redistributor VLPI base" - default y - help - The HiSilicon Hip07 SoC uses the wrong redistributor base - when issued ITS commands such as VMOVP and VMAPP, and requires - a 128kB offset to be applied to the target address in this commands. - - If unsure, say Y. - config QCOM_FALKOR_ERRATUM_E1041 bool "Falkor E1041: Speculative instruction fetches might cause errant memory access" default y @@ -758,22 +771,12 @@ config QCOM_FALKOR_ERRATUM_E1041 If unsure, say Y. -config FUJITSU_ERRATUM_010001 - bool "Fujitsu-A64FX erratum E#010001: Undefined fault may occur wrongly" +config SOCIONEXT_SYNQUACER_PREITS + bool "Socionext Synquacer: Workaround for GICv3 pre-ITS" default y help - This option adds a workaround for Fujitsu-A64FX erratum E#010001. - On some variants of the Fujitsu-A64FX cores ver(1.0, 1.1), memory - accesses may cause undefined fault (Data abort, DFSC=0b111111). - This fault occurs under a specific hardware condition when a - load/store instruction performs an address translation using: - case-1 TTBR0_EL1 with TCR_EL1.NFD0 == 1. - case-2 TTBR0_EL2 with TCR_EL2.NFD0 == 1. - case-3 TTBR1_EL1 with TCR_EL1.NFD1 == 1. - case-4 TTBR1_EL2 with TCR_EL2.NFD1 == 1. - - The workaround is to ensure these bits are clear in TCR_ELx. - The workaround only affects the Fujitsu-A64FX. + Socionext Synquacer SoCs implement a separate h/w block to generate + MSI doorbell writes with non-zero values for the device ID. If unsure, say Y. @@ -1025,6 +1028,10 @@ config ARCH_HAS_CACHE_LINE_SIZE config ARCH_ENABLE_SPLIT_PMD_PTLOCK def_bool y if PGTABLE_LEVELS > 2 +# Supported by clang >= 7.0 +config CC_HAVE_SHADOW_CALL_STACK + def_bool $(cc-option, -fsanitize=shadow-call-stack -ffixed-x18) + config SECCOMP bool "Enable seccomp to safely compute untrusted bytecode" ---help--- @@ -1584,6 +1591,48 @@ endmenu menu "ARMv8.5 architectural features" +config ARM64_BTI + bool "Branch Target Identification support" + default y + help + Branch Target Identification (part of the ARMv8.5 Extensions) + provides a mechanism to limit the set of locations to which computed + branch instructions such as BR or BLR can jump. + + To make use of BTI on CPUs that support it, say Y. + + BTI is intended to provide complementary protection to other control + flow integrity protection mechanisms, such as the Pointer + authentication mechanism provided as part of the ARMv8.3 Extensions. + For this reason, it does not make sense to enable this option without + also enabling support for pointer authentication. Thus, when + enabling this option you should also select ARM64_PTR_AUTH=y. + + Userspace binaries must also be specifically compiled to make use of + this mechanism. If you say N here or the hardware does not support + BTI, such binaries can still run, but you get no additional + enforcement of branch destinations. + +config ARM64_BTI_KERNEL + bool "Use Branch Target Identification for kernel" + default y + depends on ARM64_BTI + depends on ARM64_PTR_AUTH + depends on CC_HAS_BRANCH_PROT_PAC_RET_BTI + # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94697 + depends on !CC_IS_GCC || GCC_VERSION >= 100100 + depends on !(CC_IS_CLANG && GCOV_KERNEL) + depends on (!FUNCTION_GRAPH_TRACER || DYNAMIC_FTRACE_WITH_REGS) + help + Build the kernel with Branch Target Identification annotations + and enable enforcement of this for kernel code. When this option + is enabled and the system supports BTI all kernel code including + modular code must have BTI enabled. + +config CC_HAS_BRANCH_PROT_PAC_RET_BTI + # GCC 9 or later, clang 8 or later + def_bool $(cc-option,-mbranch-protection=pac-ret+leaf+bti) + config ARM64_E0PD bool "Enable support for E0PD" default y @@ -1785,7 +1834,7 @@ config EFI select EFI_PARAMS_FROM_FDT select EFI_RUNTIME_WRAPPERS select EFI_STUB - select EFI_ARMSTUB + select EFI_GENERIC_STUB default y help This option provides support for runtime services provided diff --git a/arch/arm64/Kconfig.debug b/arch/arm64/Kconfig.debug index a1efa246c9ed..cdf7ec0b975e 100644 --- a/arch/arm64/Kconfig.debug +++ b/arch/arm64/Kconfig.debug @@ -23,35 +23,6 @@ config ARM64_RANDOMIZE_TEXT_OFFSET of TEXT_OFFSET and platforms must not require a specific value. -config DEBUG_WX - bool "Warn on W+X mappings at boot" - select PTDUMP_CORE - ---help--- - Generate a warning if any W+X mappings are found at boot. - - This is useful for discovering cases where the kernel is leaving - W+X mappings after applying NX, as such mappings are a security risk. - This check also includes UXN, which should be set on all kernel - mappings. - - Look for a message in dmesg output like this: - - arm64/mm: Checked W+X mappings: passed, no W+X pages found. - - or like this, if the check failed: - - arm64/mm: Checked W+X mappings: FAILED, <N> W+X pages found. - - Note that even if the check fails, your kernel is possibly - still fine, as W+X mappings are not a security hole in - themselves, what they do is that they make the exploitation - of other unfixed kernel bugs easier. - - There is no runtime or memory usage effect of this option - once the kernel has booted up - it's a one time check. - - If in doubt, say "Y". - config DEBUG_EFI depends on EFI && DEBUG_INFO bool "UEFI debugging" diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index 85e4149cc5d5..650e1185c190 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -12,7 +12,6 @@ LDFLAGS_vmlinux :=--no-undefined -X CPPFLAGS_vmlinux.lds = -DTEXT_OFFSET=$(TEXT_OFFSET) -GZFLAGS :=-9 ifeq ($(CONFIG_RELOCATABLE), y) # Pass --no-apply-dynamic-relocs to restore pre-binutils-2.27 behaviour @@ -71,7 +70,14 @@ branch-prot-flags-y += $(call cc-option,-mbranch-protection=none) ifeq ($(CONFIG_ARM64_PTR_AUTH),y) branch-prot-flags-$(CONFIG_CC_HAS_SIGN_RETURN_ADDRESS) := -msign-return-address=all +# We enable additional protection for leaf functions as there is some +# narrow potential for ROP protection benefits and no substantial +# performance impact has been observed. +ifeq ($(CONFIG_ARM64_BTI_KERNEL),y) +branch-prot-flags-$(CONFIG_CC_HAS_BRANCH_PROT_PAC_RET_BTI) := -mbranch-protection=pac-ret+leaf+bti +else branch-prot-flags-$(CONFIG_CC_HAS_BRANCH_PROT_PAC_RET) := -mbranch-protection=pac-ret+leaf +endif # -march=armv8.3-a enables the non-nops instructions for PAC, to avoid the # compiler to generate them and consequently to break the single image contract # we pass it only to the assembler. This option is utilized only in case of non @@ -81,6 +87,10 @@ endif KBUILD_CFLAGS += $(branch-prot-flags-y) +ifeq ($(CONFIG_SHADOW_CALL_STACK), y) +KBUILD_CFLAGS += -ffixed-x18 +endif + ifeq ($(CONFIG_CPU_BIG_ENDIAN), y) KBUILD_CPPFLAGS += -mbig-endian CHECKFLAGS += -D__AARCH64EB__ @@ -118,7 +128,7 @@ TEXT_OFFSET := $(shell awk "BEGIN {srand(); printf \"0x%06x\n\", \ int(2 * 1024 * 1024 / (2 ^ $(CONFIG_ARM64_PAGE_SHIFT)) * \ rand()) * (2 ^ $(CONFIG_ARM64_PAGE_SHIFT))}") else -TEXT_OFFSET := 0x00080000 +TEXT_OFFSET := 0x0 endif ifeq ($(CONFIG_KASAN_SW_TAGS), y) @@ -131,7 +141,7 @@ KBUILD_CFLAGS += -DKASAN_SHADOW_SCALE_SHIFT=$(KASAN_SHADOW_SCALE_SHIFT) KBUILD_CPPFLAGS += -DKASAN_SHADOW_SCALE_SHIFT=$(KASAN_SHADOW_SCALE_SHIFT) KBUILD_AFLAGS += -DKASAN_SHADOW_SCALE_SHIFT=$(KASAN_SHADOW_SCALE_SHIFT) -export TEXT_OFFSET GZFLAGS +export TEXT_OFFSET core-y += arch/arm64/ libs-y := arch/arm64/lib/ $(libs-y) diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-pinetab.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-pinetab.dts index 316e8a443913..dc4ab6b434f9 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-a64-pinetab.dts +++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-pinetab.dts @@ -98,7 +98,7 @@ }; &codec_analog { - hpvcc-supply = <®_eldo1>; + cpvdd-supply = <®_eldo1>; status = "okay"; }; diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi index 31143fe64d91..c26cc1fcaffd 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi +++ b/arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi @@ -154,24 +154,6 @@ }; }; - sound_spdif { - compatible = "simple-audio-card"; - simple-audio-card,name = "On-board SPDIF"; - - simple-audio-card,cpu { - sound-dai = <&spdif>; - }; - - simple-audio-card,codec { - sound-dai = <&spdif_out>; - }; - }; - - spdif_out: spdif-out { - #sound-dai-cells = <0>; - compatible = "linux,spdif-dit"; - }; - timer { compatible = "arm,armv8-timer"; allwinner,erratum-unknown1; diff --git a/arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi b/arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi index 0882ea215b88..c0aef7d69117 100644 --- a/arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi +++ b/arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi @@ -2319,7 +2319,7 @@ reg = <0x0 0xff400000 0x0 0x40000>; interrupts = <GIC_SPI 31 IRQ_TYPE_LEVEL_HIGH>; clocks = <&clkc CLKID_USB1_DDR_BRIDGE>; - clock-names = "ddr"; + clock-names = "otg"; phys = <&usb2_phy1>; phy-names = "usb2-phy"; dr_mode = "peripheral"; diff --git a/arch/arm64/boot/dts/amlogic/meson-g12.dtsi b/arch/arm64/boot/dts/amlogic/meson-g12.dtsi index 783e5a397f86..55d39020ec72 100644 --- a/arch/arm64/boot/dts/amlogic/meson-g12.dtsi +++ b/arch/arm64/boot/dts/amlogic/meson-g12.dtsi @@ -1,4 +1,3 @@ - // SPDX-License-Identifier: (GPL-2.0+ OR MIT) /* * Copyright (c) 2019 BayLibre, SAS diff --git a/arch/arm64/boot/dts/amlogic/meson-g12b-khadas-vim3.dtsi b/arch/arm64/boot/dts/amlogic/meson-g12b-khadas-vim3.dtsi index c33e85fbdaba..c6c8caed8327 100644 --- a/arch/arm64/boot/dts/amlogic/meson-g12b-khadas-vim3.dtsi +++ b/arch/arm64/boot/dts/amlogic/meson-g12b-khadas-vim3.dtsi @@ -154,6 +154,10 @@ clock-latency = <50000>; }; +&frddr_a { + status = "okay"; +}; + &frddr_b { status = "okay"; }; diff --git a/arch/arm64/boot/dts/amlogic/meson-g12b-ugoos-am6.dts b/arch/arm64/boot/dts/amlogic/meson-g12b-ugoos-am6.dts index 325e448eb09c..06c5430eb92d 100644 --- a/arch/arm64/boot/dts/amlogic/meson-g12b-ugoos-am6.dts +++ b/arch/arm64/boot/dts/amlogic/meson-g12b-ugoos-am6.dts @@ -545,7 +545,7 @@ &usb { status = "okay"; dr_mode = "host"; - vbus-regulator = <&usb_pwr_en>; + vbus-supply = <&usb_pwr_en>; }; &usb2_phy0 { diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi index 2a7f70b71149..13d0570c7ed6 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi @@ -447,7 +447,7 @@ edma0: dma-controller@22c0000 { #dma-cells = <2>; - compatible = "fsl,ls1028a-edma"; + compatible = "fsl,ls1028a-edma", "fsl,vf610-edma"; reg = <0x0 0x22c0000 0x0 0x10000>, <0x0 0x22d0000 0x0 0x10000>, <0x0 0x22e0000 0x0 0x10000>; diff --git a/arch/arm64/boot/dts/freescale/imx8mm.dtsi b/arch/arm64/boot/dts/freescale/imx8mm.dtsi index cc7152ecedd9..8829628f757a 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mm.dtsi @@ -264,7 +264,7 @@ aips1: bus@30000000 { compatible = "fsl,aips-bus", "simple-bus"; - reg = <0x301f0000 0x10000>; + reg = <0x30000000 0x400000>; #address-cells = <1>; #size-cells = <1>; ranges = <0x30000000 0x30000000 0x400000>; @@ -543,7 +543,7 @@ aips2: bus@30400000 { compatible = "fsl,aips-bus", "simple-bus"; - reg = <0x305f0000 0x10000>; + reg = <0x30400000 0x400000>; #address-cells = <1>; #size-cells = <1>; ranges = <0x30400000 0x30400000 0x400000>; @@ -603,7 +603,7 @@ aips3: bus@30800000 { compatible = "fsl,aips-bus", "simple-bus"; - reg = <0x309f0000 0x10000>; + reg = <0x30800000 0x400000>; #address-cells = <1>; #size-cells = <1>; ranges = <0x30800000 0x30800000 0x400000>, @@ -863,7 +863,7 @@ aips4: bus@32c00000 { compatible = "fsl,aips-bus", "simple-bus"; - reg = <0x32df0000 0x10000>; + reg = <0x32c00000 0x400000>; #address-cells = <1>; #size-cells = <1>; ranges = <0x32c00000 0x32c00000 0x400000>; diff --git a/arch/arm64/boot/dts/freescale/imx8mn.dtsi b/arch/arm64/boot/dts/freescale/imx8mn.dtsi index fa78f0163270..43971abe218b 100644 --- a/arch/arm64/boot/dts/freescale/imx8mn.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mn.dtsi @@ -241,7 +241,7 @@ aips1: bus@30000000 { compatible = "fsl,aips-bus", "simple-bus"; - reg = <0x301f0000 0x10000>; + reg = <0x30000000 0x400000>; #address-cells = <1>; #size-cells = <1>; ranges; @@ -448,7 +448,7 @@ aips2: bus@30400000 { compatible = "fsl,aips-bus", "simple-bus"; - reg = <0x305f0000 0x10000>; + reg = <0x30400000 0x400000>; #address-cells = <1>; #size-cells = <1>; ranges; @@ -508,7 +508,7 @@ aips3: bus@30800000 { compatible = "fsl,aips-bus", "simple-bus"; - reg = <0x309f0000 0x10000>; + reg = <0x30800000 0x400000>; #address-cells = <1>; #size-cells = <1>; ranges; @@ -718,7 +718,7 @@ reg = <0x30bd0000 0x10000>; interrupts = <GIC_SPI 2 IRQ_TYPE_LEVEL_HIGH>; clocks = <&clk IMX8MN_CLK_SDMA1_ROOT>, - <&clk IMX8MN_CLK_SDMA1_ROOT>; + <&clk IMX8MN_CLK_AHB>; clock-names = "ipg", "ahb"; #dma-cells = <3>; fsl,sdma-ram-script-name = "imx/sdma/sdma-imx7d.bin"; @@ -754,7 +754,7 @@ aips4: bus@32c00000 { compatible = "fsl,aips-bus", "simple-bus"; - reg = <0x32df0000 0x10000>; + reg = <0x32c00000 0x400000>; #address-cells = <1>; #size-cells = <1>; ranges; diff --git a/arch/arm64/boot/dts/freescale/imx8mp-pinfunc.h b/arch/arm64/boot/dts/freescale/imx8mp-pinfunc.h index da78f89b6c98..319ab34cab3e 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp-pinfunc.h +++ b/arch/arm64/boot/dts/freescale/imx8mp-pinfunc.h @@ -151,26 +151,26 @@ #define MX8MP_IOMUXC_ENET_TXC__SIM_M_HADDR22 0x070 0x2D0 0x000 0x7 0x0 #define MX8MP_IOMUXC_ENET_RX_CTL__ENET_QOS_RGMII_RX_CTL 0x074 0x2D4 0x000 0x0 0x0 #define MX8MP_IOMUXC_ENET_RX_CTL__AUDIOMIX_SAI7_TX_SYNC 0x074 0x2D4 0x540 0x2 0x0 -#define MX8MP_IOMUXC_ENET_RX_CTL__AUDIOMIX_BIT_STREAM03 0x074 0x2D4 0x4CC 0x3 0x0 +#define MX8MP_IOMUXC_ENET_RX_CTL__AUDIOMIX_BIT_STREAM03 0x074 0x2D4 0x4CC 0x3 0x1 #define MX8MP_IOMUXC_ENET_RX_CTL__GPIO1_IO24 0x074 0x2D4 0x000 0x5 0x0 #define MX8MP_IOMUXC_ENET_RX_CTL__USDHC3_DATA2 0x074 0x2D4 0x618 0x6 0x0 #define MX8MP_IOMUXC_ENET_RX_CTL__SIM_M_HADDR23 0x074 0x2D4 0x000 0x7 0x0 #define MX8MP_IOMUXC_ENET_RXC__CCM_ENET_QOS_CLOCK_GENERATE_RX_CLK 0x078 0x2D8 0x000 0x0 0x0 #define MX8MP_IOMUXC_ENET_RXC__ENET_QOS_RX_ER 0x078 0x2D8 0x000 0x1 0x0 #define MX8MP_IOMUXC_ENET_RXC__AUDIOMIX_SAI7_TX_BCLK 0x078 0x2D8 0x53C 0x2 0x0 -#define MX8MP_IOMUXC_ENET_RXC__AUDIOMIX_BIT_STREAM02 0x078 0x2D8 0x4C8 0x3 0x0 +#define MX8MP_IOMUXC_ENET_RXC__AUDIOMIX_BIT_STREAM02 0x078 0x2D8 0x4C8 0x3 0x1 #define MX8MP_IOMUXC_ENET_RXC__GPIO1_IO25 0x078 0x2D8 0x000 0x5 0x0 #define MX8MP_IOMUXC_ENET_RXC__USDHC3_DATA3 0x078 0x2D8 0x61C 0x6 0x0 #define MX8MP_IOMUXC_ENET_RXC__SIM_M_HADDR24 0x078 0x2D8 0x000 0x7 0x0 #define MX8MP_IOMUXC_ENET_RD0__ENET_QOS_RGMII_RD0 0x07C 0x2DC 0x000 0x0 0x0 #define MX8MP_IOMUXC_ENET_RD0__AUDIOMIX_SAI7_RX_DATA00 0x07C 0x2DC 0x534 0x2 0x0 -#define MX8MP_IOMUXC_ENET_RD0__AUDIOMIX_BIT_STREAM01 0x07C 0x2DC 0x4C4 0x3 0x0 +#define MX8MP_IOMUXC_ENET_RD0__AUDIOMIX_BIT_STREAM01 0x07C 0x2DC 0x4C4 0x3 0x1 #define MX8MP_IOMUXC_ENET_RD0__GPIO1_IO26 0x07C 0x2DC 0x000 0x5 0x0 #define MX8MP_IOMUXC_ENET_RD0__USDHC3_DATA4 0x07C 0x2DC 0x620 0x6 0x0 #define MX8MP_IOMUXC_ENET_RD0__SIM_M_HADDR25 0x07C 0x2DC 0x000 0x7 0x0 #define MX8MP_IOMUXC_ENET_RD1__ENET_QOS_RGMII_RD1 0x080 0x2E0 0x000 0x0 0x0 #define MX8MP_IOMUXC_ENET_RD1__AUDIOMIX_SAI7_RX_SYNC 0x080 0x2E0 0x538 0x2 0x0 -#define MX8MP_IOMUXC_ENET_RD1__AUDIOMIX_BIT_STREAM00 0x080 0x2E0 0x4C0 0x3 0x0 +#define MX8MP_IOMUXC_ENET_RD1__AUDIOMIX_BIT_STREAM00 0x080 0x2E0 0x4C0 0x3 0x1 #define MX8MP_IOMUXC_ENET_RD1__GPIO1_IO27 0x080 0x2E0 0x000 0x5 0x0 #define MX8MP_IOMUXC_ENET_RD1__USDHC3_RESET_B 0x080 0x2E0 0x000 0x6 0x0 #define MX8MP_IOMUXC_ENET_RD1__SIM_M_HADDR26 0x080 0x2E0 0x000 0x7 0x0 @@ -291,7 +291,7 @@ #define MX8MP_IOMUXC_SD2_DATA0__I2C4_SDA 0x0C8 0x328 0x5C0 0x2 0x1 #define MX8MP_IOMUXC_SD2_DATA0__UART2_DCE_RX 0x0C8 0x328 0x5F0 0x3 0x2 #define MX8MP_IOMUXC_SD2_DATA0__UART2_DTE_TX 0x0C8 0x328 0x000 0x3 0x0 -#define MX8MP_IOMUXC_SD2_DATA0__AUDIOMIX_BIT_STREAM00 0x0C8 0x328 0x4C0 0x4 0x1 +#define MX8MP_IOMUXC_SD2_DATA0__AUDIOMIX_BIT_STREAM00 0x0C8 0x328 0x4C0 0x4 0x2 #define MX8MP_IOMUXC_SD2_DATA0__GPIO2_IO15 0x0C8 0x328 0x000 0x5 0x0 #define MX8MP_IOMUXC_SD2_DATA0__CCMSRCGPCMIX_OBSERVE2 0x0C8 0x328 0x000 0x6 0x0 #define MX8MP_IOMUXC_SD2_DATA0__OBSERVE_MUX_OUT02 0x0C8 0x328 0x000 0x7 0x0 @@ -313,7 +313,7 @@ #define MX8MP_IOMUXC_SD2_DATA3__USDHC2_DATA3 0x0D4 0x334 0x000 0x0 0x0 #define MX8MP_IOMUXC_SD2_DATA3__ECSPI2_MISO 0x0D4 0x334 0x56C 0x2 0x0 #define MX8MP_IOMUXC_SD2_DATA3__AUDIOMIX_SPDIF_IN 0x0D4 0x334 0x544 0x3 0x1 -#define MX8MP_IOMUXC_SD2_DATA3__AUDIOMIX_BIT_STREAM03 0x0D4 0x334 0x4CC 0x4 0x1 +#define MX8MP_IOMUXC_SD2_DATA3__AUDIOMIX_BIT_STREAM03 0x0D4 0x334 0x4CC 0x4 0x2 #define MX8MP_IOMUXC_SD2_DATA3__GPIO2_IO18 0x0D4 0x334 0x000 0x5 0x0 #define MX8MP_IOMUXC_SD2_DATA3__CCMSRCGPCMIX_EARLY_RESET 0x0D4 0x334 0x000 0x6 0x0 #define MX8MP_IOMUXC_SD2_RESET_B__USDHC2_RESET_B 0x0D8 0x338 0x000 0x0 0x0 @@ -487,27 +487,27 @@ #define MX8MP_IOMUXC_SAI5_RXD0__AUDIOMIX_SAI1_TX_DATA02 0x134 0x394 0x000 0x1 0x0 #define MX8MP_IOMUXC_SAI5_RXD0__PWM2_OUT 0x134 0x394 0x000 0x2 0x0 #define MX8MP_IOMUXC_SAI5_RXD0__I2C5_SCL 0x134 0x394 0x5C4 0x3 0x1 -#define MX8MP_IOMUXC_SAI5_RXD0__AUDIOMIX_BIT_STREAM00 0x134 0x394 0x4C0 0x4 0x2 +#define MX8MP_IOMUXC_SAI5_RXD0__AUDIOMIX_BIT_STREAM00 0x134 0x394 0x4C0 0x4 0x3 #define MX8MP_IOMUXC_SAI5_RXD0__GPIO3_IO21 0x134 0x394 0x000 0x5 0x0 #define MX8MP_IOMUXC_SAI5_RXD1__AUDIOMIX_SAI5_RX_DATA01 0x138 0x398 0x4FC 0x0 0x0 #define MX8MP_IOMUXC_SAI5_RXD1__AUDIOMIX_SAI1_TX_DATA03 0x138 0x398 0x000 0x1 0x0 #define MX8MP_IOMUXC_SAI5_RXD1__AUDIOMIX_SAI1_TX_SYNC 0x138 0x398 0x4D8 0x2 0x0 #define MX8MP_IOMUXC_SAI5_RXD1__AUDIOMIX_SAI5_TX_SYNC 0x138 0x398 0x510 0x3 0x0 -#define MX8MP_IOMUXC_SAI5_RXD1__AUDIOMIX_BIT_STREAM01 0x138 0x398 0x4C4 0x4 0x2 +#define MX8MP_IOMUXC_SAI5_RXD1__AUDIOMIX_BIT_STREAM01 0x138 0x398 0x4C4 0x4 0x3 #define MX8MP_IOMUXC_SAI5_RXD1__GPIO3_IO22 0x138 0x398 0x000 0x5 0x0 #define MX8MP_IOMUXC_SAI5_RXD1__CAN1_TX 0x138 0x398 0x000 0x6 0x0 #define MX8MP_IOMUXC_SAI5_RXD2__AUDIOMIX_SAI5_RX_DATA02 0x13C 0x39C 0x500 0x0 0x0 #define MX8MP_IOMUXC_SAI5_RXD2__AUDIOMIX_SAI1_TX_DATA04 0x13C 0x39C 0x000 0x1 0x0 #define MX8MP_IOMUXC_SAI5_RXD2__AUDIOMIX_SAI1_TX_SYNC 0x13C 0x39C 0x4D8 0x2 0x1 #define MX8MP_IOMUXC_SAI5_RXD2__AUDIOMIX_SAI5_TX_BCLK 0x13C 0x39C 0x50C 0x3 0x0 -#define MX8MP_IOMUXC_SAI5_RXD2__AUDIOMIX_BIT_STREAM02 0x13C 0x39C 0x4C8 0x4 0x2 +#define MX8MP_IOMUXC_SAI5_RXD2__AUDIOMIX_BIT_STREAM02 0x13C 0x39C 0x4C8 0x4 0x3 #define MX8MP_IOMUXC_SAI5_RXD2__GPIO3_IO23 0x13C 0x39C 0x000 0x5 0x0 #define MX8MP_IOMUXC_SAI5_RXD2__CAN1_RX 0x13C 0x39C 0x54C 0x6 0x0 #define MX8MP_IOMUXC_SAI5_RXD3__AUDIOMIX_SAI5_RX_DATA03 0x140 0x3A0 0x504 0x0 0x0 #define MX8MP_IOMUXC_SAI5_RXD3__AUDIOMIX_SAI1_TX_DATA05 0x140 0x3A0 0x000 0x1 0x0 #define MX8MP_IOMUXC_SAI5_RXD3__AUDIOMIX_SAI1_TX_SYNC 0x140 0x3A0 0x4D8 0x2 0x2 #define MX8MP_IOMUXC_SAI5_RXD3__AUDIOMIX_SAI5_TX_DATA00 0x140 0x3A0 0x000 0x3 0x0 -#define MX8MP_IOMUXC_SAI5_RXD3__AUDIOMIX_BIT_STREAM03 0x140 0x3A0 0x4CC 0x4 0x2 +#define MX8MP_IOMUXC_SAI5_RXD3__AUDIOMIX_BIT_STREAM03 0x140 0x3A0 0x4CC 0x4 0x3 #define MX8MP_IOMUXC_SAI5_RXD3__GPIO3_IO24 0x140 0x3A0 0x000 0x5 0x0 #define MX8MP_IOMUXC_SAI5_RXD3__CAN2_TX 0x140 0x3A0 0x000 0x6 0x0 #define MX8MP_IOMUXC_SAI5_MCLK__AUDIOMIX_SAI5_MCLK 0x144 0x3A4 0x4F0 0x0 0x0 @@ -528,22 +528,22 @@ #define MX8MP_IOMUXC_SAI1_RXD0__AUDIOMIX_SAI1_RX_DATA00 0x150 0x3B0 0x000 0x0 0x0 #define MX8MP_IOMUXC_SAI1_RXD0__AUDIOMIX_SAI5_RX_DATA00 0x150 0x3B0 0x4F8 0x1 0x1 #define MX8MP_IOMUXC_SAI1_RXD0__AUDIOMIX_SAI1_TX_DATA01 0x150 0x3B0 0x000 0x2 0x0 -#define MX8MP_IOMUXC_SAI1_RXD0__AUDIOMIX_BIT_STREAM00 0x150 0x3B0 0x4C0 0x3 0x3 +#define MX8MP_IOMUXC_SAI1_RXD0__AUDIOMIX_BIT_STREAM00 0x150 0x3B0 0x4C0 0x3 0x4 #define MX8MP_IOMUXC_SAI1_RXD0__ENET1_1588_EVENT1_IN 0x150 0x3B0 0x000 0x4 0x0 #define MX8MP_IOMUXC_SAI1_RXD0__GPIO4_IO02 0x150 0x3B0 0x000 0x5 0x0 #define MX8MP_IOMUXC_SAI1_RXD1__AUDIOMIX_SAI1_RX_DATA01 0x154 0x3B4 0x000 0x0 0x0 #define MX8MP_IOMUXC_SAI1_RXD1__AUDIOMIX_SAI5_RX_DATA01 0x154 0x3B4 0x4FC 0x1 0x1 -#define MX8MP_IOMUXC_SAI1_RXD1__AUDIOMIX_BIT_STREAM01 0x154 0x3B4 0x4C4 0x3 0x3 +#define MX8MP_IOMUXC_SAI1_RXD1__AUDIOMIX_BIT_STREAM01 0x154 0x3B4 0x4C4 0x3 0x4 #define MX8MP_IOMUXC_SAI1_RXD1__ENET1_1588_EVENT1_OUT 0x154 0x3B4 0x000 0x4 0x0 #define MX8MP_IOMUXC_SAI1_RXD1__GPIO4_IO03 0x154 0x3B4 0x000 0x5 0x0 #define MX8MP_IOMUXC_SAI1_RXD2__AUDIOMIX_SAI1_RX_DATA02 0x158 0x3B8 0x000 0x0 0x0 #define MX8MP_IOMUXC_SAI1_RXD2__AUDIOMIX_SAI5_RX_DATA02 0x158 0x3B8 0x500 0x1 0x1 -#define MX8MP_IOMUXC_SAI1_RXD2__AUDIOMIX_BIT_STREAM02 0x158 0x3B8 0x4C8 0x3 0x3 +#define MX8MP_IOMUXC_SAI1_RXD2__AUDIOMIX_BIT_STREAM02 0x158 0x3B8 0x4C8 0x3 0x4 #define MX8MP_IOMUXC_SAI1_RXD2__ENET1_MDC 0x158 0x3B8 0x000 0x4 0x0 #define MX8MP_IOMUXC_SAI1_RXD2__GPIO4_IO04 0x158 0x3B8 0x000 0x5 0x0 #define MX8MP_IOMUXC_SAI1_RXD3__AUDIOMIX_SAI1_RX_DATA03 0x15C 0x3BC 0x000 0x0 0x0 #define MX8MP_IOMUXC_SAI1_RXD3__AUDIOMIX_SAI5_RX_DATA03 0x15C 0x3BC 0x504 0x1 0x1 -#define MX8MP_IOMUXC_SAI1_RXD3__AUDIOMIX_BIT_STREAM03 0x15C 0x3BC 0x4CC 0x3 0x3 +#define MX8MP_IOMUXC_SAI1_RXD3__AUDIOMIX_BIT_STREAM03 0x15C 0x3BC 0x4CC 0x3 0x4 #define MX8MP_IOMUXC_SAI1_RXD3__ENET1_MDIO 0x15C 0x3BC 0x57C 0x4 0x1 #define MX8MP_IOMUXC_SAI1_RXD3__GPIO4_IO05 0x15C 0x3BC 0x000 0x5 0x0 #define MX8MP_IOMUXC_SAI1_RXD4__AUDIOMIX_SAI1_RX_DATA04 0x160 0x3C0 0x000 0x0 0x0 @@ -624,7 +624,7 @@ #define MX8MP_IOMUXC_SAI2_RXFS__UART1_DCE_TX 0x19C 0x3FC 0x000 0x4 0x0 #define MX8MP_IOMUXC_SAI2_RXFS__UART1_DTE_RX 0x19C 0x3FC 0x5E8 0x4 0x2 #define MX8MP_IOMUXC_SAI2_RXFS__GPIO4_IO21 0x19C 0x3FC 0x000 0x5 0x0 -#define MX8MP_IOMUXC_SAI2_RXFS__AUDIOMIX_BIT_STREAM02 0x19C 0x3FC 0x4C8 0x6 0x4 +#define MX8MP_IOMUXC_SAI2_RXFS__AUDIOMIX_BIT_STREAM02 0x19C 0x3FC 0x4C8 0x6 0x5 #define MX8MP_IOMUXC_SAI2_RXFS__SIM_M_HSIZE00 0x19C 0x3FC 0x000 0x7 0x0 #define MX8MP_IOMUXC_SAI2_RXC__AUDIOMIX_SAI2_RX_BCLK 0x1A0 0x400 0x000 0x0 0x0 #define MX8MP_IOMUXC_SAI2_RXC__AUDIOMIX_SAI5_TX_BCLK 0x1A0 0x400 0x50C 0x1 0x2 @@ -632,7 +632,7 @@ #define MX8MP_IOMUXC_SAI2_RXC__UART1_DCE_RX 0x1A0 0x400 0x5E8 0x4 0x3 #define MX8MP_IOMUXC_SAI2_RXC__UART1_DTE_TX 0x1A0 0x400 0x000 0x4 0x0 #define MX8MP_IOMUXC_SAI2_RXC__GPIO4_IO22 0x1A0 0x400 0x000 0x5 0x0 -#define MX8MP_IOMUXC_SAI2_RXC__AUDIOMIX_BIT_STREAM01 0x1A0 0x400 0x4C4 0x6 0x4 +#define MX8MP_IOMUXC_SAI2_RXC__AUDIOMIX_BIT_STREAM01 0x1A0 0x400 0x4C4 0x6 0x5 #define MX8MP_IOMUXC_SAI2_RXC__SIM_M_HSIZE01 0x1A0 0x400 0x000 0x7 0x0 #define MX8MP_IOMUXC_SAI2_RXD0__AUDIOMIX_SAI2_RX_DATA00 0x1A4 0x404 0x000 0x0 0x0 #define MX8MP_IOMUXC_SAI2_RXD0__AUDIOMIX_SAI5_TX_DATA00 0x1A4 0x404 0x000 0x1 0x0 @@ -641,7 +641,7 @@ #define MX8MP_IOMUXC_SAI2_RXD0__UART1_DCE_RTS 0x1A4 0x404 0x5E4 0x4 0x2 #define MX8MP_IOMUXC_SAI2_RXD0__UART1_DTE_CTS 0x1A4 0x404 0x000 0x4 0x0 #define MX8MP_IOMUXC_SAI2_RXD0__GPIO4_IO23 0x1A4 0x404 0x000 0x5 0x0 -#define MX8MP_IOMUXC_SAI2_RXD0__AUDIOMIX_BIT_STREAM03 0x1A4 0x404 0x4CC 0x6 0x4 +#define MX8MP_IOMUXC_SAI2_RXD0__AUDIOMIX_BIT_STREAM03 0x1A4 0x404 0x4CC 0x6 0x5 #define MX8MP_IOMUXC_SAI2_RXD0__SIM_M_HSIZE02 0x1A4 0x404 0x000 0x7 0x0 #define MX8MP_IOMUXC_SAI2_TXFS__AUDIOMIX_SAI2_TX_SYNC 0x1A8 0x408 0x000 0x0 0x0 #define MX8MP_IOMUXC_SAI2_TXFS__AUDIOMIX_SAI5_TX_DATA01 0x1A8 0x408 0x000 0x1 0x0 @@ -650,13 +650,13 @@ #define MX8MP_IOMUXC_SAI2_TXFS__UART1_DCE_CTS 0x1A8 0x408 0x000 0x4 0x0 #define MX8MP_IOMUXC_SAI2_TXFS__UART1_DTE_RTS 0x1A8 0x408 0x5E4 0x4 0x3 #define MX8MP_IOMUXC_SAI2_TXFS__GPIO4_IO24 0x1A8 0x408 0x000 0x5 0x0 -#define MX8MP_IOMUXC_SAI2_TXFS__AUDIOMIX_BIT_STREAM02 0x1A8 0x408 0x4C8 0x6 0x5 +#define MX8MP_IOMUXC_SAI2_TXFS__AUDIOMIX_BIT_STREAM02 0x1A8 0x408 0x4C8 0x6 0x6 #define MX8MP_IOMUXC_SAI2_TXFS__SIM_M_HWRITE 0x1A8 0x408 0x000 0x7 0x0 #define MX8MP_IOMUXC_SAI2_TXC__AUDIOMIX_SAI2_TX_BCLK 0x1AC 0x40C 0x000 0x0 0x0 #define MX8MP_IOMUXC_SAI2_TXC__AUDIOMIX_SAI5_TX_DATA02 0x1AC 0x40C 0x000 0x1 0x0 #define MX8MP_IOMUXC_SAI2_TXC__CAN1_RX 0x1AC 0x40C 0x54C 0x3 0x1 #define MX8MP_IOMUXC_SAI2_TXC__GPIO4_IO25 0x1AC 0x40C 0x000 0x5 0x0 -#define MX8MP_IOMUXC_SAI2_TXC__AUDIOMIX_BIT_STREAM01 0x1AC 0x40C 0x4C4 0x6 0x5 +#define MX8MP_IOMUXC_SAI2_TXC__AUDIOMIX_BIT_STREAM01 0x1AC 0x40C 0x4C4 0x6 0x6 #define MX8MP_IOMUXC_SAI2_TXC__SIM_M_HREADYOUT 0x1AC 0x40C 0x000 0x7 0x0 #define MX8MP_IOMUXC_SAI2_TXD0__AUDIOMIX_SAI2_TX_DATA00 0x1B0 0x410 0x000 0x0 0x0 #define MX8MP_IOMUXC_SAI2_TXD0__AUDIOMIX_SAI5_TX_DATA03 0x1B0 0x410 0x000 0x1 0x0 @@ -680,7 +680,7 @@ #define MX8MP_IOMUXC_SAI3_RXFS__AUDIOMIX_SAI3_RX_DATA01 0x1B8 0x418 0x000 0x3 0x0 #define MX8MP_IOMUXC_SAI3_RXFS__AUDIOMIX_SPDIF_IN 0x1B8 0x418 0x544 0x4 0x2 #define MX8MP_IOMUXC_SAI3_RXFS__GPIO4_IO28 0x1B8 0x418 0x000 0x5 0x0 -#define MX8MP_IOMUXC_SAI3_RXFS__AUDIOMIX_BIT_STREAM00 0x1B8 0x418 0x4C0 0x6 0x4 +#define MX8MP_IOMUXC_SAI3_RXFS__AUDIOMIX_BIT_STREAM00 0x1B8 0x418 0x4C0 0x6 0x5 #define MX8MP_IOMUXC_SAI3_RXFS__TPSMP_HTRANS00 0x1B8 0x418 0x000 0x7 0x0 #define MX8MP_IOMUXC_SAI3_RXC__AUDIOMIX_SAI3_RX_BCLK 0x1BC 0x41C 0x000 0x0 0x0 #define MX8MP_IOMUXC_SAI3_RXC__AUDIOMIX_SAI2_RX_DATA02 0x1BC 0x41C 0x000 0x1 0x0 @@ -697,7 +697,7 @@ #define MX8MP_IOMUXC_SAI3_RXD__UART2_DCE_RTS 0x1C0 0x420 0x5EC 0x4 0x3 #define MX8MP_IOMUXC_SAI3_RXD__UART2_DTE_CTS 0x1C0 0x420 0x000 0x4 0x0 #define MX8MP_IOMUXC_SAI3_RXD__GPIO4_IO30 0x1C0 0x420 0x000 0x5 0x0 -#define MX8MP_IOMUXC_SAI3_RXD__AUDIOMIX_BIT_STREAM01 0x1C0 0x420 0x4C4 0x6 0x6 +#define MX8MP_IOMUXC_SAI3_RXD__AUDIOMIX_BIT_STREAM01 0x1C0 0x420 0x4C4 0x6 0x7 #define MX8MP_IOMUXC_SAI3_RXD__TPSMP_HDATA00 0x1C0 0x420 0x000 0x7 0x0 #define MX8MP_IOMUXC_SAI3_TXFS__AUDIOMIX_SAI3_TX_SYNC 0x1C4 0x424 0x4EC 0x0 0x1 #define MX8MP_IOMUXC_SAI3_TXFS__AUDIOMIX_SAI2_TX_DATA01 0x1C4 0x424 0x000 0x1 0x0 @@ -706,7 +706,7 @@ #define MX8MP_IOMUXC_SAI3_TXFS__UART2_DCE_RX 0x1C4 0x424 0x5F0 0x4 0x4 #define MX8MP_IOMUXC_SAI3_TXFS__UART2_DTE_TX 0x1C4 0x424 0x000 0x4 0x0 #define MX8MP_IOMUXC_SAI3_TXFS__GPIO4_IO31 0x1C4 0x424 0x000 0x5 0x0 -#define MX8MP_IOMUXC_SAI3_TXFS__AUDIOMIX_BIT_STREAM03 0x1C4 0x424 0x4CC 0x6 0x5 +#define MX8MP_IOMUXC_SAI3_TXFS__AUDIOMIX_BIT_STREAM03 0x1C4 0x424 0x4CC 0x6 0x6 #define MX8MP_IOMUXC_SAI3_TXFS__TPSMP_HDATA01 0x1C4 0x424 0x000 0x7 0x0 #define MX8MP_IOMUXC_SAI3_TXC__AUDIOMIX_SAI3_TX_BCLK 0x1C8 0x428 0x4E8 0x0 0x1 #define MX8MP_IOMUXC_SAI3_TXC__AUDIOMIX_SAI2_TX_DATA02 0x1C8 0x428 0x000 0x1 0x0 @@ -715,7 +715,7 @@ #define MX8MP_IOMUXC_SAI3_TXC__UART2_DCE_TX 0x1C8 0x428 0x000 0x4 0x0 #define MX8MP_IOMUXC_SAI3_TXC__UART2_DTE_RX 0x1C8 0x428 0x5F0 0x4 0x5 #define MX8MP_IOMUXC_SAI3_TXC__GPIO5_IO00 0x1C8 0x428 0x000 0x5 0x0 -#define MX8MP_IOMUXC_SAI3_TXC__AUDIOMIX_BIT_STREAM02 0x1C8 0x428 0x4C8 0x6 0x6 +#define MX8MP_IOMUXC_SAI3_TXC__AUDIOMIX_BIT_STREAM02 0x1C8 0x428 0x4C8 0x6 0x7 #define MX8MP_IOMUXC_SAI3_TXC__TPSMP_HDATA02 0x1C8 0x428 0x000 0x7 0x0 #define MX8MP_IOMUXC_SAI3_TXD__AUDIOMIX_SAI3_TX_DATA00 0x1CC 0x42C 0x000 0x0 0x0 #define MX8MP_IOMUXC_SAI3_TXD__AUDIOMIX_SAI2_TX_DATA03 0x1CC 0x42C 0x000 0x1 0x0 diff --git a/arch/arm64/boot/dts/freescale/imx8mp.dtsi b/arch/arm64/boot/dts/freescale/imx8mp.dtsi index 9b1616e59d58..9f6ba763238d 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mp.dtsi @@ -145,7 +145,7 @@ aips1: bus@30000000 { compatible = "fsl,aips-bus", "simple-bus"; - reg = <0x301f0000 0x10000>; + reg = <0x30000000 0x400000>; #address-cells = <1>; #size-cells = <1>; ranges; @@ -318,7 +318,7 @@ aips2: bus@30400000 { compatible = "fsl,aips-bus", "simple-bus"; - reg = <0x305f0000 0x400000>; + reg = <0x30400000 0x400000>; #address-cells = <1>; #size-cells = <1>; ranges; @@ -378,7 +378,7 @@ aips3: bus@30800000 { compatible = "fsl,aips-bus", "simple-bus"; - reg = <0x309f0000 0x400000>; + reg = <0x30800000 0x400000>; #address-cells = <1>; #size-cells = <1>; ranges; diff --git a/arch/arm64/boot/dts/freescale/imx8mq.dtsi b/arch/arm64/boot/dts/freescale/imx8mq.dtsi index 75b384217a23..bab88369be1b 100644 --- a/arch/arm64/boot/dts/freescale/imx8mq.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mq.dtsi @@ -291,7 +291,7 @@ bus@30000000 { /* AIPS1 */ compatible = "fsl,aips-bus", "simple-bus"; - reg = <0x301f0000 0x10000>; + reg = <0x30000000 0x400000>; #address-cells = <1>; #size-cells = <1>; ranges = <0x30000000 0x30000000 0x400000>; @@ -696,7 +696,7 @@ bus@30400000 { /* AIPS2 */ compatible = "fsl,aips-bus", "simple-bus"; - reg = <0x305f0000 0x10000>; + reg = <0x30400000 0x400000>; #address-cells = <1>; #size-cells = <1>; ranges = <0x30400000 0x30400000 0x400000>; @@ -756,7 +756,7 @@ bus@30800000 { /* AIPS3 */ compatible = "fsl,aips-bus", "simple-bus"; - reg = <0x309f0000 0x10000>; + reg = <0x30800000 0x400000>; #address-cells = <1>; #size-cells = <1>; ranges = <0x30800000 0x30800000 0x400000>, @@ -1029,7 +1029,7 @@ bus@32c00000 { /* AIPS4 */ compatible = "fsl,aips-bus", "simple-bus"; - reg = <0x32df0000 0x10000>; + reg = <0x32c00000 0x400000>; #address-cells = <1>; #size-cells = <1>; ranges = <0x32c00000 0x32c00000 0x400000>; diff --git a/arch/arm64/boot/dts/mediatek/mt8173.dtsi b/arch/arm64/boot/dts/mediatek/mt8173.dtsi index ccb8e88a60c5..d819e44d94a8 100644 --- a/arch/arm64/boot/dts/mediatek/mt8173.dtsi +++ b/arch/arm64/boot/dts/mediatek/mt8173.dtsi @@ -1402,8 +1402,8 @@ "venc_lt_sel"; assigned-clocks = <&topckgen CLK_TOP_VENC_SEL>, <&topckgen CLK_TOP_VENC_LT_SEL>; - assigned-clock-parents = <&topckgen CLK_TOP_VENCPLL_D2>, - <&topckgen CLK_TOP_UNIVPLL1_D2>; + assigned-clock-parents = <&topckgen CLK_TOP_VCODECPLL>, + <&topckgen CLK_TOP_VCODECPLL_370P5>; }; jpegdec: jpegdec@18004000 { diff --git a/arch/arm64/boot/dts/mediatek/mt8516.dtsi b/arch/arm64/boot/dts/mediatek/mt8516.dtsi index 2f8adf042195..89af661e7f63 100644 --- a/arch/arm64/boot/dts/mediatek/mt8516.dtsi +++ b/arch/arm64/boot/dts/mediatek/mt8516.dtsi @@ -191,6 +191,11 @@ #clock-cells = <1>; }; + pericfg: pericfg@10003050 { + compatible = "mediatek,mt8516-pericfg", "syscon"; + reg = <0 0x10003050 0 0x1000>; + }; + apmixedsys: apmixedsys@10018000 { compatible = "mediatek,mt8516-apmixedsys", "syscon"; reg = <0 0x10018000 0 0x710>; @@ -401,6 +406,18 @@ status = "disabled"; }; + ethernet: ethernet@11180000 { + compatible = "mediatek,mt8516-eth"; + reg = <0 0x11180000 0 0x1000>; + mediatek,pericfg = <&pericfg>; + interrupts = <GIC_SPI 111 IRQ_TYPE_LEVEL_LOW>; + clocks = <&topckgen CLK_TOP_RG_ETH>, + <&topckgen CLK_TOP_66M_ETH>, + <&topckgen CLK_TOP_133M_ETH>; + clock-names = "core", "reg", "trans"; + status = "disabled"; + }; + rng: rng@1020c000 { compatible = "mediatek,mt8516-rng", "mediatek,mt7623-rng"; diff --git a/arch/arm64/boot/dts/mediatek/pumpkin-common.dtsi b/arch/arm64/boot/dts/mediatek/pumpkin-common.dtsi index a31093d7142b..dfceffe6950a 100644 --- a/arch/arm64/boot/dts/mediatek/pumpkin-common.dtsi +++ b/arch/arm64/boot/dts/mediatek/pumpkin-common.dtsi @@ -9,6 +9,7 @@ / { aliases { serial0 = &uart0; + ethernet0 = ðernet; }; chosen { @@ -166,6 +167,24 @@ status = "okay"; }; +ðernet { + pinctrl-names = "default"; + pinctrl-0 = <ðernet_pins_default>; + phy-handle = <ð_phy>; + phy-mode = "rmii"; + mac-address = [00 00 00 00 00 00]; + status = "okay"; + + mdio { + #address-cells = <1>; + #size-cells = <0>; + + eth_phy: ethernet-phy@0 { + reg = <0>; + }; + }; +}; + &usb0 { status = "okay"; dr_mode = "peripheral"; @@ -218,4 +237,19 @@ bias-pull-up; }; }; + + ethernet_pins_default: ethernet { + pins_ethernet { + pinmux = <MT8516_PIN_0_EINT0__FUNC_EXT_TXD0>, + <MT8516_PIN_1_EINT1__FUNC_EXT_TXD1>, + <MT8516_PIN_5_EINT5__FUNC_EXT_RXER>, + <MT8516_PIN_6_EINT6__FUNC_EXT_RXC>, + <MT8516_PIN_7_EINT7__FUNC_EXT_RXDV>, + <MT8516_PIN_8_EINT8__FUNC_EXT_RXD0>, + <MT8516_PIN_9_EINT9__FUNC_EXT_RXD1>, + <MT8516_PIN_12_EINT12__FUNC_EXT_TXEN>, + <MT8516_PIN_38_MRG_DI__FUNC_EXT_MDIO>, + <MT8516_PIN_39_MRG_DO__FUNC_EXT_MDC>; + }; + }; }; diff --git a/arch/arm64/boot/dts/qcom/apq8096-db820c.dtsi b/arch/arm64/boot/dts/qcom/apq8096-db820c.dtsi index af87350b5547..c4abbccf2bed 100644 --- a/arch/arm64/boot/dts/qcom/apq8096-db820c.dtsi +++ b/arch/arm64/boot/dts/qcom/apq8096-db820c.dtsi @@ -658,8 +658,8 @@ s11 { qcom,saw-leader; regulator-always-on; - regulator-min-microvolt = <1230000>; - regulator-max-microvolt = <1230000>; + regulator-min-microvolt = <980000>; + regulator-max-microvolt = <980000>; }; }; @@ -908,10 +908,27 @@ status = "okay"; }; +&q6asmdai { + dai@0 { + reg = <0>; + }; + + dai@1 { + reg = <1>; + }; + + dai@2 { + reg = <2>; + }; +}; + &sound { compatible = "qcom,apq8096-sndcard"; model = "DB820c"; - audio-routing = "RX_BIAS", "MCLK"; + audio-routing = "RX_BIAS", "MCLK", + "MM_DL1", "MultiMedia1 Playback", + "MM_DL2", "MultiMedia2 Playback", + "MultiMedia3 Capture", "MM_UL3"; mm1-dai-link { link-name = "MultiMedia1"; diff --git a/arch/arm64/boot/dts/qcom/msm8996.dtsi b/arch/arm64/boot/dts/qcom/msm8996.dtsi index 14827adebd94..98634d5c4440 100644 --- a/arch/arm64/boot/dts/qcom/msm8996.dtsi +++ b/arch/arm64/boot/dts/qcom/msm8996.dtsi @@ -2066,6 +2066,8 @@ reg = <APR_SVC_ASM>; q6asmdai: dais { compatible = "qcom,q6asm-dais"; + #address-cells = <1>; + #size-cells = <0>; #sound-dai-cells = <1>; iommus = <&lpass_q6_smmu 1>; }; diff --git a/arch/arm64/boot/dts/qcom/sdm845-db845c.dts b/arch/arm64/boot/dts/qcom/sdm845-db845c.dts index a2e05926b429..21fd6f8d5799 100644 --- a/arch/arm64/boot/dts/qcom/sdm845-db845c.dts +++ b/arch/arm64/boot/dts/qcom/sdm845-db845c.dts @@ -442,17 +442,14 @@ &q6asmdai { dai@0 { reg = <0>; - direction = <2>; }; dai@1 { reg = <1>; - direction = <2>; }; dai@2 { reg = <2>; - direction = <1>; }; dai@3 { diff --git a/arch/arm64/boot/dts/qcom/sdm845.dtsi b/arch/arm64/boot/dts/qcom/sdm845.dtsi index 8f926b5234d4..de6bb86c4968 100644 --- a/arch/arm64/boot/dts/qcom/sdm845.dtsi +++ b/arch/arm64/boot/dts/qcom/sdm845.dtsi @@ -1761,6 +1761,8 @@ ipa: ipa@1e40000 { compatible = "qcom,sdm845-ipa"; + + iommus = <&apps_smmu 0x720 0x3>; reg = <0 0x1e40000 0 0x7000>, <0 0x1e47000 0 0x2000>, <0 0x1e04000 0 0x2c000>; diff --git a/arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts b/arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts index 3b617a75fafa..51a670ad15b2 100644 --- a/arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts +++ b/arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts @@ -359,12 +359,10 @@ &q6asmdai { dai@0 { reg = <0>; - direction = <2>; }; dai@1 { reg = <1>; - direction = <1>; }; }; diff --git a/arch/arm64/boot/dts/renesas/r8a77970-eagle.dts b/arch/arm64/boot/dts/renesas/r8a77970-eagle.dts index 2afb91ec9c8d..ac2156ab3e62 100644 --- a/arch/arm64/boot/dts/renesas/r8a77970-eagle.dts +++ b/arch/arm64/boot/dts/renesas/r8a77970-eagle.dts @@ -137,8 +137,6 @@ adi,input-depth = <8>; adi,input-colorspace = "rgb"; adi,input-clock = "1x"; - adi,input-style = <1>; - adi,input-justification = "evenly"; ports { #address-cells = <1>; diff --git a/arch/arm64/boot/dts/renesas/r8a77970-v3msk.dts b/arch/arm64/boot/dts/renesas/r8a77970-v3msk.dts index d7c7b9156e08..01c4ba0f7be1 100644 --- a/arch/arm64/boot/dts/renesas/r8a77970-v3msk.dts +++ b/arch/arm64/boot/dts/renesas/r8a77970-v3msk.dts @@ -150,8 +150,6 @@ adi,input-depth = <8>; adi,input-colorspace = "rgb"; adi,input-clock = "1x"; - adi,input-style = <1>; - adi,input-justification = "evenly"; ports { #address-cells = <1>; diff --git a/arch/arm64/boot/dts/renesas/r8a77980-condor.dts b/arch/arm64/boot/dts/renesas/r8a77980-condor.dts index 3dde028e22a6..ef8350a062af 100644 --- a/arch/arm64/boot/dts/renesas/r8a77980-condor.dts +++ b/arch/arm64/boot/dts/renesas/r8a77980-condor.dts @@ -174,8 +174,6 @@ adi,input-depth = <8>; adi,input-colorspace = "rgb"; adi,input-clock = "1x"; - adi,input-style = <1>; - adi,input-justification = "evenly"; ports { #address-cells = <1>; diff --git a/arch/arm64/boot/dts/renesas/r8a77980-v3hsk.dts b/arch/arm64/boot/dts/renesas/r8a77980-v3hsk.dts index adbfd8f07d06..6dff04693223 100644 --- a/arch/arm64/boot/dts/renesas/r8a77980-v3hsk.dts +++ b/arch/arm64/boot/dts/renesas/r8a77980-v3hsk.dts @@ -141,8 +141,6 @@ adi,input-depth = <8>; adi,input-colorspace = "rgb"; adi,input-clock = "1x"; - adi,input-style = <1>; - adi,input-justification = "evenly"; ports { #address-cells = <1>; diff --git a/arch/arm64/boot/dts/renesas/r8a77980.dtsi b/arch/arm64/boot/dts/renesas/r8a77980.dtsi index e01b0508a18f..d672b320bc14 100644 --- a/arch/arm64/boot/dts/renesas/r8a77980.dtsi +++ b/arch/arm64/boot/dts/renesas/r8a77980.dtsi @@ -1318,6 +1318,7 @@ ipmmu_vip0: mmu@e7b00000 { compatible = "renesas,ipmmu-r8a77980"; reg = <0 0xe7b00000 0 0x1000>; + renesas,ipmmu-main = <&ipmmu_mm 4>; power-domains = <&sysc R8A77980_PD_ALWAYS_ON>; #iommu-cells = <1>; }; @@ -1325,6 +1326,7 @@ ipmmu_vip1: mmu@e7960000 { compatible = "renesas,ipmmu-r8a77980"; reg = <0 0xe7960000 0 0x1000>; + renesas,ipmmu-main = <&ipmmu_mm 11>; power-domains = <&sysc R8A77980_PD_ALWAYS_ON>; #iommu-cells = <1>; }; diff --git a/arch/arm64/boot/dts/renesas/r8a77990-ebisu.dts b/arch/arm64/boot/dts/renesas/r8a77990-ebisu.dts index 4fd2b14fbb8b..dc24cec46ae1 100644 --- a/arch/arm64/boot/dts/renesas/r8a77990-ebisu.dts +++ b/arch/arm64/boot/dts/renesas/r8a77990-ebisu.dts @@ -360,8 +360,6 @@ adi,input-depth = <8>; adi,input-colorspace = "rgb"; adi,input-clock = "1x"; - adi,input-style = <1>; - adi,input-justification = "evenly"; ports { #address-cells = <1>; diff --git a/arch/arm64/boot/dts/renesas/r8a77995-draak.dts b/arch/arm64/boot/dts/renesas/r8a77995-draak.dts index 67634cb01d6b..79c73a99d2fe 100644 --- a/arch/arm64/boot/dts/renesas/r8a77995-draak.dts +++ b/arch/arm64/boot/dts/renesas/r8a77995-draak.dts @@ -272,8 +272,8 @@ hdmi-encoder@39 { compatible = "adi,adv7511w"; - reg = <0x39>, <0x3f>, <0x38>, <0x3c>; - reg-names = "main", "edid", "packet", "cec"; + reg = <0x39>, <0x3f>, <0x3c>, <0x38>; + reg-names = "main", "edid", "cec", "packet"; interrupt-parent = <&gpio1>; interrupts = <28 IRQ_TYPE_LEVEL_LOW>; @@ -284,8 +284,6 @@ adi,input-depth = <8>; adi,input-colorspace = "rgb"; adi,input-clock = "1x"; - adi,input-style = <1>; - adi,input-justification = "evenly"; ports { #address-cells = <1>; diff --git a/arch/arm64/boot/dts/rockchip/px30.dtsi b/arch/arm64/boot/dts/rockchip/px30.dtsi index f809dd6d5dc3..adc9b8bf5eaa 100644 --- a/arch/arm64/boot/dts/rockchip/px30.dtsi +++ b/arch/arm64/boot/dts/rockchip/px30.dtsi @@ -143,7 +143,7 @@ }; arm-pmu { - compatible = "arm,cortex-a53-pmu"; + compatible = "arm,cortex-a35-pmu"; interrupts = <GIC_SPI 100 IRQ_TYPE_LEVEL_HIGH>, <GIC_SPI 101 IRQ_TYPE_LEVEL_HIGH>, <GIC_SPI 102 IRQ_TYPE_LEVEL_HIGH>, diff --git a/arch/arm64/boot/dts/rockchip/rk3308.dtsi b/arch/arm64/boot/dts/rockchip/rk3308.dtsi index ac43bc3f7031..ac7f694079d0 100644 --- a/arch/arm64/boot/dts/rockchip/rk3308.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3308.dtsi @@ -127,7 +127,7 @@ }; arm-pmu { - compatible = "arm,cortex-a53-pmu"; + compatible = "arm,cortex-a35-pmu"; interrupts = <GIC_SPI 83 IRQ_TYPE_LEVEL_HIGH>, <GIC_SPI 84 IRQ_TYPE_LEVEL_HIGH>, <GIC_SPI 85 IRQ_TYPE_LEVEL_HIGH>, diff --git a/arch/arm64/boot/dts/rockchip/rk3328-evb.dts b/arch/arm64/boot/dts/rockchip/rk3328-evb.dts index 49c4b96da3d4..ac29c2744d08 100644 --- a/arch/arm64/boot/dts/rockchip/rk3328-evb.dts +++ b/arch/arm64/boot/dts/rockchip/rk3328-evb.dts @@ -82,17 +82,16 @@ &gmac2phy { phy-supply = <&vcc_phy>; clock_in_out = "output"; - assigned-clocks = <&cru SCLK_MAC2PHY_SRC>; assigned-clock-rate = <50000000>; assigned-clocks = <&cru SCLK_MAC2PHY>; assigned-clock-parents = <&cru SCLK_MAC2PHY_SRC>; - + status = "okay"; }; &i2c1 { status = "okay"; - rk805: rk805@18 { + rk805: pmic@18 { compatible = "rockchip,rk805"; reg = <0x18>; interrupt-parent = <&gpio2>; diff --git a/arch/arm64/boot/dts/rockchip/rk3328-rock64.dts b/arch/arm64/boot/dts/rockchip/rk3328-rock64.dts index bf3e546f5266..ebf3eb222e1f 100644 --- a/arch/arm64/boot/dts/rockchip/rk3328-rock64.dts +++ b/arch/arm64/boot/dts/rockchip/rk3328-rock64.dts @@ -170,7 +170,7 @@ &i2c1 { status = "okay"; - rk805: rk805@18 { + rk805: pmic@18 { compatible = "rockchip,rk805"; reg = <0x18>; interrupt-parent = <&gpio2>; diff --git a/arch/arm64/boot/dts/rockchip/rk3328.dtsi b/arch/arm64/boot/dts/rockchip/rk3328.dtsi index 7e88d88aab98..a4d591d91533 100644 --- a/arch/arm64/boot/dts/rockchip/rk3328.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3328.dtsi @@ -299,8 +299,6 @@ grf: syscon@ff100000 { compatible = "rockchip,rk3328-grf", "syscon", "simple-mfd"; reg = <0x0 0xff100000 0x0 0x1000>; - #address-cells = <1>; - #size-cells = <1>; io_domains: io-domains { compatible = "rockchip,rk3328-io-voltage-domain"; @@ -1794,10 +1792,6 @@ }; gmac2phy { - fephyled_speed100: fephyled-speed100 { - rockchip,pins = <0 RK_PD7 1 &pcfg_pull_none>; - }; - fephyled_speed10: fephyled-speed10 { rockchip,pins = <0 RK_PD6 1 &pcfg_pull_none>; }; @@ -1806,18 +1800,6 @@ rockchip,pins = <0 RK_PD6 2 &pcfg_pull_none>; }; - fephyled_rxm0: fephyled-rxm0 { - rockchip,pins = <0 RK_PD5 1 &pcfg_pull_none>; - }; - - fephyled_txm0: fephyled-txm0 { - rockchip,pins = <0 RK_PD5 2 &pcfg_pull_none>; - }; - - fephyled_linkm0: fephyled-linkm0 { - rockchip,pins = <0 RK_PD4 1 &pcfg_pull_none>; - }; - fephyled_rxm1: fephyled-rxm1 { rockchip,pins = <2 RK_PD1 2 &pcfg_pull_none>; }; diff --git a/arch/arm64/boot/dts/rockchip/rk3399-pinebook-pro.dts b/arch/arm64/boot/dts/rockchip/rk3399-pinebook-pro.dts index 5ea281b55fe2..c49982dfd8fc 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399-pinebook-pro.dts +++ b/arch/arm64/boot/dts/rockchip/rk3399-pinebook-pro.dts @@ -147,7 +147,7 @@ "Speaker", "Speaker Amplifier OUTL", "Speaker", "Speaker Amplifier OUTR"; - simple-audio-card,hp-det-gpio = <&gpio0 RK_PB0 GPIO_ACTIVE_LOW>; + simple-audio-card,hp-det-gpio = <&gpio0 RK_PB0 GPIO_ACTIVE_HIGH>; simple-audio-card,aux-devs = <&speaker_amp>; simple-audio-card,pin-switches = "Speaker"; @@ -690,7 +690,8 @@ fusb0: fusb30x@22 { compatible = "fcs,fusb302"; reg = <0x22>; - fcs,int_n = <&gpio1 RK_PA2 GPIO_ACTIVE_HIGH>; + interrupt-parent = <&gpio1>; + interrupts = <RK_PA2 IRQ_TYPE_LEVEL_LOW>; pinctrl-names = "default"; pinctrl-0 = <&fusb0_int_gpio>; vbus-supply = <&vbus_typec>; @@ -788,13 +789,13 @@ dc-charger { dc_det_gpio: dc-det-gpio { - rockchip,pins = <4 RK_PD0 RK_FUNC_GPIO &pcfg_pull_none>; + rockchip,pins = <4 RK_PD0 RK_FUNC_GPIO &pcfg_pull_up>; }; }; es8316 { hp_det_gpio: hp-det-gpio { - rockchip,pins = <0 RK_PB0 RK_FUNC_GPIO &pcfg_pull_down>; + rockchip,pins = <0 RK_PB0 RK_FUNC_GPIO &pcfg_pull_up>; }; }; diff --git a/arch/arm64/boot/dts/rockchip/rk3399.dtsi b/arch/arm64/boot/dts/rockchip/rk3399.dtsi index 74f2c3d49095..1448f358ed0a 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3399.dtsi @@ -403,7 +403,7 @@ reset-names = "usb3-otg"; status = "disabled"; - usbdrd_dwc3_0: dwc3 { + usbdrd_dwc3_0: usb@fe800000 { compatible = "snps,dwc3"; reg = <0x0 0xfe800000 0x0 0x100000>; interrupts = <GIC_SPI 105 IRQ_TYPE_LEVEL_HIGH 0>; @@ -439,7 +439,7 @@ reset-names = "usb3-otg"; status = "disabled"; - usbdrd_dwc3_1: dwc3 { + usbdrd_dwc3_1: usb@fe900000 { compatible = "snps,dwc3"; reg = <0x0 0xfe900000 0x0 0x100000>; interrupts = <GIC_SPI 110 IRQ_TYPE_LEVEL_HIGH 0>; @@ -1124,8 +1124,6 @@ pmugrf: syscon@ff320000 { compatible = "rockchip,rk3399-pmugrf", "syscon", "simple-mfd"; reg = <0x0 0xff320000 0x0 0x1000>; - #address-cells = <1>; - #size-cells = <1>; pmu_io_domains: io-domains { compatible = "rockchip,rk3399-pmu-io-voltage-domain"; @@ -1883,10 +1881,10 @@ gpu: gpu@ff9a0000 { compatible = "rockchip,rk3399-mali", "arm,mali-t860"; reg = <0x0 0xff9a0000 0x0 0x10000>; - interrupts = <GIC_SPI 19 IRQ_TYPE_LEVEL_HIGH 0>, - <GIC_SPI 20 IRQ_TYPE_LEVEL_HIGH 0>, - <GIC_SPI 21 IRQ_TYPE_LEVEL_HIGH 0>; - interrupt-names = "gpu", "job", "mmu"; + interrupts = <GIC_SPI 20 IRQ_TYPE_LEVEL_HIGH 0>, + <GIC_SPI 21 IRQ_TYPE_LEVEL_HIGH 0>, + <GIC_SPI 19 IRQ_TYPE_LEVEL_HIGH 0>; + interrupt-names = "job", "mmu", "gpu"; clocks = <&cru ACLK_GPU>; #cooling-cells = <2>; power-domains = <&power RK3399_PD_GPU>; diff --git a/arch/arm64/boot/dts/ti/k3-am65-main.dtsi b/arch/arm64/boot/dts/ti/k3-am65-main.dtsi index 11887c72f23a..0d533d52fcda 100644 --- a/arch/arm64/boot/dts/ti/k3-am65-main.dtsi +++ b/arch/arm64/boot/dts/ti/k3-am65-main.dtsi @@ -570,6 +570,28 @@ <0x5>; /* RX_CHAN */ ti,sci-rm-range-rflow = <0x6>; /* GP RFLOW */ }; + + cpts@310d0000 { + compatible = "ti,am65-cpts"; + reg = <0x0 0x310d0000 0x0 0x400>; + reg-names = "cpts"; + clocks = <&main_cpts_mux>; + clock-names = "cpts"; + interrupts-extended = <&intr_main_navss 163 0>; + interrupt-names = "cpts"; + ti,cpts-periodic-outputs = <6>; + ti,cpts-ext-ts-inputs = <8>; + + main_cpts_mux: refclk-mux { + #clock-cells = <0>; + clocks = <&k3_clks 118 5>, <&k3_clks 118 11>, + <&k3_clks 118 6>, <&k3_clks 118 3>, + <&k3_clks 118 8>, <&k3_clks 118 14>, + <&k3_clks 120 3>, <&k3_clks 121 3>; + assigned-clocks = <&main_cpts_mux>; + assigned-clock-parents = <&k3_clks 118 5>; + }; + }; }; main_gpio0: main_gpio0@600000 { diff --git a/arch/arm64/boot/dts/ti/k3-am65-mcu.dtsi b/arch/arm64/boot/dts/ti/k3-am65-mcu.dtsi index 353d1e2532a7..ae5f813d0cac 100644 --- a/arch/arm64/boot/dts/ti/k3-am65-mcu.dtsi +++ b/arch/arm64/boot/dts/ti/k3-am65-mcu.dtsi @@ -247,5 +247,26 @@ clock-names = "fck"; bus_freq = <1000000>; }; + + cpts@3d000 { + compatible = "ti,am65-cpts"; + reg = <0x0 0x3d000 0x0 0x400>; + clocks = <&mcu_cpsw_cpts_mux>; + clock-names = "cpts"; + interrupts-extended = <&gic500 GIC_SPI 570 IRQ_TYPE_LEVEL_HIGH>; + interrupt-names = "cpts"; + ti,cpts-ext-ts-inputs = <4>; + ti,cpts-periodic-outputs = <2>; + + mcu_cpsw_cpts_mux: refclk-mux { + #clock-cells = <0>; + clocks = <&k3_clks 118 5>, <&k3_clks 118 11>, + <&k3_clks 118 6>, <&k3_clks 118 3>, + <&k3_clks 118 8>, <&k3_clks 118 14>, + <&k3_clks 120 3>, <&k3_clks 121 3>; + assigned-clocks = <&mcu_cpsw_cpts_mux>; + assigned-clock-parents = <&k3_clks 118 5>; + }; + }; }; }; diff --git a/arch/arm64/boot/dts/ti/k3-j721e-main.dtsi b/arch/arm64/boot/dts/ti/k3-j721e-main.dtsi index 0b9d14b838a1..844a5b50cf09 100644 --- a/arch/arm64/boot/dts/ti/k3-j721e-main.dtsi +++ b/arch/arm64/boot/dts/ti/k3-j721e-main.dtsi @@ -254,6 +254,18 @@ <0x0c>; /* RX_UHCHAN */ ti,sci-rm-range-rflow = <0x00>; /* GP RFLOW */ }; + + cpts@310d0000 { + compatible = "ti,j721e-cpts"; + reg = <0x0 0x310d0000 0x0 0x400>; + reg-names = "cpts"; + clocks = <&k3_clks 201 1>; + clock-names = "cpts"; + interrupts-extended = <&main_navss_intr 201 0>; + interrupt-names = "cpts"; + ti,cpts-periodic-outputs = <6>; + ti,cpts-ext-ts-inputs = <8>; + }; }; main_pmx0: pinmux@11c000 { diff --git a/arch/arm64/boot/dts/ti/k3-j721e-mcu-wakeup.dtsi b/arch/arm64/boot/dts/ti/k3-j721e-mcu-wakeup.dtsi index 3d6064125b40..dc31bd0434cb 100644 --- a/arch/arm64/boot/dts/ti/k3-j721e-mcu-wakeup.dtsi +++ b/arch/arm64/boot/dts/ti/k3-j721e-mcu-wakeup.dtsi @@ -338,5 +338,16 @@ clock-names = "fck"; bus_freq = <1000000>; }; + + cpts@3d000 { + compatible = "ti,am65-cpts"; + reg = <0x0 0x3d000 0x0 0x400>; + clocks = <&k3_clks 18 2>; + clock-names = "cpts"; + interrupts-extended = <&gic500 GIC_SPI 858 IRQ_TYPE_LEVEL_HIGH>; + interrupt-names = "cpts"; + ti,cpts-ext-ts-inputs = <4>; + ti,cpts-periodic-outputs = <2>; + }; }; }; diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index 24e534d85045..71d9ca727eee 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -208,7 +208,7 @@ CONFIG_PCIE_QCOM=y CONFIG_PCIE_ARMADA_8K=y CONFIG_PCIE_KIRIN=y CONFIG_PCIE_HISI_STB=y -CONFIG_PCIE_TEGRA194=m +CONFIG_PCIE_TEGRA194_HOST=m CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y CONFIG_FW_LOADER_USER_HELPER=y @@ -513,6 +513,7 @@ CONFIG_UNIPHIER_THERMAL=y CONFIG_WATCHDOG=y CONFIG_ARM_SP805_WATCHDOG=y CONFIG_ARM_SBSA_WATCHDOG=y +CONFIG_ARM_SMC_WATCHDOG=y CONFIG_S3C2410_WATCHDOG=y CONFIG_DW_WATCHDOG=y CONFIG_SUNXI_WATCHDOG=m @@ -567,6 +568,7 @@ CONFIG_MEDIA_DIGITAL_TV_SUPPORT=y CONFIG_MEDIA_SDR_SUPPORT=y CONFIG_MEDIA_CONTROLLER=y CONFIG_VIDEO_V4L2_SUBDEV_API=y +CONFIG_MEDIA_PLATFORM_SUPPORT=y # CONFIG_DVB_NET is not set CONFIG_MEDIA_USB_SUPPORT=y CONFIG_USB_VIDEO_CLASS=m @@ -610,8 +612,9 @@ CONFIG_DRM_MSM=m CONFIG_DRM_TEGRA=m CONFIG_DRM_PANEL_LVDS=m CONFIG_DRM_PANEL_SIMPLE=m -CONFIG_DRM_DUMB_VGA_DAC=m +CONFIG_DRM_SIMPLE_BRIDGE=m CONFIG_DRM_PANEL_TRULY_NT35597_WQXGA=m +CONFIG_DRM_DISPLAY_CONNECTOR=m CONFIG_DRM_SII902X=m CONFIG_DRM_THINE_THC63LVD1024=m CONFIG_DRM_TI_SN65DSI86=m @@ -848,7 +851,8 @@ CONFIG_QCOM_APR=m CONFIG_ARCH_R8A774A1=y CONFIG_ARCH_R8A774B1=y CONFIG_ARCH_R8A774C0=y -CONFIG_ARCH_R8A7795=y +CONFIG_ARCH_R8A77950=y +CONFIG_ARCH_R8A77951=y CONFIG_ARCH_R8A77960=y CONFIG_ARCH_R8A77961=y CONFIG_ARCH_R8A77965=y diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c index ed5409c6abf4..395bbf64b2ab 100644 --- a/arch/arm64/crypto/aes-glue.c +++ b/arch/arm64/crypto/aes-glue.c @@ -158,7 +158,6 @@ static int __maybe_unused essiv_cbc_set_key(struct crypto_skcipher *tfm, unsigned int key_len) { struct crypto_aes_essiv_cbc_ctx *ctx = crypto_skcipher_ctx(tfm); - SHASH_DESC_ON_STACK(desc, ctx->hash); u8 digest[SHA256_DIGEST_SIZE]; int ret; @@ -166,8 +165,7 @@ static int __maybe_unused essiv_cbc_set_key(struct crypto_skcipher *tfm, if (ret) return ret; - desc->tfm = ctx->hash; - crypto_shash_digest(desc, in_key, key_len, digest); + crypto_shash_tfm_digest(ctx->hash, in_key, key_len, digest); return aes_expandkey(&ctx->key2, digest, sizeof(digest)); } diff --git a/arch/arm64/crypto/chacha-neon-glue.c b/arch/arm64/crypto/chacha-neon-glue.c index 37ca3e889848..af2bbca38e70 100644 --- a/arch/arm64/crypto/chacha-neon-glue.c +++ b/arch/arm64/crypto/chacha-neon-glue.c @@ -87,9 +87,17 @@ void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes, !crypto_simd_usable()) return chacha_crypt_generic(state, dst, src, bytes, nrounds); - kernel_neon_begin(); - chacha_doneon(state, dst, src, bytes, nrounds); - kernel_neon_end(); + do { + unsigned int todo = min_t(unsigned int, bytes, SZ_4K); + + kernel_neon_begin(); + chacha_doneon(state, dst, src, todo, nrounds); + kernel_neon_end(); + + bytes -= todo; + src += todo; + dst += todo; + } while (bytes); } EXPORT_SYMBOL(chacha_crypt_arch); diff --git a/arch/arm64/crypto/crct10dif-ce-core.S b/arch/arm64/crypto/crct10dif-ce-core.S index 5a95c2628fbf..111d9c9abddd 100644 --- a/arch/arm64/crypto/crct10dif-ce-core.S +++ b/arch/arm64/crypto/crct10dif-ce-core.S @@ -66,7 +66,7 @@ #include <asm/assembler.h> .text - .cpu generic+crypto + .arch armv8-a+crypto init_crc .req w19 buf .req x20 diff --git a/arch/arm64/crypto/nhpoly1305-neon-glue.c b/arch/arm64/crypto/nhpoly1305-neon-glue.c index 895d3727c1fb..c5405e6a6db7 100644 --- a/arch/arm64/crypto/nhpoly1305-neon-glue.c +++ b/arch/arm64/crypto/nhpoly1305-neon-glue.c @@ -30,7 +30,7 @@ static int nhpoly1305_neon_update(struct shash_desc *desc, return crypto_nhpoly1305_update(desc, src, srclen); do { - unsigned int n = min_t(unsigned int, srclen, PAGE_SIZE); + unsigned int n = min_t(unsigned int, srclen, SZ_4K); kernel_neon_begin(); crypto_nhpoly1305_update_helper(desc, src, n, _nh_neon); diff --git a/arch/arm64/crypto/poly1305-glue.c b/arch/arm64/crypto/poly1305-glue.c index e97b092f56b8..f33ada70c4ed 100644 --- a/arch/arm64/crypto/poly1305-glue.c +++ b/arch/arm64/crypto/poly1305-glue.c @@ -143,13 +143,20 @@ void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src, unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE); if (static_branch_likely(&have_neon) && crypto_simd_usable()) { - kernel_neon_begin(); - poly1305_blocks_neon(&dctx->h, src, len, 1); - kernel_neon_end(); + do { + unsigned int todo = min_t(unsigned int, len, SZ_4K); + + kernel_neon_begin(); + poly1305_blocks_neon(&dctx->h, src, todo, 1); + kernel_neon_end(); + + len -= todo; + src += todo; + } while (len); } else { poly1305_blocks(&dctx->h, src, len, 1); + src += len; } - src += len; nbytes %= POLY1305_BLOCK_SIZE; } diff --git a/arch/arm64/crypto/sha256-glue.c b/arch/arm64/crypto/sha256-glue.c index ddf4a0d85c1c..77bc6e72abae 100644 --- a/arch/arm64/crypto/sha256-glue.c +++ b/arch/arm64/crypto/sha256-glue.c @@ -12,7 +12,6 @@ #include <crypto/internal/simd.h> #include <crypto/sha.h> #include <crypto/sha256_base.h> -#include <linux/cryptohash.h> #include <linux/types.h> #include <linux/string.h> diff --git a/arch/arm64/crypto/sha512-glue.c b/arch/arm64/crypto/sha512-glue.c index 78d3083de6b7..370ccb29602f 100644 --- a/arch/arm64/crypto/sha512-glue.c +++ b/arch/arm64/crypto/sha512-glue.c @@ -6,7 +6,6 @@ */ #include <crypto/internal/hash.h> -#include <linux/cryptohash.h> #include <linux/types.h> #include <linux/string.h> #include <crypto/sha.h> diff --git a/arch/arm64/include/asm/asm_pointer_auth.h b/arch/arm64/include/asm/asm_pointer_auth.h index ce2a8486992b..52dead2a8640 100644 --- a/arch/arm64/include/asm/asm_pointer_auth.h +++ b/arch/arm64/include/asm/asm_pointer_auth.h @@ -39,25 +39,58 @@ alternative_if ARM64_HAS_GENERIC_AUTH alternative_else_nop_endif .endm - .macro ptrauth_keys_install_kernel tsk, sync, tmp1, tmp2, tmp3 -alternative_if ARM64_HAS_ADDRESS_AUTH + .macro __ptrauth_keys_install_kernel_nosync tsk, tmp1, tmp2, tmp3 mov \tmp1, #THREAD_KEYS_KERNEL add \tmp1, \tsk, \tmp1 ldp \tmp2, \tmp3, [\tmp1, #PTRAUTH_KERNEL_KEY_APIA] msr_s SYS_APIAKEYLO_EL1, \tmp2 msr_s SYS_APIAKEYHI_EL1, \tmp3 - .if \sync == 1 + .endm + + .macro ptrauth_keys_install_kernel_nosync tsk, tmp1, tmp2, tmp3 +alternative_if ARM64_HAS_ADDRESS_AUTH + __ptrauth_keys_install_kernel_nosync \tsk, \tmp1, \tmp2, \tmp3 +alternative_else_nop_endif + .endm + + .macro ptrauth_keys_install_kernel tsk, tmp1, tmp2, tmp3 +alternative_if ARM64_HAS_ADDRESS_AUTH + __ptrauth_keys_install_kernel_nosync \tsk, \tmp1, \tmp2, \tmp3 isb - .endif alternative_else_nop_endif .endm + .macro __ptrauth_keys_init_cpu tsk, tmp1, tmp2, tmp3 + mrs \tmp1, id_aa64isar1_el1 + ubfx \tmp1, \tmp1, #ID_AA64ISAR1_APA_SHIFT, #8 + cbz \tmp1, .Lno_addr_auth\@ + mov_q \tmp1, (SCTLR_ELx_ENIA | SCTLR_ELx_ENIB | \ + SCTLR_ELx_ENDA | SCTLR_ELx_ENDB) + mrs \tmp2, sctlr_el1 + orr \tmp2, \tmp2, \tmp1 + msr sctlr_el1, \tmp2 + __ptrauth_keys_install_kernel_nosync \tsk, \tmp1, \tmp2, \tmp3 + isb +.Lno_addr_auth\@: + .endm + + .macro ptrauth_keys_init_cpu tsk, tmp1, tmp2, tmp3 +alternative_if_not ARM64_HAS_ADDRESS_AUTH + b .Lno_addr_auth\@ +alternative_else_nop_endif + __ptrauth_keys_init_cpu \tsk, \tmp1, \tmp2, \tmp3 +.Lno_addr_auth\@: + .endm + #else /* CONFIG_ARM64_PTR_AUTH */ .macro ptrauth_keys_install_user tsk, tmp1, tmp2, tmp3 .endm - .macro ptrauth_keys_install_kernel tsk, sync, tmp1, tmp2, tmp3 + .macro ptrauth_keys_install_kernel_nosync tsk, tmp1, tmp2, tmp3 + .endm + + .macro ptrauth_keys_install_kernel tsk, tmp1, tmp2, tmp3 .endm #endif /* CONFIG_ARM64_PTR_AUTH */ diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 0bff325117b4..54d181177656 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -736,4 +736,54 @@ USER(\label, ic ivau, \tmp2) // invalidate I line PoU .Lyield_out_\@ : .endm +/* + * This macro emits a program property note section identifying + * architecture features which require special handling, mainly for + * use in assembly files included in the VDSO. + */ + +#define NT_GNU_PROPERTY_TYPE_0 5 +#define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000 + +#define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0) +#define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1) + +#ifdef CONFIG_ARM64_BTI_KERNEL +#define GNU_PROPERTY_AARCH64_FEATURE_1_DEFAULT \ + ((GNU_PROPERTY_AARCH64_FEATURE_1_BTI | \ + GNU_PROPERTY_AARCH64_FEATURE_1_PAC)) +#endif + +#ifdef GNU_PROPERTY_AARCH64_FEATURE_1_DEFAULT +.macro emit_aarch64_feature_1_and, feat=GNU_PROPERTY_AARCH64_FEATURE_1_DEFAULT + .pushsection .note.gnu.property, "a" + .align 3 + .long 2f - 1f + .long 6f - 3f + .long NT_GNU_PROPERTY_TYPE_0 +1: .string "GNU" +2: + .align 3 +3: .long GNU_PROPERTY_AARCH64_FEATURE_1_AND + .long 5f - 4f +4: + /* + * This is described with an array of char in the Linux API + * spec but the text and all other usage (including binutils, + * clang and GCC) treat this as a 32 bit value so no swizzling + * is required for big endian. + */ + .long \feat +5: + .align 3 +6: + .popsection +.endm + +#else +.macro emit_aarch64_feature_1_and, feat=0 +.endm + +#endif /* GNU_PROPERTY_AARCH64_FEATURE_1_DEFAULT */ + #endif /* __ASM_ASSEMBLER_H */ diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h index e6cca3d4acf7..ce50c1f1f1ea 100644 --- a/arch/arm64/include/asm/cacheflush.h +++ b/arch/arm64/include/asm/cacheflush.h @@ -79,7 +79,7 @@ static inline void flush_icache_range(unsigned long start, unsigned long end) * IPI all online CPUs so that they undergo a context synchronization * event and are forced to refetch the new instructions. */ -#ifdef CONFIG_KGDB + /* * KGDB performs cache maintenance with interrupts disabled, so we * will deadlock trying to IPI the secondary CPUs. In theory, we can @@ -89,9 +89,9 @@ static inline void flush_icache_range(unsigned long start, unsigned long end) * the patching operation, so we don't need extra IPIs here anyway. * In which case, add a KGDB-specific bodge and return early. */ - if (kgdb_connected && irqs_disabled()) + if (in_dbg_master()) return; -#endif + kick_all_cpus_sync(); } diff --git a/arch/arm64/include/asm/compiler.h b/arch/arm64/include/asm/compiler.h index eece20d2c55f..51a7ce87cdfe 100644 --- a/arch/arm64/include/asm/compiler.h +++ b/arch/arm64/include/asm/compiler.h @@ -2,8 +2,6 @@ #ifndef __ASM_COMPILER_H #define __ASM_COMPILER_H -#if defined(CONFIG_ARM64_PTR_AUTH) - /* * The EL0/EL1 pointer bits used by a pointer authentication code. * This is dependent on TBI0/TBI1 being enabled, or bits 63:56 would also apply. @@ -19,6 +17,4 @@ #define __builtin_return_address(val) \ (void *)(ptrauth_clear_pac((unsigned long)__builtin_return_address(val))) -#endif /* CONFIG_ARM64_PTR_AUTH */ - #endif /* __ASM_COMPILER_H */ diff --git a/arch/arm64/include/asm/cpu.h b/arch/arm64/include/asm/cpu.h index b4a40535a3d8..7faae6ff3ab4 100644 --- a/arch/arm64/include/asm/cpu.h +++ b/arch/arm64/include/asm/cpu.h @@ -33,6 +33,7 @@ struct cpuinfo_arm64 { u64 reg_id_aa64zfr0; u32 reg_id_dfr0; + u32 reg_id_dfr1; u32 reg_id_isar0; u32 reg_id_isar1; u32 reg_id_isar2; @@ -44,8 +45,11 @@ struct cpuinfo_arm64 { u32 reg_id_mmfr1; u32 reg_id_mmfr2; u32 reg_id_mmfr3; + u32 reg_id_mmfr4; + u32 reg_id_mmfr5; u32 reg_id_pfr0; u32 reg_id_pfr1; + u32 reg_id_pfr2; u32 reg_mvfr0; u32 reg_mvfr1; diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h index 8eb5a088ae65..d7b3bb0cb180 100644 --- a/arch/arm64/include/asm/cpucaps.h +++ b/arch/arm64/include/asm/cpucaps.h @@ -44,7 +44,7 @@ #define ARM64_SSBS 34 #define ARM64_WORKAROUND_1418040 35 #define ARM64_HAS_SB 36 -#define ARM64_WORKAROUND_SPECULATIVE_AT_VHE 37 +#define ARM64_WORKAROUND_SPECULATIVE_AT 37 #define ARM64_HAS_ADDRESS_AUTH_ARCH 38 #define ARM64_HAS_ADDRESS_AUTH_IMP_DEF 39 #define ARM64_HAS_GENERIC_AUTH_ARCH 40 @@ -55,13 +55,14 @@ #define ARM64_WORKAROUND_CAVIUM_TX2_219_TVM 45 #define ARM64_WORKAROUND_CAVIUM_TX2_219_PRFM 46 #define ARM64_WORKAROUND_1542419 47 -#define ARM64_WORKAROUND_SPECULATIVE_AT_NVHE 48 -#define ARM64_HAS_E0PD 49 -#define ARM64_HAS_RNG 50 -#define ARM64_HAS_AMU_EXTN 51 -#define ARM64_HAS_ADDRESS_AUTH 52 -#define ARM64_HAS_GENERIC_AUTH 53 +#define ARM64_HAS_E0PD 48 +#define ARM64_HAS_RNG 49 +#define ARM64_HAS_AMU_EXTN 50 +#define ARM64_HAS_ADDRESS_AUTH 51 +#define ARM64_HAS_GENERIC_AUTH 52 +#define ARM64_HAS_32BIT_EL1 53 +#define ARM64_BTI 54 -#define ARM64_NCAPS 54 +#define ARM64_NCAPS 55 #endif /* __ASM_CPUCAPS_H */ diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index afe08251ff95..5d1f4ae42799 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -551,6 +551,13 @@ static inline bool id_aa64mmfr0_mixed_endian_el0(u64 mmfr0) cpuid_feature_extract_unsigned_field(mmfr0, ID_AA64MMFR0_BIGENDEL0_SHIFT) == 0x1; } +static inline bool id_aa64pfr0_32bit_el1(u64 pfr0) +{ + u32 val = cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL1_SHIFT); + + return val == ID_AA64PFR0_EL1_32BIT_64BIT; +} + static inline bool id_aa64pfr0_32bit_el0(u64 pfr0) { u32 val = cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL0_SHIFT); @@ -680,6 +687,11 @@ static inline bool system_has_prio_mask_debugging(void) system_uses_irq_prio_masking(); } +static inline bool system_supports_bti(void) +{ + return IS_ENABLED(CONFIG_ARM64_BTI) && cpus_have_const_cap(ARM64_BTI); +} + #define ARM64_BP_HARDEN_UNKNOWN -1 #define ARM64_BP_HARDEN_WA_NEEDED 0 #define ARM64_BP_HARDEN_NOT_REQUIRED 1 @@ -745,6 +757,24 @@ static inline bool cpu_has_hw_af(void) extern bool cpu_has_amu_feat(int cpu); #endif +static inline unsigned int get_vmid_bits(u64 mmfr1) +{ + int vmid_bits; + + vmid_bits = cpuid_feature_extract_unsigned_field(mmfr1, + ID_AA64MMFR1_VMIDBITS_SHIFT); + if (vmid_bits == ID_AA64MMFR1_VMIDBITS_16) + return 16; + + /* + * Return the default here even if any reserved + * value is fetched from the system register. + */ + return 8; +} + +u32 get_kvm_ipa_limit(void); + #endif /* __ASSEMBLY__ */ #endif diff --git a/arch/arm64/include/asm/debug-monitors.h b/arch/arm64/include/asm/debug-monitors.h index 7619f473155f..e5ceea213e39 100644 --- a/arch/arm64/include/asm/debug-monitors.h +++ b/arch/arm64/include/asm/debug-monitors.h @@ -125,5 +125,7 @@ static inline int reinstall_suspended_bps(struct pt_regs *regs) int aarch32_break_handler(struct pt_regs *regs); +void debug_traps_init(void); + #endif /* __ASSEMBLY */ #endif /* __ASM_DEBUG_MONITORS_H */ diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h index 45e821222774..d4ab3f73e7a3 100644 --- a/arch/arm64/include/asm/efi.h +++ b/arch/arm64/include/asm/efi.h @@ -86,14 +86,6 @@ static inline unsigned long efi_get_max_initrd_addr(unsigned long dram_base, return (image_addr & ~(SZ_1G - 1UL)) + (1UL << (VA_BITS_MIN - 1)); } -#define efi_bs_call(func, ...) efi_system_table()->boottime->func(__VA_ARGS__) -#define efi_rt_call(func, ...) efi_system_table()->runtime->func(__VA_ARGS__) -#define efi_is_native() (true) - -#define efi_table_attr(inst, attr) (inst->attr) - -#define efi_call_proto(inst, func, ...) inst->func(inst, ##__VA_ARGS__) - #define alloc_screen_info(x...) &screen_info static inline void free_screen_info(struct screen_info *si) diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h index b618017205a3..4f00d50585a4 100644 --- a/arch/arm64/include/asm/elf.h +++ b/arch/arm64/include/asm/elf.h @@ -114,7 +114,11 @@ #ifndef __ASSEMBLY__ +#include <uapi/linux/elf.h> #include <linux/bug.h> +#include <linux/errno.h> +#include <linux/fs.h> +#include <linux/types.h> #include <asm/processor.h> /* for signal_minsigstksz, used by ARCH_DLINFO */ typedef unsigned long elf_greg_t; @@ -224,6 +228,52 @@ extern int aarch32_setup_additional_pages(struct linux_binprm *bprm, #endif /* CONFIG_COMPAT */ +struct arch_elf_state { + int flags; +}; + +#define ARM64_ELF_BTI (1 << 0) + +#define INIT_ARCH_ELF_STATE { \ + .flags = 0, \ +} + +static inline int arch_parse_elf_property(u32 type, const void *data, + size_t datasz, bool compat, + struct arch_elf_state *arch) +{ + /* No known properties for AArch32 yet */ + if (IS_ENABLED(CONFIG_COMPAT) && compat) + return 0; + + if (type == GNU_PROPERTY_AARCH64_FEATURE_1_AND) { + const u32 *p = data; + + if (datasz != sizeof(*p)) + return -ENOEXEC; + + if (system_supports_bti() && + (*p & GNU_PROPERTY_AARCH64_FEATURE_1_BTI)) + arch->flags |= ARM64_ELF_BTI; + } + + return 0; +} + +static inline int arch_elf_pt_proc(void *ehdr, void *phdr, + struct file *f, bool is_interp, + struct arch_elf_state *state) +{ + return 0; +} + +static inline int arch_check_elf(void *ehdr, bool has_interp, + void *interp_ehdr, + struct arch_elf_state *state) +{ + return 0; +} + #endif /* !__ASSEMBLY__ */ #endif diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h index 6a395a7e6707..035003acfa87 100644 --- a/arch/arm64/include/asm/esr.h +++ b/arch/arm64/include/asm/esr.h @@ -22,7 +22,7 @@ #define ESR_ELx_EC_PAC (0x09) /* EL2 and above */ /* Unallocated EC: 0x0A - 0x0B */ #define ESR_ELx_EC_CP14_64 (0x0C) -/* Unallocated EC: 0x0d */ +#define ESR_ELx_EC_BTI (0x0D) #define ESR_ELx_EC_ILL (0x0E) /* Unallocated EC: 0x0F - 0x10 */ #define ESR_ELx_EC_SVC32 (0x11) diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h index 7a6e81ca23a8..7577a754d443 100644 --- a/arch/arm64/include/asm/exception.h +++ b/arch/arm64/include/asm/exception.h @@ -34,6 +34,7 @@ static inline u32 disr_to_esr(u64 disr) asmlinkage void enter_from_user_mode(void); void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs); void do_undefinstr(struct pt_regs *regs); +void do_bti(struct pt_regs *regs); asmlinkage void bad_mode(struct pt_regs *regs, int reason, unsigned int esr); void do_debug_exception(unsigned long addr_if_watchpoint, unsigned int esr, struct pt_regs *regs); diff --git a/arch/arm64/include/asm/hardirq.h b/arch/arm64/include/asm/hardirq.h index 87ad961f3c97..985493af704b 100644 --- a/arch/arm64/include/asm/hardirq.h +++ b/arch/arm64/include/asm/hardirq.h @@ -32,30 +32,70 @@ u64 smp_irq_stat_cpu(unsigned int cpu); struct nmi_ctx { u64 hcr; + unsigned int cnt; }; DECLARE_PER_CPU(struct nmi_ctx, nmi_contexts); -#define arch_nmi_enter() \ - do { \ - if (is_kernel_in_hyp_mode()) { \ - struct nmi_ctx *nmi_ctx = this_cpu_ptr(&nmi_contexts); \ - nmi_ctx->hcr = read_sysreg(hcr_el2); \ - if (!(nmi_ctx->hcr & HCR_TGE)) { \ - write_sysreg(nmi_ctx->hcr | HCR_TGE, hcr_el2); \ - isb(); \ - } \ - } \ - } while (0) +#define arch_nmi_enter() \ +do { \ + struct nmi_ctx *___ctx; \ + u64 ___hcr; \ + \ + if (!is_kernel_in_hyp_mode()) \ + break; \ + \ + ___ctx = this_cpu_ptr(&nmi_contexts); \ + if (___ctx->cnt) { \ + ___ctx->cnt++; \ + break; \ + } \ + \ + ___hcr = read_sysreg(hcr_el2); \ + if (!(___hcr & HCR_TGE)) { \ + write_sysreg(___hcr | HCR_TGE, hcr_el2); \ + isb(); \ + } \ + /* \ + * Make sure the sysreg write is performed before ___ctx->cnt \ + * is set to 1. NMIs that see cnt == 1 will rely on us. \ + */ \ + barrier(); \ + ___ctx->cnt = 1; \ + /* \ + * Make sure ___ctx->cnt is set before we save ___hcr. We \ + * don't want ___ctx->hcr to be overwritten. \ + */ \ + barrier(); \ + ___ctx->hcr = ___hcr; \ +} while (0) -#define arch_nmi_exit() \ - do { \ - if (is_kernel_in_hyp_mode()) { \ - struct nmi_ctx *nmi_ctx = this_cpu_ptr(&nmi_contexts); \ - if (!(nmi_ctx->hcr & HCR_TGE)) \ - write_sysreg(nmi_ctx->hcr, hcr_el2); \ - } \ - } while (0) +#define arch_nmi_exit() \ +do { \ + struct nmi_ctx *___ctx; \ + u64 ___hcr; \ + \ + if (!is_kernel_in_hyp_mode()) \ + break; \ + \ + ___ctx = this_cpu_ptr(&nmi_contexts); \ + ___hcr = ___ctx->hcr; \ + /* \ + * Make sure we read ___ctx->hcr before we release \ + * ___ctx->cnt as it makes ___ctx->hcr updatable again. \ + */ \ + barrier(); \ + ___ctx->cnt--; \ + /* \ + * Make sure ___ctx->cnt release is visible before we \ + * restore the sysreg. Otherwise a new NMI occurring \ + * right after write_sysreg() can be fooled and think \ + * we secured things for it. \ + */ \ + barrier(); \ + if (!___ctx->cnt && !(___hcr & HCR_TGE)) \ + write_sysreg(___hcr, hcr_el2); \ +} while (0) static inline void ack_bad_irq(unsigned int irq) { diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index 2eb6c234d594..94ba0c5bced2 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -17,22 +17,11 @@ extern bool arch_hugetlb_migration_supported(struct hstate *h); #endif -#define __HAVE_ARCH_HUGE_PTEP_GET -static inline pte_t huge_ptep_get(pte_t *ptep) -{ - return READ_ONCE(*ptep); -} - -static inline int is_hugepage_only_range(struct mm_struct *mm, - unsigned long addr, unsigned long len) -{ - return 0; -} - static inline void arch_clear_hugepage_flags(struct page *page) { clear_bit(PG_dcache_clean, &page->flags); } +#define arch_clear_hugepage_flags arch_clear_hugepage_flags extern pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, struct page *page, int writable); diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h index 0f00265248b5..d683bcbf1e7c 100644 --- a/arch/arm64/include/asm/hwcap.h +++ b/arch/arm64/include/asm/hwcap.h @@ -94,6 +94,7 @@ #define KERNEL_HWCAP_BF16 __khwcap2_feature(BF16) #define KERNEL_HWCAP_DGH __khwcap2_feature(DGH) #define KERNEL_HWCAP_RNG __khwcap2_feature(RNG) +#define KERNEL_HWCAP_BTI __khwcap2_feature(BTI) /* * This yields a mask that user programs can use to figure out what diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h index bb313dde58a4..0bc46149e491 100644 --- a/arch/arm64/include/asm/insn.h +++ b/arch/arm64/include/asm/insn.h @@ -39,13 +39,37 @@ enum aarch64_insn_encoding_class { * system instructions */ }; -enum aarch64_insn_hint_op { +enum aarch64_insn_hint_cr_op { AARCH64_INSN_HINT_NOP = 0x0 << 5, AARCH64_INSN_HINT_YIELD = 0x1 << 5, AARCH64_INSN_HINT_WFE = 0x2 << 5, AARCH64_INSN_HINT_WFI = 0x3 << 5, AARCH64_INSN_HINT_SEV = 0x4 << 5, AARCH64_INSN_HINT_SEVL = 0x5 << 5, + + AARCH64_INSN_HINT_XPACLRI = 0x07 << 5, + AARCH64_INSN_HINT_PACIA_1716 = 0x08 << 5, + AARCH64_INSN_HINT_PACIB_1716 = 0x0A << 5, + AARCH64_INSN_HINT_AUTIA_1716 = 0x0C << 5, + AARCH64_INSN_HINT_AUTIB_1716 = 0x0E << 5, + AARCH64_INSN_HINT_PACIAZ = 0x18 << 5, + AARCH64_INSN_HINT_PACIASP = 0x19 << 5, + AARCH64_INSN_HINT_PACIBZ = 0x1A << 5, + AARCH64_INSN_HINT_PACIBSP = 0x1B << 5, + AARCH64_INSN_HINT_AUTIAZ = 0x1C << 5, + AARCH64_INSN_HINT_AUTIASP = 0x1D << 5, + AARCH64_INSN_HINT_AUTIBZ = 0x1E << 5, + AARCH64_INSN_HINT_AUTIBSP = 0x1F << 5, + + AARCH64_INSN_HINT_ESB = 0x10 << 5, + AARCH64_INSN_HINT_PSB = 0x11 << 5, + AARCH64_INSN_HINT_TSB = 0x12 << 5, + AARCH64_INSN_HINT_CSDB = 0x14 << 5, + + AARCH64_INSN_HINT_BTI = 0x20 << 5, + AARCH64_INSN_HINT_BTIC = 0x22 << 5, + AARCH64_INSN_HINT_BTIJ = 0x24 << 5, + AARCH64_INSN_HINT_BTIJC = 0x26 << 5, }; enum aarch64_insn_imm_type { @@ -344,7 +368,7 @@ __AARCH64_INSN_FUNCS(msr_reg, 0xFFF00000, 0xD5100000) #undef __AARCH64_INSN_FUNCS -bool aarch64_insn_is_nop(u32 insn); +bool aarch64_insn_is_steppable_hint(u32 insn); bool aarch64_insn_is_branch_imm(u32 insn); static inline bool aarch64_insn_is_adr_adrp(u32 insn) @@ -370,7 +394,7 @@ u32 aarch64_insn_gen_comp_branch_imm(unsigned long pc, unsigned long addr, enum aarch64_insn_branch_type type); u32 aarch64_insn_gen_cond_branch_imm(unsigned long pc, unsigned long addr, enum aarch64_insn_condition cond); -u32 aarch64_insn_gen_hint(enum aarch64_insn_hint_op op); +u32 aarch64_insn_gen_hint(enum aarch64_insn_hint_cr_op op); u32 aarch64_insn_gen_nop(void); u32 aarch64_insn_gen_branch_reg(enum aarch64_insn_register reg, enum aarch64_insn_branch_type type); diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 7c7eeeaab9fa..0c9b5fc4ba0a 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -64,12 +64,14 @@ extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa); extern void __kvm_tlb_flush_vmid(struct kvm *kvm); extern void __kvm_tlb_flush_local_vmid(struct kvm_vcpu *vcpu); -extern void __kvm_timer_set_cntvoff(u32 cntvoff_low, u32 cntvoff_high); +extern void __kvm_timer_set_cntvoff(u64 cntvoff); extern int kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu); extern int __kvm_vcpu_run_nvhe(struct kvm_vcpu *vcpu); +extern void __kvm_enable_ssbs(void); + extern u64 __vgic_v3_get_ich_vtr_el2(void); extern u64 __vgic_v3_read_vmcr(void); extern void __vgic_v3_write_vmcr(u32 vmcr); diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index a30b4eec7cb4..6ea53e6e8b26 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -507,10 +507,12 @@ static inline unsigned long vcpu_data_host_to_guest(struct kvm_vcpu *vcpu, static __always_inline void kvm_skip_instr(struct kvm_vcpu *vcpu, bool is_wide_instr) { - if (vcpu_mode_is_32bit(vcpu)) + if (vcpu_mode_is_32bit(vcpu)) { kvm_skip_instr32(vcpu, is_wide_instr); - else + } else { *vcpu_pc(vcpu) += 4; + *vcpu_cpsr(vcpu) &= ~PSR_BTYPE_MASK; + } /* advance the singlestep state machine */ *vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS; diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 32c8a675e5a4..abbdf9703e20 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -46,6 +46,9 @@ #define KVM_REQ_RECORD_STEAL KVM_ARCH_REQ(3) #define KVM_REQ_RELOAD_GICv4 KVM_ARCH_REQ(4) +#define KVM_DIRTY_LOG_MANUAL_CAPS (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \ + KVM_DIRTY_LOG_INITIALLY_SET) + DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use); extern unsigned int kvm_sve_max_vl; @@ -112,12 +115,8 @@ struct kvm_vcpu_fault_info { u64 disr_el1; /* Deferred [SError] Status Register */ }; -/* - * 0 is reserved as an invalid value. - * Order should be kept in sync with the save/restore code. - */ enum vcpu_sysreg { - __INVALID_SYSREG__, + __INVALID_SYSREG__, /* 0 is reserved as an invalid value */ MPIDR_EL1, /* MultiProcessor Affinity Register */ CSSELR_EL1, /* Cache Size Selection Register */ SCTLR_EL1, /* System Control Register */ @@ -415,6 +414,8 @@ struct kvm_vm_stat { struct kvm_vcpu_stat { u64 halt_successful_poll; u64 halt_attempted_poll; + u64 halt_poll_success_ns; + u64 halt_poll_fail_ns; u64 halt_poll_invalid; u64 halt_wakeup; u64 hvc_exit_stat; @@ -530,39 +531,6 @@ static inline void kvm_init_host_cpu_context(struct kvm_cpu_context *cpu_ctxt) cpu_ctxt->sys_regs[MPIDR_EL1] = read_cpuid_mpidr(); } -void __kvm_enable_ssbs(void); - -static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr, - unsigned long hyp_stack_ptr, - unsigned long vector_ptr) -{ - /* - * Calculate the raw per-cpu offset without a translation from the - * kernel's mapping to the linear mapping, and store it in tpidr_el2 - * so that we can use adr_l to access per-cpu variables in EL2. - */ - u64 tpidr_el2 = ((u64)this_cpu_ptr(&kvm_host_data) - - (u64)kvm_ksym_ref(kvm_host_data)); - - /* - * Call initialization code, and switch to the full blown HYP code. - * If the cpucaps haven't been finalized yet, something has gone very - * wrong, and hyp will crash and burn when it uses any - * cpus_have_const_cap() wrapper. - */ - BUG_ON(!system_capabilities_finalized()); - __kvm_call_hyp((void *)pgd_ptr, hyp_stack_ptr, vector_ptr, tpidr_el2); - - /* - * Disabling SSBD on a non-VHE system requires us to enable SSBS - * at EL2. - */ - if (!has_vhe() && this_cpu_has_cap(ARM64_SSBS) && - arm64_get_ssbd_state() == ARM64_SSBD_FORCE_DISABLE) { - kvm_call_hyp(__kvm_enable_ssbs); - } -} - static inline bool kvm_arch_requires_vhe(void) { /* @@ -573,10 +541,6 @@ static inline bool kvm_arch_requires_vhe(void) if (system_supports_sve()) return true; - /* Some implementations have defects that confine them to VHE */ - if (cpus_have_cap(ARM64_WORKAROUND_SPECULATIVE_AT_VHE)) - return true; - return false; } @@ -598,8 +562,6 @@ int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu, int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); -static inline void __cpu_init_stage2(void) {} - /* Guest/host FPSIMD coordination helpers */ int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu); @@ -670,7 +632,7 @@ static inline int kvm_arm_have_ssbd(void) void kvm_vcpu_load_sysregs(struct kvm_vcpu *vcpu); void kvm_vcpu_put_sysregs(struct kvm_vcpu *vcpu); -void kvm_set_ipa_limit(void); +int kvm_set_ipa_limit(void); #define __KVM_HAVE_ARCH_VM_ALLOC struct kvm *kvm_arch_alloc_vm(void); diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index fe57f60f06a8..ce3080834bfa 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -10,10 +10,9 @@ #include <linux/compiler.h> #include <linux/kvm_host.h> #include <asm/alternative.h> -#include <asm/kvm_mmu.h> #include <asm/sysreg.h> -#define __hyp_text __section(.hyp.text) notrace +#define __hyp_text __section(.hyp.text) notrace __noscs #define read_sysreg_elx(r,nvh,vh) \ ({ \ @@ -56,12 +55,12 @@ int __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu); -void __vgic_v3_save_state(struct kvm_vcpu *vcpu); -void __vgic_v3_restore_state(struct kvm_vcpu *vcpu); -void __vgic_v3_activate_traps(struct kvm_vcpu *vcpu); -void __vgic_v3_deactivate_traps(struct kvm_vcpu *vcpu); -void __vgic_v3_save_aprs(struct kvm_vcpu *vcpu); -void __vgic_v3_restore_aprs(struct kvm_vcpu *vcpu); +void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if); +void __vgic_v3_restore_state(struct vgic_v3_cpu_if *cpu_if); +void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if); +void __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if); +void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if); +void __vgic_v3_restore_aprs(struct vgic_v3_cpu_if *cpu_if); int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu); void __timer_enable_traps(struct kvm_vcpu *vcpu); @@ -88,22 +87,5 @@ void deactivate_traps_vhe_put(void); u64 __guest_enter(struct kvm_vcpu *vcpu, struct kvm_cpu_context *host_ctxt); void __noreturn __hyp_do_panic(unsigned long, ...); -/* - * Must be called from hyp code running at EL2 with an updated VTTBR - * and interrupts disabled. - */ -static __always_inline void __hyp_text __load_guest_stage2(struct kvm *kvm) -{ - write_sysreg(kvm->arch.vtcr, vtcr_el2); - write_sysreg(kvm_get_vttbr(kvm), vttbr_el2); - - /* - * ARM errata 1165522 and 1530923 require the actual execution of the - * above before we can switch to the EL1/EL0 translation regime used by - * the guest. - */ - asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT_VHE)); -} - #endif /* __ARM64_KVM_HYP_H__ */ diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 30b0e8d6b895..324c8483d2b9 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -363,8 +363,6 @@ static inline void __kvm_flush_dcache_pud(pud_t pud) } } -#define kvm_virt_to_phys(x) __pa_symbol(x) - void kvm_set_way_flush(struct kvm_vcpu *vcpu); void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled); @@ -416,7 +414,7 @@ static inline unsigned int kvm_get_vmid_bits(void) { int reg = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); - return (cpuid_feature_extract_unsigned_field(reg, ID_AA64MMFR1_VMIDBITS_SHIFT) == 2) ? 16 : 8; + return get_vmid_bits(reg); } /* @@ -473,7 +471,7 @@ static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa, extern void *__kvm_bp_vect_base; extern int __kvm_harden_el2_vector_slot; -/* This is only called on a VHE system */ +/* This is called on both VHE and !VHE systems */ static inline void *kvm_get_hyp_vector(void) { struct bp_hardening_data *data = arm64_get_bp_hardening_data(); @@ -604,5 +602,22 @@ static __always_inline u64 kvm_get_vttbr(struct kvm *kvm) return kvm_phys_to_vttbr(baddr) | vmid_field | cnp; } +/* + * Must be called from hyp code running at EL2 with an updated VTTBR + * and interrupts disabled. + */ +static __always_inline void __load_guest_stage2(struct kvm *kvm) +{ + write_sysreg(kvm->arch.vtcr, vtcr_el2); + write_sysreg(kvm_get_vttbr(kvm), vttbr_el2); + + /* + * ARM errata 1165522 and 1530923 require the actual execution of the + * above before we can switch to the EL1/EL0 translation regime used by + * the guest. + */ + asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT)); +} + #endif /* __ASSEMBLY__ */ #endif /* __ARM64_KVM_MMU_H__ */ diff --git a/arch/arm64/include/asm/linkage.h b/arch/arm64/include/asm/linkage.h index ebee3113a62f..81fefd2a1d02 100644 --- a/arch/arm64/include/asm/linkage.h +++ b/arch/arm64/include/asm/linkage.h @@ -4,6 +4,52 @@ #define __ALIGN .align 2 #define __ALIGN_STR ".align 2" +#if defined(CONFIG_ARM64_BTI_KERNEL) && defined(__aarch64__) + +/* + * Since current versions of gas reject the BTI instruction unless we + * set the architecture version to v8.5 we use the hint instruction + * instead. + */ +#define BTI_C hint 34 ; +#define BTI_J hint 36 ; + +/* + * When using in-kernel BTI we need to ensure that PCS-conformant assembly + * functions have suitable annotations. Override SYM_FUNC_START to insert + * a BTI landing pad at the start of everything. + */ +#define SYM_FUNC_START(name) \ + SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN) \ + BTI_C + +#define SYM_FUNC_START_NOALIGN(name) \ + SYM_START(name, SYM_L_GLOBAL, SYM_A_NONE) \ + BTI_C + +#define SYM_FUNC_START_LOCAL(name) \ + SYM_START(name, SYM_L_LOCAL, SYM_A_ALIGN) \ + BTI_C + +#define SYM_FUNC_START_LOCAL_NOALIGN(name) \ + SYM_START(name, SYM_L_LOCAL, SYM_A_NONE) \ + BTI_C + +#define SYM_FUNC_START_WEAK(name) \ + SYM_START(name, SYM_L_WEAK, SYM_A_ALIGN) \ + BTI_C + +#define SYM_FUNC_START_WEAK_NOALIGN(name) \ + SYM_START(name, SYM_L_WEAK, SYM_A_NONE) \ + BTI_C + +#define SYM_INNER_LABEL(name, linkage) \ + .type name SYM_T_NONE ASM_NL \ + SYM_ENTRY(name, linkage, SYM_A_NONE) \ + BTI_J + +#endif + /* * Annotate a function as position independent, i.e., safe to be called before * the kernel virtual mapping is activated. diff --git a/arch/arm64/include/asm/mman.h b/arch/arm64/include/asm/mman.h new file mode 100644 index 000000000000..081ec8de9ea6 --- /dev/null +++ b/arch/arm64/include/asm/mman.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_MMAN_H__ +#define __ASM_MMAN_H__ + +#include <linux/compiler.h> +#include <linux/types.h> +#include <uapi/asm/mman.h> + +static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot, + unsigned long pkey __always_unused) +{ + if (system_supports_bti() && (prot & PROT_BTI)) + return VM_ARM64_BTI; + + return 0; +} +#define arch_calc_vm_prot_bits(prot, pkey) arch_calc_vm_prot_bits(prot, pkey) + +static inline pgprot_t arch_vm_get_page_prot(unsigned long vm_flags) +{ + return (vm_flags & VM_ARM64_BTI) ? __pgprot(PTE_GP) : __pgprot(0); +} +#define arch_vm_get_page_prot(vm_flags) arch_vm_get_page_prot(vm_flags) + +static inline bool arch_validate_prot(unsigned long prot, + unsigned long addr __always_unused) +{ + unsigned long supported = PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM; + + if (system_supports_bti()) + supported |= PROT_BTI; + + return (prot & ~supported) == 0; +} +#define arch_validate_prot(prot, addr) arch_validate_prot(prot, addr) + +#endif /* ! __ASM_MMAN_H__ */ diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index 6bf5e650da78..9c91a8f93a0e 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -151,6 +151,7 @@ #define PTE_SHARED (_AT(pteval_t, 3) << 8) /* SH[1:0], inner shareable */ #define PTE_AF (_AT(pteval_t, 1) << 10) /* Access Flag */ #define PTE_NG (_AT(pteval_t, 1) << 11) /* nG */ +#define PTE_GP (_AT(pteval_t, 1) << 50) /* BTI guarded */ #define PTE_DBM (_AT(pteval_t, 1) << 51) /* Dirty Bit Management */ #define PTE_CONT (_AT(pteval_t, 1) << 52) /* Contiguous range */ #define PTE_PXN (_AT(pteval_t, 1) << 53) /* Privileged XN */ @@ -190,7 +191,6 @@ * Memory Attribute override for Stage-2 (MemAttr[3:0]) */ #define PTE_S2_MEMATTR(t) (_AT(pteval_t, (t)) << 2) -#define PTE_S2_MEMATTR_MASK (_AT(pteval_t, 0xf) << 2) /* * EL2/HYP PTE/PMD definitions diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index 1305e28225fc..2e7e0f452301 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -21,6 +21,7 @@ #ifndef __ASSEMBLY__ +#include <asm/cpufeature.h> #include <asm/pgtable-types.h> extern bool arm64_use_ng_mappings; @@ -31,6 +32,16 @@ extern bool arm64_use_ng_mappings; #define PTE_MAYBE_NG (arm64_use_ng_mappings ? PTE_NG : 0) #define PMD_MAYBE_NG (arm64_use_ng_mappings ? PMD_SECT_NG : 0) +/* + * If we have userspace only BTI we don't want to mark kernel pages + * guarded even if the system does support BTI. + */ +#ifdef CONFIG_ARM64_BTI_KERNEL +#define PTE_MAYBE_GP (system_supports_bti() ? PTE_GP : 0) +#else +#define PTE_MAYBE_GP 0 +#endif + #define PROT_DEFAULT (_PROT_DEFAULT | PTE_MAYBE_NG) #define PROT_SECT_DEFAULT (_PROT_SECT_DEFAULT | PMD_MAYBE_NG) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 538c85e62f86..9ce000f22d9e 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -366,7 +366,7 @@ static inline int pmd_protnone(pmd_t pmd) #define pmd_mkclean(pmd) pte_pmd(pte_mkclean(pmd_pte(pmd))) #define pmd_mkdirty(pmd) pte_pmd(pte_mkdirty(pmd_pte(pmd))) #define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd))) -#define pmd_mknotpresent(pmd) (__pmd(pmd_val(pmd) & ~PMD_SECT_VALID)) +#define pmd_mkinvalid(pmd) (__pmd(pmd_val(pmd) & ~PMD_SECT_VALID)) #define pmd_thp_or_huge(pmd) (pmd_huge(pmd) || pmd_trans_huge(pmd)) @@ -407,6 +407,9 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd) #define __pgprot_modify(prot,mask,bits) \ __pgprot((pgprot_val(prot) & ~(mask)) | (bits)) +#define pgprot_nx(prot) \ + __pgprot_modify(prot, 0, PTE_PXN) + /* * Mark the prot value as uncacheable and unbufferable. */ @@ -457,6 +460,7 @@ extern pgd_t init_pg_dir[PTRS_PER_PGD]; extern pgd_t init_pg_end[]; extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; extern pgd_t idmap_pg_dir[PTRS_PER_PGD]; +extern pgd_t idmap_pg_end[]; extern pgd_t tramp_pg_dir[PTRS_PER_PGD]; extern void set_swapper_pgd(pgd_t *pgdp, pgd_t pgd); @@ -508,7 +512,7 @@ static inline void pte_unmap(pte_t *pte) { } #define pte_set_fixmap_offset(pmd, addr) pte_set_fixmap(pte_offset_phys(pmd, addr)) #define pte_clear_fixmap() clear_fixmap(FIX_PTE) -#define pmd_page(pmd) pfn_to_page(__phys_to_pfn(__pmd_to_phys(pmd))) +#define pmd_page(pmd) phys_to_page(__pmd_to_phys(pmd)) /* use ONLY for statically allocated translation tables */ #define pte_offset_kimg(dir,addr) ((pte_t *)__phys_to_kimg(pte_offset_phys((dir), (addr)))) @@ -566,7 +570,7 @@ static inline phys_addr_t pud_page_paddr(pud_t pud) #define pmd_set_fixmap_offset(pud, addr) pmd_set_fixmap(pmd_offset_phys(pud, addr)) #define pmd_clear_fixmap() clear_fixmap(FIX_PMD) -#define pud_page(pud) pfn_to_page(__phys_to_pfn(__pud_to_phys(pud))) +#define pud_page(pud) phys_to_page(__pud_to_phys(pud)) /* use ONLY for statically allocated translation tables */ #define pmd_offset_kimg(dir,addr) ((pmd_t *)__phys_to_kimg(pmd_offset_phys((dir), (addr)))) @@ -624,7 +628,7 @@ static inline phys_addr_t pgd_page_paddr(pgd_t pgd) #define pud_set_fixmap_offset(pgd, addr) pud_set_fixmap(pud_offset_phys(pgd, addr)) #define pud_clear_fixmap() clear_fixmap(FIX_PUD) -#define pgd_page(pgd) pfn_to_page(__phys_to_pfn(__pgd_to_phys(pgd))) +#define pgd_page(pgd) phys_to_page(__pgd_to_phys(pgd)) /* use ONLY for statically allocated translation tables */ #define pud_offset_kimg(dir,addr) ((pud_t *)__phys_to_kimg(pud_offset_phys((dir), (addr)))) @@ -660,7 +664,7 @@ static inline phys_addr_t pgd_page_paddr(pgd_t pgd) static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { const pteval_t mask = PTE_USER | PTE_PXN | PTE_UXN | PTE_RDONLY | - PTE_PROT_NONE | PTE_VALID | PTE_WRITE; + PTE_PROT_NONE | PTE_VALID | PTE_WRITE | PTE_GP; /* preserve the hardware dirty information */ if (pte_hw_dirty(pte)) pte = pte_mkdirty(pte); diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index bf57308fcd63..953b6a1ce549 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -35,6 +35,7 @@ #define GIC_PRIO_PSR_I_SET (1 << 4) /* Additional SPSR bits not exposed in the UABI */ +#define PSR_MODE_THREAD_BIT (1 << 0) #define PSR_IL_BIT (1 << 20) /* AArch32-specific ptrace requests */ diff --git a/arch/arm64/include/asm/scs.h b/arch/arm64/include/asm/scs.h new file mode 100644 index 000000000000..eaa2cd92e4c1 --- /dev/null +++ b/arch/arm64/include/asm/scs.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_SCS_H +#define _ASM_SCS_H + +#ifdef __ASSEMBLY__ + +#include <asm/asm-offsets.h> + +#ifdef CONFIG_SHADOW_CALL_STACK + scs_sp .req x18 + + .macro scs_load tsk, tmp + ldr scs_sp, [\tsk, #TSK_TI_SCS_SP] + .endm + + .macro scs_save tsk, tmp + str scs_sp, [\tsk, #TSK_TI_SCS_SP] + .endm +#else + .macro scs_load tsk, tmp + .endm + + .macro scs_save tsk, tmp + .endm +#endif /* CONFIG_SHADOW_CALL_STACK */ + +#endif /* __ASSEMBLY __ */ + +#endif /* _ASM_SCS_H */ diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h index 40d5ba029615..ea268d88b6f7 100644 --- a/arch/arm64/include/asm/smp.h +++ b/arch/arm64/include/asm/smp.h @@ -23,14 +23,6 @@ #define CPU_STUCK_REASON_52_BIT_VA (UL(1) << CPU_STUCK_REASON_SHIFT) #define CPU_STUCK_REASON_NO_GRAN (UL(2) << CPU_STUCK_REASON_SHIFT) -/* Possible options for __cpu_setup */ -/* Option to setup primary cpu */ -#define ARM64_CPU_BOOT_PRIMARY (1) -/* Option to setup secondary cpus */ -#define ARM64_CPU_BOOT_SECONDARY (2) -/* Option to setup cpus for different cpu run time services */ -#define ARM64_CPU_RUNTIME (3) - #ifndef __ASSEMBLY__ #include <asm/percpu.h> @@ -96,9 +88,6 @@ asmlinkage void secondary_start_kernel(void); struct secondary_data { void *stack; struct task_struct *task; -#ifdef CONFIG_ARM64_PTR_AUTH - struct ptrauth_keys_kernel ptrauth_key; -#endif long status; }; diff --git a/arch/arm64/include/asm/stacktrace.h b/arch/arm64/include/asm/stacktrace.h index 4d9b1f48dc39..5017b531a415 100644 --- a/arch/arm64/include/asm/stacktrace.h +++ b/arch/arm64/include/asm/stacktrace.h @@ -68,12 +68,10 @@ extern void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk); DECLARE_PER_CPU(unsigned long *, irq_stack_ptr); -static inline bool on_irq_stack(unsigned long sp, +static inline bool on_stack(unsigned long sp, unsigned long low, + unsigned long high, enum stack_type type, struct stack_info *info) { - unsigned long low = (unsigned long)raw_cpu_read(irq_stack_ptr); - unsigned long high = low + IRQ_STACK_SIZE; - if (!low) return false; @@ -83,12 +81,20 @@ static inline bool on_irq_stack(unsigned long sp, if (info) { info->low = low; info->high = high; - info->type = STACK_TYPE_IRQ; + info->type = type; } - return true; } +static inline bool on_irq_stack(unsigned long sp, + struct stack_info *info) +{ + unsigned long low = (unsigned long)raw_cpu_read(irq_stack_ptr); + unsigned long high = low + IRQ_STACK_SIZE; + + return on_stack(sp, low, high, STACK_TYPE_IRQ, info); +} + static inline bool on_task_stack(const struct task_struct *tsk, unsigned long sp, struct stack_info *info) @@ -96,16 +102,7 @@ static inline bool on_task_stack(const struct task_struct *tsk, unsigned long low = (unsigned long)task_stack_page(tsk); unsigned long high = low + THREAD_SIZE; - if (sp < low || sp >= high) - return false; - - if (info) { - info->low = low; - info->high = high; - info->type = STACK_TYPE_TASK; - } - - return true; + return on_stack(sp, low, high, STACK_TYPE_TASK, info); } #ifdef CONFIG_VMAP_STACK @@ -117,16 +114,7 @@ static inline bool on_overflow_stack(unsigned long sp, unsigned long low = (unsigned long)raw_cpu_ptr(overflow_stack); unsigned long high = low + OVERFLOW_STACK_SIZE; - if (sp < low || sp >= high) - return false; - - if (info) { - info->low = low; - info->high = high; - info->type = STACK_TYPE_OVERFLOW; - } - - return true; + return on_stack(sp, low, high, STACK_TYPE_OVERFLOW, info); } #else static inline bool on_overflow_stack(unsigned long sp, diff --git a/arch/arm64/include/asm/suspend.h b/arch/arm64/include/asm/suspend.h index 8939c87c4dce..0cde2f473971 100644 --- a/arch/arm64/include/asm/suspend.h +++ b/arch/arm64/include/asm/suspend.h @@ -2,7 +2,7 @@ #ifndef __ASM_SUSPEND_H #define __ASM_SUSPEND_H -#define NR_CTX_REGS 12 +#define NR_CTX_REGS 13 #define NR_CALLEE_SAVED_REGS 12 /* diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index c4ac0ac25a00..463175f80341 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -105,6 +105,10 @@ #define SYS_DC_CSW sys_insn(1, 0, 7, 10, 2) #define SYS_DC_CISW sys_insn(1, 0, 7, 14, 2) +/* + * System registers, organised loosely by encoding but grouped together + * where the architected name contains an index. e.g. ID_MMFR<n>_EL1. + */ #define SYS_OSDTRRX_EL1 sys_reg(2, 0, 0, 0, 2) #define SYS_MDCCINT_EL1 sys_reg(2, 0, 0, 2, 0) #define SYS_MDSCR_EL1 sys_reg(2, 0, 0, 2, 2) @@ -134,12 +138,16 @@ #define SYS_ID_PFR0_EL1 sys_reg(3, 0, 0, 1, 0) #define SYS_ID_PFR1_EL1 sys_reg(3, 0, 0, 1, 1) +#define SYS_ID_PFR2_EL1 sys_reg(3, 0, 0, 3, 4) #define SYS_ID_DFR0_EL1 sys_reg(3, 0, 0, 1, 2) +#define SYS_ID_DFR1_EL1 sys_reg(3, 0, 0, 3, 5) #define SYS_ID_AFR0_EL1 sys_reg(3, 0, 0, 1, 3) #define SYS_ID_MMFR0_EL1 sys_reg(3, 0, 0, 1, 4) #define SYS_ID_MMFR1_EL1 sys_reg(3, 0, 0, 1, 5) #define SYS_ID_MMFR2_EL1 sys_reg(3, 0, 0, 1, 6) #define SYS_ID_MMFR3_EL1 sys_reg(3, 0, 0, 1, 7) +#define SYS_ID_MMFR4_EL1 sys_reg(3, 0, 0, 2, 6) +#define SYS_ID_MMFR5_EL1 sys_reg(3, 0, 0, 3, 6) #define SYS_ID_ISAR0_EL1 sys_reg(3, 0, 0, 2, 0) #define SYS_ID_ISAR1_EL1 sys_reg(3, 0, 0, 2, 1) @@ -147,7 +155,6 @@ #define SYS_ID_ISAR3_EL1 sys_reg(3, 0, 0, 2, 3) #define SYS_ID_ISAR4_EL1 sys_reg(3, 0, 0, 2, 4) #define SYS_ID_ISAR5_EL1 sys_reg(3, 0, 0, 2, 5) -#define SYS_ID_MMFR4_EL1 sys_reg(3, 0, 0, 2, 6) #define SYS_ID_ISAR6_EL1 sys_reg(3, 0, 0, 2, 7) #define SYS_MVFR0_EL1 sys_reg(3, 0, 0, 3, 0) @@ -552,6 +559,8 @@ #endif /* SCTLR_EL1 specific flags. */ +#define SCTLR_EL1_BT1 (BIT(36)) +#define SCTLR_EL1_BT0 (BIT(35)) #define SCTLR_EL1_UCI (BIT(26)) #define SCTLR_EL1_E0E (BIT(24)) #define SCTLR_EL1_SPAN (BIT(23)) @@ -594,6 +603,7 @@ /* id_aa64isar0 */ #define ID_AA64ISAR0_RNDR_SHIFT 60 +#define ID_AA64ISAR0_TLB_SHIFT 56 #define ID_AA64ISAR0_TS_SHIFT 52 #define ID_AA64ISAR0_FHM_SHIFT 48 #define ID_AA64ISAR0_DP_SHIFT 44 @@ -637,6 +647,8 @@ #define ID_AA64PFR0_CSV2_SHIFT 56 #define ID_AA64PFR0_DIT_SHIFT 48 #define ID_AA64PFR0_AMU_SHIFT 44 +#define ID_AA64PFR0_MPAM_SHIFT 40 +#define ID_AA64PFR0_SEL2_SHIFT 36 #define ID_AA64PFR0_SVE_SHIFT 32 #define ID_AA64PFR0_RAS_SHIFT 28 #define ID_AA64PFR0_GIC_SHIFT 24 @@ -655,15 +667,21 @@ #define ID_AA64PFR0_ASIMD_NI 0xf #define ID_AA64PFR0_ASIMD_SUPPORTED 0x0 #define ID_AA64PFR0_EL1_64BIT_ONLY 0x1 +#define ID_AA64PFR0_EL1_32BIT_64BIT 0x2 #define ID_AA64PFR0_EL0_64BIT_ONLY 0x1 #define ID_AA64PFR0_EL0_32BIT_64BIT 0x2 /* id_aa64pfr1 */ +#define ID_AA64PFR1_MPAMFRAC_SHIFT 16 +#define ID_AA64PFR1_RASFRAC_SHIFT 12 +#define ID_AA64PFR1_MTE_SHIFT 8 #define ID_AA64PFR1_SSBS_SHIFT 4 +#define ID_AA64PFR1_BT_SHIFT 0 #define ID_AA64PFR1_SSBS_PSTATE_NI 0 #define ID_AA64PFR1_SSBS_PSTATE_ONLY 1 #define ID_AA64PFR1_SSBS_PSTATE_INSNS 2 +#define ID_AA64PFR1_BT_BTI 0x1 /* id_aa64zfr0 */ #define ID_AA64ZFR0_F64MM_SHIFT 56 @@ -688,6 +706,9 @@ #define ID_AA64ZFR0_SVEVER_SVE2 0x1 /* id_aa64mmfr0 */ +#define ID_AA64MMFR0_TGRAN4_2_SHIFT 40 +#define ID_AA64MMFR0_TGRAN64_2_SHIFT 36 +#define ID_AA64MMFR0_TGRAN16_2_SHIFT 32 #define ID_AA64MMFR0_TGRAN4_SHIFT 28 #define ID_AA64MMFR0_TGRAN64_SHIFT 24 #define ID_AA64MMFR0_TGRAN16_SHIFT 20 @@ -752,6 +773,25 @@ #define ID_DFR0_PERFMON_8_1 0x4 +#define ID_ISAR4_SWP_FRAC_SHIFT 28 +#define ID_ISAR4_PSR_M_SHIFT 24 +#define ID_ISAR4_SYNCH_PRIM_FRAC_SHIFT 20 +#define ID_ISAR4_BARRIER_SHIFT 16 +#define ID_ISAR4_SMC_SHIFT 12 +#define ID_ISAR4_WRITEBACK_SHIFT 8 +#define ID_ISAR4_WITHSHIFTS_SHIFT 4 +#define ID_ISAR4_UNPRIV_SHIFT 0 + +#define ID_DFR1_MTPMU_SHIFT 0 + +#define ID_ISAR0_DIVIDE_SHIFT 24 +#define ID_ISAR0_DEBUG_SHIFT 20 +#define ID_ISAR0_COPROC_SHIFT 16 +#define ID_ISAR0_CMPBRANCH_SHIFT 12 +#define ID_ISAR0_BITFIELD_SHIFT 8 +#define ID_ISAR0_BITCOUNT_SHIFT 4 +#define ID_ISAR0_SWAP_SHIFT 0 + #define ID_ISAR5_RDM_SHIFT 24 #define ID_ISAR5_CRC32_SHIFT 16 #define ID_ISAR5_SHA2_SHIFT 12 @@ -767,6 +807,22 @@ #define ID_ISAR6_DP_SHIFT 4 #define ID_ISAR6_JSCVT_SHIFT 0 +#define ID_MMFR4_EVT_SHIFT 28 +#define ID_MMFR4_CCIDX_SHIFT 24 +#define ID_MMFR4_LSM_SHIFT 20 +#define ID_MMFR4_HPDS_SHIFT 16 +#define ID_MMFR4_CNP_SHIFT 12 +#define ID_MMFR4_XNX_SHIFT 8 +#define ID_MMFR4_SPECSEI_SHIFT 0 + +#define ID_MMFR5_ETS_SHIFT 0 + +#define ID_PFR0_DIT_SHIFT 24 +#define ID_PFR0_CSV2_SHIFT 16 + +#define ID_PFR2_SSBS_SHIFT 4 +#define ID_PFR2_CSV3_SHIFT 0 + #define MVFR0_FPROUND_SHIFT 28 #define MVFR0_FPSHVEC_SHIFT 24 #define MVFR0_FPSQRT_SHIFT 20 @@ -785,17 +841,14 @@ #define MVFR1_FPDNAN_SHIFT 4 #define MVFR1_FPFTZ_SHIFT 0 - -#define ID_AA64MMFR0_TGRAN4_SHIFT 28 -#define ID_AA64MMFR0_TGRAN64_SHIFT 24 -#define ID_AA64MMFR0_TGRAN16_SHIFT 20 - -#define ID_AA64MMFR0_TGRAN4_NI 0xf -#define ID_AA64MMFR0_TGRAN4_SUPPORTED 0x0 -#define ID_AA64MMFR0_TGRAN64_NI 0xf -#define ID_AA64MMFR0_TGRAN64_SUPPORTED 0x0 -#define ID_AA64MMFR0_TGRAN16_NI 0x0 -#define ID_AA64MMFR0_TGRAN16_SUPPORTED 0x1 +#define ID_PFR1_GIC_SHIFT 28 +#define ID_PFR1_VIRT_FRAC_SHIFT 24 +#define ID_PFR1_SEC_FRAC_SHIFT 20 +#define ID_PFR1_GENTIMER_SHIFT 16 +#define ID_PFR1_VIRTUALIZATION_SHIFT 12 +#define ID_PFR1_MPROGMOD_SHIFT 8 +#define ID_PFR1_SECURITY_SHIFT 4 +#define ID_PFR1_PROGMOD_SHIFT 0 #if defined(CONFIG_ARM64_4K_PAGES) #define ID_AA64MMFR0_TGRAN_SHIFT ID_AA64MMFR0_TGRAN4_SHIFT diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index 512174a8e789..6ea8b6a26ae9 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -41,6 +41,10 @@ struct thread_info { #endif } preempt; }; +#ifdef CONFIG_SHADOW_CALL_STACK + void *scs_base; + void *scs_sp; +#endif }; #define thread_saved_pc(tsk) \ @@ -100,11 +104,20 @@ void arch_release_task_struct(struct task_struct *tsk); _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \ _TIF_SYSCALL_EMU) +#ifdef CONFIG_SHADOW_CALL_STACK +#define INIT_SCS \ + .scs_base = init_shadow_call_stack, \ + .scs_sp = init_shadow_call_stack, +#else +#define INIT_SCS +#endif + #define INIT_THREAD_INFO(tsk) \ { \ .flags = _TIF_FOREIGN_FPSTATE, \ .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ + INIT_SCS \ } #endif /* __ASM_THREAD_INFO_H */ diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index 32fc8061aa76..bc5c7b091152 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -304,7 +304,7 @@ do { \ __p = uaccess_mask_ptr(__p); \ __raw_get_user((x), __p, (err)); \ } else { \ - (x) = 0; (err) = -EFAULT; \ + (x) = (__force __typeof__(x))0; (err) = -EFAULT; \ } \ } while (0) diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index 803039d504de..3b859596840d 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -38,7 +38,7 @@ #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) -#define __NR_compat_syscalls 439 +#define __NR_compat_syscalls 440 #endif #define __ARCH_WANT_SYS_CLONE diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index c1c61635f89c..6d95d0c8bf2f 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -883,6 +883,8 @@ __SYSCALL(__NR_clone3, sys_clone3) __SYSCALL(__NR_openat2, sys_openat2) #define __NR_pidfd_getfd 438 __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd) +#define __NR_faccessat2 439 +__SYSCALL(__NR_faccessat2, sys_faccessat2) /* * Please add new compat syscalls above this comment and update diff --git a/arch/arm64/include/asm/virt.h b/arch/arm64/include/asm/virt.h index 61fd26752adc..5051b388c654 100644 --- a/arch/arm64/include/asm/virt.h +++ b/arch/arm64/include/asm/virt.h @@ -85,7 +85,7 @@ static inline bool is_kernel_in_hyp_mode(void) static __always_inline bool has_vhe(void) { - if (cpus_have_const_cap(ARM64_HAS_VIRT_HOST_EXTN)) + if (cpus_have_final_cap(ARM64_HAS_VIRT_HOST_EXTN)) return true; return false; diff --git a/arch/arm64/include/asm/vmap_stack.h b/arch/arm64/include/asm/vmap_stack.h index 0a12115d9638..0cc6636e3f15 100644 --- a/arch/arm64/include/asm/vmap_stack.h +++ b/arch/arm64/include/asm/vmap_stack.h @@ -19,10 +19,8 @@ static inline unsigned long *arch_alloc_vmap_stack(size_t stack_size, int node) { BUILD_BUG_ON(!IS_ENABLED(CONFIG_VMAP_STACK)); - return __vmalloc_node_range(stack_size, THREAD_ALIGN, - VMALLOC_START, VMALLOC_END, - THREADINFO_GFP, PAGE_KERNEL, 0, node, - __builtin_return_address(0)); + return __vmalloc_node(stack_size, THREAD_ALIGN, THREADINFO_GFP, node, + __builtin_return_address(0)); } #endif /* __ASM_VMAP_STACK_H */ diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h index 7752d93bb50f..2d6ba1c2592e 100644 --- a/arch/arm64/include/uapi/asm/hwcap.h +++ b/arch/arm64/include/uapi/asm/hwcap.h @@ -73,5 +73,6 @@ #define HWCAP2_BF16 (1 << 14) #define HWCAP2_DGH (1 << 15) #define HWCAP2_RNG (1 << 16) +#define HWCAP2_BTI (1 << 17) #endif /* _UAPI__ASM_HWCAP_H */ diff --git a/arch/arm64/include/uapi/asm/mman.h b/arch/arm64/include/uapi/asm/mman.h new file mode 100644 index 000000000000..6fdd71eb644f --- /dev/null +++ b/arch/arm64/include/uapi/asm/mman.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI__ASM_MMAN_H +#define _UAPI__ASM_MMAN_H + +#include <asm-generic/mman.h> + +#define PROT_BTI 0x10 /* BTI guarded page */ + +#endif /* ! _UAPI__ASM_MMAN_H */ diff --git a/arch/arm64/include/uapi/asm/ptrace.h b/arch/arm64/include/uapi/asm/ptrace.h index d1bb5b69f1ce..42cbe34d95ce 100644 --- a/arch/arm64/include/uapi/asm/ptrace.h +++ b/arch/arm64/include/uapi/asm/ptrace.h @@ -46,6 +46,7 @@ #define PSR_I_BIT 0x00000080 #define PSR_A_BIT 0x00000100 #define PSR_D_BIT 0x00000200 +#define PSR_BTYPE_MASK 0x00000c00 #define PSR_SSBS_BIT 0x00001000 #define PSR_PAN_BIT 0x00400000 #define PSR_UAO_BIT 0x00800000 @@ -55,6 +56,8 @@ #define PSR_Z_BIT 0x40000000 #define PSR_N_BIT 0x80000000 +#define PSR_BTYPE_SHIFT 10 + /* * Groups of PSR bits */ @@ -63,6 +66,12 @@ #define PSR_x 0x0000ff00 /* Extension */ #define PSR_c 0x000000ff /* Control */ +/* Convenience names for the values of PSTATE.BTYPE */ +#define PSR_BTYPE_NONE (0b00 << PSR_BTYPE_SHIFT) +#define PSR_BTYPE_JC (0b01 << PSR_BTYPE_SHIFT) +#define PSR_BTYPE_C (0b10 << PSR_BTYPE_SHIFT) +#define PSR_BTYPE_J (0b11 << PSR_BTYPE_SHIFT) + /* syscall emulation path in ptrace */ #define PTRACE_SYSEMU 31 #define PTRACE_SYSEMU_SINGLESTEP 32 diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 4e5b8ee31442..151f28521f1e 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -63,6 +63,7 @@ obj-$(CONFIG_CRASH_CORE) += crash_core.o obj-$(CONFIG_ARM_SDE_INTERFACE) += sdei.o obj-$(CONFIG_ARM64_SSBD) += ssbd.o obj-$(CONFIG_ARM64_PTR_AUTH) += pointer_auth.o +obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o obj-y += vdso/ probes/ obj-$(CONFIG_COMPAT_VDSO) += vdso32/ diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c index a100483b47c4..46ec402e97ed 100644 --- a/arch/arm64/kernel/acpi.c +++ b/arch/arm64/kernel/acpi.c @@ -19,6 +19,7 @@ #include <linux/init.h> #include <linux/irq.h> #include <linux/irqdomain.h> +#include <linux/irq_work.h> #include <linux/memblock.h> #include <linux/of_fdt.h> #include <linux/smp.h> @@ -269,6 +270,7 @@ pgprot_t __acpi_get_mem_attribute(phys_addr_t addr) int apei_claim_sea(struct pt_regs *regs) { int err = -ENOENT; + bool return_to_irqs_enabled; unsigned long current_flags; if (!IS_ENABLED(CONFIG_ACPI_APEI_GHES)) @@ -276,6 +278,12 @@ int apei_claim_sea(struct pt_regs *regs) current_flags = local_daif_save_flags(); + /* current_flags isn't useful here as daif doesn't tell us about pNMI */ + return_to_irqs_enabled = !irqs_disabled_flags(arch_local_save_flags()); + + if (regs) + return_to_irqs_enabled = interrupts_enabled(regs); + /* * SEA can interrupt SError, mask it and describe this as an NMI so * that APEI defers the handling. @@ -284,6 +292,23 @@ int apei_claim_sea(struct pt_regs *regs) nmi_enter(); err = ghes_notify_sea(); nmi_exit(); + + /* + * APEI NMI-like notifications are deferred to irq_work. Unless + * we interrupted irqs-masked code, we can do that now. + */ + if (!err) { + if (return_to_irqs_enabled) { + local_daif_restore(DAIF_PROCCTX_NOIRQ); + __irq_enter(); + irq_work_run(); + __irq_exit(); + } else { + pr_warn_ratelimited("APEI work queued but not completed"); + err = -EINPROGRESS; + } + } + local_daif_restore(current_flags); return err; diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c index c19aa81ddc8c..7364de008bab 100644 --- a/arch/arm64/kernel/armv8_deprecated.c +++ b/arch/arm64/kernel/armv8_deprecated.c @@ -203,7 +203,7 @@ static void __init register_insn_emulation(struct insn_emulation_ops *ops) } static int emulation_proc_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { int ret = 0; diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 9981a0a5a87f..0577e2142284 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -34,6 +34,10 @@ int main(void) #ifdef CONFIG_ARM64_SW_TTBR0_PAN DEFINE(TSK_TI_TTBR0, offsetof(struct task_struct, thread_info.ttbr0)); #endif +#ifdef CONFIG_SHADOW_CALL_STACK + DEFINE(TSK_TI_SCS_BASE, offsetof(struct task_struct, thread_info.scs_base)); + DEFINE(TSK_TI_SCS_SP, offsetof(struct task_struct, thread_info.scs_sp)); +#endif DEFINE(TSK_STACK, offsetof(struct task_struct, stack)); #ifdef CONFIG_STACKPROTECTOR DEFINE(TSK_STACK_CANARY, offsetof(struct task_struct, stack_canary)); @@ -92,11 +96,8 @@ int main(void) BLANK(); DEFINE(CPU_BOOT_STACK, offsetof(struct secondary_data, stack)); DEFINE(CPU_BOOT_TASK, offsetof(struct secondary_data, task)); -#ifdef CONFIG_ARM64_PTR_AUTH - DEFINE(CPU_BOOT_PTRAUTH_KEY, offsetof(struct secondary_data, ptrauth_key)); -#endif BLANK(); -#ifdef CONFIG_KVM_ARM_HOST +#ifdef CONFIG_KVM DEFINE(VCPU_CONTEXT, offsetof(struct kvm_vcpu, arch.ctxt)); DEFINE(VCPU_FAULT_DISR, offsetof(struct kvm_vcpu, arch.fault.disr_el1)); DEFINE(VCPU_WORKAROUND_FLAGS, offsetof(struct kvm_vcpu, arch.workaround_flags)); diff --git a/arch/arm64/kernel/cpu-reset.S b/arch/arm64/kernel/cpu-reset.S index 38087b4c0432..4a18055b2ff9 100644 --- a/arch/arm64/kernel/cpu-reset.S +++ b/arch/arm64/kernel/cpu-reset.S @@ -29,7 +29,7 @@ * branch to what would be the reset vector. It must be executed with the * flat identity mapping. */ -ENTRY(__cpu_soft_restart) +SYM_CODE_START(__cpu_soft_restart) /* Clear sctlr_el1 flags. */ mrs x12, sctlr_el1 mov_q x13, SCTLR_ELx_FLAGS @@ -47,6 +47,6 @@ ENTRY(__cpu_soft_restart) mov x1, x3 // arg1 mov x2, x4 // arg2 br x8 -ENDPROC(__cpu_soft_restart) +SYM_CODE_END(__cpu_soft_restart) .popsection diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index df56d2295d16..ad06d6802d2e 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -234,7 +234,7 @@ static int detect_harden_bp_fw(void) smccc_end = NULL; break; -#if IS_ENABLED(CONFIG_KVM_ARM_HOST) +#if IS_ENABLED(CONFIG_KVM) case SMCCC_CONDUIT_SMC: cb = call_smc_arch_workaround_1; smccc_start = __smccc_workaround_1_smc; @@ -635,7 +635,7 @@ has_neoverse_n1_erratum_1542419(const struct arm64_cpu_capabilities *entry, return is_midr_in_range(midr, &range) && has_dic; } -#if defined(CONFIG_HARDEN_EL2_VECTORS) || defined(CONFIG_ARM64_ERRATUM_1319367) +#if defined(CONFIG_HARDEN_EL2_VECTORS) static const struct midr_range ca57_a72[] = { MIDR_ALL_VERSIONS(MIDR_CORTEX_A57), @@ -757,12 +757,16 @@ static const struct arm64_cpu_capabilities erratum_843419_list[] = { }; #endif -#ifdef CONFIG_ARM64_WORKAROUND_SPECULATIVE_AT_VHE -static const struct midr_range erratum_speculative_at_vhe_list[] = { +#ifdef CONFIG_ARM64_WORKAROUND_SPECULATIVE_AT +static const struct midr_range erratum_speculative_at_list[] = { #ifdef CONFIG_ARM64_ERRATUM_1165522 /* Cortex A76 r0p0 to r2p0 */ MIDR_RANGE(MIDR_CORTEX_A76, 0, 0, 2, 0), #endif +#ifdef CONFIG_ARM64_ERRATUM_1319367 + MIDR_ALL_VERSIONS(MIDR_CORTEX_A57), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A72), +#endif #ifdef CONFIG_ARM64_ERRATUM_1530923 /* Cortex A55 r0p0 to r2p0 */ MIDR_RANGE(MIDR_CORTEX_A55, 0, 0, 2, 0), @@ -774,7 +778,7 @@ static const struct midr_range erratum_speculative_at_vhe_list[] = { const struct arm64_cpu_capabilities arm64_errata[] = { #ifdef CONFIG_ARM64_WORKAROUND_CLEAN_CACHE { - .desc = "ARM errata 826319, 827319, 824069, 819472", + .desc = "ARM errata 826319, 827319, 824069, or 819472", .capability = ARM64_WORKAROUND_CLEAN_CACHE, ERRATA_MIDR_RANGE_LIST(workaround_clean_cache), .cpu_enable = cpu_enable_cache_maint_trap, @@ -856,7 +860,7 @@ const struct arm64_cpu_capabilities arm64_errata[] = { #endif #ifdef CONFIG_ARM64_WORKAROUND_REPEAT_TLBI { - .desc = "Qualcomm erratum 1009, ARM erratum 1286807", + .desc = "Qualcomm erratum 1009, or ARM erratum 1286807", .capability = ARM64_WORKAROUND_REPEAT_TLBI, .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, .matches = cpucap_multi_entry_cap_matches, @@ -897,11 +901,11 @@ const struct arm64_cpu_capabilities arm64_errata[] = { ERRATA_MIDR_RANGE_LIST(erratum_1418040_list), }, #endif -#ifdef CONFIG_ARM64_WORKAROUND_SPECULATIVE_AT_VHE +#ifdef CONFIG_ARM64_WORKAROUND_SPECULATIVE_AT { - .desc = "ARM errata 1165522, 1530923", - .capability = ARM64_WORKAROUND_SPECULATIVE_AT_VHE, - ERRATA_MIDR_RANGE_LIST(erratum_speculative_at_vhe_list), + .desc = "ARM errata 1165522, 1319367, or 1530923", + .capability = ARM64_WORKAROUND_SPECULATIVE_AT, + ERRATA_MIDR_RANGE_LIST(erratum_speculative_at_list), }, #endif #ifdef CONFIG_ARM64_ERRATUM_1463225 @@ -935,13 +939,6 @@ const struct arm64_cpu_capabilities arm64_errata[] = { .cpu_enable = cpu_enable_trap_ctr_access, }, #endif -#ifdef CONFIG_ARM64_ERRATUM_1319367 - { - .desc = "ARM erratum 1319367", - .capability = ARM64_WORKAROUND_SPECULATIVE_AT_NVHE, - ERRATA_MIDR_RANGE_LIST(ca57_a72), - }, -#endif { } }; diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 9fac745aa7bb..4ae41670c2e6 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -3,6 +3,61 @@ * Contains CPU feature definitions * * Copyright (C) 2015 ARM Ltd. + * + * A note for the weary kernel hacker: the code here is confusing and hard to + * follow! That's partly because it's solving a nasty problem, but also because + * there's a little bit of over-abstraction that tends to obscure what's going + * on behind a maze of helper functions and macros. + * + * The basic problem is that hardware folks have started gluing together CPUs + * with distinct architectural features; in some cases even creating SoCs where + * user-visible instructions are available only on a subset of the available + * cores. We try to address this by snapshotting the feature registers of the + * boot CPU and comparing these with the feature registers of each secondary + * CPU when bringing them up. If there is a mismatch, then we update the + * snapshot state to indicate the lowest-common denominator of the feature, + * known as the "safe" value. This snapshot state can be queried to view the + * "sanitised" value of a feature register. + * + * The sanitised register values are used to decide which capabilities we + * have in the system. These may be in the form of traditional "hwcaps" + * advertised to userspace or internal "cpucaps" which are used to configure + * things like alternative patching and static keys. While a feature mismatch + * may result in a TAINT_CPU_OUT_OF_SPEC kernel taint, a capability mismatch + * may prevent a CPU from being onlined at all. + * + * Some implementation details worth remembering: + * + * - Mismatched features are *always* sanitised to a "safe" value, which + * usually indicates that the feature is not supported. + * + * - A mismatched feature marked with FTR_STRICT will cause a "SANITY CHECK" + * warning when onlining an offending CPU and the kernel will be tainted + * with TAINT_CPU_OUT_OF_SPEC. + * + * - Features marked as FTR_VISIBLE have their sanitised value visible to + * userspace. FTR_VISIBLE features in registers that are only visible + * to EL0 by trapping *must* have a corresponding HWCAP so that late + * onlining of CPUs cannot lead to features disappearing at runtime. + * + * - A "feature" is typically a 4-bit register field. A "capability" is the + * high-level description derived from the sanitised field value. + * + * - Read the Arm ARM (DDI 0487F.a) section D13.1.3 ("Principles of the ID + * scheme for fields in ID registers") to understand when feature fields + * may be signed or unsigned (FTR_SIGNED and FTR_UNSIGNED accordingly). + * + * - KVM exposes its own view of the feature registers to guest operating + * systems regardless of FTR_VISIBLE. This is typically driven from the + * sanitised register values to allow virtual CPUs to be migrated between + * arbitrary physical CPUs, but some features not present on the host are + * also advertised and emulated. Look at sys_reg_descs[] for the gory + * details. + * + * - If the arm64_ftr_bits[] for a register has a missing field, then this + * field is treated as STRICT RES0, including for read_sanitised_ftr_reg(). + * This is stronger than FTR_HIDDEN and can be used to hide features from + * KVM guests. */ #define pr_fmt(fmt) "CPU features: " fmt @@ -124,6 +179,7 @@ static bool __system_matches_cap(unsigned int n); */ static const struct arm64_ftr_bits ftr_id_aa64isar0[] = { ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_RNDR_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_TLB_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_TS_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_FHM_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_DP_SHIFT, 4, 0), @@ -166,22 +222,27 @@ static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = { ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_CSV2_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_DIT_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_AMU_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_MPAM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_SEL2_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_SVE_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_RAS_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_GIC_SHIFT, 4, 0), S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_ASIMD_SHIFT, 4, ID_AA64PFR0_ASIMD_NI), S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_FP_SHIFT, 4, ID_AA64PFR0_FP_NI), - /* Linux doesn't care about the EL3 */ ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL3_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL2_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_SHIFT, 4, ID_AA64PFR0_EL1_64BIT_ONLY), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL0_SHIFT, 4, ID_AA64PFR0_EL0_64BIT_ONLY), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL2_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_SHIFT, 4, ID_AA64PFR0_EL1_64BIT_ONLY), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL0_SHIFT, 4, ID_AA64PFR0_EL0_64BIT_ONLY), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_aa64pfr1[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_MPAMFRAC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_RASFRAC_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_SSBS_SHIFT, 4, ID_AA64PFR1_SSBS_PSTATE_NI), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_BTI), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_BT_SHIFT, 4, 0), ARM64_FTR_END, }; @@ -209,6 +270,24 @@ static const struct arm64_ftr_bits ftr_id_aa64zfr0[] = { static const struct arm64_ftr_bits ftr_id_aa64mmfr0[] = { /* + * Page size not being supported at Stage-2 is not fatal. You + * just give up KVM if PAGE_SIZE isn't supported there. Go fix + * your favourite nesting hypervisor. + * + * There is a small corner case where the hypervisor explicitly + * advertises a given granule size at Stage-2 (value 2) on some + * vCPUs, and uses the fallback to Stage-1 (value 0) for other + * vCPUs. Although this is not forbidden by the architecture, it + * indicates that the hypervisor is being silly (or buggy). + * + * We make no effort to cope with this and pretend that if these + * fields are inconsistent across vCPUs, then it isn't worth + * trying to bring KVM up. + */ + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_AA64MMFR0_TGRAN4_2_SHIFT, 4, 1), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_AA64MMFR0_TGRAN64_2_SHIFT, 4, 1), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_AA64MMFR0_TGRAN16_2_SHIFT, 4, 1), + /* * We already refuse to boot CPUs that don't support our configured * page size, so we can only detect mismatches for a page size other * than the one we're currently using. Unfortunately, SoCs like this @@ -247,7 +326,7 @@ static const struct arm64_ftr_bits ftr_id_aa64mmfr2[] = { ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_FWB_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_AT_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_LVA_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_IESB_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_IESB_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_LSM_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_UAO_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_CNP_SHIFT, 4, 0), @@ -289,7 +368,7 @@ static const struct arm64_ftr_bits ftr_id_mmfr0[] = { }; static const struct arm64_ftr_bits ftr_id_aa64dfr0[] = { - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, 36, 28, 0), + S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 36, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64DFR0_PMSVER_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_CTX_CMPS_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_WRPS_SHIFT, 4, 0), @@ -316,6 +395,16 @@ static const struct arm64_ftr_bits ftr_dczid[] = { ARM64_FTR_END, }; +static const struct arm64_ftr_bits ftr_id_isar0[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_DIVIDE_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_DEBUG_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_COPROC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_CMPBRANCH_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_BITFIELD_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_BITCOUNT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_SWAP_SHIFT, 4, 0), + ARM64_FTR_END, +}; static const struct arm64_ftr_bits ftr_id_isar5[] = { ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_RDM_SHIFT, 4, 0), @@ -328,7 +417,37 @@ static const struct arm64_ftr_bits ftr_id_isar5[] = { }; static const struct arm64_ftr_bits ftr_id_mmfr4[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_EVT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_CCIDX_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_LSM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_HPDS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_CNP_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_XNX_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 4, 4, 0), /* ac2 */ + /* + * SpecSEI = 1 indicates that the PE might generate an SError on an + * external abort on speculative read. It is safe to assume that an + * SError might be generated than it will not be. Hence it has been + * classified as FTR_HIGHER_SAFE. + */ + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_HIGHER_SAFE, ID_MMFR4_SPECSEI_SHIFT, 4, 0), + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_id_isar4[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_SWP_FRAC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_PSR_M_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_SYNCH_PRIM_FRAC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_BARRIER_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_SMC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_WRITEBACK_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_WITHSHIFTS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_UNPRIV_SHIFT, 4, 0), + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_id_mmfr5[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR5_ETS_SHIFT, 4, 0), ARM64_FTR_END, }; @@ -344,6 +463,8 @@ static const struct arm64_ftr_bits ftr_id_isar6[] = { }; static const struct arm64_ftr_bits ftr_id_pfr0[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR0_DIT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_PFR0_CSV2_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 12, 4, 0), /* State3 */ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 8, 4, 0), /* State2 */ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 4, 4, 0), /* State1 */ @@ -351,8 +472,26 @@ static const struct arm64_ftr_bits ftr_id_pfr0[] = { ARM64_FTR_END, }; +static const struct arm64_ftr_bits ftr_id_pfr1[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_GIC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_VIRT_FRAC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_SEC_FRAC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_GENTIMER_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_VIRTUALIZATION_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_MPROGMOD_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_SECURITY_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_PROGMOD_SHIFT, 4, 0), + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_id_pfr2[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR2_SSBS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_PFR2_CSV3_SHIFT, 4, 0), + ARM64_FTR_END, +}; + static const struct arm64_ftr_bits ftr_id_dfr0[] = { - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 28, 4, 0), + /* [31:28] TraceFilt */ S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 24, 4, 0xf), /* PerfMon */ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 20, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 16, 4, 0), @@ -363,6 +502,11 @@ static const struct arm64_ftr_bits ftr_id_dfr0[] = { ARM64_FTR_END, }; +static const struct arm64_ftr_bits ftr_id_dfr1[] = { + S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR1_MTPMU_SHIFT, 4, 0), + ARM64_FTR_END, +}; + static const struct arm64_ftr_bits ftr_zcr[] = { ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ZCR_ELx_LEN_SHIFT, ZCR_ELx_LEN_SIZE, 0), /* LEN */ @@ -373,7 +517,7 @@ static const struct arm64_ftr_bits ftr_zcr[] = { * Common ftr bits for a 32bit register with all hidden, strict * attributes, with 4bit feature fields and a default safe value of * 0. Covers the following 32bit registers: - * id_isar[0-4], id_mmfr[1-3], id_pfr1, mvfr[0-1] + * id_isar[1-4], id_mmfr[1-3], id_pfr1, mvfr[0-1] */ static const struct arm64_ftr_bits ftr_generic_32bits[] = { ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 28, 4, 0), @@ -411,7 +555,7 @@ static const struct __ftr_reg_entry { /* Op1 = 0, CRn = 0, CRm = 1 */ ARM64_FTR_REG(SYS_ID_PFR0_EL1, ftr_id_pfr0), - ARM64_FTR_REG(SYS_ID_PFR1_EL1, ftr_generic_32bits), + ARM64_FTR_REG(SYS_ID_PFR1_EL1, ftr_id_pfr1), ARM64_FTR_REG(SYS_ID_DFR0_EL1, ftr_id_dfr0), ARM64_FTR_REG(SYS_ID_MMFR0_EL1, ftr_id_mmfr0), ARM64_FTR_REG(SYS_ID_MMFR1_EL1, ftr_generic_32bits), @@ -419,11 +563,11 @@ static const struct __ftr_reg_entry { ARM64_FTR_REG(SYS_ID_MMFR3_EL1, ftr_generic_32bits), /* Op1 = 0, CRn = 0, CRm = 2 */ - ARM64_FTR_REG(SYS_ID_ISAR0_EL1, ftr_generic_32bits), + ARM64_FTR_REG(SYS_ID_ISAR0_EL1, ftr_id_isar0), ARM64_FTR_REG(SYS_ID_ISAR1_EL1, ftr_generic_32bits), ARM64_FTR_REG(SYS_ID_ISAR2_EL1, ftr_generic_32bits), ARM64_FTR_REG(SYS_ID_ISAR3_EL1, ftr_generic_32bits), - ARM64_FTR_REG(SYS_ID_ISAR4_EL1, ftr_generic_32bits), + ARM64_FTR_REG(SYS_ID_ISAR4_EL1, ftr_id_isar4), ARM64_FTR_REG(SYS_ID_ISAR5_EL1, ftr_id_isar5), ARM64_FTR_REG(SYS_ID_MMFR4_EL1, ftr_id_mmfr4), ARM64_FTR_REG(SYS_ID_ISAR6_EL1, ftr_id_isar6), @@ -432,6 +576,9 @@ static const struct __ftr_reg_entry { ARM64_FTR_REG(SYS_MVFR0_EL1, ftr_generic_32bits), ARM64_FTR_REG(SYS_MVFR1_EL1, ftr_generic_32bits), ARM64_FTR_REG(SYS_MVFR2_EL1, ftr_mvfr2), + ARM64_FTR_REG(SYS_ID_PFR2_EL1, ftr_id_pfr2), + ARM64_FTR_REG(SYS_ID_DFR1_EL1, ftr_id_dfr1), + ARM64_FTR_REG(SYS_ID_MMFR5_EL1, ftr_id_mmfr5), /* Op1 = 0, CRn = 0, CRm = 4 */ ARM64_FTR_REG(SYS_ID_AA64PFR0_EL1, ftr_id_aa64pfr0), @@ -468,16 +615,16 @@ static int search_cmp_ftr_reg(const void *id, const void *regp) } /* - * get_arm64_ftr_reg - Lookup a feature register entry using its - * sys_reg() encoding. With the array arm64_ftr_regs sorted in the - * ascending order of sys_id , we use binary search to find a matching + * get_arm64_ftr_reg_nowarn - Looks up a feature register entry using + * its sys_reg() encoding. With the array arm64_ftr_regs sorted in the + * ascending order of sys_id, we use binary search to find a matching * entry. * * returns - Upon success, matching ftr_reg entry for id. * - NULL on failure. It is upto the caller to decide * the impact of a failure. */ -static struct arm64_ftr_reg *get_arm64_ftr_reg(u32 sys_id) +static struct arm64_ftr_reg *get_arm64_ftr_reg_nowarn(u32 sys_id) { const struct __ftr_reg_entry *ret; @@ -491,6 +638,27 @@ static struct arm64_ftr_reg *get_arm64_ftr_reg(u32 sys_id) return NULL; } +/* + * get_arm64_ftr_reg - Looks up a feature register entry using + * its sys_reg() encoding. This calls get_arm64_ftr_reg_nowarn(). + * + * returns - Upon success, matching ftr_reg entry for id. + * - NULL on failure but with an WARN_ON(). + */ +static struct arm64_ftr_reg *get_arm64_ftr_reg(u32 sys_id) +{ + struct arm64_ftr_reg *reg; + + reg = get_arm64_ftr_reg_nowarn(sys_id); + + /* + * Requesting a non-existent register search is an error. Warn + * and let the caller handle it. + */ + WARN_ON(!reg); + return reg; +} + static u64 arm64_ftr_set_value(const struct arm64_ftr_bits *ftrp, s64 reg, s64 ftr_val) { @@ -552,7 +720,8 @@ static void __init init_cpu_ftr_reg(u32 sys_reg, u64 new) const struct arm64_ftr_bits *ftrp; struct arm64_ftr_reg *reg = get_arm64_ftr_reg(sys_reg); - BUG_ON(!reg); + if (!reg) + return; for (ftrp = reg->ftr_bits; ftrp->width; ftrp++) { u64 ftr_mask = arm64_ftr_mask(ftrp); @@ -625,6 +794,7 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info) if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) { init_cpu_ftr_reg(SYS_ID_DFR0_EL1, info->reg_id_dfr0); + init_cpu_ftr_reg(SYS_ID_DFR1_EL1, info->reg_id_dfr1); init_cpu_ftr_reg(SYS_ID_ISAR0_EL1, info->reg_id_isar0); init_cpu_ftr_reg(SYS_ID_ISAR1_EL1, info->reg_id_isar1); init_cpu_ftr_reg(SYS_ID_ISAR2_EL1, info->reg_id_isar2); @@ -636,8 +806,11 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info) init_cpu_ftr_reg(SYS_ID_MMFR1_EL1, info->reg_id_mmfr1); init_cpu_ftr_reg(SYS_ID_MMFR2_EL1, info->reg_id_mmfr2); init_cpu_ftr_reg(SYS_ID_MMFR3_EL1, info->reg_id_mmfr3); + init_cpu_ftr_reg(SYS_ID_MMFR4_EL1, info->reg_id_mmfr4); + init_cpu_ftr_reg(SYS_ID_MMFR5_EL1, info->reg_id_mmfr5); init_cpu_ftr_reg(SYS_ID_PFR0_EL1, info->reg_id_pfr0); init_cpu_ftr_reg(SYS_ID_PFR1_EL1, info->reg_id_pfr1); + init_cpu_ftr_reg(SYS_ID_PFR2_EL1, info->reg_id_pfr2); init_cpu_ftr_reg(SYS_MVFR0_EL1, info->reg_mvfr0); init_cpu_ftr_reg(SYS_MVFR1_EL1, info->reg_mvfr1); init_cpu_ftr_reg(SYS_MVFR2_EL1, info->reg_mvfr2); @@ -682,7 +855,9 @@ static int check_update_ftr_reg(u32 sys_id, int cpu, u64 val, u64 boot) { struct arm64_ftr_reg *regp = get_arm64_ftr_reg(sys_id); - BUG_ON(!regp); + if (!regp) + return 0; + update_cpu_ftr_reg(regp, val); if ((boot & regp->strict_mask) == (val & regp->strict_mask)) return 0; @@ -691,6 +866,104 @@ static int check_update_ftr_reg(u32 sys_id, int cpu, u64 val, u64 boot) return 1; } +static void relax_cpu_ftr_reg(u32 sys_id, int field) +{ + const struct arm64_ftr_bits *ftrp; + struct arm64_ftr_reg *regp = get_arm64_ftr_reg(sys_id); + + if (!regp) + return; + + for (ftrp = regp->ftr_bits; ftrp->width; ftrp++) { + if (ftrp->shift == field) { + regp->strict_mask &= ~arm64_ftr_mask(ftrp); + break; + } + } + + /* Bogus field? */ + WARN_ON(!ftrp->width); +} + +static int update_32bit_cpu_features(int cpu, struct cpuinfo_arm64 *info, + struct cpuinfo_arm64 *boot) +{ + int taint = 0; + u64 pfr0 = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); + + /* + * If we don't have AArch32 at all then skip the checks entirely + * as the register values may be UNKNOWN and we're not going to be + * using them for anything. + */ + if (!id_aa64pfr0_32bit_el0(pfr0)) + return taint; + + /* + * If we don't have AArch32 at EL1, then relax the strictness of + * EL1-dependent register fields to avoid spurious sanity check fails. + */ + if (!id_aa64pfr0_32bit_el1(pfr0)) { + relax_cpu_ftr_reg(SYS_ID_ISAR4_EL1, ID_ISAR4_SMC_SHIFT); + relax_cpu_ftr_reg(SYS_ID_PFR1_EL1, ID_PFR1_VIRT_FRAC_SHIFT); + relax_cpu_ftr_reg(SYS_ID_PFR1_EL1, ID_PFR1_SEC_FRAC_SHIFT); + relax_cpu_ftr_reg(SYS_ID_PFR1_EL1, ID_PFR1_VIRTUALIZATION_SHIFT); + relax_cpu_ftr_reg(SYS_ID_PFR1_EL1, ID_PFR1_SECURITY_SHIFT); + relax_cpu_ftr_reg(SYS_ID_PFR1_EL1, ID_PFR1_PROGMOD_SHIFT); + } + + taint |= check_update_ftr_reg(SYS_ID_DFR0_EL1, cpu, + info->reg_id_dfr0, boot->reg_id_dfr0); + taint |= check_update_ftr_reg(SYS_ID_DFR1_EL1, cpu, + info->reg_id_dfr1, boot->reg_id_dfr1); + taint |= check_update_ftr_reg(SYS_ID_ISAR0_EL1, cpu, + info->reg_id_isar0, boot->reg_id_isar0); + taint |= check_update_ftr_reg(SYS_ID_ISAR1_EL1, cpu, + info->reg_id_isar1, boot->reg_id_isar1); + taint |= check_update_ftr_reg(SYS_ID_ISAR2_EL1, cpu, + info->reg_id_isar2, boot->reg_id_isar2); + taint |= check_update_ftr_reg(SYS_ID_ISAR3_EL1, cpu, + info->reg_id_isar3, boot->reg_id_isar3); + taint |= check_update_ftr_reg(SYS_ID_ISAR4_EL1, cpu, + info->reg_id_isar4, boot->reg_id_isar4); + taint |= check_update_ftr_reg(SYS_ID_ISAR5_EL1, cpu, + info->reg_id_isar5, boot->reg_id_isar5); + taint |= check_update_ftr_reg(SYS_ID_ISAR6_EL1, cpu, + info->reg_id_isar6, boot->reg_id_isar6); + + /* + * Regardless of the value of the AuxReg field, the AIFSR, ADFSR, and + * ACTLR formats could differ across CPUs and therefore would have to + * be trapped for virtualization anyway. + */ + taint |= check_update_ftr_reg(SYS_ID_MMFR0_EL1, cpu, + info->reg_id_mmfr0, boot->reg_id_mmfr0); + taint |= check_update_ftr_reg(SYS_ID_MMFR1_EL1, cpu, + info->reg_id_mmfr1, boot->reg_id_mmfr1); + taint |= check_update_ftr_reg(SYS_ID_MMFR2_EL1, cpu, + info->reg_id_mmfr2, boot->reg_id_mmfr2); + taint |= check_update_ftr_reg(SYS_ID_MMFR3_EL1, cpu, + info->reg_id_mmfr3, boot->reg_id_mmfr3); + taint |= check_update_ftr_reg(SYS_ID_MMFR4_EL1, cpu, + info->reg_id_mmfr4, boot->reg_id_mmfr4); + taint |= check_update_ftr_reg(SYS_ID_MMFR5_EL1, cpu, + info->reg_id_mmfr5, boot->reg_id_mmfr5); + taint |= check_update_ftr_reg(SYS_ID_PFR0_EL1, cpu, + info->reg_id_pfr0, boot->reg_id_pfr0); + taint |= check_update_ftr_reg(SYS_ID_PFR1_EL1, cpu, + info->reg_id_pfr1, boot->reg_id_pfr1); + taint |= check_update_ftr_reg(SYS_ID_PFR2_EL1, cpu, + info->reg_id_pfr2, boot->reg_id_pfr2); + taint |= check_update_ftr_reg(SYS_MVFR0_EL1, cpu, + info->reg_mvfr0, boot->reg_mvfr0); + taint |= check_update_ftr_reg(SYS_MVFR1_EL1, cpu, + info->reg_mvfr1, boot->reg_mvfr1); + taint |= check_update_ftr_reg(SYS_MVFR2_EL1, cpu, + info->reg_mvfr2, boot->reg_mvfr2); + + return taint; +} + /* * Update system wide CPU feature registers with the values from a * non-boot CPU. Also performs SANITY checks to make sure that there @@ -753,9 +1026,6 @@ void update_cpu_features(int cpu, taint |= check_update_ftr_reg(SYS_ID_AA64MMFR2_EL1, cpu, info->reg_id_aa64mmfr2, boot->reg_id_aa64mmfr2); - /* - * EL3 is not our concern. - */ taint |= check_update_ftr_reg(SYS_ID_AA64PFR0_EL1, cpu, info->reg_id_aa64pfr0, boot->reg_id_aa64pfr0); taint |= check_update_ftr_reg(SYS_ID_AA64PFR1_EL1, cpu, @@ -764,55 +1034,6 @@ void update_cpu_features(int cpu, taint |= check_update_ftr_reg(SYS_ID_AA64ZFR0_EL1, cpu, info->reg_id_aa64zfr0, boot->reg_id_aa64zfr0); - /* - * If we have AArch32, we care about 32-bit features for compat. - * If the system doesn't support AArch32, don't update them. - */ - if (id_aa64pfr0_32bit_el0(read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1)) && - id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) { - - taint |= check_update_ftr_reg(SYS_ID_DFR0_EL1, cpu, - info->reg_id_dfr0, boot->reg_id_dfr0); - taint |= check_update_ftr_reg(SYS_ID_ISAR0_EL1, cpu, - info->reg_id_isar0, boot->reg_id_isar0); - taint |= check_update_ftr_reg(SYS_ID_ISAR1_EL1, cpu, - info->reg_id_isar1, boot->reg_id_isar1); - taint |= check_update_ftr_reg(SYS_ID_ISAR2_EL1, cpu, - info->reg_id_isar2, boot->reg_id_isar2); - taint |= check_update_ftr_reg(SYS_ID_ISAR3_EL1, cpu, - info->reg_id_isar3, boot->reg_id_isar3); - taint |= check_update_ftr_reg(SYS_ID_ISAR4_EL1, cpu, - info->reg_id_isar4, boot->reg_id_isar4); - taint |= check_update_ftr_reg(SYS_ID_ISAR5_EL1, cpu, - info->reg_id_isar5, boot->reg_id_isar5); - taint |= check_update_ftr_reg(SYS_ID_ISAR6_EL1, cpu, - info->reg_id_isar6, boot->reg_id_isar6); - - /* - * Regardless of the value of the AuxReg field, the AIFSR, ADFSR, and - * ACTLR formats could differ across CPUs and therefore would have to - * be trapped for virtualization anyway. - */ - taint |= check_update_ftr_reg(SYS_ID_MMFR0_EL1, cpu, - info->reg_id_mmfr0, boot->reg_id_mmfr0); - taint |= check_update_ftr_reg(SYS_ID_MMFR1_EL1, cpu, - info->reg_id_mmfr1, boot->reg_id_mmfr1); - taint |= check_update_ftr_reg(SYS_ID_MMFR2_EL1, cpu, - info->reg_id_mmfr2, boot->reg_id_mmfr2); - taint |= check_update_ftr_reg(SYS_ID_MMFR3_EL1, cpu, - info->reg_id_mmfr3, boot->reg_id_mmfr3); - taint |= check_update_ftr_reg(SYS_ID_PFR0_EL1, cpu, - info->reg_id_pfr0, boot->reg_id_pfr0); - taint |= check_update_ftr_reg(SYS_ID_PFR1_EL1, cpu, - info->reg_id_pfr1, boot->reg_id_pfr1); - taint |= check_update_ftr_reg(SYS_MVFR0_EL1, cpu, - info->reg_mvfr0, boot->reg_mvfr0); - taint |= check_update_ftr_reg(SYS_MVFR1_EL1, cpu, - info->reg_mvfr1, boot->reg_mvfr1); - taint |= check_update_ftr_reg(SYS_MVFR2_EL1, cpu, - info->reg_mvfr2, boot->reg_mvfr2); - } - if (id_aa64pfr0_sve(info->reg_id_aa64pfr0)) { taint |= check_update_ftr_reg(SYS_ZCR_EL1, cpu, info->reg_zcr, boot->reg_zcr); @@ -824,6 +1045,12 @@ void update_cpu_features(int cpu, } /* + * This relies on a sanitised view of the AArch64 ID registers + * (e.g. SYS_ID_AA64PFR0_EL1), so we call it last. + */ + taint |= update_32bit_cpu_features(cpu, info, boot); + + /* * Mismatched CPU features are a recipe for disaster. Don't even * pretend to support them. */ @@ -837,8 +1064,8 @@ u64 read_sanitised_ftr_reg(u32 id) { struct arm64_ftr_reg *regp = get_arm64_ftr_reg(id); - /* We shouldn't get a request for an unsupported register */ - BUG_ON(!regp); + if (!regp) + return 0; return regp->sys_val; } @@ -854,11 +1081,15 @@ static u64 __read_sysreg_by_encoding(u32 sys_id) switch (sys_id) { read_sysreg_case(SYS_ID_PFR0_EL1); read_sysreg_case(SYS_ID_PFR1_EL1); + read_sysreg_case(SYS_ID_PFR2_EL1); read_sysreg_case(SYS_ID_DFR0_EL1); + read_sysreg_case(SYS_ID_DFR1_EL1); read_sysreg_case(SYS_ID_MMFR0_EL1); read_sysreg_case(SYS_ID_MMFR1_EL1); read_sysreg_case(SYS_ID_MMFR2_EL1); read_sysreg_case(SYS_ID_MMFR3_EL1); + read_sysreg_case(SYS_ID_MMFR4_EL1); + read_sysreg_case(SYS_ID_MMFR5_EL1); read_sysreg_case(SYS_ID_ISAR0_EL1); read_sysreg_case(SYS_ID_ISAR1_EL1); read_sysreg_case(SYS_ID_ISAR2_EL1); @@ -1409,6 +1640,21 @@ static bool can_use_gic_priorities(const struct arm64_cpu_capabilities *entry, } #endif +#ifdef CONFIG_ARM64_BTI +static void bti_enable(const struct arm64_cpu_capabilities *__unused) +{ + /* + * Use of X16/X17 for tail-calls and trampolines that jump to + * function entry points using BR is a requirement for + * marking binaries with GNU_PROPERTY_AARCH64_FEATURE_1_BTI. + * So, be strict and forbid other BRs using other registers to + * jump onto a PACIxSP instruction: + */ + sysreg_clear_set(sctlr_el1, 0, SCTLR_EL1_BT0 | SCTLR_EL1_BT1); + isb(); +} +#endif /* CONFIG_ARM64_BTI */ + /* Internal helper functions to match cpu capability type */ static bool cpucap_late_cpu_optional(const struct arm64_cpu_capabilities *cap) @@ -1511,6 +1757,18 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .field_pos = ID_AA64PFR0_EL0_SHIFT, .min_field_value = ID_AA64PFR0_EL0_32BIT_64BIT, }, +#ifdef CONFIG_KVM + { + .desc = "32-bit EL1 Support", + .capability = ARM64_HAS_32BIT_EL1, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + .sys_reg = SYS_ID_AA64PFR0_EL1, + .sign = FTR_UNSIGNED, + .field_pos = ID_AA64PFR0_EL1_SHIFT, + .min_field_value = ID_AA64PFR0_EL1_32BIT_64BIT, + }, +#endif { .desc = "Kernel page table isolation (KPTI)", .capability = ARM64_UNMAP_KERNEL_AT_EL0, @@ -1779,6 +2037,23 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .min_field_value = 1, }, #endif +#ifdef CONFIG_ARM64_BTI + { + .desc = "Branch Target Identification", + .capability = ARM64_BTI, +#ifdef CONFIG_ARM64_BTI_KERNEL + .type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE, +#else + .type = ARM64_CPUCAP_SYSTEM_FEATURE, +#endif + .matches = has_cpuid_feature, + .cpu_enable = bti_enable, + .sys_reg = SYS_ID_AA64PFR1_EL1, + .field_pos = ID_AA64PFR1_BT_SHIFT, + .min_field_value = ID_AA64PFR1_BT_BTI, + .sign = FTR_UNSIGNED, + }, +#endif {}, }; @@ -1888,6 +2163,9 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_F64MM_SHIFT, FTR_UNSIGNED, ID_AA64ZFR0_F64MM, CAP_HWCAP, KERNEL_HWCAP_SVEF64MM), #endif HWCAP_CAP(SYS_ID_AA64PFR1_EL1, ID_AA64PFR1_SSBS_SHIFT, FTR_UNSIGNED, ID_AA64PFR1_SSBS_PSTATE_INSNS, CAP_HWCAP, KERNEL_HWCAP_SSBS), +#ifdef CONFIG_ARM64_BTI + HWCAP_CAP(SYS_ID_AA64PFR1_EL1, ID_AA64PFR1_BT_SHIFT, FTR_UNSIGNED, ID_AA64PFR1_BT_BTI, CAP_HWCAP, KERNEL_HWCAP_BTI), +#endif #ifdef CONFIG_ARM64_PTR_AUTH HWCAP_MULTI_CAP(ptr_auth_hwcap_addr_matches, CAP_HWCAP, KERNEL_HWCAP_PACA), HWCAP_MULTI_CAP(ptr_auth_hwcap_gen_matches, CAP_HWCAP, KERNEL_HWCAP_PACG), @@ -2181,6 +2459,36 @@ static void verify_sve_features(void) /* Add checks on other ZCR bits here if necessary */ } +static void verify_hyp_capabilities(void) +{ + u64 safe_mmfr1, mmfr0, mmfr1; + int parange, ipa_max; + unsigned int safe_vmid_bits, vmid_bits; + + if (!IS_ENABLED(CONFIG_KVM) || !IS_ENABLED(CONFIG_KVM_ARM_HOST)) + return; + + safe_mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); + mmfr0 = read_cpuid(ID_AA64MMFR0_EL1); + mmfr1 = read_cpuid(ID_AA64MMFR1_EL1); + + /* Verify VMID bits */ + safe_vmid_bits = get_vmid_bits(safe_mmfr1); + vmid_bits = get_vmid_bits(mmfr1); + if (vmid_bits < safe_vmid_bits) { + pr_crit("CPU%d: VMID width mismatch\n", smp_processor_id()); + cpu_die_early(); + } + + /* Verify IPA range */ + parange = cpuid_feature_extract_unsigned_field(mmfr0, + ID_AA64MMFR0_PARANGE_SHIFT); + ipa_max = id_aa64mmfr0_parange_to_phys_shift(parange); + if (ipa_max < get_kvm_ipa_limit()) { + pr_crit("CPU%d: IPA range mismatch\n", smp_processor_id()); + cpu_die_early(); + } +} /* * Run through the enabled system capabilities and enable() it on this CPU. @@ -2206,6 +2514,9 @@ static void verify_local_cpu_capabilities(void) if (system_supports_sve()) verify_sve_features(); + + if (is_hyp_mode_available()) + verify_hyp_capabilities(); } void check_local_cpu_capabilities(void) @@ -2394,7 +2705,7 @@ static int emulate_sys_reg(u32 id, u64 *valp) if (sys_reg_CRm(id) == 0) return emulate_id_reg(id, valp); - regp = get_arm64_ftr_reg(id); + regp = get_arm64_ftr_reg_nowarn(id); if (regp) *valp = arm64_ftr_reg_user_value(regp); else diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index 86136075ae41..86637466daa8 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -92,6 +92,7 @@ static const char *const hwcap_str[] = { "bf16", "dgh", "rng", + "bti", NULL }; @@ -311,6 +312,8 @@ static int __init cpuinfo_regs_init(void) } return 0; } +device_initcall(cpuinfo_regs_init); + static void cpuinfo_detect_icache_policy(struct cpuinfo_arm64 *info) { unsigned int cpu = smp_processor_id(); @@ -362,6 +365,7 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info) /* Update the 32bit ID registers only if AArch32 is implemented */ if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) { info->reg_id_dfr0 = read_cpuid(ID_DFR0_EL1); + info->reg_id_dfr1 = read_cpuid(ID_DFR1_EL1); info->reg_id_isar0 = read_cpuid(ID_ISAR0_EL1); info->reg_id_isar1 = read_cpuid(ID_ISAR1_EL1); info->reg_id_isar2 = read_cpuid(ID_ISAR2_EL1); @@ -373,8 +377,11 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info) info->reg_id_mmfr1 = read_cpuid(ID_MMFR1_EL1); info->reg_id_mmfr2 = read_cpuid(ID_MMFR2_EL1); info->reg_id_mmfr3 = read_cpuid(ID_MMFR3_EL1); + info->reg_id_mmfr4 = read_cpuid(ID_MMFR4_EL1); + info->reg_id_mmfr5 = read_cpuid(ID_MMFR5_EL1); info->reg_id_pfr0 = read_cpuid(ID_PFR0_EL1); info->reg_id_pfr1 = read_cpuid(ID_PFR1_EL1); + info->reg_id_pfr2 = read_cpuid(ID_PFR2_EL1); info->reg_mvfr0 = read_cpuid(MVFR0_EL1); info->reg_mvfr1 = read_cpuid(MVFR1_EL1); @@ -403,5 +410,3 @@ void __init cpuinfo_store_boot_cpu(void) boot_cpu_data = *info; init_cpu_features(&boot_cpu_data); } - -device_initcall(cpuinfo_regs_init); diff --git a/arch/arm64/kernel/crash_core.c b/arch/arm64/kernel/crash_core.c index ca4c3e12d8c5..1f646b07e3e9 100644 --- a/arch/arm64/kernel/crash_core.c +++ b/arch/arm64/kernel/crash_core.c @@ -5,6 +5,7 @@ */ #include <linux/crash_core.h> +#include <asm/cpufeature.h> #include <asm/memory.h> void arch_crash_save_vmcoreinfo(void) @@ -16,4 +17,7 @@ void arch_crash_save_vmcoreinfo(void) vmcoreinfo_append_str("NUMBER(PHYS_OFFSET)=0x%llx\n", PHYS_OFFSET); vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset()); + vmcoreinfo_append_str("NUMBER(KERNELPACMASK)=0x%llx\n", + system_supports_address_auth() ? + ptrauth_kernel_pac_mask() : 0); } diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c index 48222a4760c2..15e80c876d46 100644 --- a/arch/arm64/kernel/debug-monitors.c +++ b/arch/arm64/kernel/debug-monitors.c @@ -376,15 +376,13 @@ int aarch32_break_handler(struct pt_regs *regs) } NOKPROBE_SYMBOL(aarch32_break_handler); -static int __init debug_traps_init(void) +void __init debug_traps_init(void) { hook_debug_fault_code(DBG_ESR_EVT_HWSS, single_step_handler, SIGTRAP, TRAP_TRACE, "single-step handler"); hook_debug_fault_code(DBG_ESR_EVT_BRK, brk_handler, SIGTRAP, TRAP_BRKPT, "ptrace BRK handler"); - return 0; } -arch_initcall(debug_traps_init); /* Re-enable single step for syscall restarting. */ void user_rewind_single_step(struct task_struct *task) diff --git a/arch/arm64/kernel/efi-entry.S b/arch/arm64/kernel/efi-entry.S index 1a03618df0df..0073b24b5d25 100644 --- a/arch/arm64/kernel/efi-entry.S +++ b/arch/arm64/kernel/efi-entry.S @@ -14,12 +14,12 @@ SYM_CODE_START(efi_enter_kernel) /* - * efi_entry() will have copied the kernel image if necessary and we + * efi_pe_entry() will have copied the kernel image if necessary and we * end up here with device tree address in x1 and the kernel entry * point stored in x0. Save those values in registers which are * callee preserved. */ - ldr w2, =stext_offset + ldr w2, =primary_entry_offset add x19, x0, x2 // relocated Image entrypoint mov x20, x1 // DTB address diff --git a/arch/arm64/kernel/efi-header.S b/arch/arm64/kernel/efi-header.S index 914999ccaf8a..df67c0f2a077 100644 --- a/arch/arm64/kernel/efi-header.S +++ b/arch/arm64/kernel/efi-header.S @@ -27,12 +27,12 @@ optional_header: .long __initdata_begin - efi_header_end // SizeOfCode .long __pecoff_data_size // SizeOfInitializedData .long 0 // SizeOfUninitializedData - .long __efistub_efi_entry - _head // AddressOfEntryPoint + .long __efistub_efi_pe_entry - _head // AddressOfEntryPoint .long efi_header_end - _head // BaseOfCode extra_header_fields: .quad 0 // ImageBase - .long SZ_4K // SectionAlignment + .long SEGMENT_ALIGN // SectionAlignment .long PECOFF_FILE_ALIGNMENT // FileAlignment .short 0 // MajorOperatingSystemVersion .short 0 // MinorOperatingSystemVersion diff --git a/arch/arm64/kernel/efi-rt-wrapper.S b/arch/arm64/kernel/efi-rt-wrapper.S index 3fc71106cb2b..75691a2641c1 100644 --- a/arch/arm64/kernel/efi-rt-wrapper.S +++ b/arch/arm64/kernel/efi-rt-wrapper.S @@ -5,7 +5,7 @@ #include <linux/linkage.h> -ENTRY(__efi_rt_asm_wrapper) +SYM_FUNC_START(__efi_rt_asm_wrapper) stp x29, x30, [sp, #-32]! mov x29, sp @@ -34,5 +34,14 @@ ENTRY(__efi_rt_asm_wrapper) ldp x29, x30, [sp], #32 b.ne 0f ret -0: b efi_handle_corrupted_x18 // tail call -ENDPROC(__efi_rt_asm_wrapper) +0: + /* + * With CONFIG_SHADOW_CALL_STACK, the kernel uses x18 to store a + * shadow stack pointer, which we need to restore before returning to + * potentially instrumented code. This is safe because the wrapper is + * called with preemption disabled and a separate shadow stack is used + * for interrupts. + */ + mov x18, x2 + b efi_handle_corrupted_x18 // tail call +SYM_FUNC_END(__efi_rt_asm_wrapper) diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index c839b5bf1904..3dbdf9752b11 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -94,7 +94,7 @@ asmlinkage void notrace el1_sync_handler(struct pt_regs *regs) break; default: el1_inv(regs, esr); - }; + } } NOKPROBE_SYMBOL(el1_sync_handler); @@ -188,6 +188,14 @@ static void notrace el0_undef(struct pt_regs *regs) } NOKPROBE_SYMBOL(el0_undef); +static void notrace el0_bti(struct pt_regs *regs) +{ + user_exit_irqoff(); + local_daif_restore(DAIF_PROCCTX); + do_bti(regs); +} +NOKPROBE_SYMBOL(el0_bti); + static void notrace el0_inv(struct pt_regs *regs, unsigned long esr) { user_exit_irqoff(); @@ -255,6 +263,9 @@ asmlinkage void notrace el0_sync_handler(struct pt_regs *regs) case ESR_ELx_EC_UNKNOWN: el0_undef(regs); break; + case ESR_ELx_EC_BTI: + el0_bti(regs); + break; case ESR_ELx_EC_BREAKPT_LOW: case ESR_ELx_EC_SOFTSTP_LOW: case ESR_ELx_EC_WATCHPT_LOW: diff --git a/arch/arm64/kernel/entry-fpsimd.S b/arch/arm64/kernel/entry-fpsimd.S index 0f24eae8f3cc..f880dd63ddc3 100644 --- a/arch/arm64/kernel/entry-fpsimd.S +++ b/arch/arm64/kernel/entry-fpsimd.S @@ -16,34 +16,34 @@ * * x0 - pointer to struct fpsimd_state */ -ENTRY(fpsimd_save_state) +SYM_FUNC_START(fpsimd_save_state) fpsimd_save x0, 8 ret -ENDPROC(fpsimd_save_state) +SYM_FUNC_END(fpsimd_save_state) /* * Load the FP registers. * * x0 - pointer to struct fpsimd_state */ -ENTRY(fpsimd_load_state) +SYM_FUNC_START(fpsimd_load_state) fpsimd_restore x0, 8 ret -ENDPROC(fpsimd_load_state) +SYM_FUNC_END(fpsimd_load_state) #ifdef CONFIG_ARM64_SVE -ENTRY(sve_save_state) +SYM_FUNC_START(sve_save_state) sve_save 0, x1, 2 ret -ENDPROC(sve_save_state) +SYM_FUNC_END(sve_save_state) -ENTRY(sve_load_state) +SYM_FUNC_START(sve_load_state) sve_load 0, x1, x2, 3, x4 ret -ENDPROC(sve_load_state) +SYM_FUNC_END(sve_load_state) -ENTRY(sve_get_vl) +SYM_FUNC_START(sve_get_vl) _sve_rdvl 0, 1 ret -ENDPROC(sve_get_vl) +SYM_FUNC_END(sve_get_vl) #endif /* CONFIG_ARM64_SVE */ diff --git a/arch/arm64/kernel/entry-ftrace.S b/arch/arm64/kernel/entry-ftrace.S index 833d48c9acb5..a338f40e64d3 100644 --- a/arch/arm64/kernel/entry-ftrace.S +++ b/arch/arm64/kernel/entry-ftrace.S @@ -23,8 +23,9 @@ * * ... where <entry> is either ftrace_caller or ftrace_regs_caller. * - * Each instrumented function follows the AAPCS, so here x0-x8 and x19-x30 are - * live, and x9-x18 are safe to clobber. + * Each instrumented function follows the AAPCS, so here x0-x8 and x18-x30 are + * live (x18 holds the Shadow Call Stack pointer), and x9-x17 are safe to + * clobber. * * We save the callsite's context into a pt_regs before invoking any ftrace * callbacks. So that we can get a sensible backtrace, we create a stack record diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index ddcde093c433..5304d193c79d 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -23,6 +23,7 @@ #include <asm/mmu.h> #include <asm/processor.h> #include <asm/ptrace.h> +#include <asm/scs.h> #include <asm/thread_info.h> #include <asm/asm-uaccess.h> #include <asm/unistd.h> @@ -178,7 +179,9 @@ alternative_cb_end apply_ssbd 1, x22, x23 - ptrauth_keys_install_kernel tsk, 1, x20, x22, x23 + ptrauth_keys_install_kernel tsk, x20, x22, x23 + + scs_load tsk, x20 .else add x21, sp, #S_FRAME_SIZE get_current_task tsk @@ -343,6 +346,8 @@ alternative_else_nop_endif msr cntkctl_el1, x1 4: #endif + scs_save tsk, x0 + /* No kernel C function calls after this as user keys are set. */ ptrauth_keys_install_user tsk, x0, x1, x2 @@ -388,6 +393,9 @@ alternative_insn eret, nop, ARM64_UNMAP_KERNEL_AT_EL0 .macro irq_stack_entry mov x19, sp // preserve the original sp +#ifdef CONFIG_SHADOW_CALL_STACK + mov x24, scs_sp // preserve the original shadow stack +#endif /* * Compare sp with the base of the task stack. @@ -405,15 +413,25 @@ alternative_insn eret, nop, ARM64_UNMAP_KERNEL_AT_EL0 /* switch to the irq stack */ mov sp, x26 + +#ifdef CONFIG_SHADOW_CALL_STACK + /* also switch to the irq shadow stack */ + adr_this_cpu scs_sp, irq_shadow_call_stack, x26 +#endif + 9998: .endm /* - * x19 should be preserved between irq_stack_entry and - * irq_stack_exit. + * The callee-saved regs (x19-x29) should be preserved between + * irq_stack_entry and irq_stack_exit, but note that kernel_entry + * uses x20-x23 to store data for later use. */ .macro irq_stack_exit mov sp, x19 +#ifdef CONFIG_SHADOW_CALL_STACK + mov scs_sp, x24 +#endif .endm /* GPRs used by entry code */ @@ -728,20 +746,9 @@ el0_error_naked: SYM_CODE_END(el0_error) /* - * Ok, we need to do extra processing, enter the slow path. - */ -work_pending: - mov x0, sp // 'regs' - bl do_notify_resume -#ifdef CONFIG_TRACE_IRQFLAGS - bl trace_hardirqs_on // enabled while in userspace -#endif - ldr x1, [tsk, #TSK_TI_FLAGS] // re-check for single-step - b finish_ret_to_user -/* * "slow" syscall return path. */ -ret_to_user: +SYM_CODE_START_LOCAL(ret_to_user) disable_daif gic_prio_kentry_setup tmp=x3 ldr x1, [tsk, #TSK_TI_FLAGS] @@ -753,7 +760,19 @@ finish_ret_to_user: bl stackleak_erase #endif kernel_exit 0 -ENDPROC(ret_to_user) + +/* + * Ok, we need to do extra processing, enter the slow path. + */ +work_pending: + mov x0, sp // 'regs' + bl do_notify_resume +#ifdef CONFIG_TRACE_IRQFLAGS + bl trace_hardirqs_on // enabled while in userspace +#endif + ldr x1, [tsk, #TSK_TI_FLAGS] // re-check for single-step + b finish_ret_to_user +SYM_CODE_END(ret_to_user) .popsection // .entry.text @@ -900,7 +919,9 @@ SYM_FUNC_START(cpu_switch_to) ldr lr, [x8] mov sp, x9 msr sp_el0, x1 - ptrauth_keys_install_kernel x1, 1, x8, x9, x10 + ptrauth_keys_install_kernel x1, x8, x9, x10 + scs_save x0, x8 + scs_load x1, x8 ret SYM_FUNC_END(cpu_switch_to) NOKPROBE(cpu_switch_to) @@ -1029,13 +1050,16 @@ SYM_CODE_START(__sdei_asm_handler) mov x19, x1 +#if defined(CONFIG_VMAP_STACK) || defined(CONFIG_SHADOW_CALL_STACK) + ldrb w4, [x19, #SDEI_EVENT_PRIORITY] +#endif + #ifdef CONFIG_VMAP_STACK /* * entry.S may have been using sp as a scratch register, find whether * this is a normal or critical event and switch to the appropriate * stack for this CPU. */ - ldrb w4, [x19, #SDEI_EVENT_PRIORITY] cbnz w4, 1f ldr_this_cpu dst=x5, sym=sdei_stack_normal_ptr, tmp=x6 b 2f @@ -1045,6 +1069,15 @@ SYM_CODE_START(__sdei_asm_handler) mov sp, x5 #endif +#ifdef CONFIG_SHADOW_CALL_STACK + /* Use a separate shadow call stack for normal and critical events */ + cbnz w4, 3f + adr_this_cpu dst=scs_sp, sym=sdei_shadow_call_stack_normal, tmp=x6 + b 4f +3: adr_this_cpu dst=scs_sp, sym=sdei_shadow_call_stack_critical, tmp=x6 +4: +#endif + /* * We may have interrupted userspace, or a guest, or exit-from or * return-to either of these. We can't trust sp_el0, restore it. diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 94289d126993..35cb5e66c504 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -341,8 +341,7 @@ static unsigned int find_supported_vector_length(unsigned int vl) #ifdef CONFIG_SYSCTL static int sve_proc_do_default_vl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; int vl = sve_default_vl; diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 57a91032b4c2..632702146813 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -13,6 +13,7 @@ #include <linux/init.h> #include <linux/irqchip/arm-gic-v3.h> +#include <asm/asm_pointer_auth.h> #include <asm/assembler.h> #include <asm/boot.h> #include <asm/ptrace.h> @@ -27,6 +28,7 @@ #include <asm/pgtable-hwdef.h> #include <asm/pgtable.h> #include <asm/page.h> +#include <asm/scs.h> #include <asm/smp.h> #include <asm/sysreg.h> #include <asm/thread_info.h> @@ -70,9 +72,9 @@ _head: * its opcode forms the magic "MZ" signature required by UEFI. */ add x13, x18, #0x16 - b stext + b primary_entry #else - b stext // branch to kernel start, magic + b primary_entry // branch to kernel start, magic .long 0 // reserved #endif le64sym _kernel_offset_le // Image load offset from start of RAM, little-endian @@ -98,14 +100,13 @@ pe_header: * primary lowlevel boot path: * * Register Scope Purpose - * x21 stext() .. start_kernel() FDT pointer passed at boot in x0 - * x23 stext() .. start_kernel() physical misalignment/KASLR offset - * x28 __create_page_tables() callee preserved temp register - * x19/x20 __primary_switch() callee preserved temp registers - * x24 __primary_switch() .. relocate_kernel() - * current RELR displacement + * x21 primary_entry() .. start_kernel() FDT pointer passed at boot in x0 + * x23 primary_entry() .. start_kernel() physical misalignment/KASLR offset + * x28 __create_page_tables() callee preserved temp register + * x19/x20 __primary_switch() callee preserved temp registers + * x24 __primary_switch() .. relocate_kernel() current RELR displacement */ -SYM_CODE_START(stext) +SYM_CODE_START(primary_entry) bl preserve_boot_args bl el2_setup // Drop to EL1, w0=cpu_boot_mode adrp x23, __PHYS_OFFSET @@ -118,10 +119,9 @@ SYM_CODE_START(stext) * On return, the CPU will be ready for the MMU to be turned on and * the TCR will have been set. */ - mov x0, #ARM64_CPU_BOOT_PRIMARY bl __cpu_setup // initialise processor b __primary_switch -SYM_CODE_END(stext) +SYM_CODE_END(primary_entry) /* * Preserve the arguments passed by the bootloader in x0 .. x3 @@ -394,13 +394,19 @@ SYM_FUNC_START_LOCAL(__create_page_tables) /* * Since the page tables have been populated with non-cacheable - * accesses (MMU disabled), invalidate the idmap and swapper page - * tables again to remove any speculatively loaded cache lines. + * accesses (MMU disabled), invalidate those tables again to + * remove any speculatively loaded cache lines. */ + dmb sy + adrp x0, idmap_pg_dir + adrp x1, idmap_pg_end + sub x1, x1, x0 + bl __inval_dcache_area + + adrp x0, init_pg_dir adrp x1, init_pg_end sub x1, x1, x0 - dmb sy bl __inval_dcache_area ret x28 @@ -417,6 +423,10 @@ SYM_FUNC_START_LOCAL(__primary_switched) adr_l x5, init_task msr sp_el0, x5 // Save thread_info +#ifdef CONFIG_ARM64_PTR_AUTH + __ptrauth_keys_init_cpu x5, x6, x7, x8 +#endif + adr_l x8, vectors // load VBAR_EL1 with virtual msr vbar_el1, x8 // vector table address isb @@ -424,6 +434,10 @@ SYM_FUNC_START_LOCAL(__primary_switched) stp xzr, x30, [sp, #-16]! mov x29, sp +#ifdef CONFIG_SHADOW_CALL_STACK + adr_l scs_sp, init_shadow_call_stack // Set shadow call stack +#endif + str_l x21, __fdt_pointer, x5 // Save FDT pointer ldr_l x4, kimage_vaddr // Save the offset between @@ -717,7 +731,6 @@ SYM_FUNC_START_LOCAL(secondary_startup) * Common entry point for secondary CPUs. */ bl __cpu_secondary_check52bitva - mov x0, #ARM64_CPU_BOOT_SECONDARY bl __cpu_setup // initialise processor adrp x1, swapper_pg_dir bl __enable_mmu @@ -737,8 +750,14 @@ SYM_FUNC_START_LOCAL(__secondary_switched) ldr x2, [x0, #CPU_BOOT_TASK] cbz x2, __secondary_too_slow msr sp_el0, x2 + scs_load x2, x3 mov x29, #0 mov x30, #0 + +#ifdef CONFIG_ARM64_PTR_AUTH + ptrauth_keys_init_cpu x2, x3, x4, x5 +#endif + b secondary_start_kernel SYM_FUNC_END(__secondary_switched) diff --git a/arch/arm64/kernel/hibernate-asm.S b/arch/arm64/kernel/hibernate-asm.S index 6532105b3e32..8ccca660034e 100644 --- a/arch/arm64/kernel/hibernate-asm.S +++ b/arch/arm64/kernel/hibernate-asm.S @@ -65,7 +65,7 @@ * x5: physical address of a zero page that remains zero after resume */ .pushsection ".hibernate_exit.text", "ax" -ENTRY(swsusp_arch_suspend_exit) +SYM_CODE_START(swsusp_arch_suspend_exit) /* * We execute from ttbr0, change ttbr1 to our copied linear map tables * with a break-before-make via the zero page @@ -110,7 +110,7 @@ ENTRY(swsusp_arch_suspend_exit) cbz x24, 3f /* Do we need to re-initialise EL2? */ hvc #0 3: ret -ENDPROC(swsusp_arch_suspend_exit) +SYM_CODE_END(swsusp_arch_suspend_exit) /* * Restore the hyp stub. @@ -119,15 +119,15 @@ ENDPROC(swsusp_arch_suspend_exit) * * x24: The physical address of __hyp_stub_vectors */ -el1_sync: +SYM_CODE_START_LOCAL(el1_sync) msr vbar_el2, x24 eret -ENDPROC(el1_sync) +SYM_CODE_END(el1_sync) .macro invalid_vector label -\label: +SYM_CODE_START_LOCAL(\label) b \label -ENDPROC(\label) +SYM_CODE_END(\label) .endm invalid_vector el2_sync_invalid @@ -141,7 +141,7 @@ ENDPROC(\label) /* el2 vectors - switch el2 here while we restore the memory image. */ .align 11 -ENTRY(hibernate_el2_vectors) +SYM_CODE_START(hibernate_el2_vectors) ventry el2_sync_invalid // Synchronous EL2t ventry el2_irq_invalid // IRQ EL2t ventry el2_fiq_invalid // FIQ EL2t @@ -161,6 +161,6 @@ ENTRY(hibernate_el2_vectors) ventry el1_irq_invalid // IRQ 32-bit EL1 ventry el1_fiq_invalid // FIQ 32-bit EL1 ventry el1_error_invalid // Error 32-bit EL1 -END(hibernate_el2_vectors) +SYM_CODE_END(hibernate_el2_vectors) .popsection diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S index e473ead806ed..160f5881a0b7 100644 --- a/arch/arm64/kernel/hyp-stub.S +++ b/arch/arm64/kernel/hyp-stub.S @@ -21,7 +21,7 @@ .align 11 -ENTRY(__hyp_stub_vectors) +SYM_CODE_START(__hyp_stub_vectors) ventry el2_sync_invalid // Synchronous EL2t ventry el2_irq_invalid // IRQ EL2t ventry el2_fiq_invalid // FIQ EL2t @@ -41,11 +41,11 @@ ENTRY(__hyp_stub_vectors) ventry el1_irq_invalid // IRQ 32-bit EL1 ventry el1_fiq_invalid // FIQ 32-bit EL1 ventry el1_error_invalid // Error 32-bit EL1 -ENDPROC(__hyp_stub_vectors) +SYM_CODE_END(__hyp_stub_vectors) .align 11 -el1_sync: +SYM_CODE_START_LOCAL(el1_sync) cmp x0, #HVC_SET_VECTORS b.ne 2f msr vbar_el2, x1 @@ -68,12 +68,12 @@ el1_sync: 9: mov x0, xzr eret -ENDPROC(el1_sync) +SYM_CODE_END(el1_sync) .macro invalid_vector label -\label: +SYM_CODE_START_LOCAL(\label) b \label -ENDPROC(\label) +SYM_CODE_END(\label) .endm invalid_vector el2_sync_invalid @@ -106,15 +106,15 @@ ENDPROC(\label) * initialisation entry point. */ -ENTRY(__hyp_set_vectors) +SYM_FUNC_START(__hyp_set_vectors) mov x1, x0 mov x0, #HVC_SET_VECTORS hvc #0 ret -ENDPROC(__hyp_set_vectors) +SYM_FUNC_END(__hyp_set_vectors) -ENTRY(__hyp_reset_vectors) +SYM_FUNC_START(__hyp_reset_vectors) mov x0, #HVC_RESET_VECTORS hvc #0 ret -ENDPROC(__hyp_reset_vectors) +SYM_FUNC_END(__hyp_reset_vectors) diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index 7f06ad93fc95..be0a63ffed23 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -13,7 +13,7 @@ #ifdef CONFIG_EFI __efistub_kernel_size = _edata - _text; -__efistub_stext_offset = stext - _text; +__efistub_primary_entry_offset = primary_entry - _text; /* diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c index 4a9e773a177f..684d871ae38d 100644 --- a/arch/arm64/kernel/insn.c +++ b/arch/arm64/kernel/insn.c @@ -51,21 +51,33 @@ enum aarch64_insn_encoding_class __kprobes aarch64_get_insn_class(u32 insn) return aarch64_insn_encoding_class[(insn >> 25) & 0xf]; } -/* NOP is an alias of HINT */ -bool __kprobes aarch64_insn_is_nop(u32 insn) +bool __kprobes aarch64_insn_is_steppable_hint(u32 insn) { if (!aarch64_insn_is_hint(insn)) return false; switch (insn & 0xFE0) { - case AARCH64_INSN_HINT_YIELD: - case AARCH64_INSN_HINT_WFE: - case AARCH64_INSN_HINT_WFI: - case AARCH64_INSN_HINT_SEV: - case AARCH64_INSN_HINT_SEVL: - return false; - default: + case AARCH64_INSN_HINT_XPACLRI: + case AARCH64_INSN_HINT_PACIA_1716: + case AARCH64_INSN_HINT_PACIB_1716: + case AARCH64_INSN_HINT_AUTIA_1716: + case AARCH64_INSN_HINT_AUTIB_1716: + case AARCH64_INSN_HINT_PACIAZ: + case AARCH64_INSN_HINT_PACIASP: + case AARCH64_INSN_HINT_PACIBZ: + case AARCH64_INSN_HINT_PACIBSP: + case AARCH64_INSN_HINT_AUTIAZ: + case AARCH64_INSN_HINT_AUTIASP: + case AARCH64_INSN_HINT_AUTIBZ: + case AARCH64_INSN_HINT_AUTIBSP: + case AARCH64_INSN_HINT_BTI: + case AARCH64_INSN_HINT_BTIC: + case AARCH64_INSN_HINT_BTIJ: + case AARCH64_INSN_HINT_BTIJC: + case AARCH64_INSN_HINT_NOP: return true; + default: + return false; } } @@ -574,7 +586,7 @@ u32 aarch64_insn_gen_cond_branch_imm(unsigned long pc, unsigned long addr, offset >> 2); } -u32 __kprobes aarch64_insn_gen_hint(enum aarch64_insn_hint_op op) +u32 __kprobes aarch64_insn_gen_hint(enum aarch64_insn_hint_cr_op op) { return aarch64_insn_get_hint_value() | op; } @@ -1535,16 +1547,10 @@ static u32 aarch64_encode_immediate(u64 imm, u32 insn) { unsigned int immr, imms, n, ones, ror, esz, tmp; - u64 mask = ~0UL; - - /* Can't encode full zeroes or full ones */ - if (!imm || !~imm) - return AARCH64_BREAK_FAULT; + u64 mask; switch (variant) { case AARCH64_INSN_VARIANT_32BIT: - if (upper_32_bits(imm)) - return AARCH64_BREAK_FAULT; esz = 32; break; case AARCH64_INSN_VARIANT_64BIT: @@ -1556,6 +1562,12 @@ static u32 aarch64_encode_immediate(u64 imm, return AARCH64_BREAK_FAULT; } + mask = GENMASK(esz - 1, 0); + + /* Can't encode full zeroes, full ones, or value wider than the mask */ + if (!imm || imm == mask || imm & ~mask) + return AARCH64_BREAK_FAULT; + /* * Inverse of Replicate(). Try to spot a repeating pattern * with a pow2 stride. diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c index 8e9c924423b4..a0b144cfaea7 100644 --- a/arch/arm64/kernel/machine_kexec.c +++ b/arch/arm64/kernel/machine_kexec.c @@ -177,6 +177,7 @@ void machine_kexec(struct kimage *kimage) * the offline CPUs. Therefore, we must use the __* variant here. */ __flush_icache_range((uintptr_t)reboot_code_buffer, + (uintptr_t)reboot_code_buffer + arm64_relocate_new_kernel_size); /* Flush the kimage list and its buffers. */ diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c index b40c3b0def92..522e6f517ec0 100644 --- a/arch/arm64/kernel/machine_kexec_file.c +++ b/arch/arm64/kernel/machine_kexec_file.c @@ -138,12 +138,12 @@ static int setup_dtb(struct kimage *image, /* add rng-seed */ if (rng_is_initialized()) { - u8 rng_seed[RNG_SEED_SIZE]; - get_random_bytes(rng_seed, RNG_SEED_SIZE); - ret = fdt_setprop(dtb, off, FDT_PROP_RNG_SEED, rng_seed, - RNG_SEED_SIZE); + void *rng_seed; + ret = fdt_setprop_placeholder(dtb, off, FDT_PROP_RNG_SEED, + RNG_SEED_SIZE, &rng_seed); if (ret) goto out; + get_random_bytes(rng_seed, RNG_SEED_SIZE); } else { pr_notice("RNG is not initialised: omitting \"%s\" property\n", FDT_PROP_RNG_SEED); @@ -284,7 +284,7 @@ int load_other_segments(struct kimage *image, image->arch.elf_headers_sz = headers_sz; pr_debug("Loaded elf core header at 0x%lx bufsz=0x%lx memsz=0x%lx\n", - image->arch.elf_headers_mem, headers_sz, headers_sz); + image->arch.elf_headers_mem, kbuf.bufsz, kbuf.memsz); } /* load initrd */ @@ -305,7 +305,7 @@ int load_other_segments(struct kimage *image, initrd_load_addr = kbuf.mem; pr_debug("Loaded initrd at 0x%lx bufsz=0x%lx memsz=0x%lx\n", - initrd_load_addr, initrd_len, initrd_len); + initrd_load_addr, kbuf.bufsz, kbuf.memsz); } /* load dtb */ @@ -332,7 +332,7 @@ int load_other_segments(struct kimage *image, image->arch.dtb_mem = kbuf.mem; pr_debug("Loaded dtb at 0x%lx bufsz=0x%lx memsz=0x%lx\n", - kbuf.mem, dtb_len, dtb_len); + kbuf.mem, kbuf.bufsz, kbuf.memsz); return 0; diff --git a/arch/arm64/kernel/paravirt.c b/arch/arm64/kernel/paravirt.c index 1ef702b0be2d..295d66490584 100644 --- a/arch/arm64/kernel/paravirt.c +++ b/arch/arm64/kernel/paravirt.c @@ -120,7 +120,7 @@ static bool has_pv_steal_clock(void) struct arm_smccc_res res; /* To detect the presence of PV time support we require SMCCC 1.1+ */ - if (psci_ops.smccc_version < SMCCC_VERSION_1_1) + if (arm_smccc_1_1_get_conduit() == SMCCC_CONDUIT_NONE) return false; arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, diff --git a/arch/arm64/kernel/probes/decode-insn.c b/arch/arm64/kernel/probes/decode-insn.c index b78fac9e546c..263d5fba4c8a 100644 --- a/arch/arm64/kernel/probes/decode-insn.c +++ b/arch/arm64/kernel/probes/decode-insn.c @@ -46,7 +46,7 @@ static bool __kprobes aarch64_insn_is_steppable(u32 insn) * except for the NOP case. */ if (aarch64_insn_is_hint(insn)) - return aarch64_insn_is_nop(insn); + return aarch64_insn_is_steppable_hint(insn); return true; } diff --git a/arch/arm64/kernel/probes/kprobes_trampoline.S b/arch/arm64/kernel/probes/kprobes_trampoline.S index 45dce03aaeaf..890ca72c5a51 100644 --- a/arch/arm64/kernel/probes/kprobes_trampoline.S +++ b/arch/arm64/kernel/probes/kprobes_trampoline.S @@ -61,7 +61,7 @@ ldp x28, x29, [sp, #S_X28] .endm -ENTRY(kretprobe_trampoline) +SYM_CODE_START(kretprobe_trampoline) sub sp, sp, #S_FRAME_SIZE save_all_base_regs @@ -79,4 +79,4 @@ ENTRY(kretprobe_trampoline) add sp, sp, #S_FRAME_SIZE ret -ENDPROC(kretprobe_trampoline) +SYM_CODE_END(kretprobe_trampoline) diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 56be4cbf771f..eade7807e819 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -11,6 +11,7 @@ #include <linux/compat.h> #include <linux/efi.h> +#include <linux/elf.h> #include <linux/export.h> #include <linux/sched.h> #include <linux/sched/debug.h> @@ -18,6 +19,7 @@ #include <linux/sched/task_stack.h> #include <linux/kernel.h> #include <linux/lockdep.h> +#include <linux/mman.h> #include <linux/mm.h> #include <linux/stddef.h> #include <linux/sysctl.h> @@ -209,6 +211,15 @@ void machine_restart(char *cmd) while (1); } +#define bstr(suffix, str) [PSR_BTYPE_ ## suffix >> PSR_BTYPE_SHIFT] = str +static const char *const btypes[] = { + bstr(NONE, "--"), + bstr( JC, "jc"), + bstr( C, "-c"), + bstr( J , "j-") +}; +#undef bstr + static void print_pstate(struct pt_regs *regs) { u64 pstate = regs->pstate; @@ -227,7 +238,10 @@ static void print_pstate(struct pt_regs *regs) pstate & PSR_AA32_I_BIT ? 'I' : 'i', pstate & PSR_AA32_F_BIT ? 'F' : 'f'); } else { - printk("pstate: %08llx (%c%c%c%c %c%c%c%c %cPAN %cUAO)\n", + const char *btype_str = btypes[(pstate & PSR_BTYPE_MASK) >> + PSR_BTYPE_SHIFT]; + + printk("pstate: %08llx (%c%c%c%c %c%c%c%c %cPAN %cUAO BTYPE=%s)\n", pstate, pstate & PSR_N_BIT ? 'N' : 'n', pstate & PSR_Z_BIT ? 'Z' : 'z', @@ -238,7 +252,8 @@ static void print_pstate(struct pt_regs *regs) pstate & PSR_I_BIT ? 'I' : 'i', pstate & PSR_F_BIT ? 'F' : 'f', pstate & PSR_PAN_BIT ? '+' : '-', - pstate & PSR_UAO_BIT ? '+' : '-'); + pstate & PSR_UAO_BIT ? '+' : '-', + btype_str); } } @@ -655,3 +670,25 @@ asmlinkage void __sched arm64_preempt_schedule_irq(void) if (system_capabilities_finalized()) preempt_schedule_irq(); } + +#ifdef CONFIG_BINFMT_ELF +int arch_elf_adjust_prot(int prot, const struct arch_elf_state *state, + bool has_interp, bool is_interp) +{ + /* + * For dynamically linked executables the interpreter is + * responsible for setting PROT_BTI on everything except + * itself. + */ + if (is_interp != has_interp) + return prot; + + if (!(state->flags & ARM64_ELF_BTI)) + return prot; + + if (prot & PROT_EXEC) + prot |= PROT_BTI; + + return prot; +} +#endif diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index b3d3005d9515..76790a5f2a0d 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -1829,10 +1829,11 @@ static void tracehook_report_syscall(struct pt_regs *regs, int syscall_trace_enter(struct pt_regs *regs) { - if (test_thread_flag(TIF_SYSCALL_TRACE) || - test_thread_flag(TIF_SYSCALL_EMU)) { + unsigned long flags = READ_ONCE(current_thread_info()->flags); + + if (flags & (_TIF_SYSCALL_EMU | _TIF_SYSCALL_TRACE)) { tracehook_report_syscall(regs, PTRACE_SYSCALL_ENTER); - if (!in_syscall(regs) || test_thread_flag(TIF_SYSCALL_EMU)) + if (!in_syscall(regs) || (flags & _TIF_SYSCALL_EMU)) return -1; } @@ -1874,7 +1875,7 @@ void syscall_trace_exit(struct pt_regs *regs) */ #define SPSR_EL1_AARCH64_RES0_BITS \ (GENMASK_ULL(63, 32) | GENMASK_ULL(27, 25) | GENMASK_ULL(23, 22) | \ - GENMASK_ULL(20, 13) | GENMASK_ULL(11, 10) | GENMASK_ULL(5, 5)) + GENMASK_ULL(20, 13) | GENMASK_ULL(5, 5)) #define SPSR_EL1_AARCH32_RES0_BITS \ (GENMASK_ULL(63, 32) | GENMASK_ULL(22, 22) | GENMASK_ULL(20, 20)) diff --git a/arch/arm64/kernel/reloc_test_syms.S b/arch/arm64/kernel/reloc_test_syms.S index 16a34f188f26..c50f45fa29fa 100644 --- a/arch/arm64/kernel/reloc_test_syms.S +++ b/arch/arm64/kernel/reloc_test_syms.S @@ -5,81 +5,81 @@ #include <linux/linkage.h> -ENTRY(absolute_data64) +SYM_FUNC_START(absolute_data64) ldr x0, 0f ret 0: .quad sym64_abs -ENDPROC(absolute_data64) +SYM_FUNC_END(absolute_data64) -ENTRY(absolute_data32) +SYM_FUNC_START(absolute_data32) ldr w0, 0f ret 0: .long sym32_abs -ENDPROC(absolute_data32) +SYM_FUNC_END(absolute_data32) -ENTRY(absolute_data16) +SYM_FUNC_START(absolute_data16) adr x0, 0f ldrh w0, [x0] ret 0: .short sym16_abs, 0 -ENDPROC(absolute_data16) +SYM_FUNC_END(absolute_data16) -ENTRY(signed_movw) +SYM_FUNC_START(signed_movw) movz x0, #:abs_g2_s:sym64_abs movk x0, #:abs_g1_nc:sym64_abs movk x0, #:abs_g0_nc:sym64_abs ret -ENDPROC(signed_movw) +SYM_FUNC_END(signed_movw) -ENTRY(unsigned_movw) +SYM_FUNC_START(unsigned_movw) movz x0, #:abs_g3:sym64_abs movk x0, #:abs_g2_nc:sym64_abs movk x0, #:abs_g1_nc:sym64_abs movk x0, #:abs_g0_nc:sym64_abs ret -ENDPROC(unsigned_movw) +SYM_FUNC_END(unsigned_movw) .align 12 .space 0xff8 -ENTRY(relative_adrp) +SYM_FUNC_START(relative_adrp) adrp x0, sym64_rel add x0, x0, #:lo12:sym64_rel ret -ENDPROC(relative_adrp) +SYM_FUNC_END(relative_adrp) .align 12 .space 0xffc -ENTRY(relative_adrp_far) +SYM_FUNC_START(relative_adrp_far) adrp x0, memstart_addr add x0, x0, #:lo12:memstart_addr ret -ENDPROC(relative_adrp_far) +SYM_FUNC_END(relative_adrp_far) -ENTRY(relative_adr) +SYM_FUNC_START(relative_adr) adr x0, sym64_rel ret -ENDPROC(relative_adr) +SYM_FUNC_END(relative_adr) -ENTRY(relative_data64) +SYM_FUNC_START(relative_data64) adr x1, 0f ldr x0, [x1] add x0, x0, x1 ret 0: .quad sym64_rel - . -ENDPROC(relative_data64) +SYM_FUNC_END(relative_data64) -ENTRY(relative_data32) +SYM_FUNC_START(relative_data32) adr x1, 0f ldr w0, [x1] add x0, x0, x1 ret 0: .long sym64_rel - . -ENDPROC(relative_data32) +SYM_FUNC_END(relative_data32) -ENTRY(relative_data16) +SYM_FUNC_START(relative_data16) adr x1, 0f ldrsh w0, [x1] add x0, x0, x1 ret 0: .short sym64_rel - ., 0 -ENDPROC(relative_data16) +SYM_FUNC_END(relative_data16) diff --git a/arch/arm64/kernel/relocate_kernel.S b/arch/arm64/kernel/relocate_kernel.S index c40ce496c78b..542d6edc6806 100644 --- a/arch/arm64/kernel/relocate_kernel.S +++ b/arch/arm64/kernel/relocate_kernel.S @@ -26,7 +26,7 @@ * control_code_page, a special page which has been set up to be preserved * during the copy operation. */ -ENTRY(arm64_relocate_new_kernel) +SYM_CODE_START(arm64_relocate_new_kernel) /* Setup the list loop variables. */ mov x18, x2 /* x18 = dtb address */ @@ -111,7 +111,7 @@ ENTRY(arm64_relocate_new_kernel) mov x3, xzr br x17 -ENDPROC(arm64_relocate_new_kernel) +SYM_CODE_END(arm64_relocate_new_kernel) .align 3 /* To keep the 64-bit values below naturally aligned. */ diff --git a/arch/arm64/kernel/scs.c b/arch/arm64/kernel/scs.c new file mode 100644 index 000000000000..e8f7ff45dd8f --- /dev/null +++ b/arch/arm64/kernel/scs.c @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Shadow Call Stack support. + * + * Copyright (C) 2019 Google LLC + */ + +#include <linux/percpu.h> +#include <linux/scs.h> + +DEFINE_SCS(irq_shadow_call_stack); + +#ifdef CONFIG_ARM_SDE_INTERFACE +DEFINE_SCS(sdei_shadow_call_stack_normal); +DEFINE_SCS(sdei_shadow_call_stack_critical); +#endif diff --git a/arch/arm64/kernel/sdei.c b/arch/arm64/kernel/sdei.c index d6259dac62b6..dab88260b137 100644 --- a/arch/arm64/kernel/sdei.c +++ b/arch/arm64/kernel/sdei.c @@ -95,19 +95,7 @@ static bool on_sdei_normal_stack(unsigned long sp, struct stack_info *info) unsigned long low = (unsigned long)raw_cpu_read(sdei_stack_normal_ptr); unsigned long high = low + SDEI_STACK_SIZE; - if (!low) - return false; - - if (sp < low || sp >= high) - return false; - - if (info) { - info->low = low; - info->high = high; - info->type = STACK_TYPE_SDEI_NORMAL; - } - - return true; + return on_stack(sp, low, high, STACK_TYPE_SDEI_NORMAL, info); } static bool on_sdei_critical_stack(unsigned long sp, struct stack_info *info) @@ -115,19 +103,7 @@ static bool on_sdei_critical_stack(unsigned long sp, struct stack_info *info) unsigned long low = (unsigned long)raw_cpu_read(sdei_stack_critical_ptr); unsigned long high = low + SDEI_STACK_SIZE; - if (!low) - return false; - - if (sp < low || sp >= high) - return false; - - if (info) { - info->low = low; - info->high = high; - info->type = STACK_TYPE_SDEI_CRITICAL; - } - - return true; + return on_stack(sp, low, high, STACK_TYPE_SDEI_CRITICAL, info); } bool _on_sdei_stack(unsigned long sp, struct stack_info *info) @@ -251,22 +227,12 @@ asmlinkage __kprobes notrace unsigned long __sdei_handler(struct pt_regs *regs, struct sdei_registered_event *arg) { unsigned long ret; - bool do_nmi_exit = false; - /* - * nmi_enter() deals with printk() re-entrance and use of RCU when - * RCU believed this CPU was idle. Because critical events can - * interrupt normal events, we may already be in_nmi(). - */ - if (!in_nmi()) { - nmi_enter(); - do_nmi_exit = true; - } + nmi_enter(); ret = _sdei_handler(regs, arg); - if (do_nmi_exit) - nmi_exit(); + nmi_exit(); return ret; } diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index 339882db5a91..801d56cdf701 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -732,6 +732,22 @@ static void setup_return(struct pt_regs *regs, struct k_sigaction *ka, regs->regs[29] = (unsigned long)&user->next_frame->fp; regs->pc = (unsigned long)ka->sa.sa_handler; + /* + * Signal delivery is a (wacky) indirect function call in + * userspace, so simulate the same setting of BTYPE as a BLR + * <register containing the signal handler entry point>. + * Signal delivery to a location in a PROT_BTI guarded page + * that is not a function entry point will now trigger a + * SIGILL in userspace. + * + * If the signal handler entry point is not in a PROT_BTI + * guarded page, this is harmless. + */ + if (system_supports_bti()) { + regs->pstate &= ~PSR_BTYPE_MASK; + regs->pstate |= PSR_BTYPE_C; + } + if (ka->sa.sa_flags & SA_RESTORER) sigtramp = ka->sa.sa_restorer; else diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S index 7b2f2e650c44..ba40d57757d6 100644 --- a/arch/arm64/kernel/sleep.S +++ b/arch/arm64/kernel/sleep.S @@ -62,7 +62,7 @@ * * x0 = struct sleep_stack_data area */ -ENTRY(__cpu_suspend_enter) +SYM_FUNC_START(__cpu_suspend_enter) stp x29, lr, [x0, #SLEEP_STACK_DATA_CALLEE_REGS] stp x19, x20, [x0,#SLEEP_STACK_DATA_CALLEE_REGS+16] stp x21, x22, [x0,#SLEEP_STACK_DATA_CALLEE_REGS+32] @@ -95,23 +95,22 @@ ENTRY(__cpu_suspend_enter) ldp x29, lr, [sp], #16 mov x0, #1 ret -ENDPROC(__cpu_suspend_enter) +SYM_FUNC_END(__cpu_suspend_enter) .pushsection ".idmap.text", "awx" -ENTRY(cpu_resume) +SYM_CODE_START(cpu_resume) bl el2_setup // if in EL2 drop to EL1 cleanly - mov x0, #ARM64_CPU_RUNTIME bl __cpu_setup /* enable the MMU early - so we can access sleep_save_stash by va */ adrp x1, swapper_pg_dir bl __enable_mmu ldr x8, =_cpu_resume br x8 -ENDPROC(cpu_resume) +SYM_CODE_END(cpu_resume) .ltorg .popsection -ENTRY(_cpu_resume) +SYM_FUNC_START(_cpu_resume) mrs x1, mpidr_el1 adr_l x8, mpidr_hash // x8 = struct mpidr_hash virt address @@ -147,4 +146,4 @@ ENTRY(_cpu_resume) ldp x29, lr, [x29] mov x0, #0 ret -ENDPROC(_cpu_resume) +SYM_FUNC_END(_cpu_resume) diff --git a/arch/arm64/kernel/smccc-call.S b/arch/arm64/kernel/smccc-call.S index 54655273d1e0..1f93809528a4 100644 --- a/arch/arm64/kernel/smccc-call.S +++ b/arch/arm64/kernel/smccc-call.S @@ -30,9 +30,9 @@ * unsigned long a6, unsigned long a7, struct arm_smccc_res *res, * struct arm_smccc_quirk *quirk) */ -ENTRY(__arm_smccc_smc) +SYM_FUNC_START(__arm_smccc_smc) SMCCC smc -ENDPROC(__arm_smccc_smc) +SYM_FUNC_END(__arm_smccc_smc) EXPORT_SYMBOL(__arm_smccc_smc) /* @@ -41,7 +41,7 @@ EXPORT_SYMBOL(__arm_smccc_smc) * unsigned long a6, unsigned long a7, struct arm_smccc_res *res, * struct arm_smccc_quirk *quirk) */ -ENTRY(__arm_smccc_hvc) +SYM_FUNC_START(__arm_smccc_hvc) SMCCC hvc -ENDPROC(__arm_smccc_hvc) +SYM_FUNC_END(__arm_smccc_hvc) EXPORT_SYMBOL(__arm_smccc_hvc) diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 061f60fe452f..4b6f4999d06a 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -65,7 +65,7 @@ EXPORT_PER_CPU_SYMBOL(cpu_number); */ struct secondary_data secondary_data; /* Number of CPUs which aren't online, but looping in kernel text. */ -int cpus_stuck_in_kernel; +static int cpus_stuck_in_kernel; enum ipi_msg_type { IPI_RESCHEDULE, @@ -114,10 +114,6 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle) */ secondary_data.task = idle; secondary_data.stack = task_stack_page(idle) + THREAD_SIZE; -#if defined(CONFIG_ARM64_PTR_AUTH) - secondary_data.ptrauth_key.apia.lo = idle->thread.keys_kernel.apia.lo; - secondary_data.ptrauth_key.apia.hi = idle->thread.keys_kernel.apia.hi; -#endif update_cpu_boot_status(CPU_MMU_OFF); __flush_dcache_area(&secondary_data, sizeof(secondary_data)); @@ -140,10 +136,6 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle) pr_crit("CPU%u: failed to come online\n", cpu); secondary_data.task = NULL; secondary_data.stack = NULL; -#if defined(CONFIG_ARM64_PTR_AUTH) - secondary_data.ptrauth_key.apia.lo = 0; - secondary_data.ptrauth_key.apia.hi = 0; -#endif __flush_dcache_area(&secondary_data, sizeof(secondary_data)); status = READ_ONCE(secondary_data.status); if (status == CPU_MMU_OFF) @@ -176,7 +168,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle) panic("CPU%u detected unsupported configuration\n", cpu); } - return ret; + return -EIO; } static void init_gic_priority_masking(void) @@ -430,7 +422,7 @@ static void __init hyp_mode_check(void) "CPU: CPUs started in inconsistent modes"); else pr_info("CPU: All CPU(s) started at EL1\n"); - if (IS_ENABLED(CONFIG_KVM_ARM_HOST)) + if (IS_ENABLED(CONFIG_KVM)) kvm_compute_layout(); } diff --git a/arch/arm64/kernel/syscall.c b/arch/arm64/kernel/syscall.c index a12c0c88d345..5f5b868292f5 100644 --- a/arch/arm64/kernel/syscall.c +++ b/arch/arm64/kernel/syscall.c @@ -98,6 +98,24 @@ static void el0_svc_common(struct pt_regs *regs, int scno, int sc_nr, regs->orig_x0 = regs->regs[0]; regs->syscallno = scno; + /* + * BTI note: + * The architecture does not guarantee that SPSR.BTYPE is zero + * on taking an SVC, so we could return to userspace with a + * non-zero BTYPE after the syscall. + * + * This shouldn't matter except when userspace is explicitly + * doing something stupid, such as setting PROT_BTI on a page + * that lacks conforming BTI/PACIxSP instructions, falling + * through from one executable page to another with differing + * PROT_BTI, or messing with BTYPE via ptrace: in such cases, + * userspace should not be surprised if a SIGILL occurs on + * syscall return. + * + * So, don't touch regs->pstate & PSR_BTYPE_MASK here. + * (Similarly for HVC and SMC elsewhere.) + */ + cortex_a76_erratum_1463225_svc_handler(); local_daif_restore(DAIF_PROCCTX); user_exit(); diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index cf402be5c573..d332590f5978 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -272,6 +272,61 @@ void arm64_notify_die(const char *str, struct pt_regs *regs, } } +#ifdef CONFIG_COMPAT +#define PSTATE_IT_1_0_SHIFT 25 +#define PSTATE_IT_1_0_MASK (0x3 << PSTATE_IT_1_0_SHIFT) +#define PSTATE_IT_7_2_SHIFT 10 +#define PSTATE_IT_7_2_MASK (0x3f << PSTATE_IT_7_2_SHIFT) + +static u32 compat_get_it_state(struct pt_regs *regs) +{ + u32 it, pstate = regs->pstate; + + it = (pstate & PSTATE_IT_1_0_MASK) >> PSTATE_IT_1_0_SHIFT; + it |= ((pstate & PSTATE_IT_7_2_MASK) >> PSTATE_IT_7_2_SHIFT) << 2; + + return it; +} + +static void compat_set_it_state(struct pt_regs *regs, u32 it) +{ + u32 pstate_it; + + pstate_it = (it << PSTATE_IT_1_0_SHIFT) & PSTATE_IT_1_0_MASK; + pstate_it |= ((it >> 2) << PSTATE_IT_7_2_SHIFT) & PSTATE_IT_7_2_MASK; + + regs->pstate &= ~PSR_AA32_IT_MASK; + regs->pstate |= pstate_it; +} + +static void advance_itstate(struct pt_regs *regs) +{ + u32 it; + + /* ARM mode */ + if (!(regs->pstate & PSR_AA32_T_BIT) || + !(regs->pstate & PSR_AA32_IT_MASK)) + return; + + it = compat_get_it_state(regs); + + /* + * If this is the last instruction of the block, wipe the IT + * state. Otherwise advance it. + */ + if (!(it & 7)) + it = 0; + else + it = (it & 0xe0) | ((it << 1) & 0x1f); + + compat_set_it_state(regs, it); +} +#else +static void advance_itstate(struct pt_regs *regs) +{ +} +#endif + void arm64_skip_faulting_instruction(struct pt_regs *regs, unsigned long size) { regs->pc += size; @@ -282,6 +337,11 @@ void arm64_skip_faulting_instruction(struct pt_regs *regs, unsigned long size) */ if (user_mode(regs)) user_fastforward_single_step(current); + + if (compat_user_mode(regs)) + advance_itstate(regs); + else + regs->pstate &= ~PSR_BTYPE_MASK; } static LIST_HEAD(undef_hook); @@ -411,6 +471,13 @@ void do_undefinstr(struct pt_regs *regs) } NOKPROBE_SYMBOL(do_undefinstr); +void do_bti(struct pt_regs *regs) +{ + BUG_ON(!user_mode(regs)); + force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc); +} +NOKPROBE_SYMBOL(do_bti); + #define __user_cache_maint(insn, address, res) \ if (address >= user_addr_max()) { \ res = -EFAULT; \ @@ -566,34 +633,7 @@ static const struct sys64_hook sys64_hooks[] = { {}, }; - #ifdef CONFIG_COMPAT -#define PSTATE_IT_1_0_SHIFT 25 -#define PSTATE_IT_1_0_MASK (0x3 << PSTATE_IT_1_0_SHIFT) -#define PSTATE_IT_7_2_SHIFT 10 -#define PSTATE_IT_7_2_MASK (0x3f << PSTATE_IT_7_2_SHIFT) - -static u32 compat_get_it_state(struct pt_regs *regs) -{ - u32 it, pstate = regs->pstate; - - it = (pstate & PSTATE_IT_1_0_MASK) >> PSTATE_IT_1_0_SHIFT; - it |= ((pstate & PSTATE_IT_7_2_MASK) >> PSTATE_IT_7_2_SHIFT) << 2; - - return it; -} - -static void compat_set_it_state(struct pt_regs *regs, u32 it) -{ - u32 pstate_it; - - pstate_it = (it << PSTATE_IT_1_0_SHIFT) & PSTATE_IT_1_0_MASK; - pstate_it |= ((it >> 2) << PSTATE_IT_7_2_SHIFT) & PSTATE_IT_7_2_MASK; - - regs->pstate &= ~PSR_AA32_IT_MASK; - regs->pstate |= pstate_it; -} - static bool cp15_cond_valid(unsigned int esr, struct pt_regs *regs) { int cond; @@ -614,42 +654,12 @@ static bool cp15_cond_valid(unsigned int esr, struct pt_regs *regs) return aarch32_opcode_cond_checks[cond](regs->pstate); } -static void advance_itstate(struct pt_regs *regs) -{ - u32 it; - - /* ARM mode */ - if (!(regs->pstate & PSR_AA32_T_BIT) || - !(regs->pstate & PSR_AA32_IT_MASK)) - return; - - it = compat_get_it_state(regs); - - /* - * If this is the last instruction of the block, wipe the IT - * state. Otherwise advance it. - */ - if (!(it & 7)) - it = 0; - else - it = (it & 0xe0) | ((it << 1) & 0x1f); - - compat_set_it_state(regs, it); -} - -static void arm64_compat_skip_faulting_instruction(struct pt_regs *regs, - unsigned int sz) -{ - advance_itstate(regs); - arm64_skip_faulting_instruction(regs, sz); -} - static void compat_cntfrq_read_handler(unsigned int esr, struct pt_regs *regs) { int reg = (esr & ESR_ELx_CP15_32_ISS_RT_MASK) >> ESR_ELx_CP15_32_ISS_RT_SHIFT; pt_regs_write_reg(regs, reg, arch_timer_get_rate()); - arm64_compat_skip_faulting_instruction(regs, 4); + arm64_skip_faulting_instruction(regs, 4); } static const struct sys64_hook cp15_32_hooks[] = { @@ -669,7 +679,7 @@ static void compat_cntvct_read_handler(unsigned int esr, struct pt_regs *regs) pt_regs_write_reg(regs, rt, lower_32_bits(val)); pt_regs_write_reg(regs, rt2, upper_32_bits(val)); - arm64_compat_skip_faulting_instruction(regs, 4); + arm64_skip_faulting_instruction(regs, 4); } static const struct sys64_hook cp15_64_hooks[] = { @@ -690,7 +700,7 @@ void do_cp15instr(unsigned int esr, struct pt_regs *regs) * There is no T16 variant of a CP access, so we * always advance PC by 4 bytes. */ - arm64_compat_skip_faulting_instruction(regs, 4); + arm64_skip_faulting_instruction(regs, 4); return; } @@ -753,6 +763,7 @@ static const char *esr_class_str[] = { [ESR_ELx_EC_CP10_ID] = "CP10 MRC/VMRS", [ESR_ELx_EC_PAC] = "PAC", [ESR_ELx_EC_CP14_64] = "CP14 MCRR/MRRC", + [ESR_ELx_EC_BTI] = "BTI", [ESR_ELx_EC_ILL] = "PSTATE.IL", [ESR_ELx_EC_SVC32] = "SVC (AArch32)", [ESR_ELx_EC_HVC32] = "HVC (AArch32)", @@ -906,17 +917,13 @@ bool arm64_is_fatal_ras_serror(struct pt_regs *regs, unsigned int esr) asmlinkage void do_serror(struct pt_regs *regs, unsigned int esr) { - const bool was_in_nmi = in_nmi(); - - if (!was_in_nmi) - nmi_enter(); + nmi_enter(); /* non-RAS errors are not containable */ if (!arm64_is_ras_serror(esr) || arm64_is_fatal_ras_serror(regs, esr)) arm64_serror_panic(regs, esr); - if (!was_in_nmi) - nmi_exit(); + nmi_exit(); } asmlinkage void enter_from_user_mode(void) @@ -1047,11 +1054,11 @@ int __init early_brk64(unsigned long addr, unsigned int esr, return bug_handler(regs, esr) != DBG_HOOK_HANDLED; } -/* This registration must happen early, before debug_traps_init(). */ void __init trap_init(void) { register_kernel_break_hook(&bug_break_hook); #ifdef CONFIG_KASAN_SW_TAGS register_kernel_break_hook(&kasan_break_hook); #endif + debug_traps_init(); } diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index 033a48f30dbb..d51a898fd60f 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -33,20 +33,14 @@ extern char vdso_start[], vdso_end[]; extern char vdso32_start[], vdso32_end[]; #endif /* CONFIG_COMPAT_VDSO */ -/* vdso_lookup arch_index */ -enum arch_vdso_type { - ARM64_VDSO = 0, +enum vdso_abi { + VDSO_ABI_AA64, #ifdef CONFIG_COMPAT_VDSO - ARM64_VDSO32 = 1, + VDSO_ABI_AA32, #endif /* CONFIG_COMPAT_VDSO */ }; -#ifdef CONFIG_COMPAT_VDSO -#define VDSO_TYPES (ARM64_VDSO32 + 1) -#else -#define VDSO_TYPES (ARM64_VDSO + 1) -#endif /* CONFIG_COMPAT_VDSO */ -struct __vdso_abi { +struct vdso_abi_info { const char *name; const char *vdso_code_start; const char *vdso_code_end; @@ -57,14 +51,14 @@ struct __vdso_abi { struct vm_special_mapping *cm; }; -static struct __vdso_abi vdso_lookup[VDSO_TYPES] __ro_after_init = { - { +static struct vdso_abi_info vdso_info[] __ro_after_init = { + [VDSO_ABI_AA64] = { .name = "vdso", .vdso_code_start = vdso_start, .vdso_code_end = vdso_end, }, #ifdef CONFIG_COMPAT_VDSO - { + [VDSO_ABI_AA32] = { .name = "vdso32", .vdso_code_start = vdso32_start, .vdso_code_end = vdso32_end, @@ -81,13 +75,13 @@ static union { } vdso_data_store __page_aligned_data; struct vdso_data *vdso_data = vdso_data_store.data; -static int __vdso_remap(enum arch_vdso_type arch_index, +static int __vdso_remap(enum vdso_abi abi, const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) { unsigned long new_size = new_vma->vm_end - new_vma->vm_start; - unsigned long vdso_size = vdso_lookup[arch_index].vdso_code_end - - vdso_lookup[arch_index].vdso_code_start; + unsigned long vdso_size = vdso_info[abi].vdso_code_end - + vdso_info[abi].vdso_code_start; if (vdso_size != new_size) return -EINVAL; @@ -97,24 +91,24 @@ static int __vdso_remap(enum arch_vdso_type arch_index, return 0; } -static int __vdso_init(enum arch_vdso_type arch_index) +static int __vdso_init(enum vdso_abi abi) { int i; struct page **vdso_pagelist; unsigned long pfn; - if (memcmp(vdso_lookup[arch_index].vdso_code_start, "\177ELF", 4)) { + if (memcmp(vdso_info[abi].vdso_code_start, "\177ELF", 4)) { pr_err("vDSO is not a valid ELF object!\n"); return -EINVAL; } - vdso_lookup[arch_index].vdso_pages = ( - vdso_lookup[arch_index].vdso_code_end - - vdso_lookup[arch_index].vdso_code_start) >> + vdso_info[abi].vdso_pages = ( + vdso_info[abi].vdso_code_end - + vdso_info[abi].vdso_code_start) >> PAGE_SHIFT; /* Allocate the vDSO pagelist, plus a page for the data. */ - vdso_pagelist = kcalloc(vdso_lookup[arch_index].vdso_pages + 1, + vdso_pagelist = kcalloc(vdso_info[abi].vdso_pages + 1, sizeof(struct page *), GFP_KERNEL); if (vdso_pagelist == NULL) @@ -125,26 +119,27 @@ static int __vdso_init(enum arch_vdso_type arch_index) /* Grab the vDSO code pages. */ - pfn = sym_to_pfn(vdso_lookup[arch_index].vdso_code_start); + pfn = sym_to_pfn(vdso_info[abi].vdso_code_start); - for (i = 0; i < vdso_lookup[arch_index].vdso_pages; i++) + for (i = 0; i < vdso_info[abi].vdso_pages; i++) vdso_pagelist[i + 1] = pfn_to_page(pfn + i); - vdso_lookup[arch_index].dm->pages = &vdso_pagelist[0]; - vdso_lookup[arch_index].cm->pages = &vdso_pagelist[1]; + vdso_info[abi].dm->pages = &vdso_pagelist[0]; + vdso_info[abi].cm->pages = &vdso_pagelist[1]; return 0; } -static int __setup_additional_pages(enum arch_vdso_type arch_index, +static int __setup_additional_pages(enum vdso_abi abi, struct mm_struct *mm, struct linux_binprm *bprm, int uses_interp) { unsigned long vdso_base, vdso_text_len, vdso_mapping_len; + unsigned long gp_flags = 0; void *ret; - vdso_text_len = vdso_lookup[arch_index].vdso_pages << PAGE_SHIFT; + vdso_text_len = vdso_info[abi].vdso_pages << PAGE_SHIFT; /* Be sure to map the data page */ vdso_mapping_len = vdso_text_len + PAGE_SIZE; @@ -156,16 +151,19 @@ static int __setup_additional_pages(enum arch_vdso_type arch_index, ret = _install_special_mapping(mm, vdso_base, PAGE_SIZE, VM_READ|VM_MAYREAD, - vdso_lookup[arch_index].dm); + vdso_info[abi].dm); if (IS_ERR(ret)) goto up_fail; + if (IS_ENABLED(CONFIG_ARM64_BTI_KERNEL) && system_supports_bti()) + gp_flags = VM_ARM64_BTI; + vdso_base += PAGE_SIZE; mm->context.vdso = (void *)vdso_base; ret = _install_special_mapping(mm, vdso_base, vdso_text_len, - VM_READ|VM_EXEC| + VM_READ|VM_EXEC|gp_flags| VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, - vdso_lookup[arch_index].cm); + vdso_info[abi].cm); if (IS_ERR(ret)) goto up_fail; @@ -184,46 +182,42 @@ up_fail: static int aarch32_vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) { - return __vdso_remap(ARM64_VDSO32, sm, new_vma); + return __vdso_remap(VDSO_ABI_AA32, sm, new_vma); } #endif /* CONFIG_COMPAT_VDSO */ -/* - * aarch32_vdso_pages: - * 0 - kuser helpers - * 1 - sigreturn code - * or (CONFIG_COMPAT_VDSO): - * 0 - kuser helpers - * 1 - vdso data - * 2 - vdso code - */ -#define C_VECTORS 0 +enum aarch32_map { + AA32_MAP_VECTORS, /* kuser helpers */ #ifdef CONFIG_COMPAT_VDSO -#define C_VVAR 1 -#define C_VDSO 2 -#define C_PAGES (C_VDSO + 1) + AA32_MAP_VVAR, + AA32_MAP_VDSO, #else -#define C_SIGPAGE 1 -#define C_PAGES (C_SIGPAGE + 1) -#endif /* CONFIG_COMPAT_VDSO */ -static struct page *aarch32_vdso_pages[C_PAGES] __ro_after_init; -static struct vm_special_mapping aarch32_vdso_spec[C_PAGES] = { - { + AA32_MAP_SIGPAGE +#endif +}; + +static struct page *aarch32_vectors_page __ro_after_init; +#ifndef CONFIG_COMPAT_VDSO +static struct page *aarch32_sig_page __ro_after_init; +#endif + +static struct vm_special_mapping aarch32_vdso_maps[] = { + [AA32_MAP_VECTORS] = { .name = "[vectors]", /* ABI */ - .pages = &aarch32_vdso_pages[C_VECTORS], + .pages = &aarch32_vectors_page, }, #ifdef CONFIG_COMPAT_VDSO - { + [AA32_MAP_VVAR] = { .name = "[vvar]", }, - { + [AA32_MAP_VDSO] = { .name = "[vdso]", .mremap = aarch32_vdso_mremap, }, #else - { + [AA32_MAP_SIGPAGE] = { .name = "[sigpage]", /* ABI */ - .pages = &aarch32_vdso_pages[C_SIGPAGE], + .pages = &aarch32_sig_page, }, #endif /* CONFIG_COMPAT_VDSO */ }; @@ -243,8 +237,8 @@ static int aarch32_alloc_kuser_vdso_page(void) memcpy((void *)(vdso_page + 0x1000 - kuser_sz), __kuser_helper_start, kuser_sz); - aarch32_vdso_pages[C_VECTORS] = virt_to_page(vdso_page); - flush_dcache_page(aarch32_vdso_pages[C_VECTORS]); + aarch32_vectors_page = virt_to_page(vdso_page); + flush_dcache_page(aarch32_vectors_page); return 0; } @@ -253,10 +247,10 @@ static int __aarch32_alloc_vdso_pages(void) { int ret; - vdso_lookup[ARM64_VDSO32].dm = &aarch32_vdso_spec[C_VVAR]; - vdso_lookup[ARM64_VDSO32].cm = &aarch32_vdso_spec[C_VDSO]; + vdso_info[VDSO_ABI_AA32].dm = &aarch32_vdso_maps[AA32_MAP_VVAR]; + vdso_info[VDSO_ABI_AA32].cm = &aarch32_vdso_maps[AA32_MAP_VDSO]; - ret = __vdso_init(ARM64_VDSO32); + ret = __vdso_init(VDSO_ABI_AA32); if (ret) return ret; @@ -275,8 +269,8 @@ static int __aarch32_alloc_vdso_pages(void) return -ENOMEM; memcpy((void *)sigpage, __aarch32_sigret_code_start, sigret_sz); - aarch32_vdso_pages[C_SIGPAGE] = virt_to_page(sigpage); - flush_dcache_page(aarch32_vdso_pages[C_SIGPAGE]); + aarch32_sig_page = virt_to_page(sigpage); + flush_dcache_page(aarch32_sig_page); ret = aarch32_alloc_kuser_vdso_page(); if (ret) @@ -306,7 +300,7 @@ static int aarch32_kuser_helpers_setup(struct mm_struct *mm) ret = _install_special_mapping(mm, AARCH32_VECTORS_BASE, PAGE_SIZE, VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC, - &aarch32_vdso_spec[C_VECTORS]); + &aarch32_vdso_maps[AA32_MAP_VECTORS]); return PTR_ERR_OR_ZERO(ret); } @@ -330,7 +324,7 @@ static int aarch32_sigreturn_setup(struct mm_struct *mm) ret = _install_special_mapping(mm, addr, PAGE_SIZE, VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC, - &aarch32_vdso_spec[C_SIGPAGE]); + &aarch32_vdso_maps[AA32_MAP_SIGPAGE]); if (IS_ERR(ret)) goto out; @@ -354,7 +348,7 @@ int aarch32_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) goto out; #ifdef CONFIG_COMPAT_VDSO - ret = __setup_additional_pages(ARM64_VDSO32, + ret = __setup_additional_pages(VDSO_ABI_AA32, mm, bprm, uses_interp); @@ -371,22 +365,19 @@ out: static int vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) { - return __vdso_remap(ARM64_VDSO, sm, new_vma); + return __vdso_remap(VDSO_ABI_AA64, sm, new_vma); } -/* - * aarch64_vdso_pages: - * 0 - vvar - * 1 - vdso - */ -#define A_VVAR 0 -#define A_VDSO 1 -#define A_PAGES (A_VDSO + 1) -static struct vm_special_mapping vdso_spec[A_PAGES] __ro_after_init = { - { +enum aarch64_map { + AA64_MAP_VVAR, + AA64_MAP_VDSO, +}; + +static struct vm_special_mapping aarch64_vdso_maps[] __ro_after_init = { + [AA64_MAP_VVAR] = { .name = "[vvar]", }, - { + [AA64_MAP_VDSO] = { .name = "[vdso]", .mremap = vdso_mremap, }, @@ -394,10 +385,10 @@ static struct vm_special_mapping vdso_spec[A_PAGES] __ro_after_init = { static int __init vdso_init(void) { - vdso_lookup[ARM64_VDSO].dm = &vdso_spec[A_VVAR]; - vdso_lookup[ARM64_VDSO].cm = &vdso_spec[A_VDSO]; + vdso_info[VDSO_ABI_AA64].dm = &aarch64_vdso_maps[AA64_MAP_VVAR]; + vdso_info[VDSO_ABI_AA64].cm = &aarch64_vdso_maps[AA64_MAP_VDSO]; - return __vdso_init(ARM64_VDSO); + return __vdso_init(VDSO_ABI_AA64); } arch_initcall(vdso_init); @@ -410,7 +401,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, if (down_write_killable(&mm->mmap_sem)) return -EINTR; - ret = __setup_additional_pages(ARM64_VDSO, + ret = __setup_additional_pages(VDSO_ABI_AA64, mm, bprm, uses_interp); diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile index dd2514bb1511..556d424c6f52 100644 --- a/arch/arm64/kernel/vdso/Makefile +++ b/arch/arm64/kernel/vdso/Makefile @@ -17,22 +17,26 @@ obj-vdso := vgettimeofday.o note.o sigreturn.o targets := $(obj-vdso) vdso.so vdso.so.dbg obj-vdso := $(addprefix $(obj)/, $(obj-vdso)) +btildflags-$(CONFIG_ARM64_BTI_KERNEL) += -z force-bti + +# -Bsymbolic has been added for consistency with arm, the compat vDSO and +# potential future proofing if we end up with internal calls to the exported +# routines, as x86 does (see 6f121e548f83 ("x86, vdso: Reimplement vdso.so +# preparation in build-time C")). ldflags-y := -shared -nostdlib -soname=linux-vdso.so.1 --hash-style=sysv \ - --build-id -n -T + -Bsymbolic --eh-frame-hdr --build-id -n $(btildflags-y) -T ccflags-y := -fno-common -fno-builtin -fno-stack-protector -ffixed-x18 ccflags-y += -DDISABLE_BRANCH_PROFILING -VDSO_LDFLAGS := -Bsymbolic - -CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_FTRACE) -Os +CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_FTRACE) -Os $(CC_FLAGS_SCS) KBUILD_CFLAGS += $(DISABLE_LTO) KASAN_SANITIZE := n UBSAN_SANITIZE := n OBJECT_FILES_NON_STANDARD := y KCOV_INSTRUMENT := n -CFLAGS_vgettimeofday.o = -O2 -mcmodel=tiny +CFLAGS_vgettimeofday.o = -O2 -mcmodel=tiny -fasynchronous-unwind-tables ifneq ($(c-gettimeofday-y),) CFLAGS_vgettimeofday.o += -include $(c-gettimeofday-y) diff --git a/arch/arm64/kernel/vdso/note.S b/arch/arm64/kernel/vdso/note.S index 0ce6ec75a525..3d4e82290c80 100644 --- a/arch/arm64/kernel/vdso/note.S +++ b/arch/arm64/kernel/vdso/note.S @@ -12,9 +12,12 @@ #include <linux/version.h> #include <linux/elfnote.h> #include <linux/build-salt.h> +#include <asm/assembler.h> ELFNOTE_START(Linux, 0, "a") .long LINUX_VERSION_CODE ELFNOTE_END BUILD_SALT + +emit_aarch64_feature_1_and diff --git a/arch/arm64/kernel/vdso/sigreturn.S b/arch/arm64/kernel/vdso/sigreturn.S index 12324863d5c2..620a3ef837b7 100644 --- a/arch/arm64/kernel/vdso/sigreturn.S +++ b/arch/arm64/kernel/vdso/sigreturn.S @@ -1,7 +1,11 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* * Sigreturn trampoline for returning from a signal when the SA_RESTORER - * flag is not set. + * flag is not set. It serves primarily as a hall of shame for crappy + * unwinders and features an exciting but mysterious NOP instruction. + * + * It's also fragile as hell, so please think twice before changing anything + * in here. * * Copyright (C) 2012 ARM Limited * @@ -9,18 +13,54 @@ */ #include <linux/linkage.h> +#include <asm/assembler.h> #include <asm/unistd.h> .text - nop -SYM_FUNC_START(__kernel_rt_sigreturn) +/* Ensure that the mysterious NOP can be associated with a function. */ .cfi_startproc + +/* + * .cfi_signal_frame causes the corresponding Frame Description Entry in the + * .eh_frame section to be annotated as a signal frame. This allows DWARF + * unwinders (e.g. libstdc++) to implement _Unwind_GetIPInfo(), which permits + * unwinding out of the signal trampoline without the need for the mysterious + * NOP. + */ .cfi_signal_frame - .cfi_def_cfa x29, 0 - .cfi_offset x29, 0 * 8 - .cfi_offset x30, 1 * 8 + +/* + * Tell the unwinder where to locate the frame record linking back to the + * interrupted context. We don't provide unwind info for registers other + * than the frame pointer and the link register here; in practice, this + * is sufficient for unwinding in C/C++ based runtimes and the values in + * the sigcontext may have been modified by this point anyway. Debuggers + * already have baked-in strategies for attempting to unwind out of signals. + */ + .cfi_def_cfa x29, 0 + .cfi_offset x29, 0 * 8 + .cfi_offset x30, 1 * 8 + +/* + * This mysterious NOP is required for some unwinders (e.g. libc++) that + * unconditionally subtract one from the result of _Unwind_GetIP() in order to + * identify the calling function. + * Hack borrowed from arch/powerpc/kernel/vdso64/sigtramp.S. + */ + nop // Mysterious NOP + +/* + * GDB relies on being able to identify the sigreturn instruction sequence to + * unwind from signal handlers. We cannot, therefore, use SYM_FUNC_START() + * here, as it will emit a BTI C instruction and break the unwinder. Thankfully, + * this function is only ever called from a RET and so omitting the landing pad + * is perfectly fine. + */ +SYM_CODE_START(__kernel_rt_sigreturn) mov x8, #__NR_rt_sigreturn svc #0 .cfi_endproc -SYM_FUNC_END(__kernel_rt_sigreturn) +SYM_CODE_END(__kernel_rt_sigreturn) + +emit_aarch64_feature_1_and diff --git a/arch/arm64/kernel/vdso/vdso.S b/arch/arm64/kernel/vdso/vdso.S index d1414fee5274..c4b1990bf2be 100644 --- a/arch/arm64/kernel/vdso/vdso.S +++ b/arch/arm64/kernel/vdso/vdso.S @@ -8,6 +8,7 @@ #include <linux/init.h> #include <linux/linkage.h> #include <linux/const.h> +#include <asm/assembler.h> #include <asm/page.h> .globl vdso_start, vdso_end @@ -19,3 +20,5 @@ vdso_start: vdso_end: .previous + +emit_aarch64_feature_1_and diff --git a/arch/arm64/kernel/vdso32/sigreturn.S b/arch/arm64/kernel/vdso32/sigreturn.S index 620524969696..b0091064c3d6 100644 --- a/arch/arm64/kernel/vdso32/sigreturn.S +++ b/arch/arm64/kernel/vdso32/sigreturn.S @@ -3,6 +3,9 @@ * This file provides both A32 and T32 versions, in accordance with the * arm sigreturn code. * + * Please read the comments in arch/arm64/kernel/vdso/sigreturn.S to + * understand some of the craziness in here. + * * Copyright (C) 2018 ARM Limited */ @@ -17,39 +20,39 @@ .save {r0-r15} .pad #COMPAT_SIGFRAME_REGS_OFFSET nop -SYM_FUNC_START(__kernel_sigreturn_arm) +SYM_CODE_START(__kernel_sigreturn_arm) mov r7, #__NR_compat_sigreturn svc #0 .fnend -SYM_FUNC_END(__kernel_sigreturn_arm) +SYM_CODE_END(__kernel_sigreturn_arm) .fnstart .save {r0-r15} .pad #COMPAT_RT_SIGFRAME_REGS_OFFSET nop -SYM_FUNC_START(__kernel_rt_sigreturn_arm) +SYM_CODE_START(__kernel_rt_sigreturn_arm) mov r7, #__NR_compat_rt_sigreturn svc #0 .fnend -SYM_FUNC_END(__kernel_rt_sigreturn_arm) +SYM_CODE_END(__kernel_rt_sigreturn_arm) .thumb .fnstart .save {r0-r15} .pad #COMPAT_SIGFRAME_REGS_OFFSET nop -SYM_FUNC_START(__kernel_sigreturn_thumb) +SYM_CODE_START(__kernel_sigreturn_thumb) mov r7, #__NR_compat_sigreturn svc #0 .fnend -SYM_FUNC_END(__kernel_sigreturn_thumb) +SYM_CODE_END(__kernel_sigreturn_thumb) .fnstart .save {r0-r15} .pad #COMPAT_RT_SIGFRAME_REGS_OFFSET nop -SYM_FUNC_START(__kernel_rt_sigreturn_thumb) +SYM_CODE_START(__kernel_rt_sigreturn_thumb) mov r7, #__NR_compat_rt_sigreturn svc #0 .fnend -SYM_FUNC_END(__kernel_rt_sigreturn_thumb) +SYM_CODE_END(__kernel_rt_sigreturn_thumb) diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 497f9675071d..3be632177631 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -17,10 +17,6 @@ #include "image.h" -/* .exit.text needed in case of alternative patching */ -#define ARM_EXIT_KEEP(x) x -#define ARM_EXIT_DISCARD(x) - OUTPUT_ARCH(aarch64) ENTRY(_text) @@ -72,8 +68,8 @@ jiffies = jiffies_64; /* * The size of the PE/COFF section that covers the kernel image, which - * runs from stext to _edata, must be a round multiple of the PE/COFF - * FileAlignment, which we set to its minimum value of 0x200. 'stext' + * runs from _stext to _edata, must be a round multiple of the PE/COFF + * FileAlignment, which we set to its minimum value of 0x200. '_stext' * itself is 4 KB aligned, so padding out _edata to a 0x200 aligned * boundary should be sufficient. */ @@ -95,8 +91,6 @@ SECTIONS * order of matching. */ /DISCARD/ : { - ARM_EXIT_DISCARD(EXIT_TEXT) - ARM_EXIT_DISCARD(EXIT_DATA) EXIT_CALL *(.discard) *(.discard.*) @@ -139,6 +133,7 @@ SECTIONS idmap_pg_dir = .; . += IDMAP_DIR_SIZE; + idmap_pg_end = .; #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 tramp_pg_dir = .; @@ -161,7 +156,7 @@ SECTIONS __exittext_begin = .; .exit.text : { - ARM_EXIT_KEEP(EXIT_TEXT) + EXIT_TEXT } __exittext_end = .; @@ -175,7 +170,7 @@ SECTIONS *(.altinstr_replacement) } - . = ALIGN(PAGE_SIZE); + . = ALIGN(SEGMENT_ALIGN); __inittext_end = .; __initdata_begin = .; @@ -188,7 +183,7 @@ SECTIONS *(.init.rodata.* .init.bss) /* from the EFI stub */ } .exit.data : { - ARM_EXIT_KEEP(EXIT_DATA) + EXIT_DATA } PERCPU_SECTION(L1_CACHE_BYTES) @@ -246,6 +241,7 @@ SECTIONS . += INIT_DIR_SIZE; init_pg_end = .; + . = ALIGN(SEGMENT_ALIGN); __pecoff_data_size = ABSOLUTE(. - __initdata_begin); _end = .; diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 449386d76441..f1c1f981482c 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -3,7 +3,6 @@ # KVM configuration # -source "virt/kvm/Kconfig" source "virt/lib/Kconfig" menuconfig VIRTUALIZATION @@ -18,7 +17,7 @@ menuconfig VIRTUALIZATION if VIRTUALIZATION -config KVM +menuconfig KVM bool "Kernel-based Virtual Machine (KVM) support" depends on OF # for TASKSTATS/TASK_DELAY_ACCT: @@ -28,13 +27,11 @@ config KVM select HAVE_KVM_CPU_RELAX_INTERCEPT select HAVE_KVM_ARCH_TLB_FLUSH_ALL select KVM_MMIO - select KVM_ARM_HOST select KVM_GENERIC_DIRTYLOG_READ_PROTECT select SRCU select KVM_VFIO select HAVE_KVM_EVENTFD select HAVE_KVM_IRQFD - select KVM_ARM_PMU if HW_PERF_EVENTS select HAVE_KVM_MSI select HAVE_KVM_IRQCHIP select HAVE_KVM_IRQ_ROUTING @@ -45,23 +42,24 @@ config KVM select TASK_DELAY_ACCT ---help--- Support hosting virtualized guest machines. - We don't support KVM with 16K page tables yet, due to the multiple - levels of fake page tables. If unsure, say N. -config KVM_ARM_HOST - bool - ---help--- - Provides host support for ARM processors. +if KVM + +source "virt/kvm/Kconfig" config KVM_ARM_PMU - bool + bool "Virtual Performance Monitoring Unit (PMU) support" + depends on HW_PERF_EVENTS + default y ---help--- Adds support for a virtual Performance Monitoring Unit (PMU) in virtual machines. config KVM_INDIRECT_VECTORS - def_bool KVM && (HARDEN_BRANCH_PREDICTOR || HARDEN_EL2_VECTORS) + def_bool HARDEN_BRANCH_PREDICTOR || HARDEN_EL2_VECTORS + +endif # KVM endif # VIRTUALIZATION diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index 5ffbdc39e780..8d3d9513cbfe 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -3,37 +3,25 @@ # Makefile for Kernel-based Virtual Machine module # -ccflags-y += -I $(srctree)/$(src) -I $(srctree)/virt/kvm/arm/vgic +ccflags-y += -I $(srctree)/$(src) KVM=../../../virt/kvm -obj-$(CONFIG_KVM_ARM_HOST) += kvm.o -obj-$(CONFIG_KVM_ARM_HOST) += hyp/ +obj-$(CONFIG_KVM) += kvm.o +obj-$(CONFIG_KVM) += hyp/ -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o $(KVM)/vfio.o -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/arm.o $(KVM)/arm/mmu.o $(KVM)/arm/mmio.o -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/psci.o $(KVM)/arm/perf.o -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hypercalls.o -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/pvtime.o +kvm-y := $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o \ + $(KVM)/vfio.o $(KVM)/irqchip.o \ + arm.o mmu.o mmio.o psci.o perf.o hypercalls.o pvtime.o \ + inject_fault.o regmap.o va_layout.o hyp.o hyp-init.o handle_exit.o \ + guest.o debug.o reset.o sys_regs.o sys_regs_generic_v8.o \ + vgic-sys-reg-v3.o fpsimd.o pmu.o \ + aarch32.o arch_timer.o \ + vgic/vgic.o vgic/vgic-init.o \ + vgic/vgic-irqfd.o vgic/vgic-v2.o \ + vgic/vgic-v3.o vgic/vgic-v4.o \ + vgic/vgic-mmio.o vgic/vgic-mmio-v2.o \ + vgic/vgic-mmio-v3.o vgic/vgic-kvm-device.o \ + vgic/vgic-its.o vgic/vgic-debug.o -kvm-$(CONFIG_KVM_ARM_HOST) += inject_fault.o regmap.o va_layout.o -kvm-$(CONFIG_KVM_ARM_HOST) += hyp.o hyp-init.o handle_exit.o -kvm-$(CONFIG_KVM_ARM_HOST) += guest.o debug.o reset.o sys_regs.o sys_regs_generic_v8.o -kvm-$(CONFIG_KVM_ARM_HOST) += vgic-sys-reg-v3.o fpsimd.o pmu.o -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/aarch32.o - -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic.o -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-init.o -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-irqfd.o -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-v2.o -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-v3.o -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-v4.o -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio.o -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio-v2.o -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio-v3.o -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-kvm-device.o -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-its.o -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-debug.o -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/irqchip.o -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/arch_timer.o -kvm-$(CONFIG_KVM_ARM_PMU) += $(KVM)/arm/pmu.o +kvm-$(CONFIG_KVM_ARM_PMU) += pmu-emul.o diff --git a/arch/arm64/kvm/aarch32.c b/arch/arm64/kvm/aarch32.c new file mode 100644 index 000000000000..0a356aa91aa1 --- /dev/null +++ b/arch/arm64/kvm/aarch32.c @@ -0,0 +1,204 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * (not much of an) Emulation layer for 32bit guests. + * + * Copyright (C) 2012,2013 - ARM Ltd + * Author: Marc Zyngier <marc.zyngier@arm.com> + * + * based on arch/arm/kvm/emulate.c + * Copyright (C) 2012 - Virtual Open Systems and Columbia University + * Author: Christoffer Dall <c.dall@virtualopensystems.com> + */ + +#include <linux/bits.h> +#include <linux/kvm_host.h> +#include <asm/kvm_emulate.h> +#include <asm/kvm_hyp.h> + +#define DFSR_FSC_EXTABT_LPAE 0x10 +#define DFSR_FSC_EXTABT_nLPAE 0x08 +#define DFSR_LPAE BIT(9) + +/* + * Table taken from ARMv8 ARM DDI0487B-B, table G1-10. + */ +static const u8 return_offsets[8][2] = { + [0] = { 0, 0 }, /* Reset, unused */ + [1] = { 4, 2 }, /* Undefined */ + [2] = { 0, 0 }, /* SVC, unused */ + [3] = { 4, 4 }, /* Prefetch abort */ + [4] = { 8, 8 }, /* Data abort */ + [5] = { 0, 0 }, /* HVC, unused */ + [6] = { 4, 4 }, /* IRQ, unused */ + [7] = { 4, 4 }, /* FIQ, unused */ +}; + +/* + * When an exception is taken, most CPSR fields are left unchanged in the + * handler. However, some are explicitly overridden (e.g. M[4:0]). + * + * The SPSR/SPSR_ELx layouts differ, and the below is intended to work with + * either format. Note: SPSR.J bit doesn't exist in SPSR_ELx, but this bit was + * obsoleted by the ARMv7 virtualization extensions and is RES0. + * + * For the SPSR layout seen from AArch32, see: + * - ARM DDI 0406C.d, page B1-1148 + * - ARM DDI 0487E.a, page G8-6264 + * + * For the SPSR_ELx layout for AArch32 seen from AArch64, see: + * - ARM DDI 0487E.a, page C5-426 + * + * Here we manipulate the fields in order of the AArch32 SPSR_ELx layout, from + * MSB to LSB. + */ +static unsigned long get_except32_cpsr(struct kvm_vcpu *vcpu, u32 mode) +{ + u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR); + unsigned long old, new; + + old = *vcpu_cpsr(vcpu); + new = 0; + + new |= (old & PSR_AA32_N_BIT); + new |= (old & PSR_AA32_Z_BIT); + new |= (old & PSR_AA32_C_BIT); + new |= (old & PSR_AA32_V_BIT); + new |= (old & PSR_AA32_Q_BIT); + + // CPSR.IT[7:0] are set to zero upon any exception + // See ARM DDI 0487E.a, section G1.12.3 + // See ARM DDI 0406C.d, section B1.8.3 + + new |= (old & PSR_AA32_DIT_BIT); + + // CPSR.SSBS is set to SCTLR.DSSBS upon any exception + // See ARM DDI 0487E.a, page G8-6244 + if (sctlr & BIT(31)) + new |= PSR_AA32_SSBS_BIT; + + // CPSR.PAN is unchanged unless SCTLR.SPAN == 0b0 + // SCTLR.SPAN is RES1 when ARMv8.1-PAN is not implemented + // See ARM DDI 0487E.a, page G8-6246 + new |= (old & PSR_AA32_PAN_BIT); + if (!(sctlr & BIT(23))) + new |= PSR_AA32_PAN_BIT; + + // SS does not exist in AArch32, so ignore + + // CPSR.IL is set to zero upon any exception + // See ARM DDI 0487E.a, page G1-5527 + + new |= (old & PSR_AA32_GE_MASK); + + // CPSR.IT[7:0] are set to zero upon any exception + // See prior comment above + + // CPSR.E is set to SCTLR.EE upon any exception + // See ARM DDI 0487E.a, page G8-6245 + // See ARM DDI 0406C.d, page B4-1701 + if (sctlr & BIT(25)) + new |= PSR_AA32_E_BIT; + + // CPSR.A is unchanged upon an exception to Undefined, Supervisor + // CPSR.A is set upon an exception to other modes + // See ARM DDI 0487E.a, pages G1-5515 to G1-5516 + // See ARM DDI 0406C.d, page B1-1182 + new |= (old & PSR_AA32_A_BIT); + if (mode != PSR_AA32_MODE_UND && mode != PSR_AA32_MODE_SVC) + new |= PSR_AA32_A_BIT; + + // CPSR.I is set upon any exception + // See ARM DDI 0487E.a, pages G1-5515 to G1-5516 + // See ARM DDI 0406C.d, page B1-1182 + new |= PSR_AA32_I_BIT; + + // CPSR.F is set upon an exception to FIQ + // CPSR.F is unchanged upon an exception to other modes + // See ARM DDI 0487E.a, pages G1-5515 to G1-5516 + // See ARM DDI 0406C.d, page B1-1182 + new |= (old & PSR_AA32_F_BIT); + if (mode == PSR_AA32_MODE_FIQ) + new |= PSR_AA32_F_BIT; + + // CPSR.T is set to SCTLR.TE upon any exception + // See ARM DDI 0487E.a, page G8-5514 + // See ARM DDI 0406C.d, page B1-1181 + if (sctlr & BIT(30)) + new |= PSR_AA32_T_BIT; + + new |= mode; + + return new; +} + +static void prepare_fault32(struct kvm_vcpu *vcpu, u32 mode, u32 vect_offset) +{ + unsigned long spsr = *vcpu_cpsr(vcpu); + bool is_thumb = (spsr & PSR_AA32_T_BIT); + u32 return_offset = return_offsets[vect_offset >> 2][is_thumb]; + u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR); + + *vcpu_cpsr(vcpu) = get_except32_cpsr(vcpu, mode); + + /* Note: These now point to the banked copies */ + vcpu_write_spsr(vcpu, host_spsr_to_spsr32(spsr)); + *vcpu_reg32(vcpu, 14) = *vcpu_pc(vcpu) + return_offset; + + /* Branch to exception vector */ + if (sctlr & (1 << 13)) + vect_offset += 0xffff0000; + else /* always have security exceptions */ + vect_offset += vcpu_cp15(vcpu, c12_VBAR); + + *vcpu_pc(vcpu) = vect_offset; +} + +void kvm_inject_undef32(struct kvm_vcpu *vcpu) +{ + prepare_fault32(vcpu, PSR_AA32_MODE_UND, 4); +} + +/* + * Modelled after TakeDataAbortException() and TakePrefetchAbortException + * pseudocode. + */ +static void inject_abt32(struct kvm_vcpu *vcpu, bool is_pabt, + unsigned long addr) +{ + u32 vect_offset; + u32 *far, *fsr; + bool is_lpae; + + if (is_pabt) { + vect_offset = 12; + far = &vcpu_cp15(vcpu, c6_IFAR); + fsr = &vcpu_cp15(vcpu, c5_IFSR); + } else { /* !iabt */ + vect_offset = 16; + far = &vcpu_cp15(vcpu, c6_DFAR); + fsr = &vcpu_cp15(vcpu, c5_DFSR); + } + + prepare_fault32(vcpu, PSR_AA32_MODE_ABT, vect_offset); + + *far = addr; + + /* Give the guest an IMPLEMENTATION DEFINED exception */ + is_lpae = (vcpu_cp15(vcpu, c2_TTBCR) >> 31); + if (is_lpae) { + *fsr = DFSR_LPAE | DFSR_FSC_EXTABT_LPAE; + } else { + /* no need to shuffle FS[4] into DFSR[10] as its 0 */ + *fsr = DFSR_FSC_EXTABT_nLPAE; + } +} + +void kvm_inject_dabt32(struct kvm_vcpu *vcpu, unsigned long addr) +{ + inject_abt32(vcpu, false, addr); +} + +void kvm_inject_pabt32(struct kvm_vcpu *vcpu, unsigned long addr) +{ + inject_abt32(vcpu, true, addr); +} diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c new file mode 100644 index 000000000000..a1fe0ea3254e --- /dev/null +++ b/arch/arm64/kvm/arch_timer.c @@ -0,0 +1,1171 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2012 ARM Ltd. + * Author: Marc Zyngier <marc.zyngier@arm.com> + */ + +#include <linux/cpu.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/uaccess.h> + +#include <clocksource/arm_arch_timer.h> +#include <asm/arch_timer.h> +#include <asm/kvm_emulate.h> +#include <asm/kvm_hyp.h> + +#include <kvm/arm_vgic.h> +#include <kvm/arm_arch_timer.h> + +#include "trace.h" + +static struct timecounter *timecounter; +static unsigned int host_vtimer_irq; +static unsigned int host_ptimer_irq; +static u32 host_vtimer_irq_flags; +static u32 host_ptimer_irq_flags; + +static DEFINE_STATIC_KEY_FALSE(has_gic_active_state); + +static const struct kvm_irq_level default_ptimer_irq = { + .irq = 30, + .level = 1, +}; + +static const struct kvm_irq_level default_vtimer_irq = { + .irq = 27, + .level = 1, +}; + +static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx); +static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level, + struct arch_timer_context *timer_ctx); +static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx); +static void kvm_arm_timer_write(struct kvm_vcpu *vcpu, + struct arch_timer_context *timer, + enum kvm_arch_timer_regs treg, + u64 val); +static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu, + struct arch_timer_context *timer, + enum kvm_arch_timer_regs treg); + +u64 kvm_phys_timer_read(void) +{ + return timecounter->cc->read(timecounter->cc); +} + +static void get_timer_map(struct kvm_vcpu *vcpu, struct timer_map *map) +{ + if (has_vhe()) { + map->direct_vtimer = vcpu_vtimer(vcpu); + map->direct_ptimer = vcpu_ptimer(vcpu); + map->emul_ptimer = NULL; + } else { + map->direct_vtimer = vcpu_vtimer(vcpu); + map->direct_ptimer = NULL; + map->emul_ptimer = vcpu_ptimer(vcpu); + } + + trace_kvm_get_timer_map(vcpu->vcpu_id, map); +} + +static inline bool userspace_irqchip(struct kvm *kvm) +{ + return static_branch_unlikely(&userspace_irqchip_in_use) && + unlikely(!irqchip_in_kernel(kvm)); +} + +static void soft_timer_start(struct hrtimer *hrt, u64 ns) +{ + hrtimer_start(hrt, ktime_add_ns(ktime_get(), ns), + HRTIMER_MODE_ABS_HARD); +} + +static void soft_timer_cancel(struct hrtimer *hrt) +{ + hrtimer_cancel(hrt); +} + +static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id) +{ + struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id; + struct arch_timer_context *ctx; + struct timer_map map; + + /* + * We may see a timer interrupt after vcpu_put() has been called which + * sets the CPU's vcpu pointer to NULL, because even though the timer + * has been disabled in timer_save_state(), the hardware interrupt + * signal may not have been retired from the interrupt controller yet. + */ + if (!vcpu) + return IRQ_HANDLED; + + get_timer_map(vcpu, &map); + + if (irq == host_vtimer_irq) + ctx = map.direct_vtimer; + else + ctx = map.direct_ptimer; + + if (kvm_timer_should_fire(ctx)) + kvm_timer_update_irq(vcpu, true, ctx); + + if (userspace_irqchip(vcpu->kvm) && + !static_branch_unlikely(&has_gic_active_state)) + disable_percpu_irq(host_vtimer_irq); + + return IRQ_HANDLED; +} + +static u64 kvm_timer_compute_delta(struct arch_timer_context *timer_ctx) +{ + u64 cval, now; + + cval = timer_ctx->cnt_cval; + now = kvm_phys_timer_read() - timer_ctx->cntvoff; + + if (now < cval) { + u64 ns; + + ns = cyclecounter_cyc2ns(timecounter->cc, + cval - now, + timecounter->mask, + &timecounter->frac); + return ns; + } + + return 0; +} + +static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx) +{ + WARN_ON(timer_ctx && timer_ctx->loaded); + return timer_ctx && + !(timer_ctx->cnt_ctl & ARCH_TIMER_CTRL_IT_MASK) && + (timer_ctx->cnt_ctl & ARCH_TIMER_CTRL_ENABLE); +} + +/* + * Returns the earliest expiration time in ns among guest timers. + * Note that it will return 0 if none of timers can fire. + */ +static u64 kvm_timer_earliest_exp(struct kvm_vcpu *vcpu) +{ + u64 min_delta = ULLONG_MAX; + int i; + + for (i = 0; i < NR_KVM_TIMERS; i++) { + struct arch_timer_context *ctx = &vcpu->arch.timer_cpu.timers[i]; + + WARN(ctx->loaded, "timer %d loaded\n", i); + if (kvm_timer_irq_can_fire(ctx)) + min_delta = min(min_delta, kvm_timer_compute_delta(ctx)); + } + + /* If none of timers can fire, then return 0 */ + if (min_delta == ULLONG_MAX) + return 0; + + return min_delta; +} + +static enum hrtimer_restart kvm_bg_timer_expire(struct hrtimer *hrt) +{ + struct arch_timer_cpu *timer; + struct kvm_vcpu *vcpu; + u64 ns; + + timer = container_of(hrt, struct arch_timer_cpu, bg_timer); + vcpu = container_of(timer, struct kvm_vcpu, arch.timer_cpu); + + /* + * Check that the timer has really expired from the guest's + * PoV (NTP on the host may have forced it to expire + * early). If we should have slept longer, restart it. + */ + ns = kvm_timer_earliest_exp(vcpu); + if (unlikely(ns)) { + hrtimer_forward_now(hrt, ns_to_ktime(ns)); + return HRTIMER_RESTART; + } + + kvm_vcpu_wake_up(vcpu); + return HRTIMER_NORESTART; +} + +static enum hrtimer_restart kvm_hrtimer_expire(struct hrtimer *hrt) +{ + struct arch_timer_context *ctx; + struct kvm_vcpu *vcpu; + u64 ns; + + ctx = container_of(hrt, struct arch_timer_context, hrtimer); + vcpu = ctx->vcpu; + + trace_kvm_timer_hrtimer_expire(ctx); + + /* + * Check that the timer has really expired from the guest's + * PoV (NTP on the host may have forced it to expire + * early). If not ready, schedule for a later time. + */ + ns = kvm_timer_compute_delta(ctx); + if (unlikely(ns)) { + hrtimer_forward_now(hrt, ns_to_ktime(ns)); + return HRTIMER_RESTART; + } + + kvm_timer_update_irq(vcpu, true, ctx); + return HRTIMER_NORESTART; +} + +static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx) +{ + enum kvm_arch_timers index; + u64 cval, now; + + if (!timer_ctx) + return false; + + index = arch_timer_ctx_index(timer_ctx); + + if (timer_ctx->loaded) { + u32 cnt_ctl = 0; + + switch (index) { + case TIMER_VTIMER: + cnt_ctl = read_sysreg_el0(SYS_CNTV_CTL); + break; + case TIMER_PTIMER: + cnt_ctl = read_sysreg_el0(SYS_CNTP_CTL); + break; + case NR_KVM_TIMERS: + /* GCC is braindead */ + cnt_ctl = 0; + break; + } + + return (cnt_ctl & ARCH_TIMER_CTRL_ENABLE) && + (cnt_ctl & ARCH_TIMER_CTRL_IT_STAT) && + !(cnt_ctl & ARCH_TIMER_CTRL_IT_MASK); + } + + if (!kvm_timer_irq_can_fire(timer_ctx)) + return false; + + cval = timer_ctx->cnt_cval; + now = kvm_phys_timer_read() - timer_ctx->cntvoff; + + return cval <= now; +} + +bool kvm_timer_is_pending(struct kvm_vcpu *vcpu) +{ + struct timer_map map; + + get_timer_map(vcpu, &map); + + return kvm_timer_should_fire(map.direct_vtimer) || + kvm_timer_should_fire(map.direct_ptimer) || + kvm_timer_should_fire(map.emul_ptimer); +} + +/* + * Reflect the timer output level into the kvm_run structure + */ +void kvm_timer_update_run(struct kvm_vcpu *vcpu) +{ + struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); + struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); + struct kvm_sync_regs *regs = &vcpu->run->s.regs; + + /* Populate the device bitmap with the timer states */ + regs->device_irq_level &= ~(KVM_ARM_DEV_EL1_VTIMER | + KVM_ARM_DEV_EL1_PTIMER); + if (kvm_timer_should_fire(vtimer)) + regs->device_irq_level |= KVM_ARM_DEV_EL1_VTIMER; + if (kvm_timer_should_fire(ptimer)) + regs->device_irq_level |= KVM_ARM_DEV_EL1_PTIMER; +} + +static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level, + struct arch_timer_context *timer_ctx) +{ + int ret; + + timer_ctx->irq.level = new_level; + trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_ctx->irq.irq, + timer_ctx->irq.level); + + if (!userspace_irqchip(vcpu->kvm)) { + ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id, + timer_ctx->irq.irq, + timer_ctx->irq.level, + timer_ctx); + WARN_ON(ret); + } +} + +/* Only called for a fully emulated timer */ +static void timer_emulate(struct arch_timer_context *ctx) +{ + bool should_fire = kvm_timer_should_fire(ctx); + + trace_kvm_timer_emulate(ctx, should_fire); + + if (should_fire != ctx->irq.level) { + kvm_timer_update_irq(ctx->vcpu, should_fire, ctx); + return; + } + + /* + * If the timer can fire now, we don't need to have a soft timer + * scheduled for the future. If the timer cannot fire at all, + * then we also don't need a soft timer. + */ + if (!kvm_timer_irq_can_fire(ctx)) { + soft_timer_cancel(&ctx->hrtimer); + return; + } + + soft_timer_start(&ctx->hrtimer, kvm_timer_compute_delta(ctx)); +} + +static void timer_save_state(struct arch_timer_context *ctx) +{ + struct arch_timer_cpu *timer = vcpu_timer(ctx->vcpu); + enum kvm_arch_timers index = arch_timer_ctx_index(ctx); + unsigned long flags; + + if (!timer->enabled) + return; + + local_irq_save(flags); + + if (!ctx->loaded) + goto out; + + switch (index) { + case TIMER_VTIMER: + ctx->cnt_ctl = read_sysreg_el0(SYS_CNTV_CTL); + ctx->cnt_cval = read_sysreg_el0(SYS_CNTV_CVAL); + + /* Disable the timer */ + write_sysreg_el0(0, SYS_CNTV_CTL); + isb(); + + break; + case TIMER_PTIMER: + ctx->cnt_ctl = read_sysreg_el0(SYS_CNTP_CTL); + ctx->cnt_cval = read_sysreg_el0(SYS_CNTP_CVAL); + + /* Disable the timer */ + write_sysreg_el0(0, SYS_CNTP_CTL); + isb(); + + break; + case NR_KVM_TIMERS: + BUG(); + } + + trace_kvm_timer_save_state(ctx); + + ctx->loaded = false; +out: + local_irq_restore(flags); +} + +/* + * Schedule the background timer before calling kvm_vcpu_block, so that this + * thread is removed from its waitqueue and made runnable when there's a timer + * interrupt to handle. + */ +static void kvm_timer_blocking(struct kvm_vcpu *vcpu) +{ + struct arch_timer_cpu *timer = vcpu_timer(vcpu); + struct timer_map map; + + get_timer_map(vcpu, &map); + + /* + * If no timers are capable of raising interrupts (disabled or + * masked), then there's no more work for us to do. + */ + if (!kvm_timer_irq_can_fire(map.direct_vtimer) && + !kvm_timer_irq_can_fire(map.direct_ptimer) && + !kvm_timer_irq_can_fire(map.emul_ptimer)) + return; + + /* + * At least one guest time will expire. Schedule a background timer. + * Set the earliest expiration time among the guest timers. + */ + soft_timer_start(&timer->bg_timer, kvm_timer_earliest_exp(vcpu)); +} + +static void kvm_timer_unblocking(struct kvm_vcpu *vcpu) +{ + struct arch_timer_cpu *timer = vcpu_timer(vcpu); + + soft_timer_cancel(&timer->bg_timer); +} + +static void timer_restore_state(struct arch_timer_context *ctx) +{ + struct arch_timer_cpu *timer = vcpu_timer(ctx->vcpu); + enum kvm_arch_timers index = arch_timer_ctx_index(ctx); + unsigned long flags; + + if (!timer->enabled) + return; + + local_irq_save(flags); + + if (ctx->loaded) + goto out; + + switch (index) { + case TIMER_VTIMER: + write_sysreg_el0(ctx->cnt_cval, SYS_CNTV_CVAL); + isb(); + write_sysreg_el0(ctx->cnt_ctl, SYS_CNTV_CTL); + break; + case TIMER_PTIMER: + write_sysreg_el0(ctx->cnt_cval, SYS_CNTP_CVAL); + isb(); + write_sysreg_el0(ctx->cnt_ctl, SYS_CNTP_CTL); + break; + case NR_KVM_TIMERS: + BUG(); + } + + trace_kvm_timer_restore_state(ctx); + + ctx->loaded = true; +out: + local_irq_restore(flags); +} + +static void set_cntvoff(u64 cntvoff) +{ + kvm_call_hyp(__kvm_timer_set_cntvoff, cntvoff); +} + +static inline void set_timer_irq_phys_active(struct arch_timer_context *ctx, bool active) +{ + int r; + r = irq_set_irqchip_state(ctx->host_timer_irq, IRQCHIP_STATE_ACTIVE, active); + WARN_ON(r); +} + +static void kvm_timer_vcpu_load_gic(struct arch_timer_context *ctx) +{ + struct kvm_vcpu *vcpu = ctx->vcpu; + bool phys_active = false; + + /* + * Update the timer output so that it is likely to match the + * state we're about to restore. If the timer expires between + * this point and the register restoration, we'll take the + * interrupt anyway. + */ + kvm_timer_update_irq(ctx->vcpu, kvm_timer_should_fire(ctx), ctx); + + if (irqchip_in_kernel(vcpu->kvm)) + phys_active = kvm_vgic_map_is_active(vcpu, ctx->irq.irq); + + phys_active |= ctx->irq.level; + + set_timer_irq_phys_active(ctx, phys_active); +} + +static void kvm_timer_vcpu_load_nogic(struct kvm_vcpu *vcpu) +{ + struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); + + /* + * Update the timer output so that it is likely to match the + * state we're about to restore. If the timer expires between + * this point and the register restoration, we'll take the + * interrupt anyway. + */ + kvm_timer_update_irq(vcpu, kvm_timer_should_fire(vtimer), vtimer); + + /* + * When using a userspace irqchip with the architected timers and a + * host interrupt controller that doesn't support an active state, we + * must still prevent continuously exiting from the guest, and + * therefore mask the physical interrupt by disabling it on the host + * interrupt controller when the virtual level is high, such that the + * guest can make forward progress. Once we detect the output level + * being de-asserted, we unmask the interrupt again so that we exit + * from the guest when the timer fires. + */ + if (vtimer->irq.level) + disable_percpu_irq(host_vtimer_irq); + else + enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags); +} + +void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu) +{ + struct arch_timer_cpu *timer = vcpu_timer(vcpu); + struct timer_map map; + + if (unlikely(!timer->enabled)) + return; + + get_timer_map(vcpu, &map); + + if (static_branch_likely(&has_gic_active_state)) { + kvm_timer_vcpu_load_gic(map.direct_vtimer); + if (map.direct_ptimer) + kvm_timer_vcpu_load_gic(map.direct_ptimer); + } else { + kvm_timer_vcpu_load_nogic(vcpu); + } + + set_cntvoff(map.direct_vtimer->cntvoff); + + kvm_timer_unblocking(vcpu); + + timer_restore_state(map.direct_vtimer); + if (map.direct_ptimer) + timer_restore_state(map.direct_ptimer); + + if (map.emul_ptimer) + timer_emulate(map.emul_ptimer); +} + +bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu) +{ + struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); + struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); + struct kvm_sync_regs *sregs = &vcpu->run->s.regs; + bool vlevel, plevel; + + if (likely(irqchip_in_kernel(vcpu->kvm))) + return false; + + vlevel = sregs->device_irq_level & KVM_ARM_DEV_EL1_VTIMER; + plevel = sregs->device_irq_level & KVM_ARM_DEV_EL1_PTIMER; + + return kvm_timer_should_fire(vtimer) != vlevel || + kvm_timer_should_fire(ptimer) != plevel; +} + +void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) +{ + struct arch_timer_cpu *timer = vcpu_timer(vcpu); + struct timer_map map; + struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu); + + if (unlikely(!timer->enabled)) + return; + + get_timer_map(vcpu, &map); + + timer_save_state(map.direct_vtimer); + if (map.direct_ptimer) + timer_save_state(map.direct_ptimer); + + /* + * Cancel soft timer emulation, because the only case where we + * need it after a vcpu_put is in the context of a sleeping VCPU, and + * in that case we already factor in the deadline for the physical + * timer when scheduling the bg_timer. + * + * In any case, we re-schedule the hrtimer for the physical timer when + * coming back to the VCPU thread in kvm_timer_vcpu_load(). + */ + if (map.emul_ptimer) + soft_timer_cancel(&map.emul_ptimer->hrtimer); + + if (rcuwait_active(wait)) + kvm_timer_blocking(vcpu); + + /* + * The kernel may decide to run userspace after calling vcpu_put, so + * we reset cntvoff to 0 to ensure a consistent read between user + * accesses to the virtual counter and kernel access to the physical + * counter of non-VHE case. For VHE, the virtual counter uses a fixed + * virtual offset of zero, so no need to zero CNTVOFF_EL2 register. + */ + set_cntvoff(0); +} + +/* + * With a userspace irqchip we have to check if the guest de-asserted the + * timer and if so, unmask the timer irq signal on the host interrupt + * controller to ensure that we see future timer signals. + */ +static void unmask_vtimer_irq_user(struct kvm_vcpu *vcpu) +{ + struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); + + if (!kvm_timer_should_fire(vtimer)) { + kvm_timer_update_irq(vcpu, false, vtimer); + if (static_branch_likely(&has_gic_active_state)) + set_timer_irq_phys_active(vtimer, false); + else + enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags); + } +} + +void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu) +{ + struct arch_timer_cpu *timer = vcpu_timer(vcpu); + + if (unlikely(!timer->enabled)) + return; + + if (unlikely(!irqchip_in_kernel(vcpu->kvm))) + unmask_vtimer_irq_user(vcpu); +} + +int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu) +{ + struct arch_timer_cpu *timer = vcpu_timer(vcpu); + struct timer_map map; + + get_timer_map(vcpu, &map); + + /* + * The bits in CNTV_CTL are architecturally reset to UNKNOWN for ARMv8 + * and to 0 for ARMv7. We provide an implementation that always + * resets the timer to be disabled and unmasked and is compliant with + * the ARMv7 architecture. + */ + vcpu_vtimer(vcpu)->cnt_ctl = 0; + vcpu_ptimer(vcpu)->cnt_ctl = 0; + + if (timer->enabled) { + kvm_timer_update_irq(vcpu, false, vcpu_vtimer(vcpu)); + kvm_timer_update_irq(vcpu, false, vcpu_ptimer(vcpu)); + + if (irqchip_in_kernel(vcpu->kvm)) { + kvm_vgic_reset_mapped_irq(vcpu, map.direct_vtimer->irq.irq); + if (map.direct_ptimer) + kvm_vgic_reset_mapped_irq(vcpu, map.direct_ptimer->irq.irq); + } + } + + if (map.emul_ptimer) + soft_timer_cancel(&map.emul_ptimer->hrtimer); + + return 0; +} + +/* Make the updates of cntvoff for all vtimer contexts atomic */ +static void update_vtimer_cntvoff(struct kvm_vcpu *vcpu, u64 cntvoff) +{ + int i; + struct kvm *kvm = vcpu->kvm; + struct kvm_vcpu *tmp; + + mutex_lock(&kvm->lock); + kvm_for_each_vcpu(i, tmp, kvm) + vcpu_vtimer(tmp)->cntvoff = cntvoff; + + /* + * When called from the vcpu create path, the CPU being created is not + * included in the loop above, so we just set it here as well. + */ + vcpu_vtimer(vcpu)->cntvoff = cntvoff; + mutex_unlock(&kvm->lock); +} + +void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) +{ + struct arch_timer_cpu *timer = vcpu_timer(vcpu); + struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); + struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); + + /* Synchronize cntvoff across all vtimers of a VM. */ + update_vtimer_cntvoff(vcpu, kvm_phys_timer_read()); + ptimer->cntvoff = 0; + + hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); + timer->bg_timer.function = kvm_bg_timer_expire; + + hrtimer_init(&vtimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); + hrtimer_init(&ptimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); + vtimer->hrtimer.function = kvm_hrtimer_expire; + ptimer->hrtimer.function = kvm_hrtimer_expire; + + vtimer->irq.irq = default_vtimer_irq.irq; + ptimer->irq.irq = default_ptimer_irq.irq; + + vtimer->host_timer_irq = host_vtimer_irq; + ptimer->host_timer_irq = host_ptimer_irq; + + vtimer->host_timer_irq_flags = host_vtimer_irq_flags; + ptimer->host_timer_irq_flags = host_ptimer_irq_flags; + + vtimer->vcpu = vcpu; + ptimer->vcpu = vcpu; +} + +static void kvm_timer_init_interrupt(void *info) +{ + enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags); + enable_percpu_irq(host_ptimer_irq, host_ptimer_irq_flags); +} + +int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value) +{ + struct arch_timer_context *timer; + + switch (regid) { + case KVM_REG_ARM_TIMER_CTL: + timer = vcpu_vtimer(vcpu); + kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value); + break; + case KVM_REG_ARM_TIMER_CNT: + timer = vcpu_vtimer(vcpu); + update_vtimer_cntvoff(vcpu, kvm_phys_timer_read() - value); + break; + case KVM_REG_ARM_TIMER_CVAL: + timer = vcpu_vtimer(vcpu); + kvm_arm_timer_write(vcpu, timer, TIMER_REG_CVAL, value); + break; + case KVM_REG_ARM_PTIMER_CTL: + timer = vcpu_ptimer(vcpu); + kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value); + break; + case KVM_REG_ARM_PTIMER_CVAL: + timer = vcpu_ptimer(vcpu); + kvm_arm_timer_write(vcpu, timer, TIMER_REG_CVAL, value); + break; + + default: + return -1; + } + + return 0; +} + +static u64 read_timer_ctl(struct arch_timer_context *timer) +{ + /* + * Set ISTATUS bit if it's expired. + * Note that according to ARMv8 ARM Issue A.k, ISTATUS bit is + * UNKNOWN when ENABLE bit is 0, so we chose to set ISTATUS bit + * regardless of ENABLE bit for our implementation convenience. + */ + if (!kvm_timer_compute_delta(timer)) + return timer->cnt_ctl | ARCH_TIMER_CTRL_IT_STAT; + else + return timer->cnt_ctl; +} + +u64 kvm_arm_timer_get_reg(struct kvm_vcpu *vcpu, u64 regid) +{ + switch (regid) { + case KVM_REG_ARM_TIMER_CTL: + return kvm_arm_timer_read(vcpu, + vcpu_vtimer(vcpu), TIMER_REG_CTL); + case KVM_REG_ARM_TIMER_CNT: + return kvm_arm_timer_read(vcpu, + vcpu_vtimer(vcpu), TIMER_REG_CNT); + case KVM_REG_ARM_TIMER_CVAL: + return kvm_arm_timer_read(vcpu, + vcpu_vtimer(vcpu), TIMER_REG_CVAL); + case KVM_REG_ARM_PTIMER_CTL: + return kvm_arm_timer_read(vcpu, + vcpu_ptimer(vcpu), TIMER_REG_CTL); + case KVM_REG_ARM_PTIMER_CNT: + return kvm_arm_timer_read(vcpu, + vcpu_ptimer(vcpu), TIMER_REG_CNT); + case KVM_REG_ARM_PTIMER_CVAL: + return kvm_arm_timer_read(vcpu, + vcpu_ptimer(vcpu), TIMER_REG_CVAL); + } + return (u64)-1; +} + +static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu, + struct arch_timer_context *timer, + enum kvm_arch_timer_regs treg) +{ + u64 val; + + switch (treg) { + case TIMER_REG_TVAL: + val = timer->cnt_cval - kvm_phys_timer_read() + timer->cntvoff; + val &= lower_32_bits(val); + break; + + case TIMER_REG_CTL: + val = read_timer_ctl(timer); + break; + + case TIMER_REG_CVAL: + val = timer->cnt_cval; + break; + + case TIMER_REG_CNT: + val = kvm_phys_timer_read() - timer->cntvoff; + break; + + default: + BUG(); + } + + return val; +} + +u64 kvm_arm_timer_read_sysreg(struct kvm_vcpu *vcpu, + enum kvm_arch_timers tmr, + enum kvm_arch_timer_regs treg) +{ + u64 val; + + preempt_disable(); + kvm_timer_vcpu_put(vcpu); + + val = kvm_arm_timer_read(vcpu, vcpu_get_timer(vcpu, tmr), treg); + + kvm_timer_vcpu_load(vcpu); + preempt_enable(); + + return val; +} + +static void kvm_arm_timer_write(struct kvm_vcpu *vcpu, + struct arch_timer_context *timer, + enum kvm_arch_timer_regs treg, + u64 val) +{ + switch (treg) { + case TIMER_REG_TVAL: + timer->cnt_cval = kvm_phys_timer_read() - timer->cntvoff + (s32)val; + break; + + case TIMER_REG_CTL: + timer->cnt_ctl = val & ~ARCH_TIMER_CTRL_IT_STAT; + break; + + case TIMER_REG_CVAL: + timer->cnt_cval = val; + break; + + default: + BUG(); + } +} + +void kvm_arm_timer_write_sysreg(struct kvm_vcpu *vcpu, + enum kvm_arch_timers tmr, + enum kvm_arch_timer_regs treg, + u64 val) +{ + preempt_disable(); + kvm_timer_vcpu_put(vcpu); + + kvm_arm_timer_write(vcpu, vcpu_get_timer(vcpu, tmr), treg, val); + + kvm_timer_vcpu_load(vcpu); + preempt_enable(); +} + +static int kvm_timer_starting_cpu(unsigned int cpu) +{ + kvm_timer_init_interrupt(NULL); + return 0; +} + +static int kvm_timer_dying_cpu(unsigned int cpu) +{ + disable_percpu_irq(host_vtimer_irq); + return 0; +} + +int kvm_timer_hyp_init(bool has_gic) +{ + struct arch_timer_kvm_info *info; + int err; + + info = arch_timer_get_kvm_info(); + timecounter = &info->timecounter; + + if (!timecounter->cc) { + kvm_err("kvm_arch_timer: uninitialized timecounter\n"); + return -ENODEV; + } + + /* First, do the virtual EL1 timer irq */ + + if (info->virtual_irq <= 0) { + kvm_err("kvm_arch_timer: invalid virtual timer IRQ: %d\n", + info->virtual_irq); + return -ENODEV; + } + host_vtimer_irq = info->virtual_irq; + + host_vtimer_irq_flags = irq_get_trigger_type(host_vtimer_irq); + if (host_vtimer_irq_flags != IRQF_TRIGGER_HIGH && + host_vtimer_irq_flags != IRQF_TRIGGER_LOW) { + kvm_err("Invalid trigger for vtimer IRQ%d, assuming level low\n", + host_vtimer_irq); + host_vtimer_irq_flags = IRQF_TRIGGER_LOW; + } + + err = request_percpu_irq(host_vtimer_irq, kvm_arch_timer_handler, + "kvm guest vtimer", kvm_get_running_vcpus()); + if (err) { + kvm_err("kvm_arch_timer: can't request vtimer interrupt %d (%d)\n", + host_vtimer_irq, err); + return err; + } + + if (has_gic) { + err = irq_set_vcpu_affinity(host_vtimer_irq, + kvm_get_running_vcpus()); + if (err) { + kvm_err("kvm_arch_timer: error setting vcpu affinity\n"); + goto out_free_irq; + } + + static_branch_enable(&has_gic_active_state); + } + + kvm_debug("virtual timer IRQ%d\n", host_vtimer_irq); + + /* Now let's do the physical EL1 timer irq */ + + if (info->physical_irq > 0) { + host_ptimer_irq = info->physical_irq; + host_ptimer_irq_flags = irq_get_trigger_type(host_ptimer_irq); + if (host_ptimer_irq_flags != IRQF_TRIGGER_HIGH && + host_ptimer_irq_flags != IRQF_TRIGGER_LOW) { + kvm_err("Invalid trigger for ptimer IRQ%d, assuming level low\n", + host_ptimer_irq); + host_ptimer_irq_flags = IRQF_TRIGGER_LOW; + } + + err = request_percpu_irq(host_ptimer_irq, kvm_arch_timer_handler, + "kvm guest ptimer", kvm_get_running_vcpus()); + if (err) { + kvm_err("kvm_arch_timer: can't request ptimer interrupt %d (%d)\n", + host_ptimer_irq, err); + return err; + } + + if (has_gic) { + err = irq_set_vcpu_affinity(host_ptimer_irq, + kvm_get_running_vcpus()); + if (err) { + kvm_err("kvm_arch_timer: error setting vcpu affinity\n"); + goto out_free_irq; + } + } + + kvm_debug("physical timer IRQ%d\n", host_ptimer_irq); + } else if (has_vhe()) { + kvm_err("kvm_arch_timer: invalid physical timer IRQ: %d\n", + info->physical_irq); + err = -ENODEV; + goto out_free_irq; + } + + cpuhp_setup_state(CPUHP_AP_KVM_ARM_TIMER_STARTING, + "kvm/arm/timer:starting", kvm_timer_starting_cpu, + kvm_timer_dying_cpu); + return 0; +out_free_irq: + free_percpu_irq(host_vtimer_irq, kvm_get_running_vcpus()); + return err; +} + +void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu) +{ + struct arch_timer_cpu *timer = vcpu_timer(vcpu); + + soft_timer_cancel(&timer->bg_timer); +} + +static bool timer_irqs_are_valid(struct kvm_vcpu *vcpu) +{ + int vtimer_irq, ptimer_irq; + int i, ret; + + vtimer_irq = vcpu_vtimer(vcpu)->irq.irq; + ret = kvm_vgic_set_owner(vcpu, vtimer_irq, vcpu_vtimer(vcpu)); + if (ret) + return false; + + ptimer_irq = vcpu_ptimer(vcpu)->irq.irq; + ret = kvm_vgic_set_owner(vcpu, ptimer_irq, vcpu_ptimer(vcpu)); + if (ret) + return false; + + kvm_for_each_vcpu(i, vcpu, vcpu->kvm) { + if (vcpu_vtimer(vcpu)->irq.irq != vtimer_irq || + vcpu_ptimer(vcpu)->irq.irq != ptimer_irq) + return false; + } + + return true; +} + +bool kvm_arch_timer_get_input_level(int vintid) +{ + struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); + struct arch_timer_context *timer; + + if (vintid == vcpu_vtimer(vcpu)->irq.irq) + timer = vcpu_vtimer(vcpu); + else if (vintid == vcpu_ptimer(vcpu)->irq.irq) + timer = vcpu_ptimer(vcpu); + else + BUG(); + + return kvm_timer_should_fire(timer); +} + +int kvm_timer_enable(struct kvm_vcpu *vcpu) +{ + struct arch_timer_cpu *timer = vcpu_timer(vcpu); + struct timer_map map; + int ret; + + if (timer->enabled) + return 0; + + /* Without a VGIC we do not map virtual IRQs to physical IRQs */ + if (!irqchip_in_kernel(vcpu->kvm)) + goto no_vgic; + + if (!vgic_initialized(vcpu->kvm)) + return -ENODEV; + + if (!timer_irqs_are_valid(vcpu)) { + kvm_debug("incorrectly configured timer irqs\n"); + return -EINVAL; + } + + get_timer_map(vcpu, &map); + + ret = kvm_vgic_map_phys_irq(vcpu, + map.direct_vtimer->host_timer_irq, + map.direct_vtimer->irq.irq, + kvm_arch_timer_get_input_level); + if (ret) + return ret; + + if (map.direct_ptimer) { + ret = kvm_vgic_map_phys_irq(vcpu, + map.direct_ptimer->host_timer_irq, + map.direct_ptimer->irq.irq, + kvm_arch_timer_get_input_level); + } + + if (ret) + return ret; + +no_vgic: + timer->enabled = 1; + return 0; +} + +/* + * On VHE system, we only need to configure the EL2 timer trap register once, + * not for every world switch. + * The host kernel runs at EL2 with HCR_EL2.TGE == 1, + * and this makes those bits have no effect for the host kernel execution. + */ +void kvm_timer_init_vhe(void) +{ + /* When HCR_EL2.E2H ==1, EL1PCEN and EL1PCTEN are shifted by 10 */ + u32 cnthctl_shift = 10; + u64 val; + + /* + * VHE systems allow the guest direct access to the EL1 physical + * timer/counter. + */ + val = read_sysreg(cnthctl_el2); + val |= (CNTHCTL_EL1PCEN << cnthctl_shift); + val |= (CNTHCTL_EL1PCTEN << cnthctl_shift); + write_sysreg(val, cnthctl_el2); +} + +static void set_timer_irqs(struct kvm *kvm, int vtimer_irq, int ptimer_irq) +{ + struct kvm_vcpu *vcpu; + int i; + + kvm_for_each_vcpu(i, vcpu, kvm) { + vcpu_vtimer(vcpu)->irq.irq = vtimer_irq; + vcpu_ptimer(vcpu)->irq.irq = ptimer_irq; + } +} + +int kvm_arm_timer_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) +{ + int __user *uaddr = (int __user *)(long)attr->addr; + struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); + struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); + int irq; + + if (!irqchip_in_kernel(vcpu->kvm)) + return -EINVAL; + + if (get_user(irq, uaddr)) + return -EFAULT; + + if (!(irq_is_ppi(irq))) + return -EINVAL; + + if (vcpu->arch.timer_cpu.enabled) + return -EBUSY; + + switch (attr->attr) { + case KVM_ARM_VCPU_TIMER_IRQ_VTIMER: + set_timer_irqs(vcpu->kvm, irq, ptimer->irq.irq); + break; + case KVM_ARM_VCPU_TIMER_IRQ_PTIMER: + set_timer_irqs(vcpu->kvm, vtimer->irq.irq, irq); + break; + default: + return -ENXIO; + } + + return 0; +} + +int kvm_arm_timer_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) +{ + int __user *uaddr = (int __user *)(long)attr->addr; + struct arch_timer_context *timer; + int irq; + + switch (attr->attr) { + case KVM_ARM_VCPU_TIMER_IRQ_VTIMER: + timer = vcpu_vtimer(vcpu); + break; + case KVM_ARM_VCPU_TIMER_IRQ_PTIMER: + timer = vcpu_ptimer(vcpu); + break; + default: + return -ENXIO; + } + + irq = timer->irq.irq; + return put_user(irq, uaddr); +} + +int kvm_arm_timer_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) +{ + switch (attr->attr) { + case KVM_ARM_VCPU_TIMER_IRQ_VTIMER: + case KVM_ARM_VCPU_TIMER_IRQ_PTIMER: + return 0; + } + + return -ENXIO; +} diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c new file mode 100644 index 000000000000..7a57381c05e8 --- /dev/null +++ b/arch/arm64/kvm/arm.c @@ -0,0 +1,1710 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2012 - Virtual Open Systems and Columbia University + * Author: Christoffer Dall <c.dall@virtualopensystems.com> + */ + +#include <linux/bug.h> +#include <linux/cpu_pm.h> +#include <linux/errno.h> +#include <linux/err.h> +#include <linux/kvm_host.h> +#include <linux/list.h> +#include <linux/module.h> +#include <linux/vmalloc.h> +#include <linux/fs.h> +#include <linux/mman.h> +#include <linux/sched.h> +#include <linux/kvm.h> +#include <linux/kvm_irqfd.h> +#include <linux/irqbypass.h> +#include <linux/sched/stat.h> +#include <trace/events/kvm.h> + +#define CREATE_TRACE_POINTS +#include "trace_arm.h" + +#include <linux/uaccess.h> +#include <asm/ptrace.h> +#include <asm/mman.h> +#include <asm/tlbflush.h> +#include <asm/cacheflush.h> +#include <asm/cpufeature.h> +#include <asm/virt.h> +#include <asm/kvm_arm.h> +#include <asm/kvm_asm.h> +#include <asm/kvm_mmu.h> +#include <asm/kvm_emulate.h> +#include <asm/kvm_coproc.h> +#include <asm/sections.h> + +#include <kvm/arm_hypercalls.h> +#include <kvm/arm_pmu.h> +#include <kvm/arm_psci.h> + +#ifdef REQUIRES_VIRT +__asm__(".arch_extension virt"); +#endif + +DEFINE_PER_CPU(kvm_host_data_t, kvm_host_data); +static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); + +/* The VMID used in the VTTBR */ +static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1); +static u32 kvm_next_vmid; +static DEFINE_SPINLOCK(kvm_vmid_lock); + +static bool vgic_present; + +static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled); +DEFINE_STATIC_KEY_FALSE(userspace_irqchip_in_use); + +int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) +{ + return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE; +} + +int kvm_arch_hardware_setup(void *opaque) +{ + return 0; +} + +int kvm_arch_check_processor_compat(void *opaque) +{ + return 0; +} + +int kvm_vm_ioctl_enable_cap(struct kvm *kvm, + struct kvm_enable_cap *cap) +{ + int r; + + if (cap->flags) + return -EINVAL; + + switch (cap->cap) { + case KVM_CAP_ARM_NISV_TO_USER: + r = 0; + kvm->arch.return_nisv_io_abort_to_user = true; + break; + default: + r = -EINVAL; + break; + } + + return r; +} + +static int kvm_arm_default_max_vcpus(void) +{ + return vgic_present ? kvm_vgic_get_max_vcpus() : KVM_MAX_VCPUS; +} + +/** + * kvm_arch_init_vm - initializes a VM data structure + * @kvm: pointer to the KVM struct + */ +int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) +{ + int ret, cpu; + + ret = kvm_arm_setup_stage2(kvm, type); + if (ret) + return ret; + + kvm->arch.last_vcpu_ran = alloc_percpu(typeof(*kvm->arch.last_vcpu_ran)); + if (!kvm->arch.last_vcpu_ran) + return -ENOMEM; + + for_each_possible_cpu(cpu) + *per_cpu_ptr(kvm->arch.last_vcpu_ran, cpu) = -1; + + ret = kvm_alloc_stage2_pgd(kvm); + if (ret) + goto out_fail_alloc; + + ret = create_hyp_mappings(kvm, kvm + 1, PAGE_HYP); + if (ret) + goto out_free_stage2_pgd; + + kvm_vgic_early_init(kvm); + + /* Mark the initial VMID generation invalid */ + kvm->arch.vmid.vmid_gen = 0; + + /* The maximum number of VCPUs is limited by the host's GIC model */ + kvm->arch.max_vcpus = kvm_arm_default_max_vcpus(); + + return ret; +out_free_stage2_pgd: + kvm_free_stage2_pgd(kvm); +out_fail_alloc: + free_percpu(kvm->arch.last_vcpu_ran); + kvm->arch.last_vcpu_ran = NULL; + return ret; +} + +int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu) +{ + return 0; +} + +vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) +{ + return VM_FAULT_SIGBUS; +} + + +/** + * kvm_arch_destroy_vm - destroy the VM data structure + * @kvm: pointer to the KVM struct + */ +void kvm_arch_destroy_vm(struct kvm *kvm) +{ + int i; + + kvm_vgic_destroy(kvm); + + free_percpu(kvm->arch.last_vcpu_ran); + kvm->arch.last_vcpu_ran = NULL; + + for (i = 0; i < KVM_MAX_VCPUS; ++i) { + if (kvm->vcpus[i]) { + kvm_vcpu_destroy(kvm->vcpus[i]); + kvm->vcpus[i] = NULL; + } + } + atomic_set(&kvm->online_vcpus, 0); +} + +int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) +{ + int r; + switch (ext) { + case KVM_CAP_IRQCHIP: + r = vgic_present; + break; + case KVM_CAP_IOEVENTFD: + case KVM_CAP_DEVICE_CTRL: + case KVM_CAP_USER_MEMORY: + case KVM_CAP_SYNC_MMU: + case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: + case KVM_CAP_ONE_REG: + case KVM_CAP_ARM_PSCI: + case KVM_CAP_ARM_PSCI_0_2: + case KVM_CAP_READONLY_MEM: + case KVM_CAP_MP_STATE: + case KVM_CAP_IMMEDIATE_EXIT: + case KVM_CAP_VCPU_EVENTS: + case KVM_CAP_ARM_IRQ_LINE_LAYOUT_2: + case KVM_CAP_ARM_NISV_TO_USER: + case KVM_CAP_ARM_INJECT_EXT_DABT: + r = 1; + break; + case KVM_CAP_ARM_SET_DEVICE_ADDR: + r = 1; + break; + case KVM_CAP_NR_VCPUS: + r = num_online_cpus(); + break; + case KVM_CAP_MAX_VCPUS: + case KVM_CAP_MAX_VCPU_ID: + if (kvm) + r = kvm->arch.max_vcpus; + else + r = kvm_arm_default_max_vcpus(); + break; + case KVM_CAP_MSI_DEVID: + if (!kvm) + r = -EINVAL; + else + r = kvm->arch.vgic.msis_require_devid; + break; + case KVM_CAP_ARM_USER_IRQ: + /* + * 1: EL1_VTIMER, EL1_PTIMER, and PMU. + * (bump this number if adding more devices) + */ + r = 1; + break; + default: + r = kvm_arch_vm_ioctl_check_extension(kvm, ext); + break; + } + return r; +} + +long kvm_arch_dev_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + return -EINVAL; +} + +struct kvm *kvm_arch_alloc_vm(void) +{ + if (!has_vhe()) + return kzalloc(sizeof(struct kvm), GFP_KERNEL); + + return vzalloc(sizeof(struct kvm)); +} + +void kvm_arch_free_vm(struct kvm *kvm) +{ + if (!has_vhe()) + kfree(kvm); + else + vfree(kvm); +} + +int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) +{ + if (irqchip_in_kernel(kvm) && vgic_initialized(kvm)) + return -EBUSY; + + if (id >= kvm->arch.max_vcpus) + return -EINVAL; + + return 0; +} + +int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) +{ + int err; + + /* Force users to call KVM_ARM_VCPU_INIT */ + vcpu->arch.target = -1; + bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES); + + /* Set up the timer */ + kvm_timer_vcpu_init(vcpu); + + kvm_pmu_vcpu_init(vcpu); + + kvm_arm_reset_debug_ptr(vcpu); + + kvm_arm_pvtime_vcpu_init(&vcpu->arch); + + err = kvm_vgic_vcpu_init(vcpu); + if (err) + return err; + + return create_hyp_mappings(vcpu, vcpu + 1, PAGE_HYP); +} + +void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) +{ +} + +void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.has_run_once && unlikely(!irqchip_in_kernel(vcpu->kvm))) + static_branch_dec(&userspace_irqchip_in_use); + + kvm_mmu_free_memory_caches(vcpu); + kvm_timer_vcpu_terminate(vcpu); + kvm_pmu_vcpu_destroy(vcpu); + + kvm_arm_vcpu_destroy(vcpu); +} + +int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) +{ + return kvm_timer_is_pending(vcpu); +} + +void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) +{ + /* + * If we're about to block (most likely because we've just hit a + * WFI), we need to sync back the state of the GIC CPU interface + * so that we have the latest PMR and group enables. This ensures + * that kvm_arch_vcpu_runnable has up-to-date data to decide + * whether we have pending interrupts. + * + * For the same reason, we want to tell GICv4 that we need + * doorbells to be signalled, should an interrupt become pending. + */ + preempt_disable(); + kvm_vgic_vmcr_sync(vcpu); + vgic_v4_put(vcpu, true); + preempt_enable(); +} + +void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) +{ + preempt_disable(); + vgic_v4_load(vcpu); + preempt_enable(); +} + +void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + int *last_ran; + kvm_host_data_t *cpu_data; + + last_ran = this_cpu_ptr(vcpu->kvm->arch.last_vcpu_ran); + cpu_data = this_cpu_ptr(&kvm_host_data); + + /* + * We might get preempted before the vCPU actually runs, but + * over-invalidation doesn't affect correctness. + */ + if (*last_ran != vcpu->vcpu_id) { + kvm_call_hyp(__kvm_tlb_flush_local_vmid, vcpu); + *last_ran = vcpu->vcpu_id; + } + + vcpu->cpu = cpu; + vcpu->arch.host_cpu_context = &cpu_data->host_ctxt; + + kvm_vgic_load(vcpu); + kvm_timer_vcpu_load(vcpu); + kvm_vcpu_load_sysregs(vcpu); + kvm_arch_vcpu_load_fp(vcpu); + kvm_vcpu_pmu_restore_guest(vcpu); + if (kvm_arm_is_pvtime_enabled(&vcpu->arch)) + kvm_make_request(KVM_REQ_RECORD_STEAL, vcpu); + + if (single_task_running()) + vcpu_clear_wfx_traps(vcpu); + else + vcpu_set_wfx_traps(vcpu); + + vcpu_ptrauth_setup_lazy(vcpu); +} + +void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) +{ + kvm_arch_vcpu_put_fp(vcpu); + kvm_vcpu_put_sysregs(vcpu); + kvm_timer_vcpu_put(vcpu); + kvm_vgic_put(vcpu); + kvm_vcpu_pmu_restore_host(vcpu); + + vcpu->cpu = -1; +} + +static void vcpu_power_off(struct kvm_vcpu *vcpu) +{ + vcpu->arch.power_off = true; + kvm_make_request(KVM_REQ_SLEEP, vcpu); + kvm_vcpu_kick(vcpu); +} + +int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, + struct kvm_mp_state *mp_state) +{ + if (vcpu->arch.power_off) + mp_state->mp_state = KVM_MP_STATE_STOPPED; + else + mp_state->mp_state = KVM_MP_STATE_RUNNABLE; + + return 0; +} + +int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, + struct kvm_mp_state *mp_state) +{ + int ret = 0; + + switch (mp_state->mp_state) { + case KVM_MP_STATE_RUNNABLE: + vcpu->arch.power_off = false; + break; + case KVM_MP_STATE_STOPPED: + vcpu_power_off(vcpu); + break; + default: + ret = -EINVAL; + } + + return ret; +} + +/** + * kvm_arch_vcpu_runnable - determine if the vcpu can be scheduled + * @v: The VCPU pointer + * + * If the guest CPU is not waiting for interrupts or an interrupt line is + * asserted, the CPU is by definition runnable. + */ +int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) +{ + bool irq_lines = *vcpu_hcr(v) & (HCR_VI | HCR_VF); + return ((irq_lines || kvm_vgic_vcpu_pending_irq(v)) + && !v->arch.power_off && !v->arch.pause); +} + +bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) +{ + return vcpu_mode_priv(vcpu); +} + +/* Just ensure a guest exit from a particular CPU */ +static void exit_vm_noop(void *info) +{ +} + +void force_vm_exit(const cpumask_t *mask) +{ + preempt_disable(); + smp_call_function_many(mask, exit_vm_noop, NULL, true); + preempt_enable(); +} + +/** + * need_new_vmid_gen - check that the VMID is still valid + * @vmid: The VMID to check + * + * return true if there is a new generation of VMIDs being used + * + * The hardware supports a limited set of values with the value zero reserved + * for the host, so we check if an assigned value belongs to a previous + * generation, which requires us to assign a new value. If we're the first to + * use a VMID for the new generation, we must flush necessary caches and TLBs + * on all CPUs. + */ +static bool need_new_vmid_gen(struct kvm_vmid *vmid) +{ + u64 current_vmid_gen = atomic64_read(&kvm_vmid_gen); + smp_rmb(); /* Orders read of kvm_vmid_gen and kvm->arch.vmid */ + return unlikely(READ_ONCE(vmid->vmid_gen) != current_vmid_gen); +} + +/** + * update_vmid - Update the vmid with a valid VMID for the current generation + * @kvm: The guest that struct vmid belongs to + * @vmid: The stage-2 VMID information struct + */ +static void update_vmid(struct kvm_vmid *vmid) +{ + if (!need_new_vmid_gen(vmid)) + return; + + spin_lock(&kvm_vmid_lock); + + /* + * We need to re-check the vmid_gen here to ensure that if another vcpu + * already allocated a valid vmid for this vm, then this vcpu should + * use the same vmid. + */ + if (!need_new_vmid_gen(vmid)) { + spin_unlock(&kvm_vmid_lock); + return; + } + + /* First user of a new VMID generation? */ + if (unlikely(kvm_next_vmid == 0)) { + atomic64_inc(&kvm_vmid_gen); + kvm_next_vmid = 1; + + /* + * On SMP we know no other CPUs can use this CPU's or each + * other's VMID after force_vm_exit returns since the + * kvm_vmid_lock blocks them from reentry to the guest. + */ + force_vm_exit(cpu_all_mask); + /* + * Now broadcast TLB + ICACHE invalidation over the inner + * shareable domain to make sure all data structures are + * clean. + */ + kvm_call_hyp(__kvm_flush_vm_context); + } + + vmid->vmid = kvm_next_vmid; + kvm_next_vmid++; + kvm_next_vmid &= (1 << kvm_get_vmid_bits()) - 1; + + smp_wmb(); + WRITE_ONCE(vmid->vmid_gen, atomic64_read(&kvm_vmid_gen)); + + spin_unlock(&kvm_vmid_lock); +} + +static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + int ret = 0; + + if (likely(vcpu->arch.has_run_once)) + return 0; + + if (!kvm_arm_vcpu_is_finalized(vcpu)) + return -EPERM; + + vcpu->arch.has_run_once = true; + + if (likely(irqchip_in_kernel(kvm))) { + /* + * Map the VGIC hardware resources before running a vcpu the + * first time on this VM. + */ + if (unlikely(!vgic_ready(kvm))) { + ret = kvm_vgic_map_resources(kvm); + if (ret) + return ret; + } + } else { + /* + * Tell the rest of the code that there are userspace irqchip + * VMs in the wild. + */ + static_branch_inc(&userspace_irqchip_in_use); + } + + ret = kvm_timer_enable(vcpu); + if (ret) + return ret; + + ret = kvm_arm_pmu_v3_enable(vcpu); + + return ret; +} + +bool kvm_arch_intc_initialized(struct kvm *kvm) +{ + return vgic_initialized(kvm); +} + +void kvm_arm_halt_guest(struct kvm *kvm) +{ + int i; + struct kvm_vcpu *vcpu; + + kvm_for_each_vcpu(i, vcpu, kvm) + vcpu->arch.pause = true; + kvm_make_all_cpus_request(kvm, KVM_REQ_SLEEP); +} + +void kvm_arm_resume_guest(struct kvm *kvm) +{ + int i; + struct kvm_vcpu *vcpu; + + kvm_for_each_vcpu(i, vcpu, kvm) { + vcpu->arch.pause = false; + rcuwait_wake_up(kvm_arch_vcpu_get_wait(vcpu)); + } +} + +static void vcpu_req_sleep(struct kvm_vcpu *vcpu) +{ + struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu); + + rcuwait_wait_event(wait, + (!vcpu->arch.power_off) &&(!vcpu->arch.pause), + TASK_INTERRUPTIBLE); + + if (vcpu->arch.power_off || vcpu->arch.pause) { + /* Awaken to handle a signal, request we sleep again later. */ + kvm_make_request(KVM_REQ_SLEEP, vcpu); + } + + /* + * Make sure we will observe a potential reset request if we've + * observed a change to the power state. Pairs with the smp_wmb() in + * kvm_psci_vcpu_on(). + */ + smp_rmb(); +} + +static int kvm_vcpu_initialized(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.target >= 0; +} + +static void check_vcpu_requests(struct kvm_vcpu *vcpu) +{ + if (kvm_request_pending(vcpu)) { + if (kvm_check_request(KVM_REQ_SLEEP, vcpu)) + vcpu_req_sleep(vcpu); + + if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu)) + kvm_reset_vcpu(vcpu); + + /* + * Clear IRQ_PENDING requests that were made to guarantee + * that a VCPU sees new virtual interrupts. + */ + kvm_check_request(KVM_REQ_IRQ_PENDING, vcpu); + + if (kvm_check_request(KVM_REQ_RECORD_STEAL, vcpu)) + kvm_update_stolen_time(vcpu); + + if (kvm_check_request(KVM_REQ_RELOAD_GICv4, vcpu)) { + /* The distributor enable bits were changed */ + preempt_disable(); + vgic_v4_put(vcpu, false); + vgic_v4_load(vcpu); + preempt_enable(); + } + } +} + +/** + * kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code + * @vcpu: The VCPU pointer + * + * This function is called through the VCPU_RUN ioctl called from user space. It + * will execute VM code in a loop until the time slice for the process is used + * or some emulation is needed from user space in which case the function will + * return with return value 0 and with the kvm_run structure filled in with the + * required data for the requested emulation. + */ +int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) +{ + struct kvm_run *run = vcpu->run; + int ret; + + if (unlikely(!kvm_vcpu_initialized(vcpu))) + return -ENOEXEC; + + ret = kvm_vcpu_first_run_init(vcpu); + if (ret) + return ret; + + if (run->exit_reason == KVM_EXIT_MMIO) { + ret = kvm_handle_mmio_return(vcpu, run); + if (ret) + return ret; + } + + if (run->immediate_exit) + return -EINTR; + + vcpu_load(vcpu); + + kvm_sigset_activate(vcpu); + + ret = 1; + run->exit_reason = KVM_EXIT_UNKNOWN; + while (ret > 0) { + /* + * Check conditions before entering the guest + */ + cond_resched(); + + update_vmid(&vcpu->kvm->arch.vmid); + + check_vcpu_requests(vcpu); + + /* + * Preparing the interrupts to be injected also + * involves poking the GIC, which must be done in a + * non-preemptible context. + */ + preempt_disable(); + + kvm_pmu_flush_hwstate(vcpu); + + local_irq_disable(); + + kvm_vgic_flush_hwstate(vcpu); + + /* + * Exit if we have a signal pending so that we can deliver the + * signal to user space. + */ + if (signal_pending(current)) { + ret = -EINTR; + run->exit_reason = KVM_EXIT_INTR; + } + + /* + * If we're using a userspace irqchip, then check if we need + * to tell a userspace irqchip about timer or PMU level + * changes and if so, exit to userspace (the actual level + * state gets updated in kvm_timer_update_run and + * kvm_pmu_update_run below). + */ + if (static_branch_unlikely(&userspace_irqchip_in_use)) { + if (kvm_timer_should_notify_user(vcpu) || + kvm_pmu_should_notify_user(vcpu)) { + ret = -EINTR; + run->exit_reason = KVM_EXIT_INTR; + } + } + + /* + * Ensure we set mode to IN_GUEST_MODE after we disable + * interrupts and before the final VCPU requests check. + * See the comment in kvm_vcpu_exiting_guest_mode() and + * Documentation/virt/kvm/vcpu-requests.rst + */ + smp_store_mb(vcpu->mode, IN_GUEST_MODE); + + if (ret <= 0 || need_new_vmid_gen(&vcpu->kvm->arch.vmid) || + kvm_request_pending(vcpu)) { + vcpu->mode = OUTSIDE_GUEST_MODE; + isb(); /* Ensure work in x_flush_hwstate is committed */ + kvm_pmu_sync_hwstate(vcpu); + if (static_branch_unlikely(&userspace_irqchip_in_use)) + kvm_timer_sync_hwstate(vcpu); + kvm_vgic_sync_hwstate(vcpu); + local_irq_enable(); + preempt_enable(); + continue; + } + + kvm_arm_setup_debug(vcpu); + + /************************************************************** + * Enter the guest + */ + trace_kvm_entry(*vcpu_pc(vcpu)); + guest_enter_irqoff(); + + if (has_vhe()) { + ret = kvm_vcpu_run_vhe(vcpu); + } else { + ret = kvm_call_hyp_ret(__kvm_vcpu_run_nvhe, vcpu); + } + + vcpu->mode = OUTSIDE_GUEST_MODE; + vcpu->stat.exits++; + /* + * Back from guest + *************************************************************/ + + kvm_arm_clear_debug(vcpu); + + /* + * We must sync the PMU state before the vgic state so + * that the vgic can properly sample the updated state of the + * interrupt line. + */ + kvm_pmu_sync_hwstate(vcpu); + + /* + * Sync the vgic state before syncing the timer state because + * the timer code needs to know if the virtual timer + * interrupts are active. + */ + kvm_vgic_sync_hwstate(vcpu); + + /* + * Sync the timer hardware state before enabling interrupts as + * we don't want vtimer interrupts to race with syncing the + * timer virtual interrupt state. + */ + if (static_branch_unlikely(&userspace_irqchip_in_use)) + kvm_timer_sync_hwstate(vcpu); + + kvm_arch_vcpu_ctxsync_fp(vcpu); + + /* + * We may have taken a host interrupt in HYP mode (ie + * while executing the guest). This interrupt is still + * pending, as we haven't serviced it yet! + * + * We're now back in SVC mode, with interrupts + * disabled. Enabling the interrupts now will have + * the effect of taking the interrupt again, in SVC + * mode this time. + */ + local_irq_enable(); + + /* + * We do local_irq_enable() before calling guest_exit() so + * that if a timer interrupt hits while running the guest we + * account that tick as being spent in the guest. We enable + * preemption after calling guest_exit() so that if we get + * preempted we make sure ticks after that is not counted as + * guest time. + */ + guest_exit(); + trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu)); + + /* Exit types that need handling before we can be preempted */ + handle_exit_early(vcpu, run, ret); + + preempt_enable(); + + ret = handle_exit(vcpu, run, ret); + } + + /* Tell userspace about in-kernel device output levels */ + if (unlikely(!irqchip_in_kernel(vcpu->kvm))) { + kvm_timer_update_run(vcpu); + kvm_pmu_update_run(vcpu); + } + + kvm_sigset_deactivate(vcpu); + + vcpu_put(vcpu); + return ret; +} + +static int vcpu_interrupt_line(struct kvm_vcpu *vcpu, int number, bool level) +{ + int bit_index; + bool set; + unsigned long *hcr; + + if (number == KVM_ARM_IRQ_CPU_IRQ) + bit_index = __ffs(HCR_VI); + else /* KVM_ARM_IRQ_CPU_FIQ */ + bit_index = __ffs(HCR_VF); + + hcr = vcpu_hcr(vcpu); + if (level) + set = test_and_set_bit(bit_index, hcr); + else + set = test_and_clear_bit(bit_index, hcr); + + /* + * If we didn't change anything, no need to wake up or kick other CPUs + */ + if (set == level) + return 0; + + /* + * The vcpu irq_lines field was updated, wake up sleeping VCPUs and + * trigger a world-switch round on the running physical CPU to set the + * virtual IRQ/FIQ fields in the HCR appropriately. + */ + kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); + kvm_vcpu_kick(vcpu); + + return 0; +} + +int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level, + bool line_status) +{ + u32 irq = irq_level->irq; + unsigned int irq_type, vcpu_idx, irq_num; + int nrcpus = atomic_read(&kvm->online_vcpus); + struct kvm_vcpu *vcpu = NULL; + bool level = irq_level->level; + + irq_type = (irq >> KVM_ARM_IRQ_TYPE_SHIFT) & KVM_ARM_IRQ_TYPE_MASK; + vcpu_idx = (irq >> KVM_ARM_IRQ_VCPU_SHIFT) & KVM_ARM_IRQ_VCPU_MASK; + vcpu_idx += ((irq >> KVM_ARM_IRQ_VCPU2_SHIFT) & KVM_ARM_IRQ_VCPU2_MASK) * (KVM_ARM_IRQ_VCPU_MASK + 1); + irq_num = (irq >> KVM_ARM_IRQ_NUM_SHIFT) & KVM_ARM_IRQ_NUM_MASK; + + trace_kvm_irq_line(irq_type, vcpu_idx, irq_num, irq_level->level); + + switch (irq_type) { + case KVM_ARM_IRQ_TYPE_CPU: + if (irqchip_in_kernel(kvm)) + return -ENXIO; + + if (vcpu_idx >= nrcpus) + return -EINVAL; + + vcpu = kvm_get_vcpu(kvm, vcpu_idx); + if (!vcpu) + return -EINVAL; + + if (irq_num > KVM_ARM_IRQ_CPU_FIQ) + return -EINVAL; + + return vcpu_interrupt_line(vcpu, irq_num, level); + case KVM_ARM_IRQ_TYPE_PPI: + if (!irqchip_in_kernel(kvm)) + return -ENXIO; + + if (vcpu_idx >= nrcpus) + return -EINVAL; + + vcpu = kvm_get_vcpu(kvm, vcpu_idx); + if (!vcpu) + return -EINVAL; + + if (irq_num < VGIC_NR_SGIS || irq_num >= VGIC_NR_PRIVATE_IRQS) + return -EINVAL; + + return kvm_vgic_inject_irq(kvm, vcpu->vcpu_id, irq_num, level, NULL); + case KVM_ARM_IRQ_TYPE_SPI: + if (!irqchip_in_kernel(kvm)) + return -ENXIO; + + if (irq_num < VGIC_NR_PRIVATE_IRQS) + return -EINVAL; + + return kvm_vgic_inject_irq(kvm, 0, irq_num, level, NULL); + } + + return -EINVAL; +} + +static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu, + const struct kvm_vcpu_init *init) +{ + unsigned int i, ret; + int phys_target = kvm_target_cpu(); + + if (init->target != phys_target) + return -EINVAL; + + /* + * Secondary and subsequent calls to KVM_ARM_VCPU_INIT must + * use the same target. + */ + if (vcpu->arch.target != -1 && vcpu->arch.target != init->target) + return -EINVAL; + + /* -ENOENT for unknown features, -EINVAL for invalid combinations. */ + for (i = 0; i < sizeof(init->features) * 8; i++) { + bool set = (init->features[i / 32] & (1 << (i % 32))); + + if (set && i >= KVM_VCPU_MAX_FEATURES) + return -ENOENT; + + /* + * Secondary and subsequent calls to KVM_ARM_VCPU_INIT must + * use the same feature set. + */ + if (vcpu->arch.target != -1 && i < KVM_VCPU_MAX_FEATURES && + test_bit(i, vcpu->arch.features) != set) + return -EINVAL; + + if (set) + set_bit(i, vcpu->arch.features); + } + + vcpu->arch.target = phys_target; + + /* Now we know what it is, we can reset it. */ + ret = kvm_reset_vcpu(vcpu); + if (ret) { + vcpu->arch.target = -1; + bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES); + } + + return ret; +} + +static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu, + struct kvm_vcpu_init *init) +{ + int ret; + + ret = kvm_vcpu_set_target(vcpu, init); + if (ret) + return ret; + + /* + * Ensure a rebooted VM will fault in RAM pages and detect if the + * guest MMU is turned off and flush the caches as needed. + * + * S2FWB enforces all memory accesses to RAM being cacheable, we + * ensure that the cache is always coherent. + */ + if (vcpu->arch.has_run_once && !cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) + stage2_unmap_vm(vcpu->kvm); + + vcpu_reset_hcr(vcpu); + + /* + * Handle the "start in power-off" case. + */ + if (test_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features)) + vcpu_power_off(vcpu); + else + vcpu->arch.power_off = false; + + return 0; +} + +static int kvm_arm_vcpu_set_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr) +{ + int ret = -ENXIO; + + switch (attr->group) { + default: + ret = kvm_arm_vcpu_arch_set_attr(vcpu, attr); + break; + } + + return ret; +} + +static int kvm_arm_vcpu_get_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr) +{ + int ret = -ENXIO; + + switch (attr->group) { + default: + ret = kvm_arm_vcpu_arch_get_attr(vcpu, attr); + break; + } + + return ret; +} + +static int kvm_arm_vcpu_has_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr) +{ + int ret = -ENXIO; + + switch (attr->group) { + default: + ret = kvm_arm_vcpu_arch_has_attr(vcpu, attr); + break; + } + + return ret; +} + +static int kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu, + struct kvm_vcpu_events *events) +{ + memset(events, 0, sizeof(*events)); + + return __kvm_arm_vcpu_get_events(vcpu, events); +} + +static int kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu, + struct kvm_vcpu_events *events) +{ + int i; + + /* check whether the reserved field is zero */ + for (i = 0; i < ARRAY_SIZE(events->reserved); i++) + if (events->reserved[i]) + return -EINVAL; + + /* check whether the pad field is zero */ + for (i = 0; i < ARRAY_SIZE(events->exception.pad); i++) + if (events->exception.pad[i]) + return -EINVAL; + + return __kvm_arm_vcpu_set_events(vcpu, events); +} + +long kvm_arch_vcpu_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + struct kvm_vcpu *vcpu = filp->private_data; + void __user *argp = (void __user *)arg; + struct kvm_device_attr attr; + long r; + + switch (ioctl) { + case KVM_ARM_VCPU_INIT: { + struct kvm_vcpu_init init; + + r = -EFAULT; + if (copy_from_user(&init, argp, sizeof(init))) + break; + + r = kvm_arch_vcpu_ioctl_vcpu_init(vcpu, &init); + break; + } + case KVM_SET_ONE_REG: + case KVM_GET_ONE_REG: { + struct kvm_one_reg reg; + + r = -ENOEXEC; + if (unlikely(!kvm_vcpu_initialized(vcpu))) + break; + + r = -EFAULT; + if (copy_from_user(®, argp, sizeof(reg))) + break; + + if (ioctl == KVM_SET_ONE_REG) + r = kvm_arm_set_reg(vcpu, ®); + else + r = kvm_arm_get_reg(vcpu, ®); + break; + } + case KVM_GET_REG_LIST: { + struct kvm_reg_list __user *user_list = argp; + struct kvm_reg_list reg_list; + unsigned n; + + r = -ENOEXEC; + if (unlikely(!kvm_vcpu_initialized(vcpu))) + break; + + r = -EPERM; + if (!kvm_arm_vcpu_is_finalized(vcpu)) + break; + + r = -EFAULT; + if (copy_from_user(®_list, user_list, sizeof(reg_list))) + break; + n = reg_list.n; + reg_list.n = kvm_arm_num_regs(vcpu); + if (copy_to_user(user_list, ®_list, sizeof(reg_list))) + break; + r = -E2BIG; + if (n < reg_list.n) + break; + r = kvm_arm_copy_reg_indices(vcpu, user_list->reg); + break; + } + case KVM_SET_DEVICE_ATTR: { + r = -EFAULT; + if (copy_from_user(&attr, argp, sizeof(attr))) + break; + r = kvm_arm_vcpu_set_attr(vcpu, &attr); + break; + } + case KVM_GET_DEVICE_ATTR: { + r = -EFAULT; + if (copy_from_user(&attr, argp, sizeof(attr))) + break; + r = kvm_arm_vcpu_get_attr(vcpu, &attr); + break; + } + case KVM_HAS_DEVICE_ATTR: { + r = -EFAULT; + if (copy_from_user(&attr, argp, sizeof(attr))) + break; + r = kvm_arm_vcpu_has_attr(vcpu, &attr); + break; + } + case KVM_GET_VCPU_EVENTS: { + struct kvm_vcpu_events events; + + if (kvm_arm_vcpu_get_events(vcpu, &events)) + return -EINVAL; + + if (copy_to_user(argp, &events, sizeof(events))) + return -EFAULT; + + return 0; + } + case KVM_SET_VCPU_EVENTS: { + struct kvm_vcpu_events events; + + if (copy_from_user(&events, argp, sizeof(events))) + return -EFAULT; + + return kvm_arm_vcpu_set_events(vcpu, &events); + } + case KVM_ARM_VCPU_FINALIZE: { + int what; + + if (!kvm_vcpu_initialized(vcpu)) + return -ENOEXEC; + + if (get_user(what, (const int __user *)argp)) + return -EFAULT; + + return kvm_arm_vcpu_finalize(vcpu, what); + } + default: + r = -EINVAL; + } + + return r; +} + +void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) +{ + +} + +void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, + struct kvm_memory_slot *memslot) +{ + kvm_flush_remote_tlbs(kvm); +} + +static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, + struct kvm_arm_device_addr *dev_addr) +{ + unsigned long dev_id, type; + + dev_id = (dev_addr->id & KVM_ARM_DEVICE_ID_MASK) >> + KVM_ARM_DEVICE_ID_SHIFT; + type = (dev_addr->id & KVM_ARM_DEVICE_TYPE_MASK) >> + KVM_ARM_DEVICE_TYPE_SHIFT; + + switch (dev_id) { + case KVM_ARM_DEVICE_VGIC_V2: + if (!vgic_present) + return -ENXIO; + return kvm_vgic_addr(kvm, type, &dev_addr->addr, true); + default: + return -ENODEV; + } +} + +long kvm_arch_vm_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + struct kvm *kvm = filp->private_data; + void __user *argp = (void __user *)arg; + + switch (ioctl) { + case KVM_CREATE_IRQCHIP: { + int ret; + if (!vgic_present) + return -ENXIO; + mutex_lock(&kvm->lock); + ret = kvm_vgic_create(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); + mutex_unlock(&kvm->lock); + return ret; + } + case KVM_ARM_SET_DEVICE_ADDR: { + struct kvm_arm_device_addr dev_addr; + + if (copy_from_user(&dev_addr, argp, sizeof(dev_addr))) + return -EFAULT; + return kvm_vm_ioctl_set_device_addr(kvm, &dev_addr); + } + case KVM_ARM_PREFERRED_TARGET: { + int err; + struct kvm_vcpu_init init; + + err = kvm_vcpu_preferred_target(&init); + if (err) + return err; + + if (copy_to_user(argp, &init, sizeof(init))) + return -EFAULT; + + return 0; + } + default: + return -EINVAL; + } +} + +static void cpu_init_hyp_mode(void) +{ + phys_addr_t pgd_ptr; + unsigned long hyp_stack_ptr; + unsigned long vector_ptr; + unsigned long tpidr_el2; + + /* Switch from the HYP stub to our own HYP init vector */ + __hyp_set_vectors(kvm_get_idmap_vector()); + + /* + * Calculate the raw per-cpu offset without a translation from the + * kernel's mapping to the linear mapping, and store it in tpidr_el2 + * so that we can use adr_l to access per-cpu variables in EL2. + */ + tpidr_el2 = ((unsigned long)this_cpu_ptr(&kvm_host_data) - + (unsigned long)kvm_ksym_ref(kvm_host_data)); + + pgd_ptr = kvm_mmu_get_httbr(); + hyp_stack_ptr = __this_cpu_read(kvm_arm_hyp_stack_page) + PAGE_SIZE; + vector_ptr = (unsigned long)kvm_get_hyp_vector(); + + /* + * Call initialization code, and switch to the full blown HYP code. + * If the cpucaps haven't been finalized yet, something has gone very + * wrong, and hyp will crash and burn when it uses any + * cpus_have_const_cap() wrapper. + */ + BUG_ON(!system_capabilities_finalized()); + __kvm_call_hyp((void *)pgd_ptr, hyp_stack_ptr, vector_ptr, tpidr_el2); + + /* + * Disabling SSBD on a non-VHE system requires us to enable SSBS + * at EL2. + */ + if (this_cpu_has_cap(ARM64_SSBS) && + arm64_get_ssbd_state() == ARM64_SSBD_FORCE_DISABLE) { + kvm_call_hyp(__kvm_enable_ssbs); + } +} + +static void cpu_hyp_reset(void) +{ + if (!is_kernel_in_hyp_mode()) + __hyp_reset_vectors(); +} + +static void cpu_hyp_reinit(void) +{ + kvm_init_host_cpu_context(&this_cpu_ptr(&kvm_host_data)->host_ctxt); + + cpu_hyp_reset(); + + if (is_kernel_in_hyp_mode()) + kvm_timer_init_vhe(); + else + cpu_init_hyp_mode(); + + kvm_arm_init_debug(); + + if (vgic_present) + kvm_vgic_init_cpu_hardware(); +} + +static void _kvm_arch_hardware_enable(void *discard) +{ + if (!__this_cpu_read(kvm_arm_hardware_enabled)) { + cpu_hyp_reinit(); + __this_cpu_write(kvm_arm_hardware_enabled, 1); + } +} + +int kvm_arch_hardware_enable(void) +{ + _kvm_arch_hardware_enable(NULL); + return 0; +} + +static void _kvm_arch_hardware_disable(void *discard) +{ + if (__this_cpu_read(kvm_arm_hardware_enabled)) { + cpu_hyp_reset(); + __this_cpu_write(kvm_arm_hardware_enabled, 0); + } +} + +void kvm_arch_hardware_disable(void) +{ + _kvm_arch_hardware_disable(NULL); +} + +#ifdef CONFIG_CPU_PM +static int hyp_init_cpu_pm_notifier(struct notifier_block *self, + unsigned long cmd, + void *v) +{ + /* + * kvm_arm_hardware_enabled is left with its old value over + * PM_ENTER->PM_EXIT. It is used to indicate PM_EXIT should + * re-enable hyp. + */ + switch (cmd) { + case CPU_PM_ENTER: + if (__this_cpu_read(kvm_arm_hardware_enabled)) + /* + * don't update kvm_arm_hardware_enabled here + * so that the hardware will be re-enabled + * when we resume. See below. + */ + cpu_hyp_reset(); + + return NOTIFY_OK; + case CPU_PM_ENTER_FAILED: + case CPU_PM_EXIT: + if (__this_cpu_read(kvm_arm_hardware_enabled)) + /* The hardware was enabled before suspend. */ + cpu_hyp_reinit(); + + return NOTIFY_OK; + + default: + return NOTIFY_DONE; + } +} + +static struct notifier_block hyp_init_cpu_pm_nb = { + .notifier_call = hyp_init_cpu_pm_notifier, +}; + +static void __init hyp_cpu_pm_init(void) +{ + cpu_pm_register_notifier(&hyp_init_cpu_pm_nb); +} +static void __init hyp_cpu_pm_exit(void) +{ + cpu_pm_unregister_notifier(&hyp_init_cpu_pm_nb); +} +#else +static inline void hyp_cpu_pm_init(void) +{ +} +static inline void hyp_cpu_pm_exit(void) +{ +} +#endif + +static int init_common_resources(void) +{ + return kvm_set_ipa_limit(); +} + +static int init_subsystems(void) +{ + int err = 0; + + /* + * Enable hardware so that subsystem initialisation can access EL2. + */ + on_each_cpu(_kvm_arch_hardware_enable, NULL, 1); + + /* + * Register CPU lower-power notifier + */ + hyp_cpu_pm_init(); + + /* + * Init HYP view of VGIC + */ + err = kvm_vgic_hyp_init(); + switch (err) { + case 0: + vgic_present = true; + break; + case -ENODEV: + case -ENXIO: + vgic_present = false; + err = 0; + break; + default: + goto out; + } + + /* + * Init HYP architected timer support + */ + err = kvm_timer_hyp_init(vgic_present); + if (err) + goto out; + + kvm_perf_init(); + kvm_coproc_table_init(); + +out: + on_each_cpu(_kvm_arch_hardware_disable, NULL, 1); + + return err; +} + +static void teardown_hyp_mode(void) +{ + int cpu; + + free_hyp_pgds(); + for_each_possible_cpu(cpu) + free_page(per_cpu(kvm_arm_hyp_stack_page, cpu)); +} + +/** + * Inits Hyp-mode on all online CPUs + */ +static int init_hyp_mode(void) +{ + int cpu; + int err = 0; + + /* + * Allocate Hyp PGD and setup Hyp identity mapping + */ + err = kvm_mmu_init(); + if (err) + goto out_err; + + /* + * Allocate stack pages for Hypervisor-mode + */ + for_each_possible_cpu(cpu) { + unsigned long stack_page; + + stack_page = __get_free_page(GFP_KERNEL); + if (!stack_page) { + err = -ENOMEM; + goto out_err; + } + + per_cpu(kvm_arm_hyp_stack_page, cpu) = stack_page; + } + + /* + * Map the Hyp-code called directly from the host + */ + err = create_hyp_mappings(kvm_ksym_ref(__hyp_text_start), + kvm_ksym_ref(__hyp_text_end), PAGE_HYP_EXEC); + if (err) { + kvm_err("Cannot map world-switch code\n"); + goto out_err; + } + + err = create_hyp_mappings(kvm_ksym_ref(__start_rodata), + kvm_ksym_ref(__end_rodata), PAGE_HYP_RO); + if (err) { + kvm_err("Cannot map rodata section\n"); + goto out_err; + } + + err = create_hyp_mappings(kvm_ksym_ref(__bss_start), + kvm_ksym_ref(__bss_stop), PAGE_HYP_RO); + if (err) { + kvm_err("Cannot map bss section\n"); + goto out_err; + } + + err = kvm_map_vectors(); + if (err) { + kvm_err("Cannot map vectors\n"); + goto out_err; + } + + /* + * Map the Hyp stack pages + */ + for_each_possible_cpu(cpu) { + char *stack_page = (char *)per_cpu(kvm_arm_hyp_stack_page, cpu); + err = create_hyp_mappings(stack_page, stack_page + PAGE_SIZE, + PAGE_HYP); + + if (err) { + kvm_err("Cannot map hyp stack\n"); + goto out_err; + } + } + + for_each_possible_cpu(cpu) { + kvm_host_data_t *cpu_data; + + cpu_data = per_cpu_ptr(&kvm_host_data, cpu); + err = create_hyp_mappings(cpu_data, cpu_data + 1, PAGE_HYP); + + if (err) { + kvm_err("Cannot map host CPU state: %d\n", err); + goto out_err; + } + } + + err = hyp_map_aux_data(); + if (err) + kvm_err("Cannot map host auxiliary data: %d\n", err); + + return 0; + +out_err: + teardown_hyp_mode(); + kvm_err("error initializing Hyp mode: %d\n", err); + return err; +} + +static void check_kvm_target_cpu(void *ret) +{ + *(int *)ret = kvm_target_cpu(); +} + +struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr) +{ + struct kvm_vcpu *vcpu; + int i; + + mpidr &= MPIDR_HWID_BITMASK; + kvm_for_each_vcpu(i, vcpu, kvm) { + if (mpidr == kvm_vcpu_get_mpidr_aff(vcpu)) + return vcpu; + } + return NULL; +} + +bool kvm_arch_has_irq_bypass(void) +{ + return true; +} + +int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, + struct irq_bypass_producer *prod) +{ + struct kvm_kernel_irqfd *irqfd = + container_of(cons, struct kvm_kernel_irqfd, consumer); + + return kvm_vgic_v4_set_forwarding(irqfd->kvm, prod->irq, + &irqfd->irq_entry); +} +void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, + struct irq_bypass_producer *prod) +{ + struct kvm_kernel_irqfd *irqfd = + container_of(cons, struct kvm_kernel_irqfd, consumer); + + kvm_vgic_v4_unset_forwarding(irqfd->kvm, prod->irq, + &irqfd->irq_entry); +} + +void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *cons) +{ + struct kvm_kernel_irqfd *irqfd = + container_of(cons, struct kvm_kernel_irqfd, consumer); + + kvm_arm_halt_guest(irqfd->kvm); +} + +void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *cons) +{ + struct kvm_kernel_irqfd *irqfd = + container_of(cons, struct kvm_kernel_irqfd, consumer); + + kvm_arm_resume_guest(irqfd->kvm); +} + +/** + * Initialize Hyp-mode and memory mappings on all CPUs. + */ +int kvm_arch_init(void *opaque) +{ + int err; + int ret, cpu; + bool in_hyp_mode; + + if (!is_hyp_mode_available()) { + kvm_info("HYP mode not available\n"); + return -ENODEV; + } + + in_hyp_mode = is_kernel_in_hyp_mode(); + + if (!in_hyp_mode && kvm_arch_requires_vhe()) { + kvm_pr_unimpl("CPU unsupported in non-VHE mode, not initializing\n"); + return -ENODEV; + } + + for_each_online_cpu(cpu) { + smp_call_function_single(cpu, check_kvm_target_cpu, &ret, 1); + if (ret < 0) { + kvm_err("Error, CPU %d not supported!\n", cpu); + return -ENODEV; + } + } + + err = init_common_resources(); + if (err) + return err; + + err = kvm_arm_init_sve(); + if (err) + return err; + + if (!in_hyp_mode) { + err = init_hyp_mode(); + if (err) + goto out_err; + } + + err = init_subsystems(); + if (err) + goto out_hyp; + + if (in_hyp_mode) + kvm_info("VHE mode initialized successfully\n"); + else + kvm_info("Hyp mode initialized successfully\n"); + + return 0; + +out_hyp: + hyp_cpu_pm_exit(); + if (!in_hyp_mode) + teardown_hyp_mode(); +out_err: + return err; +} + +/* NOP: Compiling as a module not supported */ +void kvm_arch_exit(void) +{ + kvm_perf_teardown(); +} + +static int arm_init(void) +{ + int rc = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); + return rc; +} + +module_init(arm_init); diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 23ebe51410f0..aea43ec60f37 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -29,20 +29,19 @@ #include "trace.h" -#define VM_STAT(x) { #x, offsetof(struct kvm, stat.x), KVM_STAT_VM } -#define VCPU_STAT(x) { #x, offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU } - struct kvm_stats_debugfs_item debugfs_entries[] = { - VCPU_STAT(halt_successful_poll), - VCPU_STAT(halt_attempted_poll), - VCPU_STAT(halt_poll_invalid), - VCPU_STAT(halt_wakeup), - VCPU_STAT(hvc_exit_stat), - VCPU_STAT(wfe_exit_stat), - VCPU_STAT(wfi_exit_stat), - VCPU_STAT(mmio_exit_user), - VCPU_STAT(mmio_exit_kernel), - VCPU_STAT(exits), + VCPU_STAT("halt_successful_poll", halt_successful_poll), + VCPU_STAT("halt_attempted_poll", halt_attempted_poll), + VCPU_STAT("halt_poll_invalid", halt_poll_invalid), + VCPU_STAT("halt_wakeup", halt_wakeup), + VCPU_STAT("hvc_exit_stat", hvc_exit_stat), + VCPU_STAT("wfe_exit_stat", wfe_exit_stat), + VCPU_STAT("wfi_exit_stat", wfi_exit_stat), + VCPU_STAT("mmio_exit_user", mmio_exit_user), + VCPU_STAT("mmio_exit_kernel", mmio_exit_kernel), + VCPU_STAT("exits", exits), + VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns), + VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns), { NULL } }; @@ -200,6 +199,13 @@ static int set_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) } memcpy((u32 *)regs + off, valp, KVM_REG_SIZE(reg->id)); + + if (*vcpu_cpsr(vcpu) & PSR_MODE32_BIT) { + int i; + + for (i = 0; i < 16; i++) + *vcpu_reg32(vcpu, i) = (u32)*vcpu_reg32(vcpu, i); + } out: return err; } @@ -260,7 +266,7 @@ static int set_sve_vls(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) /* * Vector lengths supported by the host can't currently be * hidden from the guest individually: instead we can only set a - * maxmium via ZCR_EL2.LEN. So, make sure the available vector + * maximum via ZCR_EL2.LEN. So, make sure the available vector * lengths match the set requested exactly up to the requested * maximum: */ @@ -330,7 +336,7 @@ static int sve_reg_to_region(struct sve_state_reg_region *region, unsigned int reg_num; unsigned int reqoffset, reqlen; /* User-requested offset and length */ - unsigned int maxlen; /* Maxmimum permitted length */ + unsigned int maxlen; /* Maximum permitted length */ size_t sve_state_size; diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index aacfc55de44c..eb194696ef62 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -23,7 +23,7 @@ #include <kvm/arm_hypercalls.h> #define CREATE_TRACE_POINTS -#include "trace.h" +#include "trace_handle_exit.h" typedef int (*exit_handle_fn)(struct kvm_vcpu *, struct kvm_run *); diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile index ea710f674cb6..8c9880783839 100644 --- a/arch/arm64/kvm/hyp/Makefile +++ b/arch/arm64/kvm/hyp/Makefile @@ -6,20 +6,10 @@ ccflags-y += -fno-stack-protector -DDISABLE_BRANCH_PROFILING \ $(DISABLE_STACKLEAK_PLUGIN) -KVM=../../../../virt/kvm +obj-$(CONFIG_KVM) += hyp.o -obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/vgic-v3-sr.o -obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/timer-sr.o -obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/aarch32.o - -obj-$(CONFIG_KVM_ARM_HOST) += vgic-v2-cpuif-proxy.o -obj-$(CONFIG_KVM_ARM_HOST) += sysreg-sr.o -obj-$(CONFIG_KVM_ARM_HOST) += debug-sr.o -obj-$(CONFIG_KVM_ARM_HOST) += entry.o -obj-$(CONFIG_KVM_ARM_HOST) += switch.o -obj-$(CONFIG_KVM_ARM_HOST) += fpsimd.o -obj-$(CONFIG_KVM_ARM_HOST) += tlb.o -obj-$(CONFIG_KVM_ARM_HOST) += hyp-entry.o +hyp-y := vgic-v3-sr.o timer-sr.o aarch32.o vgic-v2-cpuif-proxy.o sysreg-sr.o \ + debug-sr.o entry.o switch.o fpsimd.o tlb.o hyp-entry.o # KVM code is run at a different exception code with a different map, so # compiler instrumentation that inserts callbacks or checks into the code may diff --git a/arch/arm64/kvm/hyp/aarch32.c b/arch/arm64/kvm/hyp/aarch32.c new file mode 100644 index 000000000000..25c0e47d57cb --- /dev/null +++ b/arch/arm64/kvm/hyp/aarch32.c @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Hyp portion of the (not much of an) Emulation layer for 32bit guests. + * + * Copyright (C) 2012,2013 - ARM Ltd + * Author: Marc Zyngier <marc.zyngier@arm.com> + * + * based on arch/arm/kvm/emulate.c + * Copyright (C) 2012 - Virtual Open Systems and Columbia University + * Author: Christoffer Dall <c.dall@virtualopensystems.com> + */ + +#include <linux/kvm_host.h> +#include <asm/kvm_emulate.h> +#include <asm/kvm_hyp.h> + +/* + * stolen from arch/arm/kernel/opcodes.c + * + * condition code lookup table + * index into the table is test code: EQ, NE, ... LT, GT, AL, NV + * + * bit position in short is condition code: NZCV + */ +static const unsigned short cc_map[16] = { + 0xF0F0, /* EQ == Z set */ + 0x0F0F, /* NE */ + 0xCCCC, /* CS == C set */ + 0x3333, /* CC */ + 0xFF00, /* MI == N set */ + 0x00FF, /* PL */ + 0xAAAA, /* VS == V set */ + 0x5555, /* VC */ + 0x0C0C, /* HI == C set && Z clear */ + 0xF3F3, /* LS == C clear || Z set */ + 0xAA55, /* GE == (N==V) */ + 0x55AA, /* LT == (N!=V) */ + 0x0A05, /* GT == (!Z && (N==V)) */ + 0xF5FA, /* LE == (Z || (N!=V)) */ + 0xFFFF, /* AL always */ + 0 /* NV */ +}; + +/* + * Check if a trapped instruction should have been executed or not. + */ +bool __hyp_text kvm_condition_valid32(const struct kvm_vcpu *vcpu) +{ + unsigned long cpsr; + u32 cpsr_cond; + int cond; + + /* Top two bits non-zero? Unconditional. */ + if (kvm_vcpu_get_hsr(vcpu) >> 30) + return true; + + /* Is condition field valid? */ + cond = kvm_vcpu_get_condition(vcpu); + if (cond == 0xE) + return true; + + cpsr = *vcpu_cpsr(vcpu); + + if (cond < 0) { + /* This can happen in Thumb mode: examine IT state. */ + unsigned long it; + + it = ((cpsr >> 8) & 0xFC) | ((cpsr >> 25) & 0x3); + + /* it == 0 => unconditional. */ + if (it == 0) + return true; + + /* The cond for this insn works out as the top 4 bits. */ + cond = (it >> 4); + } + + cpsr_cond = cpsr >> 28; + + if (!((cc_map[cond] >> cpsr_cond) & 1)) + return false; + + return true; +} + +/** + * adjust_itstate - adjust ITSTATE when emulating instructions in IT-block + * @vcpu: The VCPU pointer + * + * When exceptions occur while instructions are executed in Thumb IF-THEN + * blocks, the ITSTATE field of the CPSR is not advanced (updated), so we have + * to do this little bit of work manually. The fields map like this: + * + * IT[7:0] -> CPSR[26:25],CPSR[15:10] + */ +static void __hyp_text kvm_adjust_itstate(struct kvm_vcpu *vcpu) +{ + unsigned long itbits, cond; + unsigned long cpsr = *vcpu_cpsr(vcpu); + bool is_arm = !(cpsr & PSR_AA32_T_BIT); + + if (is_arm || !(cpsr & PSR_AA32_IT_MASK)) + return; + + cond = (cpsr & 0xe000) >> 13; + itbits = (cpsr & 0x1c00) >> (10 - 2); + itbits |= (cpsr & (0x3 << 25)) >> 25; + + /* Perform ITAdvance (see page A2-52 in ARM DDI 0406C) */ + if ((itbits & 0x7) == 0) + itbits = cond = 0; + else + itbits = (itbits << 1) & 0x1f; + + cpsr &= ~PSR_AA32_IT_MASK; + cpsr |= cond << 13; + cpsr |= (itbits & 0x1c) << (10 - 2); + cpsr |= (itbits & 0x3) << 25; + *vcpu_cpsr(vcpu) = cpsr; +} + +/** + * kvm_skip_instr - skip a trapped instruction and proceed to the next + * @vcpu: The vcpu pointer + */ +void __hyp_text kvm_skip_instr32(struct kvm_vcpu *vcpu, bool is_wide_instr) +{ + u32 pc = *vcpu_pc(vcpu); + bool is_thumb; + + is_thumb = !!(*vcpu_cpsr(vcpu) & PSR_AA32_T_BIT); + if (is_thumb && !is_wide_instr) + pc += 2; + else + pc += 4; + + *vcpu_pc(vcpu) = pc; + + kvm_adjust_itstate(vcpu); +} diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S index d22d0534dd60..90186cf6473e 100644 --- a/arch/arm64/kvm/hyp/entry.S +++ b/arch/arm64/kvm/hyp/entry.S @@ -18,6 +18,7 @@ #define CPU_GP_REG_OFFSET(x) (CPU_GP_REGS + x) #define CPU_XREG_OFFSET(x) CPU_GP_REG_OFFSET(CPU_USER_PT_REGS + 8*x) +#define CPU_SP_EL0_OFFSET (CPU_XREG_OFFSET(30) + 8) .text .pushsection .hyp.text, "ax" @@ -47,6 +48,16 @@ ldp x29, lr, [\ctxt, #CPU_XREG_OFFSET(29)] .endm +.macro save_sp_el0 ctxt, tmp + mrs \tmp, sp_el0 + str \tmp, [\ctxt, #CPU_SP_EL0_OFFSET] +.endm + +.macro restore_sp_el0 ctxt, tmp + ldr \tmp, [\ctxt, #CPU_SP_EL0_OFFSET] + msr sp_el0, \tmp +.endm + /* * u64 __guest_enter(struct kvm_vcpu *vcpu, * struct kvm_cpu_context *host_ctxt); @@ -60,6 +71,9 @@ SYM_FUNC_START(__guest_enter) // Store the host regs save_callee_saved_regs x1 + // Save the host's sp_el0 + save_sp_el0 x1, x2 + // Now the host state is stored if we have a pending RAS SError it must // affect the host. If any asynchronous exception is pending we defer // the guest entry. The DSB isn't necessary before v8.2 as any SError @@ -83,6 +97,9 @@ alternative_else_nop_endif // when this feature is enabled for kernel code. ptrauth_switch_to_guest x29, x0, x1, x2 + // Restore the guest's sp_el0 + restore_sp_el0 x29, x0 + // Restore guest regs x0-x17 ldp x0, x1, [x29, #CPU_XREG_OFFSET(0)] ldp x2, x3, [x29, #CPU_XREG_OFFSET(2)] @@ -130,6 +147,9 @@ SYM_INNER_LABEL(__guest_exit, SYM_L_GLOBAL) // Store the guest regs x18-x29, lr save_callee_saved_regs x1 + // Store the guest's sp_el0 + save_sp_el0 x1, x2 + get_host_ctxt x2, x3 // Macro ptrauth_switch_to_guest format: @@ -139,6 +159,9 @@ SYM_INNER_LABEL(__guest_exit, SYM_L_GLOBAL) // when this feature is enabled for kernel code. ptrauth_switch_to_host x1, x2, x3, x4, x5 + // Restore the hosts's sp_el0 + restore_sp_el0 x2, x3 + // Now restore the host regs restore_callee_saved_regs x2 diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S index c2a13ab3c471..9c5cfb04170e 100644 --- a/arch/arm64/kvm/hyp/hyp-entry.S +++ b/arch/arm64/kvm/hyp/hyp-entry.S @@ -198,7 +198,6 @@ SYM_CODE_END(__hyp_panic) .macro invalid_vector label, target = __hyp_panic .align 2 SYM_CODE_START(\label) -\label: b \target SYM_CODE_END(\label) .endm diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c index 8a1e81a400e0..676b6585e5ae 100644 --- a/arch/arm64/kvm/hyp/switch.c +++ b/arch/arm64/kvm/hyp/switch.c @@ -138,7 +138,7 @@ static void __hyp_text __activate_traps_nvhe(struct kvm_vcpu *vcpu) write_sysreg(val, cptr_el2); - if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT_NVHE)) { + if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { struct kvm_cpu_context *ctxt = &vcpu->arch.ctxt; isb(); @@ -181,7 +181,7 @@ static void deactivate_traps_vhe(void) * above before we can switch to the EL2/EL0 translation regime used by * the host. */ - asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT_VHE)); + asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT)); write_sysreg(CPACR_EL1_DEFAULT, cpacr_el1); write_sysreg(vectors, vbar_el1); @@ -192,7 +192,7 @@ static void __hyp_text __deactivate_traps_nvhe(void) { u64 mdcr_el2 = read_sysreg(mdcr_el2); - if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT_NVHE)) { + if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { u64 val; /* @@ -270,8 +270,8 @@ static void __hyp_text __deactivate_vm(struct kvm_vcpu *vcpu) static void __hyp_text __hyp_vgic_save_state(struct kvm_vcpu *vcpu) { if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) { - __vgic_v3_save_state(vcpu); - __vgic_v3_deactivate_traps(vcpu); + __vgic_v3_save_state(&vcpu->arch.vgic_cpu.vgic_v3); + __vgic_v3_deactivate_traps(&vcpu->arch.vgic_cpu.vgic_v3); } } @@ -279,8 +279,8 @@ static void __hyp_text __hyp_vgic_save_state(struct kvm_vcpu *vcpu) static void __hyp_text __hyp_vgic_restore_state(struct kvm_vcpu *vcpu) { if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) { - __vgic_v3_activate_traps(vcpu); - __vgic_v3_restore_state(vcpu); + __vgic_v3_activate_traps(&vcpu->arch.vgic_cpu.vgic_v3); + __vgic_v3_restore_state(&vcpu->arch.vgic_cpu.vgic_v3); } } diff --git a/arch/arm64/kvm/hyp/sysreg-sr.c b/arch/arm64/kvm/hyp/sysreg-sr.c index 75b1925763f1..ea5d22fbdacf 100644 --- a/arch/arm64/kvm/hyp/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/sysreg-sr.c @@ -15,8 +15,9 @@ /* * Non-VHE: Both host and guest must save everything. * - * VHE: Host and guest must save mdscr_el1 and sp_el0 (and the PC and pstate, - * which are handled as part of the el2 return state) on every switch. + * VHE: Host and guest must save mdscr_el1 and sp_el0 (and the PC and + * pstate, which are handled as part of the el2 return state) on every + * switch (sp_el0 is being dealt with in the assembly code). * tpidr_el0 and tpidrro_el0 only need to be switched when going * to host userspace or a different VCPU. EL1 registers only need to be * switched when potentially going to run a different VCPU. The latter two @@ -26,12 +27,6 @@ static void __hyp_text __sysreg_save_common_state(struct kvm_cpu_context *ctxt) { ctxt->sys_regs[MDSCR_EL1] = read_sysreg(mdscr_el1); - - /* - * The host arm64 Linux uses sp_el0 to point to 'current' and it must - * therefore be saved/restored on every entry/exit to/from the guest. - */ - ctxt->gp_regs.regs.sp = read_sysreg(sp_el0); } static void __hyp_text __sysreg_save_user_state(struct kvm_cpu_context *ctxt) @@ -99,12 +94,6 @@ NOKPROBE_SYMBOL(sysreg_save_guest_state_vhe); static void __hyp_text __sysreg_restore_common_state(struct kvm_cpu_context *ctxt) { write_sysreg(ctxt->sys_regs[MDSCR_EL1], mdscr_el1); - - /* - * The host arm64 Linux uses sp_el0 to point to 'current' and it must - * therefore be saved/restored on every entry/exit to/from the guest. - */ - write_sysreg(ctxt->gp_regs.regs.sp, sp_el0); } static void __hyp_text __sysreg_restore_user_state(struct kvm_cpu_context *ctxt) @@ -118,7 +107,8 @@ static void __hyp_text __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt) write_sysreg(ctxt->sys_regs[MPIDR_EL1], vmpidr_el2); write_sysreg(ctxt->sys_regs[CSSELR_EL1], csselr_el1); - if (!cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT_NVHE)) { + if (has_vhe() || + !cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { write_sysreg_el1(ctxt->sys_regs[SCTLR_EL1], SYS_SCTLR); write_sysreg_el1(ctxt->sys_regs[TCR_EL1], SYS_TCR); } else if (!ctxt->__hyp_running_vcpu) { @@ -149,7 +139,8 @@ static void __hyp_text __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt) write_sysreg(ctxt->sys_regs[PAR_EL1], par_el1); write_sysreg(ctxt->sys_regs[TPIDR_EL1], tpidr_el1); - if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT_NVHE) && + if (!has_vhe() && + cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT) && ctxt->__hyp_running_vcpu) { /* * Must only be done for host registers, hence the context diff --git a/arch/arm64/kvm/hyp/timer-sr.c b/arch/arm64/kvm/hyp/timer-sr.c new file mode 100644 index 000000000000..fb5c0be33223 --- /dev/null +++ b/arch/arm64/kvm/hyp/timer-sr.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2012-2015 - ARM Ltd + * Author: Marc Zyngier <marc.zyngier@arm.com> + */ + +#include <clocksource/arm_arch_timer.h> +#include <linux/compiler.h> +#include <linux/kvm_host.h> + +#include <asm/kvm_hyp.h> + +void __hyp_text __kvm_timer_set_cntvoff(u64 cntvoff) +{ + write_sysreg(cntvoff, cntvoff_el2); +} + +/* + * Should only be called on non-VHE systems. + * VHE systems use EL2 timers and configure EL1 timers in kvm_timer_init_vhe(). + */ +void __hyp_text __timer_disable_traps(struct kvm_vcpu *vcpu) +{ + u64 val; + + /* Allow physical timer/counter access for the host */ + val = read_sysreg(cnthctl_el2); + val |= CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN; + write_sysreg(val, cnthctl_el2); +} + +/* + * Should only be called on non-VHE systems. + * VHE systems use EL2 timers and configure EL1 timers in kvm_timer_init_vhe(). + */ +void __hyp_text __timer_enable_traps(struct kvm_vcpu *vcpu) +{ + u64 val; + + /* + * Disallow physical timer access for the guest + * Physical counter access is allowed + */ + val = read_sysreg(cnthctl_el2); + val &= ~CNTHCTL_EL1PCEN; + val |= CNTHCTL_EL1PCTEN; + write_sysreg(val, cnthctl_el2); +} diff --git a/arch/arm64/kvm/hyp/tlb.c b/arch/arm64/kvm/hyp/tlb.c index ceaddbe4279f..d063a576d511 100644 --- a/arch/arm64/kvm/hyp/tlb.c +++ b/arch/arm64/kvm/hyp/tlb.c @@ -23,7 +23,7 @@ static void __hyp_text __tlb_switch_to_guest_vhe(struct kvm *kvm, local_irq_save(cxt->flags); - if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT_VHE)) { + if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { /* * For CPUs that are affected by ARM errata 1165522 or 1530923, * we cannot trust stage-1 to be in a correct state at that @@ -63,7 +63,7 @@ static void __hyp_text __tlb_switch_to_guest_vhe(struct kvm *kvm, static void __hyp_text __tlb_switch_to_guest_nvhe(struct kvm *kvm, struct tlb_inv_context *cxt) { - if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT_NVHE)) { + if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { u64 val; /* @@ -79,8 +79,9 @@ static void __hyp_text __tlb_switch_to_guest_nvhe(struct kvm *kvm, isb(); } + /* __load_guest_stage2() includes an ISB for the workaround. */ __load_guest_stage2(kvm); - isb(); + asm(ALTERNATIVE("isb", "nop", ARM64_WORKAROUND_SPECULATIVE_AT)); } static void __hyp_text __tlb_switch_to_guest(struct kvm *kvm, @@ -103,7 +104,7 @@ static void __hyp_text __tlb_switch_to_host_vhe(struct kvm *kvm, write_sysreg(HCR_HOST_VHE_FLAGS, hcr_el2); isb(); - if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT_VHE)) { + if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { /* Restore the registers to what they were */ write_sysreg_el1(cxt->tcr, SYS_TCR); write_sysreg_el1(cxt->sctlr, SYS_SCTLR); @@ -117,7 +118,7 @@ static void __hyp_text __tlb_switch_to_host_nvhe(struct kvm *kvm, { write_sysreg(0, vttbr_el2); - if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT_NVHE)) { + if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { /* Ensure write of the host VMID */ isb(); /* Restore the host's TCR_EL1 */ diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c new file mode 100644 index 000000000000..10ed539835c1 --- /dev/null +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -0,0 +1,1113 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2012-2015 - ARM Ltd + * Author: Marc Zyngier <marc.zyngier@arm.com> + */ + +#include <linux/compiler.h> +#include <linux/irqchip/arm-gic-v3.h> +#include <linux/kvm_host.h> + +#include <asm/kvm_emulate.h> +#include <asm/kvm_hyp.h> +#include <asm/kvm_mmu.h> + +#define vtr_to_max_lr_idx(v) ((v) & 0xf) +#define vtr_to_nr_pre_bits(v) ((((u32)(v) >> 26) & 7) + 1) +#define vtr_to_nr_apr_regs(v) (1 << (vtr_to_nr_pre_bits(v) - 5)) + +static u64 __hyp_text __gic_v3_get_lr(unsigned int lr) +{ + switch (lr & 0xf) { + case 0: + return read_gicreg(ICH_LR0_EL2); + case 1: + return read_gicreg(ICH_LR1_EL2); + case 2: + return read_gicreg(ICH_LR2_EL2); + case 3: + return read_gicreg(ICH_LR3_EL2); + case 4: + return read_gicreg(ICH_LR4_EL2); + case 5: + return read_gicreg(ICH_LR5_EL2); + case 6: + return read_gicreg(ICH_LR6_EL2); + case 7: + return read_gicreg(ICH_LR7_EL2); + case 8: + return read_gicreg(ICH_LR8_EL2); + case 9: + return read_gicreg(ICH_LR9_EL2); + case 10: + return read_gicreg(ICH_LR10_EL2); + case 11: + return read_gicreg(ICH_LR11_EL2); + case 12: + return read_gicreg(ICH_LR12_EL2); + case 13: + return read_gicreg(ICH_LR13_EL2); + case 14: + return read_gicreg(ICH_LR14_EL2); + case 15: + return read_gicreg(ICH_LR15_EL2); + } + + unreachable(); +} + +static void __hyp_text __gic_v3_set_lr(u64 val, int lr) +{ + switch (lr & 0xf) { + case 0: + write_gicreg(val, ICH_LR0_EL2); + break; + case 1: + write_gicreg(val, ICH_LR1_EL2); + break; + case 2: + write_gicreg(val, ICH_LR2_EL2); + break; + case 3: + write_gicreg(val, ICH_LR3_EL2); + break; + case 4: + write_gicreg(val, ICH_LR4_EL2); + break; + case 5: + write_gicreg(val, ICH_LR5_EL2); + break; + case 6: + write_gicreg(val, ICH_LR6_EL2); + break; + case 7: + write_gicreg(val, ICH_LR7_EL2); + break; + case 8: + write_gicreg(val, ICH_LR8_EL2); + break; + case 9: + write_gicreg(val, ICH_LR9_EL2); + break; + case 10: + write_gicreg(val, ICH_LR10_EL2); + break; + case 11: + write_gicreg(val, ICH_LR11_EL2); + break; + case 12: + write_gicreg(val, ICH_LR12_EL2); + break; + case 13: + write_gicreg(val, ICH_LR13_EL2); + break; + case 14: + write_gicreg(val, ICH_LR14_EL2); + break; + case 15: + write_gicreg(val, ICH_LR15_EL2); + break; + } +} + +static void __hyp_text __vgic_v3_write_ap0rn(u32 val, int n) +{ + switch (n) { + case 0: + write_gicreg(val, ICH_AP0R0_EL2); + break; + case 1: + write_gicreg(val, ICH_AP0R1_EL2); + break; + case 2: + write_gicreg(val, ICH_AP0R2_EL2); + break; + case 3: + write_gicreg(val, ICH_AP0R3_EL2); + break; + } +} + +static void __hyp_text __vgic_v3_write_ap1rn(u32 val, int n) +{ + switch (n) { + case 0: + write_gicreg(val, ICH_AP1R0_EL2); + break; + case 1: + write_gicreg(val, ICH_AP1R1_EL2); + break; + case 2: + write_gicreg(val, ICH_AP1R2_EL2); + break; + case 3: + write_gicreg(val, ICH_AP1R3_EL2); + break; + } +} + +static u32 __hyp_text __vgic_v3_read_ap0rn(int n) +{ + u32 val; + + switch (n) { + case 0: + val = read_gicreg(ICH_AP0R0_EL2); + break; + case 1: + val = read_gicreg(ICH_AP0R1_EL2); + break; + case 2: + val = read_gicreg(ICH_AP0R2_EL2); + break; + case 3: + val = read_gicreg(ICH_AP0R3_EL2); + break; + default: + unreachable(); + } + + return val; +} + +static u32 __hyp_text __vgic_v3_read_ap1rn(int n) +{ + u32 val; + + switch (n) { + case 0: + val = read_gicreg(ICH_AP1R0_EL2); + break; + case 1: + val = read_gicreg(ICH_AP1R1_EL2); + break; + case 2: + val = read_gicreg(ICH_AP1R2_EL2); + break; + case 3: + val = read_gicreg(ICH_AP1R3_EL2); + break; + default: + unreachable(); + } + + return val; +} + +void __hyp_text __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if) +{ + u64 used_lrs = cpu_if->used_lrs; + + /* + * Make sure stores to the GIC via the memory mapped interface + * are now visible to the system register interface when reading the + * LRs, and when reading back the VMCR on non-VHE systems. + */ + if (used_lrs || !has_vhe()) { + if (!cpu_if->vgic_sre) { + dsb(sy); + isb(); + } + } + + if (used_lrs || cpu_if->its_vpe.its_vm) { + int i; + u32 elrsr; + + elrsr = read_gicreg(ICH_ELRSR_EL2); + + write_gicreg(cpu_if->vgic_hcr & ~ICH_HCR_EN, ICH_HCR_EL2); + + for (i = 0; i < used_lrs; i++) { + if (elrsr & (1 << i)) + cpu_if->vgic_lr[i] &= ~ICH_LR_STATE; + else + cpu_if->vgic_lr[i] = __gic_v3_get_lr(i); + + __gic_v3_set_lr(0, i); + } + } +} + +void __hyp_text __vgic_v3_restore_state(struct vgic_v3_cpu_if *cpu_if) +{ + u64 used_lrs = cpu_if->used_lrs; + int i; + + if (used_lrs || cpu_if->its_vpe.its_vm) { + write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2); + + for (i = 0; i < used_lrs; i++) + __gic_v3_set_lr(cpu_if->vgic_lr[i], i); + } + + /* + * Ensure that writes to the LRs, and on non-VHE systems ensure that + * the write to the VMCR in __vgic_v3_activate_traps(), will have + * reached the (re)distributors. This ensure the guest will read the + * correct values from the memory-mapped interface. + */ + if (used_lrs || !has_vhe()) { + if (!cpu_if->vgic_sre) { + isb(); + dsb(sy); + } + } +} + +void __hyp_text __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if) +{ + /* + * VFIQEn is RES1 if ICC_SRE_EL1.SRE is 1. This causes a + * Group0 interrupt (as generated in GICv2 mode) to be + * delivered as a FIQ to the guest, with potentially fatal + * consequences. So we must make sure that ICC_SRE_EL1 has + * been actually programmed with the value we want before + * starting to mess with the rest of the GIC, and VMCR_EL2 in + * particular. This logic must be called before + * __vgic_v3_restore_state(). + */ + if (!cpu_if->vgic_sre) { + write_gicreg(0, ICC_SRE_EL1); + isb(); + write_gicreg(cpu_if->vgic_vmcr, ICH_VMCR_EL2); + + + if (has_vhe()) { + /* + * Ensure that the write to the VMCR will have reached + * the (re)distributors. This ensure the guest will + * read the correct values from the memory-mapped + * interface. + */ + isb(); + dsb(sy); + } + } + + /* + * Prevent the guest from touching the GIC system registers if + * SRE isn't enabled for GICv3 emulation. + */ + write_gicreg(read_gicreg(ICC_SRE_EL2) & ~ICC_SRE_EL2_ENABLE, + ICC_SRE_EL2); + + /* + * If we need to trap system registers, we must write + * ICH_HCR_EL2 anyway, even if no interrupts are being + * injected, + */ + if (static_branch_unlikely(&vgic_v3_cpuif_trap) || + cpu_if->its_vpe.its_vm) + write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2); +} + +void __hyp_text __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if) +{ + u64 val; + + if (!cpu_if->vgic_sre) { + cpu_if->vgic_vmcr = read_gicreg(ICH_VMCR_EL2); + } + + val = read_gicreg(ICC_SRE_EL2); + write_gicreg(val | ICC_SRE_EL2_ENABLE, ICC_SRE_EL2); + + if (!cpu_if->vgic_sre) { + /* Make sure ENABLE is set at EL2 before setting SRE at EL1 */ + isb(); + write_gicreg(1, ICC_SRE_EL1); + } + + /* + * If we were trapping system registers, we enabled the VGIC even if + * no interrupts were being injected, and we disable it again here. + */ + if (static_branch_unlikely(&vgic_v3_cpuif_trap) || + cpu_if->its_vpe.its_vm) + write_gicreg(0, ICH_HCR_EL2); +} + +void __hyp_text __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if) +{ + u64 val; + u32 nr_pre_bits; + + val = read_gicreg(ICH_VTR_EL2); + nr_pre_bits = vtr_to_nr_pre_bits(val); + + switch (nr_pre_bits) { + case 7: + cpu_if->vgic_ap0r[3] = __vgic_v3_read_ap0rn(3); + cpu_if->vgic_ap0r[2] = __vgic_v3_read_ap0rn(2); + /* Fall through */ + case 6: + cpu_if->vgic_ap0r[1] = __vgic_v3_read_ap0rn(1); + /* Fall through */ + default: + cpu_if->vgic_ap0r[0] = __vgic_v3_read_ap0rn(0); + } + + switch (nr_pre_bits) { + case 7: + cpu_if->vgic_ap1r[3] = __vgic_v3_read_ap1rn(3); + cpu_if->vgic_ap1r[2] = __vgic_v3_read_ap1rn(2); + /* Fall through */ + case 6: + cpu_if->vgic_ap1r[1] = __vgic_v3_read_ap1rn(1); + /* Fall through */ + default: + cpu_if->vgic_ap1r[0] = __vgic_v3_read_ap1rn(0); + } +} + +void __hyp_text __vgic_v3_restore_aprs(struct vgic_v3_cpu_if *cpu_if) +{ + u64 val; + u32 nr_pre_bits; + + val = read_gicreg(ICH_VTR_EL2); + nr_pre_bits = vtr_to_nr_pre_bits(val); + + switch (nr_pre_bits) { + case 7: + __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[3], 3); + __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[2], 2); + /* Fall through */ + case 6: + __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[1], 1); + /* Fall through */ + default: + __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[0], 0); + } + + switch (nr_pre_bits) { + case 7: + __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[3], 3); + __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[2], 2); + /* Fall through */ + case 6: + __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[1], 1); + /* Fall through */ + default: + __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[0], 0); + } +} + +void __hyp_text __vgic_v3_init_lrs(void) +{ + int max_lr_idx = vtr_to_max_lr_idx(read_gicreg(ICH_VTR_EL2)); + int i; + + for (i = 0; i <= max_lr_idx; i++) + __gic_v3_set_lr(0, i); +} + +u64 __hyp_text __vgic_v3_get_ich_vtr_el2(void) +{ + return read_gicreg(ICH_VTR_EL2); +} + +u64 __hyp_text __vgic_v3_read_vmcr(void) +{ + return read_gicreg(ICH_VMCR_EL2); +} + +void __hyp_text __vgic_v3_write_vmcr(u32 vmcr) +{ + write_gicreg(vmcr, ICH_VMCR_EL2); +} + +static int __hyp_text __vgic_v3_bpr_min(void) +{ + /* See Pseudocode for VPriorityGroup */ + return 8 - vtr_to_nr_pre_bits(read_gicreg(ICH_VTR_EL2)); +} + +static int __hyp_text __vgic_v3_get_group(struct kvm_vcpu *vcpu) +{ + u32 esr = kvm_vcpu_get_hsr(vcpu); + u8 crm = (esr & ESR_ELx_SYS64_ISS_CRM_MASK) >> ESR_ELx_SYS64_ISS_CRM_SHIFT; + + return crm != 8; +} + +#define GICv3_IDLE_PRIORITY 0xff + +static int __hyp_text __vgic_v3_highest_priority_lr(struct kvm_vcpu *vcpu, + u32 vmcr, + u64 *lr_val) +{ + unsigned int used_lrs = vcpu->arch.vgic_cpu.vgic_v3.used_lrs; + u8 priority = GICv3_IDLE_PRIORITY; + int i, lr = -1; + + for (i = 0; i < used_lrs; i++) { + u64 val = __gic_v3_get_lr(i); + u8 lr_prio = (val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT; + + /* Not pending in the state? */ + if ((val & ICH_LR_STATE) != ICH_LR_PENDING_BIT) + continue; + + /* Group-0 interrupt, but Group-0 disabled? */ + if (!(val & ICH_LR_GROUP) && !(vmcr & ICH_VMCR_ENG0_MASK)) + continue; + + /* Group-1 interrupt, but Group-1 disabled? */ + if ((val & ICH_LR_GROUP) && !(vmcr & ICH_VMCR_ENG1_MASK)) + continue; + + /* Not the highest priority? */ + if (lr_prio >= priority) + continue; + + /* This is a candidate */ + priority = lr_prio; + *lr_val = val; + lr = i; + } + + if (lr == -1) + *lr_val = ICC_IAR1_EL1_SPURIOUS; + + return lr; +} + +static int __hyp_text __vgic_v3_find_active_lr(struct kvm_vcpu *vcpu, + int intid, u64 *lr_val) +{ + unsigned int used_lrs = vcpu->arch.vgic_cpu.vgic_v3.used_lrs; + int i; + + for (i = 0; i < used_lrs; i++) { + u64 val = __gic_v3_get_lr(i); + + if ((val & ICH_LR_VIRTUAL_ID_MASK) == intid && + (val & ICH_LR_ACTIVE_BIT)) { + *lr_val = val; + return i; + } + } + + *lr_val = ICC_IAR1_EL1_SPURIOUS; + return -1; +} + +static int __hyp_text __vgic_v3_get_highest_active_priority(void) +{ + u8 nr_apr_regs = vtr_to_nr_apr_regs(read_gicreg(ICH_VTR_EL2)); + u32 hap = 0; + int i; + + for (i = 0; i < nr_apr_regs; i++) { + u32 val; + + /* + * The ICH_AP0Rn_EL2 and ICH_AP1Rn_EL2 registers + * contain the active priority levels for this VCPU + * for the maximum number of supported priority + * levels, and we return the full priority level only + * if the BPR is programmed to its minimum, otherwise + * we return a combination of the priority level and + * subpriority, as determined by the setting of the + * BPR, but without the full subpriority. + */ + val = __vgic_v3_read_ap0rn(i); + val |= __vgic_v3_read_ap1rn(i); + if (!val) { + hap += 32; + continue; + } + + return (hap + __ffs(val)) << __vgic_v3_bpr_min(); + } + + return GICv3_IDLE_PRIORITY; +} + +static unsigned int __hyp_text __vgic_v3_get_bpr0(u32 vmcr) +{ + return (vmcr & ICH_VMCR_BPR0_MASK) >> ICH_VMCR_BPR0_SHIFT; +} + +static unsigned int __hyp_text __vgic_v3_get_bpr1(u32 vmcr) +{ + unsigned int bpr; + + if (vmcr & ICH_VMCR_CBPR_MASK) { + bpr = __vgic_v3_get_bpr0(vmcr); + if (bpr < 7) + bpr++; + } else { + bpr = (vmcr & ICH_VMCR_BPR1_MASK) >> ICH_VMCR_BPR1_SHIFT; + } + + return bpr; +} + +/* + * Convert a priority to a preemption level, taking the relevant BPR + * into account by zeroing the sub-priority bits. + */ +static u8 __hyp_text __vgic_v3_pri_to_pre(u8 pri, u32 vmcr, int grp) +{ + unsigned int bpr; + + if (!grp) + bpr = __vgic_v3_get_bpr0(vmcr) + 1; + else + bpr = __vgic_v3_get_bpr1(vmcr); + + return pri & (GENMASK(7, 0) << bpr); +} + +/* + * The priority value is independent of any of the BPR values, so we + * normalize it using the minimal BPR value. This guarantees that no + * matter what the guest does with its BPR, we can always set/get the + * same value of a priority. + */ +static void __hyp_text __vgic_v3_set_active_priority(u8 pri, u32 vmcr, int grp) +{ + u8 pre, ap; + u32 val; + int apr; + + pre = __vgic_v3_pri_to_pre(pri, vmcr, grp); + ap = pre >> __vgic_v3_bpr_min(); + apr = ap / 32; + + if (!grp) { + val = __vgic_v3_read_ap0rn(apr); + __vgic_v3_write_ap0rn(val | BIT(ap % 32), apr); + } else { + val = __vgic_v3_read_ap1rn(apr); + __vgic_v3_write_ap1rn(val | BIT(ap % 32), apr); + } +} + +static int __hyp_text __vgic_v3_clear_highest_active_priority(void) +{ + u8 nr_apr_regs = vtr_to_nr_apr_regs(read_gicreg(ICH_VTR_EL2)); + u32 hap = 0; + int i; + + for (i = 0; i < nr_apr_regs; i++) { + u32 ap0, ap1; + int c0, c1; + + ap0 = __vgic_v3_read_ap0rn(i); + ap1 = __vgic_v3_read_ap1rn(i); + if (!ap0 && !ap1) { + hap += 32; + continue; + } + + c0 = ap0 ? __ffs(ap0) : 32; + c1 = ap1 ? __ffs(ap1) : 32; + + /* Always clear the LSB, which is the highest priority */ + if (c0 < c1) { + ap0 &= ~BIT(c0); + __vgic_v3_write_ap0rn(ap0, i); + hap += c0; + } else { + ap1 &= ~BIT(c1); + __vgic_v3_write_ap1rn(ap1, i); + hap += c1; + } + + /* Rescale to 8 bits of priority */ + return hap << __vgic_v3_bpr_min(); + } + + return GICv3_IDLE_PRIORITY; +} + +static void __hyp_text __vgic_v3_read_iar(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +{ + u64 lr_val; + u8 lr_prio, pmr; + int lr, grp; + + grp = __vgic_v3_get_group(vcpu); + + lr = __vgic_v3_highest_priority_lr(vcpu, vmcr, &lr_val); + if (lr < 0) + goto spurious; + + if (grp != !!(lr_val & ICH_LR_GROUP)) + goto spurious; + + pmr = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT; + lr_prio = (lr_val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT; + if (pmr <= lr_prio) + goto spurious; + + if (__vgic_v3_get_highest_active_priority() <= __vgic_v3_pri_to_pre(lr_prio, vmcr, grp)) + goto spurious; + + lr_val &= ~ICH_LR_STATE; + /* No active state for LPIs */ + if ((lr_val & ICH_LR_VIRTUAL_ID_MASK) <= VGIC_MAX_SPI) + lr_val |= ICH_LR_ACTIVE_BIT; + __gic_v3_set_lr(lr_val, lr); + __vgic_v3_set_active_priority(lr_prio, vmcr, grp); + vcpu_set_reg(vcpu, rt, lr_val & ICH_LR_VIRTUAL_ID_MASK); + return; + +spurious: + vcpu_set_reg(vcpu, rt, ICC_IAR1_EL1_SPURIOUS); +} + +static void __hyp_text __vgic_v3_clear_active_lr(int lr, u64 lr_val) +{ + lr_val &= ~ICH_LR_ACTIVE_BIT; + if (lr_val & ICH_LR_HW) { + u32 pid; + + pid = (lr_val & ICH_LR_PHYS_ID_MASK) >> ICH_LR_PHYS_ID_SHIFT; + gic_write_dir(pid); + } + + __gic_v3_set_lr(lr_val, lr); +} + +static void __hyp_text __vgic_v3_bump_eoicount(void) +{ + u32 hcr; + + hcr = read_gicreg(ICH_HCR_EL2); + hcr += 1 << ICH_HCR_EOIcount_SHIFT; + write_gicreg(hcr, ICH_HCR_EL2); +} + +static void __hyp_text __vgic_v3_write_dir(struct kvm_vcpu *vcpu, + u32 vmcr, int rt) +{ + u32 vid = vcpu_get_reg(vcpu, rt); + u64 lr_val; + int lr; + + /* EOImode == 0, nothing to be done here */ + if (!(vmcr & ICH_VMCR_EOIM_MASK)) + return; + + /* No deactivate to be performed on an LPI */ + if (vid >= VGIC_MIN_LPI) + return; + + lr = __vgic_v3_find_active_lr(vcpu, vid, &lr_val); + if (lr == -1) { + __vgic_v3_bump_eoicount(); + return; + } + + __vgic_v3_clear_active_lr(lr, lr_val); +} + +static void __hyp_text __vgic_v3_write_eoir(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +{ + u32 vid = vcpu_get_reg(vcpu, rt); + u64 lr_val; + u8 lr_prio, act_prio; + int lr, grp; + + grp = __vgic_v3_get_group(vcpu); + + /* Drop priority in any case */ + act_prio = __vgic_v3_clear_highest_active_priority(); + + /* If EOIing an LPI, no deactivate to be performed */ + if (vid >= VGIC_MIN_LPI) + return; + + /* EOImode == 1, nothing to be done here */ + if (vmcr & ICH_VMCR_EOIM_MASK) + return; + + lr = __vgic_v3_find_active_lr(vcpu, vid, &lr_val); + if (lr == -1) { + __vgic_v3_bump_eoicount(); + return; + } + + lr_prio = (lr_val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT; + + /* If priorities or group do not match, the guest has fscked-up. */ + if (grp != !!(lr_val & ICH_LR_GROUP) || + __vgic_v3_pri_to_pre(lr_prio, vmcr, grp) != act_prio) + return; + + /* Let's now perform the deactivation */ + __vgic_v3_clear_active_lr(lr, lr_val); +} + +static void __hyp_text __vgic_v3_read_igrpen0(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +{ + vcpu_set_reg(vcpu, rt, !!(vmcr & ICH_VMCR_ENG0_MASK)); +} + +static void __hyp_text __vgic_v3_read_igrpen1(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +{ + vcpu_set_reg(vcpu, rt, !!(vmcr & ICH_VMCR_ENG1_MASK)); +} + +static void __hyp_text __vgic_v3_write_igrpen0(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +{ + u64 val = vcpu_get_reg(vcpu, rt); + + if (val & 1) + vmcr |= ICH_VMCR_ENG0_MASK; + else + vmcr &= ~ICH_VMCR_ENG0_MASK; + + __vgic_v3_write_vmcr(vmcr); +} + +static void __hyp_text __vgic_v3_write_igrpen1(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +{ + u64 val = vcpu_get_reg(vcpu, rt); + + if (val & 1) + vmcr |= ICH_VMCR_ENG1_MASK; + else + vmcr &= ~ICH_VMCR_ENG1_MASK; + + __vgic_v3_write_vmcr(vmcr); +} + +static void __hyp_text __vgic_v3_read_bpr0(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +{ + vcpu_set_reg(vcpu, rt, __vgic_v3_get_bpr0(vmcr)); +} + +static void __hyp_text __vgic_v3_read_bpr1(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +{ + vcpu_set_reg(vcpu, rt, __vgic_v3_get_bpr1(vmcr)); +} + +static void __hyp_text __vgic_v3_write_bpr0(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +{ + u64 val = vcpu_get_reg(vcpu, rt); + u8 bpr_min = __vgic_v3_bpr_min() - 1; + + /* Enforce BPR limiting */ + if (val < bpr_min) + val = bpr_min; + + val <<= ICH_VMCR_BPR0_SHIFT; + val &= ICH_VMCR_BPR0_MASK; + vmcr &= ~ICH_VMCR_BPR0_MASK; + vmcr |= val; + + __vgic_v3_write_vmcr(vmcr); +} + +static void __hyp_text __vgic_v3_write_bpr1(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +{ + u64 val = vcpu_get_reg(vcpu, rt); + u8 bpr_min = __vgic_v3_bpr_min(); + + if (vmcr & ICH_VMCR_CBPR_MASK) + return; + + /* Enforce BPR limiting */ + if (val < bpr_min) + val = bpr_min; + + val <<= ICH_VMCR_BPR1_SHIFT; + val &= ICH_VMCR_BPR1_MASK; + vmcr &= ~ICH_VMCR_BPR1_MASK; + vmcr |= val; + + __vgic_v3_write_vmcr(vmcr); +} + +static void __hyp_text __vgic_v3_read_apxrn(struct kvm_vcpu *vcpu, int rt, int n) +{ + u32 val; + + if (!__vgic_v3_get_group(vcpu)) + val = __vgic_v3_read_ap0rn(n); + else + val = __vgic_v3_read_ap1rn(n); + + vcpu_set_reg(vcpu, rt, val); +} + +static void __hyp_text __vgic_v3_write_apxrn(struct kvm_vcpu *vcpu, int rt, int n) +{ + u32 val = vcpu_get_reg(vcpu, rt); + + if (!__vgic_v3_get_group(vcpu)) + __vgic_v3_write_ap0rn(val, n); + else + __vgic_v3_write_ap1rn(val, n); +} + +static void __hyp_text __vgic_v3_read_apxr0(struct kvm_vcpu *vcpu, + u32 vmcr, int rt) +{ + __vgic_v3_read_apxrn(vcpu, rt, 0); +} + +static void __hyp_text __vgic_v3_read_apxr1(struct kvm_vcpu *vcpu, + u32 vmcr, int rt) +{ + __vgic_v3_read_apxrn(vcpu, rt, 1); +} + +static void __hyp_text __vgic_v3_read_apxr2(struct kvm_vcpu *vcpu, + u32 vmcr, int rt) +{ + __vgic_v3_read_apxrn(vcpu, rt, 2); +} + +static void __hyp_text __vgic_v3_read_apxr3(struct kvm_vcpu *vcpu, + u32 vmcr, int rt) +{ + __vgic_v3_read_apxrn(vcpu, rt, 3); +} + +static void __hyp_text __vgic_v3_write_apxr0(struct kvm_vcpu *vcpu, + u32 vmcr, int rt) +{ + __vgic_v3_write_apxrn(vcpu, rt, 0); +} + +static void __hyp_text __vgic_v3_write_apxr1(struct kvm_vcpu *vcpu, + u32 vmcr, int rt) +{ + __vgic_v3_write_apxrn(vcpu, rt, 1); +} + +static void __hyp_text __vgic_v3_write_apxr2(struct kvm_vcpu *vcpu, + u32 vmcr, int rt) +{ + __vgic_v3_write_apxrn(vcpu, rt, 2); +} + +static void __hyp_text __vgic_v3_write_apxr3(struct kvm_vcpu *vcpu, + u32 vmcr, int rt) +{ + __vgic_v3_write_apxrn(vcpu, rt, 3); +} + +static void __hyp_text __vgic_v3_read_hppir(struct kvm_vcpu *vcpu, + u32 vmcr, int rt) +{ + u64 lr_val; + int lr, lr_grp, grp; + + grp = __vgic_v3_get_group(vcpu); + + lr = __vgic_v3_highest_priority_lr(vcpu, vmcr, &lr_val); + if (lr == -1) + goto spurious; + + lr_grp = !!(lr_val & ICH_LR_GROUP); + if (lr_grp != grp) + lr_val = ICC_IAR1_EL1_SPURIOUS; + +spurious: + vcpu_set_reg(vcpu, rt, lr_val & ICH_LR_VIRTUAL_ID_MASK); +} + +static void __hyp_text __vgic_v3_read_pmr(struct kvm_vcpu *vcpu, + u32 vmcr, int rt) +{ + vmcr &= ICH_VMCR_PMR_MASK; + vmcr >>= ICH_VMCR_PMR_SHIFT; + vcpu_set_reg(vcpu, rt, vmcr); +} + +static void __hyp_text __vgic_v3_write_pmr(struct kvm_vcpu *vcpu, + u32 vmcr, int rt) +{ + u32 val = vcpu_get_reg(vcpu, rt); + + val <<= ICH_VMCR_PMR_SHIFT; + val &= ICH_VMCR_PMR_MASK; + vmcr &= ~ICH_VMCR_PMR_MASK; + vmcr |= val; + + write_gicreg(vmcr, ICH_VMCR_EL2); +} + +static void __hyp_text __vgic_v3_read_rpr(struct kvm_vcpu *vcpu, + u32 vmcr, int rt) +{ + u32 val = __vgic_v3_get_highest_active_priority(); + vcpu_set_reg(vcpu, rt, val); +} + +static void __hyp_text __vgic_v3_read_ctlr(struct kvm_vcpu *vcpu, + u32 vmcr, int rt) +{ + u32 vtr, val; + + vtr = read_gicreg(ICH_VTR_EL2); + /* PRIbits */ + val = ((vtr >> 29) & 7) << ICC_CTLR_EL1_PRI_BITS_SHIFT; + /* IDbits */ + val |= ((vtr >> 23) & 7) << ICC_CTLR_EL1_ID_BITS_SHIFT; + /* SEIS */ + val |= ((vtr >> 22) & 1) << ICC_CTLR_EL1_SEIS_SHIFT; + /* A3V */ + val |= ((vtr >> 21) & 1) << ICC_CTLR_EL1_A3V_SHIFT; + /* EOImode */ + val |= ((vmcr & ICH_VMCR_EOIM_MASK) >> ICH_VMCR_EOIM_SHIFT) << ICC_CTLR_EL1_EOImode_SHIFT; + /* CBPR */ + val |= (vmcr & ICH_VMCR_CBPR_MASK) >> ICH_VMCR_CBPR_SHIFT; + + vcpu_set_reg(vcpu, rt, val); +} + +static void __hyp_text __vgic_v3_write_ctlr(struct kvm_vcpu *vcpu, + u32 vmcr, int rt) +{ + u32 val = vcpu_get_reg(vcpu, rt); + + if (val & ICC_CTLR_EL1_CBPR_MASK) + vmcr |= ICH_VMCR_CBPR_MASK; + else + vmcr &= ~ICH_VMCR_CBPR_MASK; + + if (val & ICC_CTLR_EL1_EOImode_MASK) + vmcr |= ICH_VMCR_EOIM_MASK; + else + vmcr &= ~ICH_VMCR_EOIM_MASK; + + write_gicreg(vmcr, ICH_VMCR_EL2); +} + +int __hyp_text __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu) +{ + int rt; + u32 esr; + u32 vmcr; + void (*fn)(struct kvm_vcpu *, u32, int); + bool is_read; + u32 sysreg; + + esr = kvm_vcpu_get_hsr(vcpu); + if (vcpu_mode_is_32bit(vcpu)) { + if (!kvm_condition_valid(vcpu)) { + __kvm_skip_instr(vcpu); + return 1; + } + + sysreg = esr_cp15_to_sysreg(esr); + } else { + sysreg = esr_sys64_to_sysreg(esr); + } + + is_read = (esr & ESR_ELx_SYS64_ISS_DIR_MASK) == ESR_ELx_SYS64_ISS_DIR_READ; + + switch (sysreg) { + case SYS_ICC_IAR0_EL1: + case SYS_ICC_IAR1_EL1: + if (unlikely(!is_read)) + return 0; + fn = __vgic_v3_read_iar; + break; + case SYS_ICC_EOIR0_EL1: + case SYS_ICC_EOIR1_EL1: + if (unlikely(is_read)) + return 0; + fn = __vgic_v3_write_eoir; + break; + case SYS_ICC_IGRPEN1_EL1: + if (is_read) + fn = __vgic_v3_read_igrpen1; + else + fn = __vgic_v3_write_igrpen1; + break; + case SYS_ICC_BPR1_EL1: + if (is_read) + fn = __vgic_v3_read_bpr1; + else + fn = __vgic_v3_write_bpr1; + break; + case SYS_ICC_AP0Rn_EL1(0): + case SYS_ICC_AP1Rn_EL1(0): + if (is_read) + fn = __vgic_v3_read_apxr0; + else + fn = __vgic_v3_write_apxr0; + break; + case SYS_ICC_AP0Rn_EL1(1): + case SYS_ICC_AP1Rn_EL1(1): + if (is_read) + fn = __vgic_v3_read_apxr1; + else + fn = __vgic_v3_write_apxr1; + break; + case SYS_ICC_AP0Rn_EL1(2): + case SYS_ICC_AP1Rn_EL1(2): + if (is_read) + fn = __vgic_v3_read_apxr2; + else + fn = __vgic_v3_write_apxr2; + break; + case SYS_ICC_AP0Rn_EL1(3): + case SYS_ICC_AP1Rn_EL1(3): + if (is_read) + fn = __vgic_v3_read_apxr3; + else + fn = __vgic_v3_write_apxr3; + break; + case SYS_ICC_HPPIR0_EL1: + case SYS_ICC_HPPIR1_EL1: + if (unlikely(!is_read)) + return 0; + fn = __vgic_v3_read_hppir; + break; + case SYS_ICC_IGRPEN0_EL1: + if (is_read) + fn = __vgic_v3_read_igrpen0; + else + fn = __vgic_v3_write_igrpen0; + break; + case SYS_ICC_BPR0_EL1: + if (is_read) + fn = __vgic_v3_read_bpr0; + else + fn = __vgic_v3_write_bpr0; + break; + case SYS_ICC_DIR_EL1: + if (unlikely(is_read)) + return 0; + fn = __vgic_v3_write_dir; + break; + case SYS_ICC_RPR_EL1: + if (unlikely(!is_read)) + return 0; + fn = __vgic_v3_read_rpr; + break; + case SYS_ICC_CTLR_EL1: + if (is_read) + fn = __vgic_v3_read_ctlr; + else + fn = __vgic_v3_write_ctlr; + break; + case SYS_ICC_PMR_EL1: + if (is_read) + fn = __vgic_v3_read_pmr; + else + fn = __vgic_v3_write_pmr; + break; + default: + return 0; + } + + vmcr = __vgic_v3_read_vmcr(); + rt = kvm_vcpu_sys_get_rt(vcpu); + fn(vcpu, vmcr, rt); + + __kvm_skip_instr(vcpu); + + return 1; +} diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c new file mode 100644 index 000000000000..550dfa3e53cd --- /dev/null +++ b/arch/arm64/kvm/hypercalls.c @@ -0,0 +1,71 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2019 Arm Ltd. + +#include <linux/arm-smccc.h> +#include <linux/kvm_host.h> + +#include <asm/kvm_emulate.h> + +#include <kvm/arm_hypercalls.h> +#include <kvm/arm_psci.h> + +int kvm_hvc_call_handler(struct kvm_vcpu *vcpu) +{ + u32 func_id = smccc_get_function(vcpu); + long val = SMCCC_RET_NOT_SUPPORTED; + u32 feature; + gpa_t gpa; + + switch (func_id) { + case ARM_SMCCC_VERSION_FUNC_ID: + val = ARM_SMCCC_VERSION_1_1; + break; + case ARM_SMCCC_ARCH_FEATURES_FUNC_ID: + feature = smccc_get_arg1(vcpu); + switch (feature) { + case ARM_SMCCC_ARCH_WORKAROUND_1: + switch (kvm_arm_harden_branch_predictor()) { + case KVM_BP_HARDEN_UNKNOWN: + break; + case KVM_BP_HARDEN_WA_NEEDED: + val = SMCCC_RET_SUCCESS; + break; + case KVM_BP_HARDEN_NOT_REQUIRED: + val = SMCCC_RET_NOT_REQUIRED; + break; + } + break; + case ARM_SMCCC_ARCH_WORKAROUND_2: + switch (kvm_arm_have_ssbd()) { + case KVM_SSBD_FORCE_DISABLE: + case KVM_SSBD_UNKNOWN: + break; + case KVM_SSBD_KERNEL: + val = SMCCC_RET_SUCCESS; + break; + case KVM_SSBD_FORCE_ENABLE: + case KVM_SSBD_MITIGATED: + val = SMCCC_RET_NOT_REQUIRED; + break; + } + break; + case ARM_SMCCC_HV_PV_TIME_FEATURES: + val = SMCCC_RET_SUCCESS; + break; + } + break; + case ARM_SMCCC_HV_PV_TIME_FEATURES: + val = kvm_hypercall_pv_features(vcpu); + break; + case ARM_SMCCC_HV_PV_TIME_ST: + gpa = kvm_init_stolen_time(vcpu); + if (gpa != GPA_INVALID) + val = gpa; + break; + default: + return kvm_psci_call(vcpu); + } + + smccc_set_retval(vcpu, val, 0, 0, 0); + return 1; +} diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c index 6aafc2825c1c..e21fdd93027a 100644 --- a/arch/arm64/kvm/inject_fault.c +++ b/arch/arm64/kvm/inject_fault.c @@ -26,28 +26,12 @@ enum exception_type { except_type_serror = 0x180, }; -static u64 get_except_vector(struct kvm_vcpu *vcpu, enum exception_type type) -{ - u64 exc_offset; - - switch (*vcpu_cpsr(vcpu) & (PSR_MODE_MASK | PSR_MODE32_BIT)) { - case PSR_MODE_EL1t: - exc_offset = CURRENT_EL_SP_EL0_VECTOR; - break; - case PSR_MODE_EL1h: - exc_offset = CURRENT_EL_SP_ELx_VECTOR; - break; - case PSR_MODE_EL0t: - exc_offset = LOWER_EL_AArch64_VECTOR; - break; - default: - exc_offset = LOWER_EL_AArch32_VECTOR; - } - - return vcpu_read_sys_reg(vcpu, VBAR_EL1) + exc_offset + type; -} - /* + * This performs the exception entry at a given EL (@target_mode), stashing PC + * and PSTATE into ELR and SPSR respectively, and compute the new PC/PSTATE. + * The EL passed to this function *must* be a non-secure, privileged mode with + * bit 0 being set (PSTATE.SP == 1). + * * When an exception is taken, most PSTATE fields are left unchanged in the * handler. However, some are explicitly overridden (e.g. M[4:0]). Luckily all * of the inherited bits have the same position in the AArch64/AArch32 SPSR_ELx @@ -59,10 +43,35 @@ static u64 get_except_vector(struct kvm_vcpu *vcpu, enum exception_type type) * Here we manipulate the fields in order of the AArch64 SPSR_ELx layout, from * MSB to LSB. */ -static unsigned long get_except64_pstate(struct kvm_vcpu *vcpu) +static void enter_exception64(struct kvm_vcpu *vcpu, unsigned long target_mode, + enum exception_type type) { - unsigned long sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1); - unsigned long old, new; + unsigned long sctlr, vbar, old, new, mode; + u64 exc_offset; + + mode = *vcpu_cpsr(vcpu) & (PSR_MODE_MASK | PSR_MODE32_BIT); + + if (mode == target_mode) + exc_offset = CURRENT_EL_SP_ELx_VECTOR; + else if ((mode | PSR_MODE_THREAD_BIT) == target_mode) + exc_offset = CURRENT_EL_SP_EL0_VECTOR; + else if (!(mode & PSR_MODE32_BIT)) + exc_offset = LOWER_EL_AArch64_VECTOR; + else + exc_offset = LOWER_EL_AArch32_VECTOR; + + switch (target_mode) { + case PSR_MODE_EL1h: + vbar = vcpu_read_sys_reg(vcpu, VBAR_EL1); + sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1); + vcpu_write_elr_el1(vcpu, *vcpu_pc(vcpu)); + break; + default: + /* Don't do that */ + BUG(); + } + + *vcpu_pc(vcpu) = vbar + exc_offset + type; old = *vcpu_cpsr(vcpu); new = 0; @@ -105,9 +114,10 @@ static unsigned long get_except64_pstate(struct kvm_vcpu *vcpu) new |= PSR_I_BIT; new |= PSR_F_BIT; - new |= PSR_MODE_EL1h; + new |= target_mode; - return new; + *vcpu_cpsr(vcpu) = new; + vcpu_write_spsr(vcpu, old); } static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr) @@ -116,11 +126,7 @@ static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr bool is_aarch32 = vcpu_mode_is_32bit(vcpu); u32 esr = 0; - vcpu_write_elr_el1(vcpu, *vcpu_pc(vcpu)); - *vcpu_pc(vcpu) = get_except_vector(vcpu, except_type_sync); - - *vcpu_cpsr(vcpu) = get_except64_pstate(vcpu); - vcpu_write_spsr(vcpu, cpsr); + enter_exception64(vcpu, PSR_MODE_EL1h, except_type_sync); vcpu_write_sys_reg(vcpu, addr, FAR_EL1); @@ -148,14 +154,9 @@ static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr static void inject_undef64(struct kvm_vcpu *vcpu) { - unsigned long cpsr = *vcpu_cpsr(vcpu); u32 esr = (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT); - vcpu_write_elr_el1(vcpu, *vcpu_pc(vcpu)); - *vcpu_pc(vcpu) = get_except_vector(vcpu, except_type_sync); - - *vcpu_cpsr(vcpu) = get_except64_pstate(vcpu); - vcpu_write_spsr(vcpu, cpsr); + enter_exception64(vcpu, PSR_MODE_EL1h, except_type_sync); /* * Build an unknown exception, depending on the instruction diff --git a/arch/arm64/kvm/mmio.c b/arch/arm64/kvm/mmio.c new file mode 100644 index 000000000000..4e0366759726 --- /dev/null +++ b/arch/arm64/kvm/mmio.c @@ -0,0 +1,200 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2012 - Virtual Open Systems and Columbia University + * Author: Christoffer Dall <c.dall@virtualopensystems.com> + */ + +#include <linux/kvm_host.h> +#include <asm/kvm_emulate.h> +#include <trace/events/kvm.h> + +#include "trace.h" + +void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data) +{ + void *datap = NULL; + union { + u8 byte; + u16 hword; + u32 word; + u64 dword; + } tmp; + + switch (len) { + case 1: + tmp.byte = data; + datap = &tmp.byte; + break; + case 2: + tmp.hword = data; + datap = &tmp.hword; + break; + case 4: + tmp.word = data; + datap = &tmp.word; + break; + case 8: + tmp.dword = data; + datap = &tmp.dword; + break; + } + + memcpy(buf, datap, len); +} + +unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len) +{ + unsigned long data = 0; + union { + u16 hword; + u32 word; + u64 dword; + } tmp; + + switch (len) { + case 1: + data = *(u8 *)buf; + break; + case 2: + memcpy(&tmp.hword, buf, len); + data = tmp.hword; + break; + case 4: + memcpy(&tmp.word, buf, len); + data = tmp.word; + break; + case 8: + memcpy(&tmp.dword, buf, len); + data = tmp.dword; + break; + } + + return data; +} + +/** + * kvm_handle_mmio_return -- Handle MMIO loads after user space emulation + * or in-kernel IO emulation + * + * @vcpu: The VCPU pointer + * @run: The VCPU run struct containing the mmio data + */ +int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run) +{ + unsigned long data; + unsigned int len; + int mask; + + /* Detect an already handled MMIO return */ + if (unlikely(!vcpu->mmio_needed)) + return 0; + + vcpu->mmio_needed = 0; + + if (!kvm_vcpu_dabt_iswrite(vcpu)) { + len = kvm_vcpu_dabt_get_as(vcpu); + data = kvm_mmio_read_buf(run->mmio.data, len); + + if (kvm_vcpu_dabt_issext(vcpu) && + len < sizeof(unsigned long)) { + mask = 1U << ((len * 8) - 1); + data = (data ^ mask) - mask; + } + + if (!kvm_vcpu_dabt_issf(vcpu)) + data = data & 0xffffffff; + + trace_kvm_mmio(KVM_TRACE_MMIO_READ, len, run->mmio.phys_addr, + &data); + data = vcpu_data_host_to_guest(vcpu, data, len); + vcpu_set_reg(vcpu, kvm_vcpu_dabt_get_rd(vcpu), data); + } + + /* + * The MMIO instruction is emulated and should not be re-executed + * in the guest. + */ + kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); + + return 0; +} + +int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run, + phys_addr_t fault_ipa) +{ + unsigned long data; + unsigned long rt; + int ret; + bool is_write; + int len; + u8 data_buf[8]; + + /* + * No valid syndrome? Ask userspace for help if it has + * volunteered to do so, and bail out otherwise. + */ + if (!kvm_vcpu_dabt_isvalid(vcpu)) { + if (vcpu->kvm->arch.return_nisv_io_abort_to_user) { + run->exit_reason = KVM_EXIT_ARM_NISV; + run->arm_nisv.esr_iss = kvm_vcpu_dabt_iss_nisv_sanitized(vcpu); + run->arm_nisv.fault_ipa = fault_ipa; + return 0; + } + + kvm_pr_unimpl("Data abort outside memslots with no valid syndrome info\n"); + return -ENOSYS; + } + + /* Page table accesses IO mem: tell guest to fix its TTBR */ + if (kvm_vcpu_dabt_iss1tw(vcpu)) { + kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu)); + return 1; + } + + /* + * Prepare MMIO operation. First decode the syndrome data we get + * from the CPU. Then try if some in-kernel emulation feels + * responsible, otherwise let user space do its magic. + */ + is_write = kvm_vcpu_dabt_iswrite(vcpu); + len = kvm_vcpu_dabt_get_as(vcpu); + rt = kvm_vcpu_dabt_get_rd(vcpu); + + if (is_write) { + data = vcpu_data_guest_to_host(vcpu, vcpu_get_reg(vcpu, rt), + len); + + trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, len, fault_ipa, &data); + kvm_mmio_write_buf(data_buf, len, data); + + ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, fault_ipa, len, + data_buf); + } else { + trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, len, + fault_ipa, NULL); + + ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS, fault_ipa, len, + data_buf); + } + + /* Now prepare kvm_run for the potential return to userland. */ + run->mmio.is_write = is_write; + run->mmio.phys_addr = fault_ipa; + run->mmio.len = len; + vcpu->mmio_needed = 1; + + if (!ret) { + /* We handled the access successfully in the kernel. */ + if (!is_write) + memcpy(run->mmio.data, data_buf, len); + vcpu->stat.mmio_exit_kernel++; + kvm_handle_mmio_return(vcpu, run); + return 1; + } + + if (is_write) + memcpy(run->mmio.data, data_buf, len); + vcpu->stat.mmio_exit_user++; + run->exit_reason = KVM_EXIT_MMIO; + return 0; +} diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c new file mode 100644 index 000000000000..a1f6bc70c4e4 --- /dev/null +++ b/arch/arm64/kvm/mmu.c @@ -0,0 +1,2467 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2012 - Virtual Open Systems and Columbia University + * Author: Christoffer Dall <c.dall@virtualopensystems.com> + */ + +#include <linux/mman.h> +#include <linux/kvm_host.h> +#include <linux/io.h> +#include <linux/hugetlb.h> +#include <linux/sched/signal.h> +#include <trace/events/kvm.h> +#include <asm/pgalloc.h> +#include <asm/cacheflush.h> +#include <asm/kvm_arm.h> +#include <asm/kvm_mmu.h> +#include <asm/kvm_ras.h> +#include <asm/kvm_asm.h> +#include <asm/kvm_emulate.h> +#include <asm/virt.h> + +#include "trace.h" + +static pgd_t *boot_hyp_pgd; +static pgd_t *hyp_pgd; +static pgd_t *merged_hyp_pgd; +static DEFINE_MUTEX(kvm_hyp_pgd_mutex); + +static unsigned long hyp_idmap_start; +static unsigned long hyp_idmap_end; +static phys_addr_t hyp_idmap_vector; + +static unsigned long io_map_base; + +#define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t)) + +#define KVM_S2PTE_FLAG_IS_IOMAP (1UL << 0) +#define KVM_S2_FLAG_LOGGING_ACTIVE (1UL << 1) + +static bool is_iomap(unsigned long flags) +{ + return flags & KVM_S2PTE_FLAG_IS_IOMAP; +} + +static bool memslot_is_logging(struct kvm_memory_slot *memslot) +{ + return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY); +} + +/** + * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8 + * @kvm: pointer to kvm structure. + * + * Interface to HYP function to flush all VM TLB entries + */ +void kvm_flush_remote_tlbs(struct kvm *kvm) +{ + kvm_call_hyp(__kvm_tlb_flush_vmid, kvm); +} + +static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) +{ + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa); +} + +/* + * D-Cache management functions. They take the page table entries by + * value, as they are flushing the cache using the kernel mapping (or + * kmap on 32bit). + */ +static void kvm_flush_dcache_pte(pte_t pte) +{ + __kvm_flush_dcache_pte(pte); +} + +static void kvm_flush_dcache_pmd(pmd_t pmd) +{ + __kvm_flush_dcache_pmd(pmd); +} + +static void kvm_flush_dcache_pud(pud_t pud) +{ + __kvm_flush_dcache_pud(pud); +} + +static bool kvm_is_device_pfn(unsigned long pfn) +{ + return !pfn_valid(pfn); +} + +/** + * stage2_dissolve_pmd() - clear and flush huge PMD entry + * @kvm: pointer to kvm structure. + * @addr: IPA + * @pmd: pmd pointer for IPA + * + * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. + */ +static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd) +{ + if (!pmd_thp_or_huge(*pmd)) + return; + + pmd_clear(pmd); + kvm_tlb_flush_vmid_ipa(kvm, addr); + put_page(virt_to_page(pmd)); +} + +/** + * stage2_dissolve_pud() - clear and flush huge PUD entry + * @kvm: pointer to kvm structure. + * @addr: IPA + * @pud: pud pointer for IPA + * + * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs. + */ +static void stage2_dissolve_pud(struct kvm *kvm, phys_addr_t addr, pud_t *pudp) +{ + if (!stage2_pud_huge(kvm, *pudp)) + return; + + stage2_pud_clear(kvm, pudp); + kvm_tlb_flush_vmid_ipa(kvm, addr); + put_page(virt_to_page(pudp)); +} + +static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, + int min, int max) +{ + void *page; + + BUG_ON(max > KVM_NR_MEM_OBJS); + if (cache->nobjs >= min) + return 0; + while (cache->nobjs < max) { + page = (void *)__get_free_page(GFP_PGTABLE_USER); + if (!page) + return -ENOMEM; + cache->objects[cache->nobjs++] = page; + } + return 0; +} + +static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) +{ + while (mc->nobjs) + free_page((unsigned long)mc->objects[--mc->nobjs]); +} + +static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) +{ + void *p; + + BUG_ON(!mc || !mc->nobjs); + p = mc->objects[--mc->nobjs]; + return p; +} + +static void clear_stage2_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr) +{ + pud_t *pud_table __maybe_unused = stage2_pud_offset(kvm, pgd, 0UL); + stage2_pgd_clear(kvm, pgd); + kvm_tlb_flush_vmid_ipa(kvm, addr); + stage2_pud_free(kvm, pud_table); + put_page(virt_to_page(pgd)); +} + +static void clear_stage2_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr) +{ + pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(kvm, pud, 0); + VM_BUG_ON(stage2_pud_huge(kvm, *pud)); + stage2_pud_clear(kvm, pud); + kvm_tlb_flush_vmid_ipa(kvm, addr); + stage2_pmd_free(kvm, pmd_table); + put_page(virt_to_page(pud)); +} + +static void clear_stage2_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr) +{ + pte_t *pte_table = pte_offset_kernel(pmd, 0); + VM_BUG_ON(pmd_thp_or_huge(*pmd)); + pmd_clear(pmd); + kvm_tlb_flush_vmid_ipa(kvm, addr); + free_page((unsigned long)pte_table); + put_page(virt_to_page(pmd)); +} + +static inline void kvm_set_pte(pte_t *ptep, pte_t new_pte) +{ + WRITE_ONCE(*ptep, new_pte); + dsb(ishst); +} + +static inline void kvm_set_pmd(pmd_t *pmdp, pmd_t new_pmd) +{ + WRITE_ONCE(*pmdp, new_pmd); + dsb(ishst); +} + +static inline void kvm_pmd_populate(pmd_t *pmdp, pte_t *ptep) +{ + kvm_set_pmd(pmdp, kvm_mk_pmd(ptep)); +} + +static inline void kvm_pud_populate(pud_t *pudp, pmd_t *pmdp) +{ + WRITE_ONCE(*pudp, kvm_mk_pud(pmdp)); + dsb(ishst); +} + +static inline void kvm_pgd_populate(pgd_t *pgdp, pud_t *pudp) +{ + WRITE_ONCE(*pgdp, kvm_mk_pgd(pudp)); + dsb(ishst); +} + +/* + * Unmapping vs dcache management: + * + * If a guest maps certain memory pages as uncached, all writes will + * bypass the data cache and go directly to RAM. However, the CPUs + * can still speculate reads (not writes) and fill cache lines with + * data. + * + * Those cache lines will be *clean* cache lines though, so a + * clean+invalidate operation is equivalent to an invalidate + * operation, because no cache lines are marked dirty. + * + * Those clean cache lines could be filled prior to an uncached write + * by the guest, and the cache coherent IO subsystem would therefore + * end up writing old data to disk. + * + * This is why right after unmapping a page/section and invalidating + * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure + * the IO subsystem will never hit in the cache. + * + * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as + * we then fully enforce cacheability of RAM, no matter what the guest + * does. + */ +static void unmap_stage2_ptes(struct kvm *kvm, pmd_t *pmd, + phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t start_addr = addr; + pte_t *pte, *start_pte; + + start_pte = pte = pte_offset_kernel(pmd, addr); + do { + if (!pte_none(*pte)) { + pte_t old_pte = *pte; + + kvm_set_pte(pte, __pte(0)); + kvm_tlb_flush_vmid_ipa(kvm, addr); + + /* No need to invalidate the cache for device mappings */ + if (!kvm_is_device_pfn(pte_pfn(old_pte))) + kvm_flush_dcache_pte(old_pte); + + put_page(virt_to_page(pte)); + } + } while (pte++, addr += PAGE_SIZE, addr != end); + + if (stage2_pte_table_empty(kvm, start_pte)) + clear_stage2_pmd_entry(kvm, pmd, start_addr); +} + +static void unmap_stage2_pmds(struct kvm *kvm, pud_t *pud, + phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t next, start_addr = addr; + pmd_t *pmd, *start_pmd; + + start_pmd = pmd = stage2_pmd_offset(kvm, pud, addr); + do { + next = stage2_pmd_addr_end(kvm, addr, end); + if (!pmd_none(*pmd)) { + if (pmd_thp_or_huge(*pmd)) { + pmd_t old_pmd = *pmd; + + pmd_clear(pmd); + kvm_tlb_flush_vmid_ipa(kvm, addr); + + kvm_flush_dcache_pmd(old_pmd); + + put_page(virt_to_page(pmd)); + } else { + unmap_stage2_ptes(kvm, pmd, addr, next); + } + } + } while (pmd++, addr = next, addr != end); + + if (stage2_pmd_table_empty(kvm, start_pmd)) + clear_stage2_pud_entry(kvm, pud, start_addr); +} + +static void unmap_stage2_puds(struct kvm *kvm, pgd_t *pgd, + phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t next, start_addr = addr; + pud_t *pud, *start_pud; + + start_pud = pud = stage2_pud_offset(kvm, pgd, addr); + do { + next = stage2_pud_addr_end(kvm, addr, end); + if (!stage2_pud_none(kvm, *pud)) { + if (stage2_pud_huge(kvm, *pud)) { + pud_t old_pud = *pud; + + stage2_pud_clear(kvm, pud); + kvm_tlb_flush_vmid_ipa(kvm, addr); + kvm_flush_dcache_pud(old_pud); + put_page(virt_to_page(pud)); + } else { + unmap_stage2_pmds(kvm, pud, addr, next); + } + } + } while (pud++, addr = next, addr != end); + + if (stage2_pud_table_empty(kvm, start_pud)) + clear_stage2_pgd_entry(kvm, pgd, start_addr); +} + +/** + * unmap_stage2_range -- Clear stage2 page table entries to unmap a range + * @kvm: The VM pointer + * @start: The intermediate physical base address of the range to unmap + * @size: The size of the area to unmap + * + * Clear a range of stage-2 mappings, lowering the various ref-counts. Must + * be called while holding mmu_lock (unless for freeing the stage2 pgd before + * destroying the VM), otherwise another faulting VCPU may come in and mess + * with things behind our backs. + */ +static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size) +{ + pgd_t *pgd; + phys_addr_t addr = start, end = start + size; + phys_addr_t next; + + assert_spin_locked(&kvm->mmu_lock); + WARN_ON(size & ~PAGE_MASK); + + pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr); + do { + /* + * Make sure the page table is still active, as another thread + * could have possibly freed the page table, while we released + * the lock. + */ + if (!READ_ONCE(kvm->arch.pgd)) + break; + next = stage2_pgd_addr_end(kvm, addr, end); + if (!stage2_pgd_none(kvm, *pgd)) + unmap_stage2_puds(kvm, pgd, addr, next); + /* + * If the range is too large, release the kvm->mmu_lock + * to prevent starvation and lockup detector warnings. + */ + if (next != end) + cond_resched_lock(&kvm->mmu_lock); + } while (pgd++, addr = next, addr != end); +} + +static void stage2_flush_ptes(struct kvm *kvm, pmd_t *pmd, + phys_addr_t addr, phys_addr_t end) +{ + pte_t *pte; + + pte = pte_offset_kernel(pmd, addr); + do { + if (!pte_none(*pte) && !kvm_is_device_pfn(pte_pfn(*pte))) + kvm_flush_dcache_pte(*pte); + } while (pte++, addr += PAGE_SIZE, addr != end); +} + +static void stage2_flush_pmds(struct kvm *kvm, pud_t *pud, + phys_addr_t addr, phys_addr_t end) +{ + pmd_t *pmd; + phys_addr_t next; + + pmd = stage2_pmd_offset(kvm, pud, addr); + do { + next = stage2_pmd_addr_end(kvm, addr, end); + if (!pmd_none(*pmd)) { + if (pmd_thp_or_huge(*pmd)) + kvm_flush_dcache_pmd(*pmd); + else + stage2_flush_ptes(kvm, pmd, addr, next); + } + } while (pmd++, addr = next, addr != end); +} + +static void stage2_flush_puds(struct kvm *kvm, pgd_t *pgd, + phys_addr_t addr, phys_addr_t end) +{ + pud_t *pud; + phys_addr_t next; + + pud = stage2_pud_offset(kvm, pgd, addr); + do { + next = stage2_pud_addr_end(kvm, addr, end); + if (!stage2_pud_none(kvm, *pud)) { + if (stage2_pud_huge(kvm, *pud)) + kvm_flush_dcache_pud(*pud); + else + stage2_flush_pmds(kvm, pud, addr, next); + } + } while (pud++, addr = next, addr != end); +} + +static void stage2_flush_memslot(struct kvm *kvm, + struct kvm_memory_slot *memslot) +{ + phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; + phys_addr_t end = addr + PAGE_SIZE * memslot->npages; + phys_addr_t next; + pgd_t *pgd; + + pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr); + do { + next = stage2_pgd_addr_end(kvm, addr, end); + if (!stage2_pgd_none(kvm, *pgd)) + stage2_flush_puds(kvm, pgd, addr, next); + + if (next != end) + cond_resched_lock(&kvm->mmu_lock); + } while (pgd++, addr = next, addr != end); +} + +/** + * stage2_flush_vm - Invalidate cache for pages mapped in stage 2 + * @kvm: The struct kvm pointer + * + * Go through the stage 2 page tables and invalidate any cache lines + * backing memory already mapped to the VM. + */ +static void stage2_flush_vm(struct kvm *kvm) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + int idx; + + idx = srcu_read_lock(&kvm->srcu); + spin_lock(&kvm->mmu_lock); + + slots = kvm_memslots(kvm); + kvm_for_each_memslot(memslot, slots) + stage2_flush_memslot(kvm, memslot); + + spin_unlock(&kvm->mmu_lock); + srcu_read_unlock(&kvm->srcu, idx); +} + +static void clear_hyp_pgd_entry(pgd_t *pgd) +{ + pud_t *pud_table __maybe_unused = pud_offset(pgd, 0UL); + pgd_clear(pgd); + pud_free(NULL, pud_table); + put_page(virt_to_page(pgd)); +} + +static void clear_hyp_pud_entry(pud_t *pud) +{ + pmd_t *pmd_table __maybe_unused = pmd_offset(pud, 0); + VM_BUG_ON(pud_huge(*pud)); + pud_clear(pud); + pmd_free(NULL, pmd_table); + put_page(virt_to_page(pud)); +} + +static void clear_hyp_pmd_entry(pmd_t *pmd) +{ + pte_t *pte_table = pte_offset_kernel(pmd, 0); + VM_BUG_ON(pmd_thp_or_huge(*pmd)); + pmd_clear(pmd); + pte_free_kernel(NULL, pte_table); + put_page(virt_to_page(pmd)); +} + +static void unmap_hyp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end) +{ + pte_t *pte, *start_pte; + + start_pte = pte = pte_offset_kernel(pmd, addr); + do { + if (!pte_none(*pte)) { + kvm_set_pte(pte, __pte(0)); + put_page(virt_to_page(pte)); + } + } while (pte++, addr += PAGE_SIZE, addr != end); + + if (hyp_pte_table_empty(start_pte)) + clear_hyp_pmd_entry(pmd); +} + +static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t next; + pmd_t *pmd, *start_pmd; + + start_pmd = pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + /* Hyp doesn't use huge pmds */ + if (!pmd_none(*pmd)) + unmap_hyp_ptes(pmd, addr, next); + } while (pmd++, addr = next, addr != end); + + if (hyp_pmd_table_empty(start_pmd)) + clear_hyp_pud_entry(pud); +} + +static void unmap_hyp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t next; + pud_t *pud, *start_pud; + + start_pud = pud = pud_offset(pgd, addr); + do { + next = pud_addr_end(addr, end); + /* Hyp doesn't use huge puds */ + if (!pud_none(*pud)) + unmap_hyp_pmds(pud, addr, next); + } while (pud++, addr = next, addr != end); + + if (hyp_pud_table_empty(start_pud)) + clear_hyp_pgd_entry(pgd); +} + +static unsigned int kvm_pgd_index(unsigned long addr, unsigned int ptrs_per_pgd) +{ + return (addr >> PGDIR_SHIFT) & (ptrs_per_pgd - 1); +} + +static void __unmap_hyp_range(pgd_t *pgdp, unsigned long ptrs_per_pgd, + phys_addr_t start, u64 size) +{ + pgd_t *pgd; + phys_addr_t addr = start, end = start + size; + phys_addr_t next; + + /* + * We don't unmap anything from HYP, except at the hyp tear down. + * Hence, we don't have to invalidate the TLBs here. + */ + pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd); + do { + next = pgd_addr_end(addr, end); + if (!pgd_none(*pgd)) + unmap_hyp_puds(pgd, addr, next); + } while (pgd++, addr = next, addr != end); +} + +static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size) +{ + __unmap_hyp_range(pgdp, PTRS_PER_PGD, start, size); +} + +static void unmap_hyp_idmap_range(pgd_t *pgdp, phys_addr_t start, u64 size) +{ + __unmap_hyp_range(pgdp, __kvm_idmap_ptrs_per_pgd(), start, size); +} + +/** + * free_hyp_pgds - free Hyp-mode page tables + * + * Assumes hyp_pgd is a page table used strictly in Hyp-mode and + * therefore contains either mappings in the kernel memory area (above + * PAGE_OFFSET), or device mappings in the idmap range. + * + * boot_hyp_pgd should only map the idmap range, and is only used in + * the extended idmap case. + */ +void free_hyp_pgds(void) +{ + pgd_t *id_pgd; + + mutex_lock(&kvm_hyp_pgd_mutex); + + id_pgd = boot_hyp_pgd ? boot_hyp_pgd : hyp_pgd; + + if (id_pgd) { + /* In case we never called hyp_mmu_init() */ + if (!io_map_base) + io_map_base = hyp_idmap_start; + unmap_hyp_idmap_range(id_pgd, io_map_base, + hyp_idmap_start + PAGE_SIZE - io_map_base); + } + + if (boot_hyp_pgd) { + free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order); + boot_hyp_pgd = NULL; + } + + if (hyp_pgd) { + unmap_hyp_range(hyp_pgd, kern_hyp_va(PAGE_OFFSET), + (uintptr_t)high_memory - PAGE_OFFSET); + + free_pages((unsigned long)hyp_pgd, hyp_pgd_order); + hyp_pgd = NULL; + } + if (merged_hyp_pgd) { + clear_page(merged_hyp_pgd); + free_page((unsigned long)merged_hyp_pgd); + merged_hyp_pgd = NULL; + } + + mutex_unlock(&kvm_hyp_pgd_mutex); +} + +static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start, + unsigned long end, unsigned long pfn, + pgprot_t prot) +{ + pte_t *pte; + unsigned long addr; + + addr = start; + do { + pte = pte_offset_kernel(pmd, addr); + kvm_set_pte(pte, kvm_pfn_pte(pfn, prot)); + get_page(virt_to_page(pte)); + pfn++; + } while (addr += PAGE_SIZE, addr != end); +} + +static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start, + unsigned long end, unsigned long pfn, + pgprot_t prot) +{ + pmd_t *pmd; + pte_t *pte; + unsigned long addr, next; + + addr = start; + do { + pmd = pmd_offset(pud, addr); + + BUG_ON(pmd_sect(*pmd)); + + if (pmd_none(*pmd)) { + pte = pte_alloc_one_kernel(NULL); + if (!pte) { + kvm_err("Cannot allocate Hyp pte\n"); + return -ENOMEM; + } + kvm_pmd_populate(pmd, pte); + get_page(virt_to_page(pmd)); + } + + next = pmd_addr_end(addr, end); + + create_hyp_pte_mappings(pmd, addr, next, pfn, prot); + pfn += (next - addr) >> PAGE_SHIFT; + } while (addr = next, addr != end); + + return 0; +} + +static int create_hyp_pud_mappings(pgd_t *pgd, unsigned long start, + unsigned long end, unsigned long pfn, + pgprot_t prot) +{ + pud_t *pud; + pmd_t *pmd; + unsigned long addr, next; + int ret; + + addr = start; + do { + pud = pud_offset(pgd, addr); + + if (pud_none_or_clear_bad(pud)) { + pmd = pmd_alloc_one(NULL, addr); + if (!pmd) { + kvm_err("Cannot allocate Hyp pmd\n"); + return -ENOMEM; + } + kvm_pud_populate(pud, pmd); + get_page(virt_to_page(pud)); + } + + next = pud_addr_end(addr, end); + ret = create_hyp_pmd_mappings(pud, addr, next, pfn, prot); + if (ret) + return ret; + pfn += (next - addr) >> PAGE_SHIFT; + } while (addr = next, addr != end); + + return 0; +} + +static int __create_hyp_mappings(pgd_t *pgdp, unsigned long ptrs_per_pgd, + unsigned long start, unsigned long end, + unsigned long pfn, pgprot_t prot) +{ + pgd_t *pgd; + pud_t *pud; + unsigned long addr, next; + int err = 0; + + mutex_lock(&kvm_hyp_pgd_mutex); + addr = start & PAGE_MASK; + end = PAGE_ALIGN(end); + do { + pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd); + + if (pgd_none(*pgd)) { + pud = pud_alloc_one(NULL, addr); + if (!pud) { + kvm_err("Cannot allocate Hyp pud\n"); + err = -ENOMEM; + goto out; + } + kvm_pgd_populate(pgd, pud); + get_page(virt_to_page(pgd)); + } + + next = pgd_addr_end(addr, end); + err = create_hyp_pud_mappings(pgd, addr, next, pfn, prot); + if (err) + goto out; + pfn += (next - addr) >> PAGE_SHIFT; + } while (addr = next, addr != end); +out: + mutex_unlock(&kvm_hyp_pgd_mutex); + return err; +} + +static phys_addr_t kvm_kaddr_to_phys(void *kaddr) +{ + if (!is_vmalloc_addr(kaddr)) { + BUG_ON(!virt_addr_valid(kaddr)); + return __pa(kaddr); + } else { + return page_to_phys(vmalloc_to_page(kaddr)) + + offset_in_page(kaddr); + } +} + +/** + * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode + * @from: The virtual kernel start address of the range + * @to: The virtual kernel end address of the range (exclusive) + * @prot: The protection to be applied to this range + * + * The same virtual address as the kernel virtual address is also used + * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying + * physical pages. + */ +int create_hyp_mappings(void *from, void *to, pgprot_t prot) +{ + phys_addr_t phys_addr; + unsigned long virt_addr; + unsigned long start = kern_hyp_va((unsigned long)from); + unsigned long end = kern_hyp_va((unsigned long)to); + + if (is_kernel_in_hyp_mode()) + return 0; + + start = start & PAGE_MASK; + end = PAGE_ALIGN(end); + + for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) { + int err; + + phys_addr = kvm_kaddr_to_phys(from + virt_addr - start); + err = __create_hyp_mappings(hyp_pgd, PTRS_PER_PGD, + virt_addr, virt_addr + PAGE_SIZE, + __phys_to_pfn(phys_addr), + prot); + if (err) + return err; + } + + return 0; +} + +static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size, + unsigned long *haddr, pgprot_t prot) +{ + pgd_t *pgd = hyp_pgd; + unsigned long base; + int ret = 0; + + mutex_lock(&kvm_hyp_pgd_mutex); + + /* + * This assumes that we have enough space below the idmap + * page to allocate our VAs. If not, the check below will + * kick. A potential alternative would be to detect that + * overflow and switch to an allocation above the idmap. + * + * The allocated size is always a multiple of PAGE_SIZE. + */ + size = PAGE_ALIGN(size + offset_in_page(phys_addr)); + base = io_map_base - size; + + /* + * Verify that BIT(VA_BITS - 1) hasn't been flipped by + * allocating the new area, as it would indicate we've + * overflowed the idmap/IO address range. + */ + if ((base ^ io_map_base) & BIT(VA_BITS - 1)) + ret = -ENOMEM; + else + io_map_base = base; + + mutex_unlock(&kvm_hyp_pgd_mutex); + + if (ret) + goto out; + + if (__kvm_cpu_uses_extended_idmap()) + pgd = boot_hyp_pgd; + + ret = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(), + base, base + size, + __phys_to_pfn(phys_addr), prot); + if (ret) + goto out; + + *haddr = base + offset_in_page(phys_addr); + +out: + return ret; +} + +/** + * create_hyp_io_mappings - Map IO into both kernel and HYP + * @phys_addr: The physical start address which gets mapped + * @size: Size of the region being mapped + * @kaddr: Kernel VA for this mapping + * @haddr: HYP VA for this mapping + */ +int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size, + void __iomem **kaddr, + void __iomem **haddr) +{ + unsigned long addr; + int ret; + + *kaddr = ioremap(phys_addr, size); + if (!*kaddr) + return -ENOMEM; + + if (is_kernel_in_hyp_mode()) { + *haddr = *kaddr; + return 0; + } + + ret = __create_hyp_private_mapping(phys_addr, size, + &addr, PAGE_HYP_DEVICE); + if (ret) { + iounmap(*kaddr); + *kaddr = NULL; + *haddr = NULL; + return ret; + } + + *haddr = (void __iomem *)addr; + return 0; +} + +/** + * create_hyp_exec_mappings - Map an executable range into HYP + * @phys_addr: The physical start address which gets mapped + * @size: Size of the region being mapped + * @haddr: HYP VA for this mapping + */ +int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, + void **haddr) +{ + unsigned long addr; + int ret; + + BUG_ON(is_kernel_in_hyp_mode()); + + ret = __create_hyp_private_mapping(phys_addr, size, + &addr, PAGE_HYP_EXEC); + if (ret) { + *haddr = NULL; + return ret; + } + + *haddr = (void *)addr; + return 0; +} + +/** + * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation. + * @kvm: The KVM struct pointer for the VM. + * + * Allocates only the stage-2 HW PGD level table(s) of size defined by + * stage2_pgd_size(kvm). + * + * Note we don't need locking here as this is only called when the VM is + * created, which can only be done once. + */ +int kvm_alloc_stage2_pgd(struct kvm *kvm) +{ + phys_addr_t pgd_phys; + pgd_t *pgd; + + if (kvm->arch.pgd != NULL) { + kvm_err("kvm_arch already initialized?\n"); + return -EINVAL; + } + + /* Allocate the HW PGD, making sure that each page gets its own refcount */ + pgd = alloc_pages_exact(stage2_pgd_size(kvm), GFP_KERNEL | __GFP_ZERO); + if (!pgd) + return -ENOMEM; + + pgd_phys = virt_to_phys(pgd); + if (WARN_ON(pgd_phys & ~kvm_vttbr_baddr_mask(kvm))) + return -EINVAL; + + kvm->arch.pgd = pgd; + kvm->arch.pgd_phys = pgd_phys; + return 0; +} + +static void stage2_unmap_memslot(struct kvm *kvm, + struct kvm_memory_slot *memslot) +{ + hva_t hva = memslot->userspace_addr; + phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; + phys_addr_t size = PAGE_SIZE * memslot->npages; + hva_t reg_end = hva + size; + + /* + * A memory region could potentially cover multiple VMAs, and any holes + * between them, so iterate over all of them to find out if we should + * unmap any of them. + * + * +--------------------------------------------+ + * +---------------+----------------+ +----------------+ + * | : VMA 1 | VMA 2 | | VMA 3 : | + * +---------------+----------------+ +----------------+ + * | memory region | + * +--------------------------------------------+ + */ + do { + struct vm_area_struct *vma = find_vma(current->mm, hva); + hva_t vm_start, vm_end; + + if (!vma || vma->vm_start >= reg_end) + break; + + /* + * Take the intersection of this VMA with the memory region + */ + vm_start = max(hva, vma->vm_start); + vm_end = min(reg_end, vma->vm_end); + + if (!(vma->vm_flags & VM_PFNMAP)) { + gpa_t gpa = addr + (vm_start - memslot->userspace_addr); + unmap_stage2_range(kvm, gpa, vm_end - vm_start); + } + hva = vm_end; + } while (hva < reg_end); +} + +/** + * stage2_unmap_vm - Unmap Stage-2 RAM mappings + * @kvm: The struct kvm pointer + * + * Go through the memregions and unmap any regular RAM + * backing memory already mapped to the VM. + */ +void stage2_unmap_vm(struct kvm *kvm) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + int idx; + + idx = srcu_read_lock(&kvm->srcu); + down_read(¤t->mm->mmap_sem); + spin_lock(&kvm->mmu_lock); + + slots = kvm_memslots(kvm); + kvm_for_each_memslot(memslot, slots) + stage2_unmap_memslot(kvm, memslot); + + spin_unlock(&kvm->mmu_lock); + up_read(¤t->mm->mmap_sem); + srcu_read_unlock(&kvm->srcu, idx); +} + +/** + * kvm_free_stage2_pgd - free all stage-2 tables + * @kvm: The KVM struct pointer for the VM. + * + * Walks the level-1 page table pointed to by kvm->arch.pgd and frees all + * underlying level-2 and level-3 tables before freeing the actual level-1 table + * and setting the struct pointer to NULL. + */ +void kvm_free_stage2_pgd(struct kvm *kvm) +{ + void *pgd = NULL; + + spin_lock(&kvm->mmu_lock); + if (kvm->arch.pgd) { + unmap_stage2_range(kvm, 0, kvm_phys_size(kvm)); + pgd = READ_ONCE(kvm->arch.pgd); + kvm->arch.pgd = NULL; + kvm->arch.pgd_phys = 0; + } + spin_unlock(&kvm->mmu_lock); + + /* Free the HW pgd, one page at a time */ + if (pgd) + free_pages_exact(pgd, stage2_pgd_size(kvm)); +} + +static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, + phys_addr_t addr) +{ + pgd_t *pgd; + pud_t *pud; + + pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr); + if (stage2_pgd_none(kvm, *pgd)) { + if (!cache) + return NULL; + pud = mmu_memory_cache_alloc(cache); + stage2_pgd_populate(kvm, pgd, pud); + get_page(virt_to_page(pgd)); + } + + return stage2_pud_offset(kvm, pgd, addr); +} + +static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, + phys_addr_t addr) +{ + pud_t *pud; + pmd_t *pmd; + + pud = stage2_get_pud(kvm, cache, addr); + if (!pud || stage2_pud_huge(kvm, *pud)) + return NULL; + + if (stage2_pud_none(kvm, *pud)) { + if (!cache) + return NULL; + pmd = mmu_memory_cache_alloc(cache); + stage2_pud_populate(kvm, pud, pmd); + get_page(virt_to_page(pud)); + } + + return stage2_pmd_offset(kvm, pud, addr); +} + +static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache + *cache, phys_addr_t addr, const pmd_t *new_pmd) +{ + pmd_t *pmd, old_pmd; + +retry: + pmd = stage2_get_pmd(kvm, cache, addr); + VM_BUG_ON(!pmd); + + old_pmd = *pmd; + /* + * Multiple vcpus faulting on the same PMD entry, can + * lead to them sequentially updating the PMD with the + * same value. Following the break-before-make + * (pmd_clear() followed by tlb_flush()) process can + * hinder forward progress due to refaults generated + * on missing translations. + * + * Skip updating the page table if the entry is + * unchanged. + */ + if (pmd_val(old_pmd) == pmd_val(*new_pmd)) + return 0; + + if (pmd_present(old_pmd)) { + /* + * If we already have PTE level mapping for this block, + * we must unmap it to avoid inconsistent TLB state and + * leaking the table page. We could end up in this situation + * if the memory slot was marked for dirty logging and was + * reverted, leaving PTE level mappings for the pages accessed + * during the period. So, unmap the PTE level mapping for this + * block and retry, as we could have released the upper level + * table in the process. + * + * Normal THP split/merge follows mmu_notifier callbacks and do + * get handled accordingly. + */ + if (!pmd_thp_or_huge(old_pmd)) { + unmap_stage2_range(kvm, addr & S2_PMD_MASK, S2_PMD_SIZE); + goto retry; + } + /* + * Mapping in huge pages should only happen through a + * fault. If a page is merged into a transparent huge + * page, the individual subpages of that huge page + * should be unmapped through MMU notifiers before we + * get here. + * + * Merging of CompoundPages is not supported; they + * should become splitting first, unmapped, merged, + * and mapped back in on-demand. + */ + WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd)); + pmd_clear(pmd); + kvm_tlb_flush_vmid_ipa(kvm, addr); + } else { + get_page(virt_to_page(pmd)); + } + + kvm_set_pmd(pmd, *new_pmd); + return 0; +} + +static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, + phys_addr_t addr, const pud_t *new_pudp) +{ + pud_t *pudp, old_pud; + +retry: + pudp = stage2_get_pud(kvm, cache, addr); + VM_BUG_ON(!pudp); + + old_pud = *pudp; + + /* + * A large number of vcpus faulting on the same stage 2 entry, + * can lead to a refault due to the stage2_pud_clear()/tlb_flush(). + * Skip updating the page tables if there is no change. + */ + if (pud_val(old_pud) == pud_val(*new_pudp)) + return 0; + + if (stage2_pud_present(kvm, old_pud)) { + /* + * If we already have table level mapping for this block, unmap + * the range for this block and retry. + */ + if (!stage2_pud_huge(kvm, old_pud)) { + unmap_stage2_range(kvm, addr & S2_PUD_MASK, S2_PUD_SIZE); + goto retry; + } + + WARN_ON_ONCE(kvm_pud_pfn(old_pud) != kvm_pud_pfn(*new_pudp)); + stage2_pud_clear(kvm, pudp); + kvm_tlb_flush_vmid_ipa(kvm, addr); + } else { + get_page(virt_to_page(pudp)); + } + + kvm_set_pud(pudp, *new_pudp); + return 0; +} + +/* + * stage2_get_leaf_entry - walk the stage2 VM page tables and return + * true if a valid and present leaf-entry is found. A pointer to the + * leaf-entry is returned in the appropriate level variable - pudpp, + * pmdpp, ptepp. + */ +static bool stage2_get_leaf_entry(struct kvm *kvm, phys_addr_t addr, + pud_t **pudpp, pmd_t **pmdpp, pte_t **ptepp) +{ + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + *pudpp = NULL; + *pmdpp = NULL; + *ptepp = NULL; + + pudp = stage2_get_pud(kvm, NULL, addr); + if (!pudp || stage2_pud_none(kvm, *pudp) || !stage2_pud_present(kvm, *pudp)) + return false; + + if (stage2_pud_huge(kvm, *pudp)) { + *pudpp = pudp; + return true; + } + + pmdp = stage2_pmd_offset(kvm, pudp, addr); + if (!pmdp || pmd_none(*pmdp) || !pmd_present(*pmdp)) + return false; + + if (pmd_thp_or_huge(*pmdp)) { + *pmdpp = pmdp; + return true; + } + + ptep = pte_offset_kernel(pmdp, addr); + if (!ptep || pte_none(*ptep) || !pte_present(*ptep)) + return false; + + *ptepp = ptep; + return true; +} + +static bool stage2_is_exec(struct kvm *kvm, phys_addr_t addr) +{ + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + bool found; + + found = stage2_get_leaf_entry(kvm, addr, &pudp, &pmdp, &ptep); + if (!found) + return false; + + if (pudp) + return kvm_s2pud_exec(pudp); + else if (pmdp) + return kvm_s2pmd_exec(pmdp); + else + return kvm_s2pte_exec(ptep); +} + +static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, + phys_addr_t addr, const pte_t *new_pte, + unsigned long flags) +{ + pud_t *pud; + pmd_t *pmd; + pte_t *pte, old_pte; + bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP; + bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE; + + VM_BUG_ON(logging_active && !cache); + + /* Create stage-2 page table mapping - Levels 0 and 1 */ + pud = stage2_get_pud(kvm, cache, addr); + if (!pud) { + /* + * Ignore calls from kvm_set_spte_hva for unallocated + * address ranges. + */ + return 0; + } + + /* + * While dirty page logging - dissolve huge PUD, then continue + * on to allocate page. + */ + if (logging_active) + stage2_dissolve_pud(kvm, addr, pud); + + if (stage2_pud_none(kvm, *pud)) { + if (!cache) + return 0; /* ignore calls from kvm_set_spte_hva */ + pmd = mmu_memory_cache_alloc(cache); + stage2_pud_populate(kvm, pud, pmd); + get_page(virt_to_page(pud)); + } + + pmd = stage2_pmd_offset(kvm, pud, addr); + if (!pmd) { + /* + * Ignore calls from kvm_set_spte_hva for unallocated + * address ranges. + */ + return 0; + } + + /* + * While dirty page logging - dissolve huge PMD, then continue on to + * allocate page. + */ + if (logging_active) + stage2_dissolve_pmd(kvm, addr, pmd); + + /* Create stage-2 page mappings - Level 2 */ + if (pmd_none(*pmd)) { + if (!cache) + return 0; /* ignore calls from kvm_set_spte_hva */ + pte = mmu_memory_cache_alloc(cache); + kvm_pmd_populate(pmd, pte); + get_page(virt_to_page(pmd)); + } + + pte = pte_offset_kernel(pmd, addr); + + if (iomap && pte_present(*pte)) + return -EFAULT; + + /* Create 2nd stage page table mapping - Level 3 */ + old_pte = *pte; + if (pte_present(old_pte)) { + /* Skip page table update if there is no change */ + if (pte_val(old_pte) == pte_val(*new_pte)) + return 0; + + kvm_set_pte(pte, __pte(0)); + kvm_tlb_flush_vmid_ipa(kvm, addr); + } else { + get_page(virt_to_page(pte)); + } + + kvm_set_pte(pte, *new_pte); + return 0; +} + +#ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG +static int stage2_ptep_test_and_clear_young(pte_t *pte) +{ + if (pte_young(*pte)) { + *pte = pte_mkold(*pte); + return 1; + } + return 0; +} +#else +static int stage2_ptep_test_and_clear_young(pte_t *pte) +{ + return __ptep_test_and_clear_young(pte); +} +#endif + +static int stage2_pmdp_test_and_clear_young(pmd_t *pmd) +{ + return stage2_ptep_test_and_clear_young((pte_t *)pmd); +} + +static int stage2_pudp_test_and_clear_young(pud_t *pud) +{ + return stage2_ptep_test_and_clear_young((pte_t *)pud); +} + +/** + * kvm_phys_addr_ioremap - map a device range to guest IPA + * + * @kvm: The KVM pointer + * @guest_ipa: The IPA at which to insert the mapping + * @pa: The physical address of the device + * @size: The size of the mapping + */ +int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, + phys_addr_t pa, unsigned long size, bool writable) +{ + phys_addr_t addr, end; + int ret = 0; + unsigned long pfn; + struct kvm_mmu_memory_cache cache = { 0, }; + + end = (guest_ipa + size + PAGE_SIZE - 1) & PAGE_MASK; + pfn = __phys_to_pfn(pa); + + for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) { + pte_t pte = kvm_pfn_pte(pfn, PAGE_S2_DEVICE); + + if (writable) + pte = kvm_s2pte_mkwrite(pte); + + ret = mmu_topup_memory_cache(&cache, + kvm_mmu_cache_min_pages(kvm), + KVM_NR_MEM_OBJS); + if (ret) + goto out; + spin_lock(&kvm->mmu_lock); + ret = stage2_set_pte(kvm, &cache, addr, &pte, + KVM_S2PTE_FLAG_IS_IOMAP); + spin_unlock(&kvm->mmu_lock); + if (ret) + goto out; + + pfn++; + } + +out: + mmu_free_memory_cache(&cache); + return ret; +} + +/** + * stage2_wp_ptes - write protect PMD range + * @pmd: pointer to pmd entry + * @addr: range start address + * @end: range end address + */ +static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end) +{ + pte_t *pte; + + pte = pte_offset_kernel(pmd, addr); + do { + if (!pte_none(*pte)) { + if (!kvm_s2pte_readonly(pte)) + kvm_set_s2pte_readonly(pte); + } + } while (pte++, addr += PAGE_SIZE, addr != end); +} + +/** + * stage2_wp_pmds - write protect PUD range + * kvm: kvm instance for the VM + * @pud: pointer to pud entry + * @addr: range start address + * @end: range end address + */ +static void stage2_wp_pmds(struct kvm *kvm, pud_t *pud, + phys_addr_t addr, phys_addr_t end) +{ + pmd_t *pmd; + phys_addr_t next; + + pmd = stage2_pmd_offset(kvm, pud, addr); + + do { + next = stage2_pmd_addr_end(kvm, addr, end); + if (!pmd_none(*pmd)) { + if (pmd_thp_or_huge(*pmd)) { + if (!kvm_s2pmd_readonly(pmd)) + kvm_set_s2pmd_readonly(pmd); + } else { + stage2_wp_ptes(pmd, addr, next); + } + } + } while (pmd++, addr = next, addr != end); +} + +/** + * stage2_wp_puds - write protect PGD range + * @pgd: pointer to pgd entry + * @addr: range start address + * @end: range end address + */ +static void stage2_wp_puds(struct kvm *kvm, pgd_t *pgd, + phys_addr_t addr, phys_addr_t end) +{ + pud_t *pud; + phys_addr_t next; + + pud = stage2_pud_offset(kvm, pgd, addr); + do { + next = stage2_pud_addr_end(kvm, addr, end); + if (!stage2_pud_none(kvm, *pud)) { + if (stage2_pud_huge(kvm, *pud)) { + if (!kvm_s2pud_readonly(pud)) + kvm_set_s2pud_readonly(pud); + } else { + stage2_wp_pmds(kvm, pud, addr, next); + } + } + } while (pud++, addr = next, addr != end); +} + +/** + * stage2_wp_range() - write protect stage2 memory region range + * @kvm: The KVM pointer + * @addr: Start address of range + * @end: End address of range + */ +static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) +{ + pgd_t *pgd; + phys_addr_t next; + + pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr); + do { + /* + * Release kvm_mmu_lock periodically if the memory region is + * large. Otherwise, we may see kernel panics with + * CONFIG_DETECT_HUNG_TASK, CONFIG_LOCKUP_DETECTOR, + * CONFIG_LOCKDEP. Additionally, holding the lock too long + * will also starve other vCPUs. We have to also make sure + * that the page tables are not freed while we released + * the lock. + */ + cond_resched_lock(&kvm->mmu_lock); + if (!READ_ONCE(kvm->arch.pgd)) + break; + next = stage2_pgd_addr_end(kvm, addr, end); + if (stage2_pgd_present(kvm, *pgd)) + stage2_wp_puds(kvm, pgd, addr, next); + } while (pgd++, addr = next, addr != end); +} + +/** + * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot + * @kvm: The KVM pointer + * @slot: The memory slot to write protect + * + * Called to start logging dirty pages after memory region + * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns + * all present PUD, PMD and PTEs are write protected in the memory region. + * Afterwards read of dirty page log can be called. + * + * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired, + * serializing operations for VM memory regions. + */ +void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) +{ + struct kvm_memslots *slots = kvm_memslots(kvm); + struct kvm_memory_slot *memslot = id_to_memslot(slots, slot); + phys_addr_t start, end; + + if (WARN_ON_ONCE(!memslot)) + return; + + start = memslot->base_gfn << PAGE_SHIFT; + end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; + + spin_lock(&kvm->mmu_lock); + stage2_wp_range(kvm, start, end); + spin_unlock(&kvm->mmu_lock); + kvm_flush_remote_tlbs(kvm); +} + +/** + * kvm_mmu_write_protect_pt_masked() - write protect dirty pages + * @kvm: The KVM pointer + * @slot: The memory slot associated with mask + * @gfn_offset: The gfn offset in memory slot + * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory + * slot to be write protected + * + * Walks bits set in mask write protects the associated pte's. Caller must + * acquire kvm_mmu_lock. + */ +static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask) +{ + phys_addr_t base_gfn = slot->base_gfn + gfn_offset; + phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT; + phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT; + + stage2_wp_range(kvm, start, end); +} + +/* + * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected + * dirty pages. + * + * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to + * enable dirty logging for them. + */ +void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask) +{ + kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); +} + +static void clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size) +{ + __clean_dcache_guest_page(pfn, size); +} + +static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size) +{ + __invalidate_icache_guest_page(pfn, size); +} + +static void kvm_send_hwpoison_signal(unsigned long address, short lsb) +{ + send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current); +} + +static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot, + unsigned long hva, + unsigned long map_size) +{ + gpa_t gpa_start; + hva_t uaddr_start, uaddr_end; + size_t size; + + /* The memslot and the VMA are guaranteed to be aligned to PAGE_SIZE */ + if (map_size == PAGE_SIZE) + return true; + + size = memslot->npages * PAGE_SIZE; + + gpa_start = memslot->base_gfn << PAGE_SHIFT; + + uaddr_start = memslot->userspace_addr; + uaddr_end = uaddr_start + size; + + /* + * Pages belonging to memslots that don't have the same alignment + * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2 + * PMD/PUD entries, because we'll end up mapping the wrong pages. + * + * Consider a layout like the following: + * + * memslot->userspace_addr: + * +-----+--------------------+--------------------+---+ + * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz| + * +-----+--------------------+--------------------+---+ + * + * memslot->base_gfn << PAGE_SHIFT: + * +---+--------------------+--------------------+-----+ + * |abc|def Stage-2 block | Stage-2 block |tvxyz| + * +---+--------------------+--------------------+-----+ + * + * If we create those stage-2 blocks, we'll end up with this incorrect + * mapping: + * d -> f + * e -> g + * f -> h + */ + if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1))) + return false; + + /* + * Next, let's make sure we're not trying to map anything not covered + * by the memslot. This means we have to prohibit block size mappings + * for the beginning and end of a non-block aligned and non-block sized + * memory slot (illustrated by the head and tail parts of the + * userspace view above containing pages 'abcde' and 'xyz', + * respectively). + * + * Note that it doesn't matter if we do the check using the + * userspace_addr or the base_gfn, as both are equally aligned (per + * the check above) and equally sized. + */ + return (hva & ~(map_size - 1)) >= uaddr_start && + (hva & ~(map_size - 1)) + map_size <= uaddr_end; +} + +/* + * Check if the given hva is backed by a transparent huge page (THP) and + * whether it can be mapped using block mapping in stage2. If so, adjust + * the stage2 PFN and IPA accordingly. Only PMD_SIZE THPs are currently + * supported. This will need to be updated to support other THP sizes. + * + * Returns the size of the mapping. + */ +static unsigned long +transparent_hugepage_adjust(struct kvm_memory_slot *memslot, + unsigned long hva, kvm_pfn_t *pfnp, + phys_addr_t *ipap) +{ + kvm_pfn_t pfn = *pfnp; + + /* + * Make sure the adjustment is done only for THP pages. Also make + * sure that the HVA and IPA are sufficiently aligned and that the + * block map is contained within the memslot. + */ + if (kvm_is_transparent_hugepage(pfn) && + fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) { + /* + * The address we faulted on is backed by a transparent huge + * page. However, because we map the compound huge page and + * not the individual tail page, we need to transfer the + * refcount to the head page. We have to be careful that the + * THP doesn't start to split while we are adjusting the + * refcounts. + * + * We are sure this doesn't happen, because mmu_notifier_retry + * was successful and we are holding the mmu_lock, so if this + * THP is trying to split, it will be blocked in the mmu + * notifier before touching any of the pages, specifically + * before being able to call __split_huge_page_refcount(). + * + * We can therefore safely transfer the refcount from PG_tail + * to PG_head and switch the pfn from a tail page to the head + * page accordingly. + */ + *ipap &= PMD_MASK; + kvm_release_pfn_clean(pfn); + pfn &= ~(PTRS_PER_PMD - 1); + kvm_get_pfn(pfn); + *pfnp = pfn; + + return PMD_SIZE; + } + + /* Use page mapping if we cannot use block mapping. */ + return PAGE_SIZE; +} + +static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, + struct kvm_memory_slot *memslot, unsigned long hva, + unsigned long fault_status) +{ + int ret; + bool write_fault, writable, force_pte = false; + bool exec_fault, needs_exec; + unsigned long mmu_seq; + gfn_t gfn = fault_ipa >> PAGE_SHIFT; + struct kvm *kvm = vcpu->kvm; + struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; + struct vm_area_struct *vma; + short vma_shift; + kvm_pfn_t pfn; + pgprot_t mem_type = PAGE_S2; + bool logging_active = memslot_is_logging(memslot); + unsigned long vma_pagesize, flags = 0; + + write_fault = kvm_is_write_fault(vcpu); + exec_fault = kvm_vcpu_trap_is_iabt(vcpu); + VM_BUG_ON(write_fault && exec_fault); + + if (fault_status == FSC_PERM && !write_fault && !exec_fault) { + kvm_err("Unexpected L2 read permission error\n"); + return -EFAULT; + } + + /* Let's check if we will get back a huge page backed by hugetlbfs */ + down_read(¤t->mm->mmap_sem); + vma = find_vma_intersection(current->mm, hva, hva + 1); + if (unlikely(!vma)) { + kvm_err("Failed to find VMA for hva 0x%lx\n", hva); + up_read(¤t->mm->mmap_sem); + return -EFAULT; + } + + if (is_vm_hugetlb_page(vma)) + vma_shift = huge_page_shift(hstate_vma(vma)); + else + vma_shift = PAGE_SHIFT; + + vma_pagesize = 1ULL << vma_shift; + if (logging_active || + (vma->vm_flags & VM_PFNMAP) || + !fault_supports_stage2_huge_mapping(memslot, hva, vma_pagesize)) { + force_pte = true; + vma_pagesize = PAGE_SIZE; + } + + /* + * The stage2 has a minimum of 2 level table (For arm64 see + * kvm_arm_setup_stage2()). Hence, we are guaranteed that we can + * use PMD_SIZE huge mappings (even when the PMD is folded into PGD). + * As for PUD huge maps, we must make sure that we have at least + * 3 levels, i.e, PMD is not folded. + */ + if (vma_pagesize == PMD_SIZE || + (vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm))) + gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT; + up_read(¤t->mm->mmap_sem); + + /* We need minimum second+third level pages */ + ret = mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm), + KVM_NR_MEM_OBJS); + if (ret) + return ret; + + mmu_seq = vcpu->kvm->mmu_notifier_seq; + /* + * Ensure the read of mmu_notifier_seq happens before we call + * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk + * the page we just got a reference to gets unmapped before we have a + * chance to grab the mmu_lock, which ensure that if the page gets + * unmapped afterwards, the call to kvm_unmap_hva will take it away + * from us again properly. This smp_rmb() interacts with the smp_wmb() + * in kvm_mmu_notifier_invalidate_<page|range_end>. + */ + smp_rmb(); + + pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable); + if (pfn == KVM_PFN_ERR_HWPOISON) { + kvm_send_hwpoison_signal(hva, vma_shift); + return 0; + } + if (is_error_noslot_pfn(pfn)) + return -EFAULT; + + if (kvm_is_device_pfn(pfn)) { + mem_type = PAGE_S2_DEVICE; + flags |= KVM_S2PTE_FLAG_IS_IOMAP; + } else if (logging_active) { + /* + * Faults on pages in a memslot with logging enabled + * should not be mapped with huge pages (it introduces churn + * and performance degradation), so force a pte mapping. + */ + flags |= KVM_S2_FLAG_LOGGING_ACTIVE; + + /* + * Only actually map the page as writable if this was a write + * fault. + */ + if (!write_fault) + writable = false; + } + + if (exec_fault && is_iomap(flags)) + return -ENOEXEC; + + spin_lock(&kvm->mmu_lock); + if (mmu_notifier_retry(kvm, mmu_seq)) + goto out_unlock; + + /* + * If we are not forced to use page mapping, check if we are + * backed by a THP and thus use block mapping if possible. + */ + if (vma_pagesize == PAGE_SIZE && !force_pte) + vma_pagesize = transparent_hugepage_adjust(memslot, hva, + &pfn, &fault_ipa); + if (writable) + kvm_set_pfn_dirty(pfn); + + if (fault_status != FSC_PERM && !is_iomap(flags)) + clean_dcache_guest_page(pfn, vma_pagesize); + + if (exec_fault) + invalidate_icache_guest_page(pfn, vma_pagesize); + + /* + * If we took an execution fault we have made the + * icache/dcache coherent above and should now let the s2 + * mapping be executable. + * + * Write faults (!exec_fault && FSC_PERM) are orthogonal to + * execute permissions, and we preserve whatever we have. + */ + needs_exec = exec_fault || + (fault_status == FSC_PERM && stage2_is_exec(kvm, fault_ipa)); + + if (vma_pagesize == PUD_SIZE) { + pud_t new_pud = kvm_pfn_pud(pfn, mem_type); + + new_pud = kvm_pud_mkhuge(new_pud); + if (writable) + new_pud = kvm_s2pud_mkwrite(new_pud); + + if (needs_exec) + new_pud = kvm_s2pud_mkexec(new_pud); + + ret = stage2_set_pud_huge(kvm, memcache, fault_ipa, &new_pud); + } else if (vma_pagesize == PMD_SIZE) { + pmd_t new_pmd = kvm_pfn_pmd(pfn, mem_type); + + new_pmd = kvm_pmd_mkhuge(new_pmd); + + if (writable) + new_pmd = kvm_s2pmd_mkwrite(new_pmd); + + if (needs_exec) + new_pmd = kvm_s2pmd_mkexec(new_pmd); + + ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd); + } else { + pte_t new_pte = kvm_pfn_pte(pfn, mem_type); + + if (writable) { + new_pte = kvm_s2pte_mkwrite(new_pte); + mark_page_dirty(kvm, gfn); + } + + if (needs_exec) + new_pte = kvm_s2pte_mkexec(new_pte); + + ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags); + } + +out_unlock: + spin_unlock(&kvm->mmu_lock); + kvm_set_pfn_accessed(pfn); + kvm_release_pfn_clean(pfn); + return ret; +} + +/* + * Resolve the access fault by making the page young again. + * Note that because the faulting entry is guaranteed not to be + * cached in the TLB, we don't need to invalidate anything. + * Only the HW Access Flag updates are supported for Stage 2 (no DBM), + * so there is no need for atomic (pte|pmd)_mkyoung operations. + */ +static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) +{ + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + kvm_pfn_t pfn; + bool pfn_valid = false; + + trace_kvm_access_fault(fault_ipa); + + spin_lock(&vcpu->kvm->mmu_lock); + + if (!stage2_get_leaf_entry(vcpu->kvm, fault_ipa, &pud, &pmd, &pte)) + goto out; + + if (pud) { /* HugeTLB */ + *pud = kvm_s2pud_mkyoung(*pud); + pfn = kvm_pud_pfn(*pud); + pfn_valid = true; + } else if (pmd) { /* THP, HugeTLB */ + *pmd = pmd_mkyoung(*pmd); + pfn = pmd_pfn(*pmd); + pfn_valid = true; + } else { + *pte = pte_mkyoung(*pte); /* Just a page... */ + pfn = pte_pfn(*pte); + pfn_valid = true; + } + +out: + spin_unlock(&vcpu->kvm->mmu_lock); + if (pfn_valid) + kvm_set_pfn_accessed(pfn); +} + +/** + * kvm_handle_guest_abort - handles all 2nd stage aborts + * @vcpu: the VCPU pointer + * @run: the kvm_run structure + * + * Any abort that gets to the host is almost guaranteed to be caused by a + * missing second stage translation table entry, which can mean that either the + * guest simply needs more memory and we must allocate an appropriate page or it + * can mean that the guest tried to access I/O memory, which is emulated by user + * space. The distinction is based on the IPA causing the fault and whether this + * memory region has been registered as standard RAM by user space. + */ +int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) +{ + unsigned long fault_status; + phys_addr_t fault_ipa; + struct kvm_memory_slot *memslot; + unsigned long hva; + bool is_iabt, write_fault, writable; + gfn_t gfn; + int ret, idx; + + fault_status = kvm_vcpu_trap_get_fault_type(vcpu); + + fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); + is_iabt = kvm_vcpu_trap_is_iabt(vcpu); + + /* Synchronous External Abort? */ + if (kvm_vcpu_dabt_isextabt(vcpu)) { + /* + * For RAS the host kernel may handle this abort. + * There is no need to pass the error into the guest. + */ + if (!kvm_handle_guest_sea(fault_ipa, kvm_vcpu_get_hsr(vcpu))) + return 1; + + if (unlikely(!is_iabt)) { + kvm_inject_vabt(vcpu); + return 1; + } + } + + trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_hsr(vcpu), + kvm_vcpu_get_hfar(vcpu), fault_ipa); + + /* Check the stage-2 fault is trans. fault or write fault */ + if (fault_status != FSC_FAULT && fault_status != FSC_PERM && + fault_status != FSC_ACCESS) { + kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n", + kvm_vcpu_trap_get_class(vcpu), + (unsigned long)kvm_vcpu_trap_get_fault(vcpu), + (unsigned long)kvm_vcpu_get_hsr(vcpu)); + return -EFAULT; + } + + idx = srcu_read_lock(&vcpu->kvm->srcu); + + gfn = fault_ipa >> PAGE_SHIFT; + memslot = gfn_to_memslot(vcpu->kvm, gfn); + hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable); + write_fault = kvm_is_write_fault(vcpu); + if (kvm_is_error_hva(hva) || (write_fault && !writable)) { + if (is_iabt) { + /* Prefetch Abort on I/O address */ + ret = -ENOEXEC; + goto out; + } + + /* + * Check for a cache maintenance operation. Since we + * ended-up here, we know it is outside of any memory + * slot. But we can't find out if that is for a device, + * or if the guest is just being stupid. The only thing + * we know for sure is that this range cannot be cached. + * + * So let's assume that the guest is just being + * cautious, and skip the instruction. + */ + if (kvm_vcpu_dabt_is_cm(vcpu)) { + kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); + ret = 1; + goto out_unlock; + } + + /* + * The IPA is reported as [MAX:12], so we need to + * complement it with the bottom 12 bits from the + * faulting VA. This is always 12 bits, irrespective + * of the page size. + */ + fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1); + ret = io_mem_abort(vcpu, run, fault_ipa); + goto out_unlock; + } + + /* Userspace should not be able to register out-of-bounds IPAs */ + VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm)); + + if (fault_status == FSC_ACCESS) { + handle_access_fault(vcpu, fault_ipa); + ret = 1; + goto out_unlock; + } + + ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status); + if (ret == 0) + ret = 1; +out: + if (ret == -ENOEXEC) { + kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu)); + ret = 1; + } +out_unlock: + srcu_read_unlock(&vcpu->kvm->srcu, idx); + return ret; +} + +static int handle_hva_to_gpa(struct kvm *kvm, + unsigned long start, + unsigned long end, + int (*handler)(struct kvm *kvm, + gpa_t gpa, u64 size, + void *data), + void *data) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + int ret = 0; + + slots = kvm_memslots(kvm); + + /* we only care about the pages that the guest sees */ + kvm_for_each_memslot(memslot, slots) { + unsigned long hva_start, hva_end; + gfn_t gpa; + + hva_start = max(start, memslot->userspace_addr); + hva_end = min(end, memslot->userspace_addr + + (memslot->npages << PAGE_SHIFT)); + if (hva_start >= hva_end) + continue; + + gpa = hva_to_gfn_memslot(hva_start, memslot) << PAGE_SHIFT; + ret |= handler(kvm, gpa, (u64)(hva_end - hva_start), data); + } + + return ret; +} + +static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) +{ + unmap_stage2_range(kvm, gpa, size); + return 0; +} + +int kvm_unmap_hva_range(struct kvm *kvm, + unsigned long start, unsigned long end) +{ + if (!kvm->arch.pgd) + return 0; + + trace_kvm_unmap_hva_range(start, end); + handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL); + return 0; +} + +static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) +{ + pte_t *pte = (pte_t *)data; + + WARN_ON(size != PAGE_SIZE); + /* + * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE + * flag clear because MMU notifiers will have unmapped a huge PMD before + * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and + * therefore stage2_set_pte() never needs to clear out a huge PMD + * through this calling path. + */ + stage2_set_pte(kvm, NULL, gpa, pte, 0); + return 0; +} + + +int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +{ + unsigned long end = hva + PAGE_SIZE; + kvm_pfn_t pfn = pte_pfn(pte); + pte_t stage2_pte; + + if (!kvm->arch.pgd) + return 0; + + trace_kvm_set_spte_hva(hva); + + /* + * We've moved a page around, probably through CoW, so let's treat it + * just like a translation fault and clean the cache to the PoC. + */ + clean_dcache_guest_page(pfn, PAGE_SIZE); + stage2_pte = kvm_pfn_pte(pfn, PAGE_S2); + handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte); + + return 0; +} + +static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) +{ + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); + if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte)) + return 0; + + if (pud) + return stage2_pudp_test_and_clear_young(pud); + else if (pmd) + return stage2_pmdp_test_and_clear_young(pmd); + else + return stage2_ptep_test_and_clear_young(pte); +} + +static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) +{ + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); + if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte)) + return 0; + + if (pud) + return kvm_s2pud_young(*pud); + else if (pmd) + return pmd_young(*pmd); + else + return pte_young(*pte); +} + +int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) +{ + if (!kvm->arch.pgd) + return 0; + trace_kvm_age_hva(start, end); + return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL); +} + +int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) +{ + if (!kvm->arch.pgd) + return 0; + trace_kvm_test_age_hva(hva); + return handle_hva_to_gpa(kvm, hva, hva + PAGE_SIZE, + kvm_test_age_hva_handler, NULL); +} + +void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu) +{ + mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); +} + +phys_addr_t kvm_mmu_get_httbr(void) +{ + if (__kvm_cpu_uses_extended_idmap()) + return virt_to_phys(merged_hyp_pgd); + else + return virt_to_phys(hyp_pgd); +} + +phys_addr_t kvm_get_idmap_vector(void) +{ + return hyp_idmap_vector; +} + +static int kvm_map_idmap_text(pgd_t *pgd) +{ + int err; + + /* Create the idmap in the boot page tables */ + err = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(), + hyp_idmap_start, hyp_idmap_end, + __phys_to_pfn(hyp_idmap_start), + PAGE_HYP_EXEC); + if (err) + kvm_err("Failed to idmap %lx-%lx\n", + hyp_idmap_start, hyp_idmap_end); + + return err; +} + +int kvm_mmu_init(void) +{ + int err; + + hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start); + hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE); + hyp_idmap_end = __pa_symbol(__hyp_idmap_text_end); + hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE); + hyp_idmap_vector = __pa_symbol(__kvm_hyp_init); + + /* + * We rely on the linker script to ensure at build time that the HYP + * init code does not cross a page boundary. + */ + BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK); + + kvm_debug("IDMAP page: %lx\n", hyp_idmap_start); + kvm_debug("HYP VA range: %lx:%lx\n", + kern_hyp_va(PAGE_OFFSET), + kern_hyp_va((unsigned long)high_memory - 1)); + + if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) && + hyp_idmap_start < kern_hyp_va((unsigned long)high_memory - 1) && + hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) { + /* + * The idmap page is intersecting with the VA space, + * it is not safe to continue further. + */ + kvm_err("IDMAP intersecting with HYP VA, unable to continue\n"); + err = -EINVAL; + goto out; + } + + hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order); + if (!hyp_pgd) { + kvm_err("Hyp mode PGD not allocated\n"); + err = -ENOMEM; + goto out; + } + + if (__kvm_cpu_uses_extended_idmap()) { + boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + hyp_pgd_order); + if (!boot_hyp_pgd) { + kvm_err("Hyp boot PGD not allocated\n"); + err = -ENOMEM; + goto out; + } + + err = kvm_map_idmap_text(boot_hyp_pgd); + if (err) + goto out; + + merged_hyp_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + if (!merged_hyp_pgd) { + kvm_err("Failed to allocate extra HYP pgd\n"); + goto out; + } + __kvm_extend_hypmap(boot_hyp_pgd, hyp_pgd, merged_hyp_pgd, + hyp_idmap_start); + } else { + err = kvm_map_idmap_text(hyp_pgd); + if (err) + goto out; + } + + io_map_base = hyp_idmap_start; + return 0; +out: + free_hyp_pgds(); + return err; +} + +void kvm_arch_commit_memory_region(struct kvm *kvm, + const struct kvm_userspace_memory_region *mem, + struct kvm_memory_slot *old, + const struct kvm_memory_slot *new, + enum kvm_mr_change change) +{ + /* + * At this point memslot has been committed and there is an + * allocated dirty_bitmap[], dirty pages will be tracked while the + * memory slot is write protected. + */ + if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) { + /* + * If we're with initial-all-set, we don't need to write + * protect any pages because they're all reported as dirty. + * Huge pages and normal pages will be write protect gradually. + */ + if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) { + kvm_mmu_wp_memory_region(kvm, mem->slot); + } + } +} + +int kvm_arch_prepare_memory_region(struct kvm *kvm, + struct kvm_memory_slot *memslot, + const struct kvm_userspace_memory_region *mem, + enum kvm_mr_change change) +{ + hva_t hva = mem->userspace_addr; + hva_t reg_end = hva + mem->memory_size; + bool writable = !(mem->flags & KVM_MEM_READONLY); + int ret = 0; + + if (change != KVM_MR_CREATE && change != KVM_MR_MOVE && + change != KVM_MR_FLAGS_ONLY) + return 0; + + /* + * Prevent userspace from creating a memory region outside of the IPA + * space addressable by the KVM guest IPA space. + */ + if (memslot->base_gfn + memslot->npages >= + (kvm_phys_size(kvm) >> PAGE_SHIFT)) + return -EFAULT; + + down_read(¤t->mm->mmap_sem); + /* + * A memory region could potentially cover multiple VMAs, and any holes + * between them, so iterate over all of them to find out if we can map + * any of them right now. + * + * +--------------------------------------------+ + * +---------------+----------------+ +----------------+ + * | : VMA 1 | VMA 2 | | VMA 3 : | + * +---------------+----------------+ +----------------+ + * | memory region | + * +--------------------------------------------+ + */ + do { + struct vm_area_struct *vma = find_vma(current->mm, hva); + hva_t vm_start, vm_end; + + if (!vma || vma->vm_start >= reg_end) + break; + + /* + * Take the intersection of this VMA with the memory region + */ + vm_start = max(hva, vma->vm_start); + vm_end = min(reg_end, vma->vm_end); + + if (vma->vm_flags & VM_PFNMAP) { + gpa_t gpa = mem->guest_phys_addr + + (vm_start - mem->userspace_addr); + phys_addr_t pa; + + pa = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT; + pa += vm_start - vma->vm_start; + + /* IO region dirty page logging not allowed */ + if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES) { + ret = -EINVAL; + goto out; + } + + ret = kvm_phys_addr_ioremap(kvm, gpa, pa, + vm_end - vm_start, + writable); + if (ret) + break; + } + hva = vm_end; + } while (hva < reg_end); + + if (change == KVM_MR_FLAGS_ONLY) + goto out; + + spin_lock(&kvm->mmu_lock); + if (ret) + unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size); + else + stage2_flush_memslot(kvm, memslot); + spin_unlock(&kvm->mmu_lock); +out: + up_read(¤t->mm->mmap_sem); + return ret; +} + +void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) +{ +} + +void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) +{ +} + +void kvm_arch_flush_shadow_all(struct kvm *kvm) +{ + kvm_free_stage2_pgd(kvm); +} + +void kvm_arch_flush_shadow_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot) +{ + gpa_t gpa = slot->base_gfn << PAGE_SHIFT; + phys_addr_t size = slot->npages << PAGE_SHIFT; + + spin_lock(&kvm->mmu_lock); + unmap_stage2_range(kvm, gpa, size); + spin_unlock(&kvm->mmu_lock); +} + +/* + * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized). + * + * Main problems: + * - S/W ops are local to a CPU (not broadcast) + * - We have line migration behind our back (speculation) + * - System caches don't support S/W at all (damn!) + * + * In the face of the above, the best we can do is to try and convert + * S/W ops to VA ops. Because the guest is not allowed to infer the + * S/W to PA mapping, it can only use S/W to nuke the whole cache, + * which is a rather good thing for us. + * + * Also, it is only used when turning caches on/off ("The expected + * usage of the cache maintenance instructions that operate by set/way + * is associated with the cache maintenance instructions associated + * with the powerdown and powerup of caches, if this is required by + * the implementation."). + * + * We use the following policy: + * + * - If we trap a S/W operation, we enable VM trapping to detect + * caches being turned on/off, and do a full clean. + * + * - We flush the caches on both caches being turned on and off. + * + * - Once the caches are enabled, we stop trapping VM ops. + */ +void kvm_set_way_flush(struct kvm_vcpu *vcpu) +{ + unsigned long hcr = *vcpu_hcr(vcpu); + + /* + * If this is the first time we do a S/W operation + * (i.e. HCR_TVM not set) flush the whole memory, and set the + * VM trapping. + * + * Otherwise, rely on the VM trapping to wait for the MMU + + * Caches to be turned off. At that point, we'll be able to + * clean the caches again. + */ + if (!(hcr & HCR_TVM)) { + trace_kvm_set_way_flush(*vcpu_pc(vcpu), + vcpu_has_cache_enabled(vcpu)); + stage2_flush_vm(vcpu->kvm); + *vcpu_hcr(vcpu) = hcr | HCR_TVM; + } +} + +void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled) +{ + bool now_enabled = vcpu_has_cache_enabled(vcpu); + + /* + * If switching the MMU+caches on, need to invalidate the caches. + * If switching it off, need to clean the caches. + * Clean + invalidate does the trick always. + */ + if (now_enabled != was_enabled) + stage2_flush_vm(vcpu->kvm); + + /* Caches are now on, stop trapping VM ops (until a S/W op) */ + if (now_enabled) + *vcpu_hcr(vcpu) &= ~HCR_TVM; + + trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled); +} diff --git a/arch/arm64/kvm/perf.c b/arch/arm64/kvm/perf.c new file mode 100644 index 000000000000..d45b8b9a4415 --- /dev/null +++ b/arch/arm64/kvm/perf.c @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Based on the x86 implementation. + * + * Copyright (C) 2012 ARM Ltd. + * Author: Marc Zyngier <marc.zyngier@arm.com> + */ + +#include <linux/perf_event.h> +#include <linux/kvm_host.h> + +#include <asm/kvm_emulate.h> + +static int kvm_is_in_guest(void) +{ + return kvm_get_running_vcpu() != NULL; +} + +static int kvm_is_user_mode(void) +{ + struct kvm_vcpu *vcpu; + + vcpu = kvm_get_running_vcpu(); + + if (vcpu) + return !vcpu_mode_priv(vcpu); + + return 0; +} + +static unsigned long kvm_get_guest_ip(void) +{ + struct kvm_vcpu *vcpu; + + vcpu = kvm_get_running_vcpu(); + + if (vcpu) + return *vcpu_pc(vcpu); + + return 0; +} + +static struct perf_guest_info_callbacks kvm_guest_cbs = { + .is_in_guest = kvm_is_in_guest, + .is_user_mode = kvm_is_user_mode, + .get_guest_ip = kvm_get_guest_ip, +}; + +int kvm_perf_init(void) +{ + return perf_register_guest_info_callbacks(&kvm_guest_cbs); +} + +int kvm_perf_teardown(void) +{ + return perf_unregister_guest_info_callbacks(&kvm_guest_cbs); +} diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c new file mode 100644 index 000000000000..f0d0312c0a55 --- /dev/null +++ b/arch/arm64/kvm/pmu-emul.c @@ -0,0 +1,869 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2015 Linaro Ltd. + * Author: Shannon Zhao <shannon.zhao@linaro.org> + */ + +#include <linux/cpu.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/perf_event.h> +#include <linux/perf/arm_pmu.h> +#include <linux/uaccess.h> +#include <asm/kvm_emulate.h> +#include <kvm/arm_pmu.h> +#include <kvm/arm_vgic.h> + +static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx); +static void kvm_pmu_update_pmc_chained(struct kvm_vcpu *vcpu, u64 select_idx); +static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc); + +#define PERF_ATTR_CFG1_KVM_PMU_CHAINED 0x1 + +/** + * kvm_pmu_idx_is_64bit - determine if select_idx is a 64bit counter + * @vcpu: The vcpu pointer + * @select_idx: The counter index + */ +static bool kvm_pmu_idx_is_64bit(struct kvm_vcpu *vcpu, u64 select_idx) +{ + return (select_idx == ARMV8_PMU_CYCLE_IDX && + __vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_LC); +} + +static struct kvm_vcpu *kvm_pmc_to_vcpu(struct kvm_pmc *pmc) +{ + struct kvm_pmu *pmu; + struct kvm_vcpu_arch *vcpu_arch; + + pmc -= pmc->idx; + pmu = container_of(pmc, struct kvm_pmu, pmc[0]); + vcpu_arch = container_of(pmu, struct kvm_vcpu_arch, pmu); + return container_of(vcpu_arch, struct kvm_vcpu, arch); +} + +/** + * kvm_pmu_pmc_is_chained - determine if the pmc is chained + * @pmc: The PMU counter pointer + */ +static bool kvm_pmu_pmc_is_chained(struct kvm_pmc *pmc) +{ + struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); + + return test_bit(pmc->idx >> 1, vcpu->arch.pmu.chained); +} + +/** + * kvm_pmu_idx_is_high_counter - determine if select_idx is a high/low counter + * @select_idx: The counter index + */ +static bool kvm_pmu_idx_is_high_counter(u64 select_idx) +{ + return select_idx & 0x1; +} + +/** + * kvm_pmu_get_canonical_pmc - obtain the canonical pmc + * @pmc: The PMU counter pointer + * + * When a pair of PMCs are chained together we use the low counter (canonical) + * to hold the underlying perf event. + */ +static struct kvm_pmc *kvm_pmu_get_canonical_pmc(struct kvm_pmc *pmc) +{ + if (kvm_pmu_pmc_is_chained(pmc) && + kvm_pmu_idx_is_high_counter(pmc->idx)) + return pmc - 1; + + return pmc; +} +static struct kvm_pmc *kvm_pmu_get_alternate_pmc(struct kvm_pmc *pmc) +{ + if (kvm_pmu_idx_is_high_counter(pmc->idx)) + return pmc - 1; + else + return pmc + 1; +} + +/** + * kvm_pmu_idx_has_chain_evtype - determine if the event type is chain + * @vcpu: The vcpu pointer + * @select_idx: The counter index + */ +static bool kvm_pmu_idx_has_chain_evtype(struct kvm_vcpu *vcpu, u64 select_idx) +{ + u64 eventsel, reg; + + select_idx |= 0x1; + + if (select_idx == ARMV8_PMU_CYCLE_IDX) + return false; + + reg = PMEVTYPER0_EL0 + select_idx; + eventsel = __vcpu_sys_reg(vcpu, reg) & ARMV8_PMU_EVTYPE_EVENT; + + return eventsel == ARMV8_PMUV3_PERFCTR_CHAIN; +} + +/** + * kvm_pmu_get_pair_counter_value - get PMU counter value + * @vcpu: The vcpu pointer + * @pmc: The PMU counter pointer + */ +static u64 kvm_pmu_get_pair_counter_value(struct kvm_vcpu *vcpu, + struct kvm_pmc *pmc) +{ + u64 counter, counter_high, reg, enabled, running; + + if (kvm_pmu_pmc_is_chained(pmc)) { + pmc = kvm_pmu_get_canonical_pmc(pmc); + reg = PMEVCNTR0_EL0 + pmc->idx; + + counter = __vcpu_sys_reg(vcpu, reg); + counter_high = __vcpu_sys_reg(vcpu, reg + 1); + + counter = lower_32_bits(counter) | (counter_high << 32); + } else { + reg = (pmc->idx == ARMV8_PMU_CYCLE_IDX) + ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + pmc->idx; + counter = __vcpu_sys_reg(vcpu, reg); + } + + /* + * The real counter value is equal to the value of counter register plus + * the value perf event counts. + */ + if (pmc->perf_event) + counter += perf_event_read_value(pmc->perf_event, &enabled, + &running); + + return counter; +} + +/** + * kvm_pmu_get_counter_value - get PMU counter value + * @vcpu: The vcpu pointer + * @select_idx: The counter index + */ +u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx) +{ + u64 counter; + struct kvm_pmu *pmu = &vcpu->arch.pmu; + struct kvm_pmc *pmc = &pmu->pmc[select_idx]; + + counter = kvm_pmu_get_pair_counter_value(vcpu, pmc); + + if (kvm_pmu_pmc_is_chained(pmc) && + kvm_pmu_idx_is_high_counter(select_idx)) + counter = upper_32_bits(counter); + else if (select_idx != ARMV8_PMU_CYCLE_IDX) + counter = lower_32_bits(counter); + + return counter; +} + +/** + * kvm_pmu_set_counter_value - set PMU counter value + * @vcpu: The vcpu pointer + * @select_idx: The counter index + * @val: The counter value + */ +void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val) +{ + u64 reg; + + reg = (select_idx == ARMV8_PMU_CYCLE_IDX) + ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + select_idx; + __vcpu_sys_reg(vcpu, reg) += (s64)val - kvm_pmu_get_counter_value(vcpu, select_idx); + + /* Recreate the perf event to reflect the updated sample_period */ + kvm_pmu_create_perf_event(vcpu, select_idx); +} + +/** + * kvm_pmu_release_perf_event - remove the perf event + * @pmc: The PMU counter pointer + */ +static void kvm_pmu_release_perf_event(struct kvm_pmc *pmc) +{ + pmc = kvm_pmu_get_canonical_pmc(pmc); + if (pmc->perf_event) { + perf_event_disable(pmc->perf_event); + perf_event_release_kernel(pmc->perf_event); + pmc->perf_event = NULL; + } +} + +/** + * kvm_pmu_stop_counter - stop PMU counter + * @pmc: The PMU counter pointer + * + * If this counter has been configured to monitor some event, release it here. + */ +static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc) +{ + u64 counter, reg, val; + + pmc = kvm_pmu_get_canonical_pmc(pmc); + if (!pmc->perf_event) + return; + + counter = kvm_pmu_get_pair_counter_value(vcpu, pmc); + + if (pmc->idx == ARMV8_PMU_CYCLE_IDX) { + reg = PMCCNTR_EL0; + val = counter; + } else { + reg = PMEVCNTR0_EL0 + pmc->idx; + val = lower_32_bits(counter); + } + + __vcpu_sys_reg(vcpu, reg) = val; + + if (kvm_pmu_pmc_is_chained(pmc)) + __vcpu_sys_reg(vcpu, reg + 1) = upper_32_bits(counter); + + kvm_pmu_release_perf_event(pmc); +} + +/** + * kvm_pmu_vcpu_init - assign pmu counter idx for cpu + * @vcpu: The vcpu pointer + * + */ +void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu) +{ + int i; + struct kvm_pmu *pmu = &vcpu->arch.pmu; + + for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) + pmu->pmc[i].idx = i; +} + +/** + * kvm_pmu_vcpu_reset - reset pmu state for cpu + * @vcpu: The vcpu pointer + * + */ +void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu) +{ + unsigned long mask = kvm_pmu_valid_counter_mask(vcpu); + struct kvm_pmu *pmu = &vcpu->arch.pmu; + int i; + + for_each_set_bit(i, &mask, 32) + kvm_pmu_stop_counter(vcpu, &pmu->pmc[i]); + + bitmap_zero(vcpu->arch.pmu.chained, ARMV8_PMU_MAX_COUNTER_PAIRS); +} + +/** + * kvm_pmu_vcpu_destroy - free perf event of PMU for cpu + * @vcpu: The vcpu pointer + * + */ +void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu) +{ + int i; + struct kvm_pmu *pmu = &vcpu->arch.pmu; + + for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) + kvm_pmu_release_perf_event(&pmu->pmc[i]); +} + +u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu) +{ + u64 val = __vcpu_sys_reg(vcpu, PMCR_EL0) >> ARMV8_PMU_PMCR_N_SHIFT; + + val &= ARMV8_PMU_PMCR_N_MASK; + if (val == 0) + return BIT(ARMV8_PMU_CYCLE_IDX); + else + return GENMASK(val - 1, 0) | BIT(ARMV8_PMU_CYCLE_IDX); +} + +/** + * kvm_pmu_enable_counter_mask - enable selected PMU counters + * @vcpu: The vcpu pointer + * @val: the value guest writes to PMCNTENSET register + * + * Call perf_event_enable to start counting the perf event + */ +void kvm_pmu_enable_counter_mask(struct kvm_vcpu *vcpu, u64 val) +{ + int i; + struct kvm_pmu *pmu = &vcpu->arch.pmu; + struct kvm_pmc *pmc; + + if (!(__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E) || !val) + return; + + for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) { + if (!(val & BIT(i))) + continue; + + pmc = &pmu->pmc[i]; + + /* A change in the enable state may affect the chain state */ + kvm_pmu_update_pmc_chained(vcpu, i); + kvm_pmu_create_perf_event(vcpu, i); + + /* At this point, pmc must be the canonical */ + if (pmc->perf_event) { + perf_event_enable(pmc->perf_event); + if (pmc->perf_event->state != PERF_EVENT_STATE_ACTIVE) + kvm_debug("fail to enable perf event\n"); + } + } +} + +/** + * kvm_pmu_disable_counter_mask - disable selected PMU counters + * @vcpu: The vcpu pointer + * @val: the value guest writes to PMCNTENCLR register + * + * Call perf_event_disable to stop counting the perf event + */ +void kvm_pmu_disable_counter_mask(struct kvm_vcpu *vcpu, u64 val) +{ + int i; + struct kvm_pmu *pmu = &vcpu->arch.pmu; + struct kvm_pmc *pmc; + + if (!val) + return; + + for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) { + if (!(val & BIT(i))) + continue; + + pmc = &pmu->pmc[i]; + + /* A change in the enable state may affect the chain state */ + kvm_pmu_update_pmc_chained(vcpu, i); + kvm_pmu_create_perf_event(vcpu, i); + + /* At this point, pmc must be the canonical */ + if (pmc->perf_event) + perf_event_disable(pmc->perf_event); + } +} + +static u64 kvm_pmu_overflow_status(struct kvm_vcpu *vcpu) +{ + u64 reg = 0; + + if ((__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E)) { + reg = __vcpu_sys_reg(vcpu, PMOVSSET_EL0); + reg &= __vcpu_sys_reg(vcpu, PMCNTENSET_EL0); + reg &= __vcpu_sys_reg(vcpu, PMINTENSET_EL1); + reg &= kvm_pmu_valid_counter_mask(vcpu); + } + + return reg; +} + +static void kvm_pmu_update_state(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = &vcpu->arch.pmu; + bool overflow; + + if (!kvm_arm_pmu_v3_ready(vcpu)) + return; + + overflow = !!kvm_pmu_overflow_status(vcpu); + if (pmu->irq_level == overflow) + return; + + pmu->irq_level = overflow; + + if (likely(irqchip_in_kernel(vcpu->kvm))) { + int ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id, + pmu->irq_num, overflow, pmu); + WARN_ON(ret); + } +} + +bool kvm_pmu_should_notify_user(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = &vcpu->arch.pmu; + struct kvm_sync_regs *sregs = &vcpu->run->s.regs; + bool run_level = sregs->device_irq_level & KVM_ARM_DEV_PMU; + + if (likely(irqchip_in_kernel(vcpu->kvm))) + return false; + + return pmu->irq_level != run_level; +} + +/* + * Reflect the PMU overflow interrupt output level into the kvm_run structure + */ +void kvm_pmu_update_run(struct kvm_vcpu *vcpu) +{ + struct kvm_sync_regs *regs = &vcpu->run->s.regs; + + /* Populate the timer bitmap for user space */ + regs->device_irq_level &= ~KVM_ARM_DEV_PMU; + if (vcpu->arch.pmu.irq_level) + regs->device_irq_level |= KVM_ARM_DEV_PMU; +} + +/** + * kvm_pmu_flush_hwstate - flush pmu state to cpu + * @vcpu: The vcpu pointer + * + * Check if the PMU has overflowed while we were running in the host, and inject + * an interrupt if that was the case. + */ +void kvm_pmu_flush_hwstate(struct kvm_vcpu *vcpu) +{ + kvm_pmu_update_state(vcpu); +} + +/** + * kvm_pmu_sync_hwstate - sync pmu state from cpu + * @vcpu: The vcpu pointer + * + * Check if the PMU has overflowed while we were running in the guest, and + * inject an interrupt if that was the case. + */ +void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu) +{ + kvm_pmu_update_state(vcpu); +} + +/** + * When the perf event overflows, set the overflow status and inform the vcpu. + */ +static void kvm_pmu_perf_overflow(struct perf_event *perf_event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct kvm_pmc *pmc = perf_event->overflow_handler_context; + struct arm_pmu *cpu_pmu = to_arm_pmu(perf_event->pmu); + struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); + int idx = pmc->idx; + u64 period; + + cpu_pmu->pmu.stop(perf_event, PERF_EF_UPDATE); + + /* + * Reset the sample period to the architectural limit, + * i.e. the point where the counter overflows. + */ + period = -(local64_read(&perf_event->count)); + + if (!kvm_pmu_idx_is_64bit(vcpu, pmc->idx)) + period &= GENMASK(31, 0); + + local64_set(&perf_event->hw.period_left, 0); + perf_event->attr.sample_period = period; + perf_event->hw.sample_period = period; + + __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(idx); + + if (kvm_pmu_overflow_status(vcpu)) { + kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); + kvm_vcpu_kick(vcpu); + } + + cpu_pmu->pmu.start(perf_event, PERF_EF_RELOAD); +} + +/** + * kvm_pmu_software_increment - do software increment + * @vcpu: The vcpu pointer + * @val: the value guest writes to PMSWINC register + */ +void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val) +{ + struct kvm_pmu *pmu = &vcpu->arch.pmu; + int i; + + if (!(__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E)) + return; + + /* Weed out disabled counters */ + val &= __vcpu_sys_reg(vcpu, PMCNTENSET_EL0); + + for (i = 0; i < ARMV8_PMU_CYCLE_IDX; i++) { + u64 type, reg; + + if (!(val & BIT(i))) + continue; + + /* PMSWINC only applies to ... SW_INC! */ + type = __vcpu_sys_reg(vcpu, PMEVTYPER0_EL0 + i); + type &= ARMV8_PMU_EVTYPE_EVENT; + if (type != ARMV8_PMUV3_PERFCTR_SW_INCR) + continue; + + /* increment this even SW_INC counter */ + reg = __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) + 1; + reg = lower_32_bits(reg); + __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) = reg; + + if (reg) /* no overflow on the low part */ + continue; + + if (kvm_pmu_pmc_is_chained(&pmu->pmc[i])) { + /* increment the high counter */ + reg = __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i + 1) + 1; + reg = lower_32_bits(reg); + __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i + 1) = reg; + if (!reg) /* mark overflow on the high counter */ + __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(i + 1); + } else { + /* mark overflow on low counter */ + __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(i); + } + } +} + +/** + * kvm_pmu_handle_pmcr - handle PMCR register + * @vcpu: The vcpu pointer + * @val: the value guest writes to PMCR register + */ +void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) +{ + unsigned long mask = kvm_pmu_valid_counter_mask(vcpu); + int i; + + if (val & ARMV8_PMU_PMCR_E) { + kvm_pmu_enable_counter_mask(vcpu, + __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & mask); + } else { + kvm_pmu_disable_counter_mask(vcpu, mask); + } + + if (val & ARMV8_PMU_PMCR_C) + kvm_pmu_set_counter_value(vcpu, ARMV8_PMU_CYCLE_IDX, 0); + + if (val & ARMV8_PMU_PMCR_P) { + for_each_set_bit(i, &mask, 32) + kvm_pmu_set_counter_value(vcpu, i, 0); + } +} + +static bool kvm_pmu_counter_is_enabled(struct kvm_vcpu *vcpu, u64 select_idx) +{ + return (__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E) && + (__vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & BIT(select_idx)); +} + +/** + * kvm_pmu_create_perf_event - create a perf event for a counter + * @vcpu: The vcpu pointer + * @select_idx: The number of selected counter + */ +static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx) +{ + struct kvm_pmu *pmu = &vcpu->arch.pmu; + struct kvm_pmc *pmc; + struct perf_event *event; + struct perf_event_attr attr; + u64 eventsel, counter, reg, data; + + /* + * For chained counters the event type and filtering attributes are + * obtained from the low/even counter. We also use this counter to + * determine if the event is enabled/disabled. + */ + pmc = kvm_pmu_get_canonical_pmc(&pmu->pmc[select_idx]); + + reg = (pmc->idx == ARMV8_PMU_CYCLE_IDX) + ? PMCCFILTR_EL0 : PMEVTYPER0_EL0 + pmc->idx; + data = __vcpu_sys_reg(vcpu, reg); + + kvm_pmu_stop_counter(vcpu, pmc); + eventsel = data & ARMV8_PMU_EVTYPE_EVENT; + + /* Software increment event does't need to be backed by a perf event */ + if (eventsel == ARMV8_PMUV3_PERFCTR_SW_INCR && + pmc->idx != ARMV8_PMU_CYCLE_IDX) + return; + + memset(&attr, 0, sizeof(struct perf_event_attr)); + attr.type = PERF_TYPE_RAW; + attr.size = sizeof(attr); + attr.pinned = 1; + attr.disabled = !kvm_pmu_counter_is_enabled(vcpu, pmc->idx); + attr.exclude_user = data & ARMV8_PMU_EXCLUDE_EL0 ? 1 : 0; + attr.exclude_kernel = data & ARMV8_PMU_EXCLUDE_EL1 ? 1 : 0; + attr.exclude_hv = 1; /* Don't count EL2 events */ + attr.exclude_host = 1; /* Don't count host events */ + attr.config = (pmc->idx == ARMV8_PMU_CYCLE_IDX) ? + ARMV8_PMUV3_PERFCTR_CPU_CYCLES : eventsel; + + counter = kvm_pmu_get_pair_counter_value(vcpu, pmc); + + if (kvm_pmu_pmc_is_chained(pmc)) { + /** + * The initial sample period (overflow count) of an event. For + * chained counters we only support overflow interrupts on the + * high counter. + */ + attr.sample_period = (-counter) & GENMASK(63, 0); + attr.config1 |= PERF_ATTR_CFG1_KVM_PMU_CHAINED; + + event = perf_event_create_kernel_counter(&attr, -1, current, + kvm_pmu_perf_overflow, + pmc + 1); + } else { + /* The initial sample period (overflow count) of an event. */ + if (kvm_pmu_idx_is_64bit(vcpu, pmc->idx)) + attr.sample_period = (-counter) & GENMASK(63, 0); + else + attr.sample_period = (-counter) & GENMASK(31, 0); + + event = perf_event_create_kernel_counter(&attr, -1, current, + kvm_pmu_perf_overflow, pmc); + } + + if (IS_ERR(event)) { + pr_err_once("kvm: pmu event creation failed %ld\n", + PTR_ERR(event)); + return; + } + + pmc->perf_event = event; +} + +/** + * kvm_pmu_update_pmc_chained - update chained bitmap + * @vcpu: The vcpu pointer + * @select_idx: The number of selected counter + * + * Update the chained bitmap based on the event type written in the + * typer register and the enable state of the odd register. + */ +static void kvm_pmu_update_pmc_chained(struct kvm_vcpu *vcpu, u64 select_idx) +{ + struct kvm_pmu *pmu = &vcpu->arch.pmu; + struct kvm_pmc *pmc = &pmu->pmc[select_idx], *canonical_pmc; + bool new_state, old_state; + + old_state = kvm_pmu_pmc_is_chained(pmc); + new_state = kvm_pmu_idx_has_chain_evtype(vcpu, pmc->idx) && + kvm_pmu_counter_is_enabled(vcpu, pmc->idx | 0x1); + + if (old_state == new_state) + return; + + canonical_pmc = kvm_pmu_get_canonical_pmc(pmc); + kvm_pmu_stop_counter(vcpu, canonical_pmc); + if (new_state) { + /* + * During promotion from !chained to chained we must ensure + * the adjacent counter is stopped and its event destroyed + */ + kvm_pmu_stop_counter(vcpu, kvm_pmu_get_alternate_pmc(pmc)); + set_bit(pmc->idx >> 1, vcpu->arch.pmu.chained); + return; + } + clear_bit(pmc->idx >> 1, vcpu->arch.pmu.chained); +} + +/** + * kvm_pmu_set_counter_event_type - set selected counter to monitor some event + * @vcpu: The vcpu pointer + * @data: The data guest writes to PMXEVTYPER_EL0 + * @select_idx: The number of selected counter + * + * When OS accesses PMXEVTYPER_EL0, that means it wants to set a PMC to count an + * event with given hardware event number. Here we call perf_event API to + * emulate this action and create a kernel perf event for it. + */ +void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data, + u64 select_idx) +{ + u64 reg, event_type = data & ARMV8_PMU_EVTYPE_MASK; + + reg = (select_idx == ARMV8_PMU_CYCLE_IDX) + ? PMCCFILTR_EL0 : PMEVTYPER0_EL0 + select_idx; + + __vcpu_sys_reg(vcpu, reg) = event_type; + + kvm_pmu_update_pmc_chained(vcpu, select_idx); + kvm_pmu_create_perf_event(vcpu, select_idx); +} + +bool kvm_arm_support_pmu_v3(void) +{ + /* + * Check if HW_PERF_EVENTS are supported by checking the number of + * hardware performance counters. This could ensure the presence of + * a physical PMU and CONFIG_PERF_EVENT is selected. + */ + return (perf_num_counters() > 0); +} + +int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu) +{ + if (!vcpu->arch.pmu.created) + return 0; + + /* + * A valid interrupt configuration for the PMU is either to have a + * properly configured interrupt number and using an in-kernel + * irqchip, or to not have an in-kernel GIC and not set an IRQ. + */ + if (irqchip_in_kernel(vcpu->kvm)) { + int irq = vcpu->arch.pmu.irq_num; + if (!kvm_arm_pmu_irq_initialized(vcpu)) + return -EINVAL; + + /* + * If we are using an in-kernel vgic, at this point we know + * the vgic will be initialized, so we can check the PMU irq + * number against the dimensions of the vgic and make sure + * it's valid. + */ + if (!irq_is_ppi(irq) && !vgic_valid_spi(vcpu->kvm, irq)) + return -EINVAL; + } else if (kvm_arm_pmu_irq_initialized(vcpu)) { + return -EINVAL; + } + + kvm_pmu_vcpu_reset(vcpu); + vcpu->arch.pmu.ready = true; + + return 0; +} + +static int kvm_arm_pmu_v3_init(struct kvm_vcpu *vcpu) +{ + if (!kvm_arm_support_pmu_v3()) + return -ENODEV; + + if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features)) + return -ENXIO; + + if (vcpu->arch.pmu.created) + return -EBUSY; + + if (irqchip_in_kernel(vcpu->kvm)) { + int ret; + + /* + * If using the PMU with an in-kernel virtual GIC + * implementation, we require the GIC to be already + * initialized when initializing the PMU. + */ + if (!vgic_initialized(vcpu->kvm)) + return -ENODEV; + + if (!kvm_arm_pmu_irq_initialized(vcpu)) + return -ENXIO; + + ret = kvm_vgic_set_owner(vcpu, vcpu->arch.pmu.irq_num, + &vcpu->arch.pmu); + if (ret) + return ret; + } + + vcpu->arch.pmu.created = true; + return 0; +} + +/* + * For one VM the interrupt type must be same for each vcpu. + * As a PPI, the interrupt number is the same for all vcpus, + * while as an SPI it must be a separate number per vcpu. + */ +static bool pmu_irq_is_valid(struct kvm *kvm, int irq) +{ + int i; + struct kvm_vcpu *vcpu; + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (!kvm_arm_pmu_irq_initialized(vcpu)) + continue; + + if (irq_is_ppi(irq)) { + if (vcpu->arch.pmu.irq_num != irq) + return false; + } else { + if (vcpu->arch.pmu.irq_num == irq) + return false; + } + } + + return true; +} + +int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) +{ + switch (attr->attr) { + case KVM_ARM_VCPU_PMU_V3_IRQ: { + int __user *uaddr = (int __user *)(long)attr->addr; + int irq; + + if (!irqchip_in_kernel(vcpu->kvm)) + return -EINVAL; + + if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features)) + return -ENODEV; + + if (get_user(irq, uaddr)) + return -EFAULT; + + /* The PMU overflow interrupt can be a PPI or a valid SPI. */ + if (!(irq_is_ppi(irq) || irq_is_spi(irq))) + return -EINVAL; + + if (!pmu_irq_is_valid(vcpu->kvm, irq)) + return -EINVAL; + + if (kvm_arm_pmu_irq_initialized(vcpu)) + return -EBUSY; + + kvm_debug("Set kvm ARM PMU irq: %d\n", irq); + vcpu->arch.pmu.irq_num = irq; + return 0; + } + case KVM_ARM_VCPU_PMU_V3_INIT: + return kvm_arm_pmu_v3_init(vcpu); + } + + return -ENXIO; +} + +int kvm_arm_pmu_v3_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) +{ + switch (attr->attr) { + case KVM_ARM_VCPU_PMU_V3_IRQ: { + int __user *uaddr = (int __user *)(long)attr->addr; + int irq; + + if (!irqchip_in_kernel(vcpu->kvm)) + return -EINVAL; + + if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features)) + return -ENODEV; + + if (!kvm_arm_pmu_irq_initialized(vcpu)) + return -ENXIO; + + irq = vcpu->arch.pmu.irq_num; + return put_user(irq, uaddr); + } + } + + return -ENXIO; +} + +int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) +{ + switch (attr->attr) { + case KVM_ARM_VCPU_PMU_V3_IRQ: + case KVM_ARM_VCPU_PMU_V3_INIT: + if (kvm_arm_support_pmu_v3() && + test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features)) + return 0; + } + + return -ENXIO; +} diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c new file mode 100644 index 000000000000..83415e96b589 --- /dev/null +++ b/arch/arm64/kvm/psci.c @@ -0,0 +1,564 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2012 - ARM Ltd + * Author: Marc Zyngier <marc.zyngier@arm.com> + */ + +#include <linux/arm-smccc.h> +#include <linux/preempt.h> +#include <linux/kvm_host.h> +#include <linux/uaccess.h> +#include <linux/wait.h> + +#include <asm/cputype.h> +#include <asm/kvm_emulate.h> + +#include <kvm/arm_psci.h> +#include <kvm/arm_hypercalls.h> + +/* + * This is an implementation of the Power State Coordination Interface + * as described in ARM document number ARM DEN 0022A. + */ + +#define AFFINITY_MASK(level) ~((0x1UL << ((level) * MPIDR_LEVEL_BITS)) - 1) + +static unsigned long psci_affinity_mask(unsigned long affinity_level) +{ + if (affinity_level <= 3) + return MPIDR_HWID_BITMASK & AFFINITY_MASK(affinity_level); + + return 0; +} + +static unsigned long kvm_psci_vcpu_suspend(struct kvm_vcpu *vcpu) +{ + /* + * NOTE: For simplicity, we make VCPU suspend emulation to be + * same-as WFI (Wait-for-interrupt) emulation. + * + * This means for KVM the wakeup events are interrupts and + * this is consistent with intended use of StateID as described + * in section 5.4.1 of PSCI v0.2 specification (ARM DEN 0022A). + * + * Further, we also treat power-down request to be same as + * stand-by request as-per section 5.4.2 clause 3 of PSCI v0.2 + * specification (ARM DEN 0022A). This means all suspend states + * for KVM will preserve the register state. + */ + kvm_vcpu_block(vcpu); + kvm_clear_request(KVM_REQ_UNHALT, vcpu); + + return PSCI_RET_SUCCESS; +} + +static void kvm_psci_vcpu_off(struct kvm_vcpu *vcpu) +{ + vcpu->arch.power_off = true; + kvm_make_request(KVM_REQ_SLEEP, vcpu); + kvm_vcpu_kick(vcpu); +} + +static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) +{ + struct vcpu_reset_state *reset_state; + struct kvm *kvm = source_vcpu->kvm; + struct kvm_vcpu *vcpu = NULL; + unsigned long cpu_id; + + cpu_id = smccc_get_arg1(source_vcpu) & MPIDR_HWID_BITMASK; + if (vcpu_mode_is_32bit(source_vcpu)) + cpu_id &= ~((u32) 0); + + vcpu = kvm_mpidr_to_vcpu(kvm, cpu_id); + + /* + * Make sure the caller requested a valid CPU and that the CPU is + * turned off. + */ + if (!vcpu) + return PSCI_RET_INVALID_PARAMS; + if (!vcpu->arch.power_off) { + if (kvm_psci_version(source_vcpu, kvm) != KVM_ARM_PSCI_0_1) + return PSCI_RET_ALREADY_ON; + else + return PSCI_RET_INVALID_PARAMS; + } + + reset_state = &vcpu->arch.reset_state; + + reset_state->pc = smccc_get_arg2(source_vcpu); + + /* Propagate caller endianness */ + reset_state->be = kvm_vcpu_is_be(source_vcpu); + + /* + * NOTE: We always update r0 (or x0) because for PSCI v0.1 + * the general purpose registers are undefined upon CPU_ON. + */ + reset_state->r0 = smccc_get_arg3(source_vcpu); + + WRITE_ONCE(reset_state->reset, true); + kvm_make_request(KVM_REQ_VCPU_RESET, vcpu); + + /* + * Make sure the reset request is observed if the change to + * power_state is observed. + */ + smp_wmb(); + + vcpu->arch.power_off = false; + kvm_vcpu_wake_up(vcpu); + + return PSCI_RET_SUCCESS; +} + +static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu) +{ + int i, matching_cpus = 0; + unsigned long mpidr; + unsigned long target_affinity; + unsigned long target_affinity_mask; + unsigned long lowest_affinity_level; + struct kvm *kvm = vcpu->kvm; + struct kvm_vcpu *tmp; + + target_affinity = smccc_get_arg1(vcpu); + lowest_affinity_level = smccc_get_arg2(vcpu); + + /* Determine target affinity mask */ + target_affinity_mask = psci_affinity_mask(lowest_affinity_level); + if (!target_affinity_mask) + return PSCI_RET_INVALID_PARAMS; + + /* Ignore other bits of target affinity */ + target_affinity &= target_affinity_mask; + + /* + * If one or more VCPU matching target affinity are running + * then ON else OFF + */ + kvm_for_each_vcpu(i, tmp, kvm) { + mpidr = kvm_vcpu_get_mpidr_aff(tmp); + if ((mpidr & target_affinity_mask) == target_affinity) { + matching_cpus++; + if (!tmp->arch.power_off) + return PSCI_0_2_AFFINITY_LEVEL_ON; + } + } + + if (!matching_cpus) + return PSCI_RET_INVALID_PARAMS; + + return PSCI_0_2_AFFINITY_LEVEL_OFF; +} + +static void kvm_prepare_system_event(struct kvm_vcpu *vcpu, u32 type) +{ + int i; + struct kvm_vcpu *tmp; + + /* + * The KVM ABI specifies that a system event exit may call KVM_RUN + * again and may perform shutdown/reboot at a later time that when the + * actual request is made. Since we are implementing PSCI and a + * caller of PSCI reboot and shutdown expects that the system shuts + * down or reboots immediately, let's make sure that VCPUs are not run + * after this call is handled and before the VCPUs have been + * re-initialized. + */ + kvm_for_each_vcpu(i, tmp, vcpu->kvm) + tmp->arch.power_off = true; + kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_SLEEP); + + memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event)); + vcpu->run->system_event.type = type; + vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; +} + +static void kvm_psci_system_off(struct kvm_vcpu *vcpu) +{ + kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_SHUTDOWN); +} + +static void kvm_psci_system_reset(struct kvm_vcpu *vcpu) +{ + kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_RESET); +} + +static void kvm_psci_narrow_to_32bit(struct kvm_vcpu *vcpu) +{ + int i; + + /* + * Zero the input registers' upper 32 bits. They will be fully + * zeroed on exit, so we're fine changing them in place. + */ + for (i = 1; i < 4; i++) + vcpu_set_reg(vcpu, i, lower_32_bits(vcpu_get_reg(vcpu, i))); +} + +static unsigned long kvm_psci_check_allowed_function(struct kvm_vcpu *vcpu, u32 fn) +{ + switch(fn) { + case PSCI_0_2_FN64_CPU_SUSPEND: + case PSCI_0_2_FN64_CPU_ON: + case PSCI_0_2_FN64_AFFINITY_INFO: + /* Disallow these functions for 32bit guests */ + if (vcpu_mode_is_32bit(vcpu)) + return PSCI_RET_NOT_SUPPORTED; + break; + } + + return 0; +} + +static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + u32 psci_fn = smccc_get_function(vcpu); + unsigned long val; + int ret = 1; + + val = kvm_psci_check_allowed_function(vcpu, psci_fn); + if (val) + goto out; + + switch (psci_fn) { + case PSCI_0_2_FN_PSCI_VERSION: + /* + * Bits[31:16] = Major Version = 0 + * Bits[15:0] = Minor Version = 2 + */ + val = KVM_ARM_PSCI_0_2; + break; + case PSCI_0_2_FN_CPU_SUSPEND: + case PSCI_0_2_FN64_CPU_SUSPEND: + val = kvm_psci_vcpu_suspend(vcpu); + break; + case PSCI_0_2_FN_CPU_OFF: + kvm_psci_vcpu_off(vcpu); + val = PSCI_RET_SUCCESS; + break; + case PSCI_0_2_FN_CPU_ON: + kvm_psci_narrow_to_32bit(vcpu); + fallthrough; + case PSCI_0_2_FN64_CPU_ON: + mutex_lock(&kvm->lock); + val = kvm_psci_vcpu_on(vcpu); + mutex_unlock(&kvm->lock); + break; + case PSCI_0_2_FN_AFFINITY_INFO: + kvm_psci_narrow_to_32bit(vcpu); + fallthrough; + case PSCI_0_2_FN64_AFFINITY_INFO: + val = kvm_psci_vcpu_affinity_info(vcpu); + break; + case PSCI_0_2_FN_MIGRATE_INFO_TYPE: + /* + * Trusted OS is MP hence does not require migration + * or + * Trusted OS is not present + */ + val = PSCI_0_2_TOS_MP; + break; + case PSCI_0_2_FN_SYSTEM_OFF: + kvm_psci_system_off(vcpu); + /* + * We shouldn't be going back to guest VCPU after + * receiving SYSTEM_OFF request. + * + * If user space accidentally/deliberately resumes + * guest VCPU after SYSTEM_OFF request then guest + * VCPU should see internal failure from PSCI return + * value. To achieve this, we preload r0 (or x0) with + * PSCI return value INTERNAL_FAILURE. + */ + val = PSCI_RET_INTERNAL_FAILURE; + ret = 0; + break; + case PSCI_0_2_FN_SYSTEM_RESET: + kvm_psci_system_reset(vcpu); + /* + * Same reason as SYSTEM_OFF for preloading r0 (or x0) + * with PSCI return value INTERNAL_FAILURE. + */ + val = PSCI_RET_INTERNAL_FAILURE; + ret = 0; + break; + default: + val = PSCI_RET_NOT_SUPPORTED; + break; + } + +out: + smccc_set_retval(vcpu, val, 0, 0, 0); + return ret; +} + +static int kvm_psci_1_0_call(struct kvm_vcpu *vcpu) +{ + u32 psci_fn = smccc_get_function(vcpu); + u32 feature; + unsigned long val; + int ret = 1; + + switch(psci_fn) { + case PSCI_0_2_FN_PSCI_VERSION: + val = KVM_ARM_PSCI_1_0; + break; + case PSCI_1_0_FN_PSCI_FEATURES: + feature = smccc_get_arg1(vcpu); + val = kvm_psci_check_allowed_function(vcpu, feature); + if (val) + break; + + switch(feature) { + case PSCI_0_2_FN_PSCI_VERSION: + case PSCI_0_2_FN_CPU_SUSPEND: + case PSCI_0_2_FN64_CPU_SUSPEND: + case PSCI_0_2_FN_CPU_OFF: + case PSCI_0_2_FN_CPU_ON: + case PSCI_0_2_FN64_CPU_ON: + case PSCI_0_2_FN_AFFINITY_INFO: + case PSCI_0_2_FN64_AFFINITY_INFO: + case PSCI_0_2_FN_MIGRATE_INFO_TYPE: + case PSCI_0_2_FN_SYSTEM_OFF: + case PSCI_0_2_FN_SYSTEM_RESET: + case PSCI_1_0_FN_PSCI_FEATURES: + case ARM_SMCCC_VERSION_FUNC_ID: + val = 0; + break; + default: + val = PSCI_RET_NOT_SUPPORTED; + break; + } + break; + default: + return kvm_psci_0_2_call(vcpu); + } + + smccc_set_retval(vcpu, val, 0, 0, 0); + return ret; +} + +static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + u32 psci_fn = smccc_get_function(vcpu); + unsigned long val; + + switch (psci_fn) { + case KVM_PSCI_FN_CPU_OFF: + kvm_psci_vcpu_off(vcpu); + val = PSCI_RET_SUCCESS; + break; + case KVM_PSCI_FN_CPU_ON: + mutex_lock(&kvm->lock); + val = kvm_psci_vcpu_on(vcpu); + mutex_unlock(&kvm->lock); + break; + default: + val = PSCI_RET_NOT_SUPPORTED; + break; + } + + smccc_set_retval(vcpu, val, 0, 0, 0); + return 1; +} + +/** + * kvm_psci_call - handle PSCI call if r0 value is in range + * @vcpu: Pointer to the VCPU struct + * + * Handle PSCI calls from guests through traps from HVC instructions. + * The calling convention is similar to SMC calls to the secure world + * where the function number is placed in r0. + * + * This function returns: > 0 (success), 0 (success but exit to user + * space), and < 0 (errors) + * + * Errors: + * -EINVAL: Unrecognized PSCI function + */ +int kvm_psci_call(struct kvm_vcpu *vcpu) +{ + switch (kvm_psci_version(vcpu, vcpu->kvm)) { + case KVM_ARM_PSCI_1_0: + return kvm_psci_1_0_call(vcpu); + case KVM_ARM_PSCI_0_2: + return kvm_psci_0_2_call(vcpu); + case KVM_ARM_PSCI_0_1: + return kvm_psci_0_1_call(vcpu); + default: + return -EINVAL; + }; +} + +int kvm_arm_get_fw_num_regs(struct kvm_vcpu *vcpu) +{ + return 3; /* PSCI version and two workaround registers */ +} + +int kvm_arm_copy_fw_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) +{ + if (put_user(KVM_REG_ARM_PSCI_VERSION, uindices++)) + return -EFAULT; + + if (put_user(KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1, uindices++)) + return -EFAULT; + + if (put_user(KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2, uindices++)) + return -EFAULT; + + return 0; +} + +#define KVM_REG_FEATURE_LEVEL_WIDTH 4 +#define KVM_REG_FEATURE_LEVEL_MASK (BIT(KVM_REG_FEATURE_LEVEL_WIDTH) - 1) + +/* + * Convert the workaround level into an easy-to-compare number, where higher + * values mean better protection. + */ +static int get_kernel_wa_level(u64 regid) +{ + switch (regid) { + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1: + switch (kvm_arm_harden_branch_predictor()) { + case KVM_BP_HARDEN_UNKNOWN: + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL; + case KVM_BP_HARDEN_WA_NEEDED: + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_AVAIL; + case KVM_BP_HARDEN_NOT_REQUIRED: + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_REQUIRED; + } + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL; + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2: + switch (kvm_arm_have_ssbd()) { + case KVM_SSBD_FORCE_DISABLE: + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL; + case KVM_SSBD_KERNEL: + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL; + case KVM_SSBD_FORCE_ENABLE: + case KVM_SSBD_MITIGATED: + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED; + case KVM_SSBD_UNKNOWN: + default: + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN; + } + } + + return -EINVAL; +} + +int kvm_arm_get_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) +{ + void __user *uaddr = (void __user *)(long)reg->addr; + u64 val; + + switch (reg->id) { + case KVM_REG_ARM_PSCI_VERSION: + val = kvm_psci_version(vcpu, vcpu->kvm); + break; + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1: + val = get_kernel_wa_level(reg->id) & KVM_REG_FEATURE_LEVEL_MASK; + break; + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2: + val = get_kernel_wa_level(reg->id) & KVM_REG_FEATURE_LEVEL_MASK; + + if (val == KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL && + kvm_arm_get_vcpu_workaround_2_flag(vcpu)) + val |= KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED; + break; + default: + return -ENOENT; + } + + if (copy_to_user(uaddr, &val, KVM_REG_SIZE(reg->id))) + return -EFAULT; + + return 0; +} + +int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) +{ + void __user *uaddr = (void __user *)(long)reg->addr; + u64 val; + int wa_level; + + if (copy_from_user(&val, uaddr, KVM_REG_SIZE(reg->id))) + return -EFAULT; + + switch (reg->id) { + case KVM_REG_ARM_PSCI_VERSION: + { + bool wants_02; + + wants_02 = test_bit(KVM_ARM_VCPU_PSCI_0_2, vcpu->arch.features); + + switch (val) { + case KVM_ARM_PSCI_0_1: + if (wants_02) + return -EINVAL; + vcpu->kvm->arch.psci_version = val; + return 0; + case KVM_ARM_PSCI_0_2: + case KVM_ARM_PSCI_1_0: + if (!wants_02) + return -EINVAL; + vcpu->kvm->arch.psci_version = val; + return 0; + } + break; + } + + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1: + if (val & ~KVM_REG_FEATURE_LEVEL_MASK) + return -EINVAL; + + if (get_kernel_wa_level(reg->id) < val) + return -EINVAL; + + return 0; + + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2: + if (val & ~(KVM_REG_FEATURE_LEVEL_MASK | + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED)) + return -EINVAL; + + wa_level = val & KVM_REG_FEATURE_LEVEL_MASK; + + if (get_kernel_wa_level(reg->id) < wa_level) + return -EINVAL; + + /* The enabled bit must not be set unless the level is AVAIL. */ + if (wa_level != KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL && + wa_level != val) + return -EINVAL; + + /* Are we finished or do we need to check the enable bit ? */ + if (kvm_arm_have_ssbd() != KVM_SSBD_KERNEL) + return 0; + + /* + * If this kernel supports the workaround to be switched on + * or off, make sure it matches the requested setting. + */ + switch (wa_level) { + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL: + kvm_arm_set_vcpu_workaround_2_flag(vcpu, + val & KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED); + break; + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED: + kvm_arm_set_vcpu_workaround_2_flag(vcpu, true); + break; + } + + return 0; + default: + return -ENOENT; + } + + return -EINVAL; +} diff --git a/arch/arm64/kvm/pvtime.c b/arch/arm64/kvm/pvtime.c new file mode 100644 index 000000000000..1e0f4c284888 --- /dev/null +++ b/arch/arm64/kvm/pvtime.c @@ -0,0 +1,131 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2019 Arm Ltd. + +#include <linux/arm-smccc.h> +#include <linux/kvm_host.h> + +#include <asm/kvm_mmu.h> +#include <asm/pvclock-abi.h> + +#include <kvm/arm_hypercalls.h> + +void kvm_update_stolen_time(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + u64 steal; + __le64 steal_le; + u64 offset; + int idx; + u64 base = vcpu->arch.steal.base; + + if (base == GPA_INVALID) + return; + + /* Let's do the local bookkeeping */ + steal = vcpu->arch.steal.steal; + steal += current->sched_info.run_delay - vcpu->arch.steal.last_steal; + vcpu->arch.steal.last_steal = current->sched_info.run_delay; + vcpu->arch.steal.steal = steal; + + steal_le = cpu_to_le64(steal); + idx = srcu_read_lock(&kvm->srcu); + offset = offsetof(struct pvclock_vcpu_stolen_time, stolen_time); + kvm_put_guest(kvm, base + offset, steal_le, u64); + srcu_read_unlock(&kvm->srcu, idx); +} + +long kvm_hypercall_pv_features(struct kvm_vcpu *vcpu) +{ + u32 feature = smccc_get_arg1(vcpu); + long val = SMCCC_RET_NOT_SUPPORTED; + + switch (feature) { + case ARM_SMCCC_HV_PV_TIME_FEATURES: + case ARM_SMCCC_HV_PV_TIME_ST: + val = SMCCC_RET_SUCCESS; + break; + } + + return val; +} + +gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu) +{ + struct pvclock_vcpu_stolen_time init_values = {}; + struct kvm *kvm = vcpu->kvm; + u64 base = vcpu->arch.steal.base; + int idx; + + if (base == GPA_INVALID) + return base; + + /* + * Start counting stolen time from the time the guest requests + * the feature enabled. + */ + vcpu->arch.steal.steal = 0; + vcpu->arch.steal.last_steal = current->sched_info.run_delay; + + idx = srcu_read_lock(&kvm->srcu); + kvm_write_guest(kvm, base, &init_values, sizeof(init_values)); + srcu_read_unlock(&kvm->srcu, idx); + + return base; +} + +int kvm_arm_pvtime_set_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr) +{ + u64 __user *user = (u64 __user *)attr->addr; + struct kvm *kvm = vcpu->kvm; + u64 ipa; + int ret = 0; + int idx; + + if (attr->attr != KVM_ARM_VCPU_PVTIME_IPA) + return -ENXIO; + + if (get_user(ipa, user)) + return -EFAULT; + if (!IS_ALIGNED(ipa, 64)) + return -EINVAL; + if (vcpu->arch.steal.base != GPA_INVALID) + return -EEXIST; + + /* Check the address is in a valid memslot */ + idx = srcu_read_lock(&kvm->srcu); + if (kvm_is_error_hva(gfn_to_hva(kvm, ipa >> PAGE_SHIFT))) + ret = -EINVAL; + srcu_read_unlock(&kvm->srcu, idx); + + if (!ret) + vcpu->arch.steal.base = ipa; + + return ret; +} + +int kvm_arm_pvtime_get_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr) +{ + u64 __user *user = (u64 __user *)attr->addr; + u64 ipa; + + if (attr->attr != KVM_ARM_VCPU_PVTIME_IPA) + return -ENXIO; + + ipa = vcpu->arch.steal.base; + + if (put_user(ipa, user)) + return -EFAULT; + return 0; +} + +int kvm_arm_pvtime_has_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr) +{ + switch (attr->attr) { + case KVM_ARM_VCPU_PVTIME_IPA: + return 0; + } + return -ENXIO; +} diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 30b7ea680f66..d3b209023727 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -36,23 +36,11 @@ static u32 kvm_ipa_limit; /* * ARMv8 Reset Values */ -static const struct kvm_regs default_regs_reset = { - .regs.pstate = (PSR_MODE_EL1h | PSR_A_BIT | PSR_I_BIT | - PSR_F_BIT | PSR_D_BIT), -}; +#define VCPU_RESET_PSTATE_EL1 (PSR_MODE_EL1h | PSR_A_BIT | PSR_I_BIT | \ + PSR_F_BIT | PSR_D_BIT) -static const struct kvm_regs default_regs_reset32 = { - .regs.pstate = (PSR_AA32_MODE_SVC | PSR_AA32_A_BIT | - PSR_AA32_I_BIT | PSR_AA32_F_BIT), -}; - -static bool cpu_has_32bit_el1(void) -{ - u64 pfr0; - - pfr0 = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); - return !!(pfr0 & 0x20); -} +#define VCPU_RESET_PSTATE_SVC (PSR_AA32_MODE_SVC | PSR_AA32_A_BIT | \ + PSR_AA32_I_BIT | PSR_AA32_F_BIT) /** * kvm_arch_vm_ioctl_check_extension @@ -66,7 +54,7 @@ int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext) switch (ext) { case KVM_CAP_ARM_EL1_32BIT: - r = cpu_has_32bit_el1(); + r = cpus_have_const_cap(ARM64_HAS_32BIT_EL1); break; case KVM_CAP_GUEST_DEBUG_HW_BPS: r = get_num_brps(); @@ -163,7 +151,7 @@ static int kvm_vcpu_finalize_sve(struct kvm_vcpu *vcpu) vl = vcpu->arch.sve_max_vl; /* - * Resposibility for these properties is shared between + * Responsibility for these properties is shared between * kvm_arm_init_arch_resources(), kvm_vcpu_enable_sve() and * set_sve_vls(). Double-check here just to be sure: */ @@ -249,7 +237,7 @@ static int kvm_vcpu_enable_ptrauth(struct kvm_vcpu *vcpu) * ioctl or as part of handling a request issued by another VCPU in the PSCI * handling code. In the first case, the VCPU will not be loaded, and in the * second case the VCPU will be loaded. Because this function operates purely - * on the memory-backed valus of system registers, we want to do a full put if + * on the memory-backed values of system registers, we want to do a full put if * we were loaded (handling a request) and load the values back at the end of * the function. Otherwise we leave the state alone. In both cases, we * disable preemption around the vcpu reset as we would otherwise race with @@ -257,9 +245,9 @@ static int kvm_vcpu_enable_ptrauth(struct kvm_vcpu *vcpu) */ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) { - const struct kvm_regs *cpu_reset; int ret = -EINVAL; bool loaded; + u32 pstate; /* Reset PMU outside of the non-preemptible section */ kvm_pmu_vcpu_reset(vcpu); @@ -288,18 +276,19 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) switch (vcpu->arch.target) { default: if (test_bit(KVM_ARM_VCPU_EL1_32BIT, vcpu->arch.features)) { - if (!cpu_has_32bit_el1()) + if (!cpus_have_const_cap(ARM64_HAS_32BIT_EL1)) goto out; - cpu_reset = &default_regs_reset32; + pstate = VCPU_RESET_PSTATE_SVC; } else { - cpu_reset = &default_regs_reset; + pstate = VCPU_RESET_PSTATE_EL1; } break; } /* Reset core registers */ - memcpy(vcpu_gp_regs(vcpu), cpu_reset, sizeof(*cpu_reset)); + memset(vcpu_gp_regs(vcpu), 0, sizeof(*vcpu_gp_regs(vcpu))); + vcpu_gp_regs(vcpu)->regs.pstate = pstate; /* Reset system registers */ kvm_reset_sys_regs(vcpu); @@ -340,11 +329,50 @@ out: return ret; } -void kvm_set_ipa_limit(void) +u32 get_kvm_ipa_limit(void) +{ + return kvm_ipa_limit; +} + +int kvm_set_ipa_limit(void) { - unsigned int ipa_max, pa_max, va_max, parange; + unsigned int ipa_max, pa_max, va_max, parange, tgran_2; + u64 mmfr0; + + mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); + parange = cpuid_feature_extract_unsigned_field(mmfr0, + ID_AA64MMFR0_PARANGE_SHIFT); + + /* + * Check with ARMv8.5-GTG that our PAGE_SIZE is supported at + * Stage-2. If not, things will stop very quickly. + */ + switch (PAGE_SIZE) { + default: + case SZ_4K: + tgran_2 = ID_AA64MMFR0_TGRAN4_2_SHIFT; + break; + case SZ_16K: + tgran_2 = ID_AA64MMFR0_TGRAN16_2_SHIFT; + break; + case SZ_64K: + tgran_2 = ID_AA64MMFR0_TGRAN64_2_SHIFT; + break; + } + + switch (cpuid_feature_extract_unsigned_field(mmfr0, tgran_2)) { + default: + case 1: + kvm_err("PAGE_SIZE not supported at Stage-2, giving up\n"); + return -EINVAL; + case 0: + kvm_debug("PAGE_SIZE supported at Stage-2 (default)\n"); + break; + case 2: + kvm_debug("PAGE_SIZE supported at Stage-2 (advertised)\n"); + break; + } - parange = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1) & 0x7; pa_max = id_aa64mmfr0_parange_to_phys_shift(parange); /* Clamp the IPA limit to the PA size supported by the kernel */ @@ -357,7 +385,7 @@ void kvm_set_ipa_limit(void) * * So clamp the ipa limit further down to limit the number of levels. * Since we can concatenate upto 16 tables at entry level, we could - * go upto 4bits above the maximum VA addressible with the current + * go upto 4bits above the maximum VA addressable with the current * number of levels. */ va_max = PGDIR_SHIFT + PAGE_SHIFT - 3; @@ -378,6 +406,8 @@ void kvm_set_ipa_limit(void) "KVM IPA limit (%d bit) is smaller than default size\n", ipa_max); kvm_ipa_limit = ipa_max; kvm_info("IPA Size Limit: %dbits\n", kvm_ipa_limit); + + return 0; } /* @@ -390,7 +420,7 @@ void kvm_set_ipa_limit(void) */ int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type) { - u64 vtcr = VTCR_EL2_FLAGS; + u64 vtcr = VTCR_EL2_FLAGS, mmfr0; u32 parange, phys_shift; u8 lvls; @@ -406,7 +436,9 @@ int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type) phys_shift = KVM_PHYS_SHIFT; } - parange = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1) & 7; + mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); + parange = cpuid_feature_extract_unsigned_field(mmfr0, + ID_AA64MMFR0_PARANGE_SHIFT); if (parange > ID_AA64MMFR0_PARANGE_MAX) parange = ID_AA64MMFR0_PARANGE_MAX; vtcr |= parange << VTCR_EL2_PS_SHIFT; diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 51db934702b6..80985439bfb2 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -34,7 +34,7 @@ #include "trace.h" /* - * All of this file is extremly similar to the ARM coproc.c, but the + * All of this file is extremely similar to the ARM coproc.c, but the * types are different. My gut feeling is that it should be pretty * easy to merge, but that would be an ABI breakage -- again. VFP * would also need to be abstracted. @@ -64,11 +64,8 @@ static bool write_to_read_only(struct kvm_vcpu *vcpu, return false; } -u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg) +static bool __vcpu_read_sys_reg_from_cpu(int reg, u64 *val) { - if (!vcpu->arch.sysregs_loaded_on_cpu) - goto immediate_read; - /* * System registers listed in the switch are not saved on every * exit from the guest but are only saved on vcpu_put. @@ -79,75 +76,92 @@ u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg) * thread when emulating cross-VCPU communication. */ switch (reg) { - case CSSELR_EL1: return read_sysreg_s(SYS_CSSELR_EL1); - case SCTLR_EL1: return read_sysreg_s(SYS_SCTLR_EL12); - case ACTLR_EL1: return read_sysreg_s(SYS_ACTLR_EL1); - case CPACR_EL1: return read_sysreg_s(SYS_CPACR_EL12); - case TTBR0_EL1: return read_sysreg_s(SYS_TTBR0_EL12); - case TTBR1_EL1: return read_sysreg_s(SYS_TTBR1_EL12); - case TCR_EL1: return read_sysreg_s(SYS_TCR_EL12); - case ESR_EL1: return read_sysreg_s(SYS_ESR_EL12); - case AFSR0_EL1: return read_sysreg_s(SYS_AFSR0_EL12); - case AFSR1_EL1: return read_sysreg_s(SYS_AFSR1_EL12); - case FAR_EL1: return read_sysreg_s(SYS_FAR_EL12); - case MAIR_EL1: return read_sysreg_s(SYS_MAIR_EL12); - case VBAR_EL1: return read_sysreg_s(SYS_VBAR_EL12); - case CONTEXTIDR_EL1: return read_sysreg_s(SYS_CONTEXTIDR_EL12); - case TPIDR_EL0: return read_sysreg_s(SYS_TPIDR_EL0); - case TPIDRRO_EL0: return read_sysreg_s(SYS_TPIDRRO_EL0); - case TPIDR_EL1: return read_sysreg_s(SYS_TPIDR_EL1); - case AMAIR_EL1: return read_sysreg_s(SYS_AMAIR_EL12); - case CNTKCTL_EL1: return read_sysreg_s(SYS_CNTKCTL_EL12); - case PAR_EL1: return read_sysreg_s(SYS_PAR_EL1); - case DACR32_EL2: return read_sysreg_s(SYS_DACR32_EL2); - case IFSR32_EL2: return read_sysreg_s(SYS_IFSR32_EL2); - case DBGVCR32_EL2: return read_sysreg_s(SYS_DBGVCR32_EL2); + case CSSELR_EL1: *val = read_sysreg_s(SYS_CSSELR_EL1); break; + case SCTLR_EL1: *val = read_sysreg_s(SYS_SCTLR_EL12); break; + case ACTLR_EL1: *val = read_sysreg_s(SYS_ACTLR_EL1); break; + case CPACR_EL1: *val = read_sysreg_s(SYS_CPACR_EL12); break; + case TTBR0_EL1: *val = read_sysreg_s(SYS_TTBR0_EL12); break; + case TTBR1_EL1: *val = read_sysreg_s(SYS_TTBR1_EL12); break; + case TCR_EL1: *val = read_sysreg_s(SYS_TCR_EL12); break; + case ESR_EL1: *val = read_sysreg_s(SYS_ESR_EL12); break; + case AFSR0_EL1: *val = read_sysreg_s(SYS_AFSR0_EL12); break; + case AFSR1_EL1: *val = read_sysreg_s(SYS_AFSR1_EL12); break; + case FAR_EL1: *val = read_sysreg_s(SYS_FAR_EL12); break; + case MAIR_EL1: *val = read_sysreg_s(SYS_MAIR_EL12); break; + case VBAR_EL1: *val = read_sysreg_s(SYS_VBAR_EL12); break; + case CONTEXTIDR_EL1: *val = read_sysreg_s(SYS_CONTEXTIDR_EL12);break; + case TPIDR_EL0: *val = read_sysreg_s(SYS_TPIDR_EL0); break; + case TPIDRRO_EL0: *val = read_sysreg_s(SYS_TPIDRRO_EL0); break; + case TPIDR_EL1: *val = read_sysreg_s(SYS_TPIDR_EL1); break; + case AMAIR_EL1: *val = read_sysreg_s(SYS_AMAIR_EL12); break; + case CNTKCTL_EL1: *val = read_sysreg_s(SYS_CNTKCTL_EL12); break; + case PAR_EL1: *val = read_sysreg_s(SYS_PAR_EL1); break; + case DACR32_EL2: *val = read_sysreg_s(SYS_DACR32_EL2); break; + case IFSR32_EL2: *val = read_sysreg_s(SYS_IFSR32_EL2); break; + case DBGVCR32_EL2: *val = read_sysreg_s(SYS_DBGVCR32_EL2); break; + default: return false; } -immediate_read: - return __vcpu_sys_reg(vcpu, reg); + return true; } -void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg) +static bool __vcpu_write_sys_reg_to_cpu(u64 val, int reg) { - if (!vcpu->arch.sysregs_loaded_on_cpu) - goto immediate_write; - /* * System registers listed in the switch are not restored on every * entry to the guest but are only restored on vcpu_load. * * Note that MPIDR_EL1 for the guest is set by KVM via VMPIDR_EL2 but - * should never be listed below, because the the MPIDR should only be - * set once, before running the VCPU, and never changed later. + * should never be listed below, because the MPIDR should only be set + * once, before running the VCPU, and never changed later. */ switch (reg) { - case CSSELR_EL1: write_sysreg_s(val, SYS_CSSELR_EL1); return; - case SCTLR_EL1: write_sysreg_s(val, SYS_SCTLR_EL12); return; - case ACTLR_EL1: write_sysreg_s(val, SYS_ACTLR_EL1); return; - case CPACR_EL1: write_sysreg_s(val, SYS_CPACR_EL12); return; - case TTBR0_EL1: write_sysreg_s(val, SYS_TTBR0_EL12); return; - case TTBR1_EL1: write_sysreg_s(val, SYS_TTBR1_EL12); return; - case TCR_EL1: write_sysreg_s(val, SYS_TCR_EL12); return; - case ESR_EL1: write_sysreg_s(val, SYS_ESR_EL12); return; - case AFSR0_EL1: write_sysreg_s(val, SYS_AFSR0_EL12); return; - case AFSR1_EL1: write_sysreg_s(val, SYS_AFSR1_EL12); return; - case FAR_EL1: write_sysreg_s(val, SYS_FAR_EL12); return; - case MAIR_EL1: write_sysreg_s(val, SYS_MAIR_EL12); return; - case VBAR_EL1: write_sysreg_s(val, SYS_VBAR_EL12); return; - case CONTEXTIDR_EL1: write_sysreg_s(val, SYS_CONTEXTIDR_EL12); return; - case TPIDR_EL0: write_sysreg_s(val, SYS_TPIDR_EL0); return; - case TPIDRRO_EL0: write_sysreg_s(val, SYS_TPIDRRO_EL0); return; - case TPIDR_EL1: write_sysreg_s(val, SYS_TPIDR_EL1); return; - case AMAIR_EL1: write_sysreg_s(val, SYS_AMAIR_EL12); return; - case CNTKCTL_EL1: write_sysreg_s(val, SYS_CNTKCTL_EL12); return; - case PAR_EL1: write_sysreg_s(val, SYS_PAR_EL1); return; - case DACR32_EL2: write_sysreg_s(val, SYS_DACR32_EL2); return; - case IFSR32_EL2: write_sysreg_s(val, SYS_IFSR32_EL2); return; - case DBGVCR32_EL2: write_sysreg_s(val, SYS_DBGVCR32_EL2); return; + case CSSELR_EL1: write_sysreg_s(val, SYS_CSSELR_EL1); break; + case SCTLR_EL1: write_sysreg_s(val, SYS_SCTLR_EL12); break; + case ACTLR_EL1: write_sysreg_s(val, SYS_ACTLR_EL1); break; + case CPACR_EL1: write_sysreg_s(val, SYS_CPACR_EL12); break; + case TTBR0_EL1: write_sysreg_s(val, SYS_TTBR0_EL12); break; + case TTBR1_EL1: write_sysreg_s(val, SYS_TTBR1_EL12); break; + case TCR_EL1: write_sysreg_s(val, SYS_TCR_EL12); break; + case ESR_EL1: write_sysreg_s(val, SYS_ESR_EL12); break; + case AFSR0_EL1: write_sysreg_s(val, SYS_AFSR0_EL12); break; + case AFSR1_EL1: write_sysreg_s(val, SYS_AFSR1_EL12); break; + case FAR_EL1: write_sysreg_s(val, SYS_FAR_EL12); break; + case MAIR_EL1: write_sysreg_s(val, SYS_MAIR_EL12); break; + case VBAR_EL1: write_sysreg_s(val, SYS_VBAR_EL12); break; + case CONTEXTIDR_EL1: write_sysreg_s(val, SYS_CONTEXTIDR_EL12);break; + case TPIDR_EL0: write_sysreg_s(val, SYS_TPIDR_EL0); break; + case TPIDRRO_EL0: write_sysreg_s(val, SYS_TPIDRRO_EL0); break; + case TPIDR_EL1: write_sysreg_s(val, SYS_TPIDR_EL1); break; + case AMAIR_EL1: write_sysreg_s(val, SYS_AMAIR_EL12); break; + case CNTKCTL_EL1: write_sysreg_s(val, SYS_CNTKCTL_EL12); break; + case PAR_EL1: write_sysreg_s(val, SYS_PAR_EL1); break; + case DACR32_EL2: write_sysreg_s(val, SYS_DACR32_EL2); break; + case IFSR32_EL2: write_sysreg_s(val, SYS_IFSR32_EL2); break; + case DBGVCR32_EL2: write_sysreg_s(val, SYS_DBGVCR32_EL2); break; + default: return false; } -immediate_write: + return true; +} + +u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg) +{ + u64 val = 0x8badf00d8badf00d; + + if (vcpu->arch.sysregs_loaded_on_cpu && + __vcpu_read_sys_reg_from_cpu(reg, &val)) + return val; + + return __vcpu_sys_reg(vcpu, reg); +} + +void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg) +{ + if (vcpu->arch.sysregs_loaded_on_cpu && + __vcpu_write_sys_reg_to_cpu(val, reg)) + return; + __vcpu_sys_reg(vcpu, reg) = val; } @@ -1456,9 +1470,9 @@ static const struct sys_reg_desc sys_reg_descs[] = { ID_SANITISED(MVFR1_EL1), ID_SANITISED(MVFR2_EL1), ID_UNALLOCATED(3,3), - ID_UNALLOCATED(3,4), - ID_UNALLOCATED(3,5), - ID_UNALLOCATED(3,6), + ID_SANITISED(ID_PFR2_EL1), + ID_HIDDEN(ID_DFR1_EL1), + ID_SANITISED(ID_MMFR5_EL1), ID_UNALLOCATED(3,7), /* AArch64 ID registers */ @@ -1532,7 +1546,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_PAR_EL1), NULL, reset_unknown, PAR_EL1 }, { SYS_DESC(SYS_PMINTENSET_EL1), access_pminten, reset_unknown, PMINTENSET_EL1 }, - { SYS_DESC(SYS_PMINTENCLR_EL1), access_pminten, NULL, PMINTENSET_EL1 }, + { SYS_DESC(SYS_PMINTENCLR_EL1), access_pminten, reset_unknown, PMINTENSET_EL1 }, { SYS_DESC(SYS_MAIR_EL1), access_vm_reg, reset_unknown, MAIR_EL1 }, { SYS_DESC(SYS_AMAIR_EL1), access_vm_reg, reset_amair_el1, AMAIR_EL1 }, @@ -1571,8 +1585,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_PMCR_EL0), access_pmcr, reset_pmcr, PMCR_EL0 }, { SYS_DESC(SYS_PMCNTENSET_EL0), access_pmcnten, reset_unknown, PMCNTENSET_EL0 }, - { SYS_DESC(SYS_PMCNTENCLR_EL0), access_pmcnten, NULL, PMCNTENSET_EL0 }, - { SYS_DESC(SYS_PMOVSCLR_EL0), access_pmovs, NULL, PMOVSSET_EL0 }, + { SYS_DESC(SYS_PMCNTENCLR_EL0), access_pmcnten, reset_unknown, PMCNTENSET_EL0 }, + { SYS_DESC(SYS_PMOVSCLR_EL0), access_pmovs, reset_unknown, PMOVSSET_EL0 }, { SYS_DESC(SYS_PMSWINC_EL0), access_pmswinc, reset_unknown, PMSWINC_EL0 }, { SYS_DESC(SYS_PMSELR_EL0), access_pmselr, reset_unknown, PMSELR_EL0 }, { SYS_DESC(SYS_PMCEID0_EL0), access_pmceid }, @@ -2073,12 +2087,37 @@ static const struct sys_reg_desc cp15_64_regs[] = { { SYS_DESC(SYS_AARCH32_CNTP_CVAL), access_arch_timer }, }; +static int check_sysreg_table(const struct sys_reg_desc *table, unsigned int n, + bool is_32) +{ + unsigned int i; + + for (i = 0; i < n; i++) { + if (!is_32 && table[i].reg && !table[i].reset) { + kvm_err("sys_reg table %p entry %d has lacks reset\n", + table, i); + return 1; + } + + if (i && cmp_sys_reg(&table[i-1], &table[i]) >= 0) { + kvm_err("sys_reg table %p out of order (%d)\n", table, i - 1); + return 1; + } + } + + return 0; +} + /* Target specific emulation tables */ static struct kvm_sys_reg_target_table *target_tables[KVM_ARM_NUM_TARGETS]; void kvm_register_target_sys_reg_table(unsigned int target, struct kvm_sys_reg_target_table *table) { + if (check_sysreg_table(table->table64.table, table->table64.num, false) || + check_sysreg_table(table->table32.table, table->table32.num, true)) + return; + target_tables[target] = table; } @@ -2364,19 +2403,13 @@ static int emulate_sys_reg(struct kvm_vcpu *vcpu, } static void reset_sys_reg_descs(struct kvm_vcpu *vcpu, - const struct sys_reg_desc *table, size_t num, - unsigned long *bmap) + const struct sys_reg_desc *table, size_t num) { unsigned long i; for (i = 0; i < num; i++) - if (table[i].reset) { - int reg = table[i].reg; - + if (table[i].reset) table[i].reset(vcpu, &table[i]); - if (reg > 0 && reg < NR_SYS_REGS) - set_bit(reg, bmap); - } } /** @@ -2832,32 +2865,18 @@ int kvm_arm_copy_sys_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) return write_demux_regids(uindices); } -static int check_sysreg_table(const struct sys_reg_desc *table, unsigned int n) -{ - unsigned int i; - - for (i = 1; i < n; i++) { - if (cmp_sys_reg(&table[i-1], &table[i]) >= 0) { - kvm_err("sys_reg table %p out of order (%d)\n", table, i - 1); - return 1; - } - } - - return 0; -} - void kvm_sys_reg_table_init(void) { unsigned int i; struct sys_reg_desc clidr; /* Make sure tables are unique and in order. */ - BUG_ON(check_sysreg_table(sys_reg_descs, ARRAY_SIZE(sys_reg_descs))); - BUG_ON(check_sysreg_table(cp14_regs, ARRAY_SIZE(cp14_regs))); - BUG_ON(check_sysreg_table(cp14_64_regs, ARRAY_SIZE(cp14_64_regs))); - BUG_ON(check_sysreg_table(cp15_regs, ARRAY_SIZE(cp15_regs))); - BUG_ON(check_sysreg_table(cp15_64_regs, ARRAY_SIZE(cp15_64_regs))); - BUG_ON(check_sysreg_table(invariant_sys_regs, ARRAY_SIZE(invariant_sys_regs))); + BUG_ON(check_sysreg_table(sys_reg_descs, ARRAY_SIZE(sys_reg_descs), false)); + BUG_ON(check_sysreg_table(cp14_regs, ARRAY_SIZE(cp14_regs), true)); + BUG_ON(check_sysreg_table(cp14_64_regs, ARRAY_SIZE(cp14_64_regs), true)); + BUG_ON(check_sysreg_table(cp15_regs, ARRAY_SIZE(cp15_regs), true)); + BUG_ON(check_sysreg_table(cp15_64_regs, ARRAY_SIZE(cp15_64_regs), true)); + BUG_ON(check_sysreg_table(invariant_sys_regs, ARRAY_SIZE(invariant_sys_regs), false)); /* We abuse the reset function to overwrite the table itself. */ for (i = 0; i < ARRAY_SIZE(invariant_sys_regs); i++) @@ -2893,17 +2912,10 @@ void kvm_reset_sys_regs(struct kvm_vcpu *vcpu) { size_t num; const struct sys_reg_desc *table; - DECLARE_BITMAP(bmap, NR_SYS_REGS) = { 0, }; /* Generic chip reset first (so target could override). */ - reset_sys_reg_descs(vcpu, sys_reg_descs, ARRAY_SIZE(sys_reg_descs), bmap); + reset_sys_reg_descs(vcpu, sys_reg_descs, ARRAY_SIZE(sys_reg_descs)); table = get_target_table(vcpu->arch.target, true, &num); - reset_sys_reg_descs(vcpu, table, num, bmap); - - for (num = 1; num < NR_SYS_REGS; num++) { - if (WARN(!test_bit(num, bmap), - "Didn't reset __vcpu_sys_reg(%zi)\n", num)) - break; - } + reset_sys_reg_descs(vcpu, table, num); } diff --git a/arch/arm64/kvm/trace.h b/arch/arm64/kvm/trace.h index eab91ad0effb..86f9ea47be29 100644 --- a/arch/arm64/kvm/trace.h +++ b/arch/arm64/kvm/trace.h @@ -1,216 +1,8 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#if !defined(_TRACE_ARM64_KVM_H) || defined(TRACE_HEADER_MULTI_READ) +#ifndef _TRACE_ARM64_KVM_H #define _TRACE_ARM64_KVM_H -#include <linux/tracepoint.h> -#include "sys_regs.h" +#include "trace_arm.h" +#include "trace_handle_exit.h" -#undef TRACE_SYSTEM -#define TRACE_SYSTEM kvm - -TRACE_EVENT(kvm_wfx_arm64, - TP_PROTO(unsigned long vcpu_pc, bool is_wfe), - TP_ARGS(vcpu_pc, is_wfe), - - TP_STRUCT__entry( - __field(unsigned long, vcpu_pc) - __field(bool, is_wfe) - ), - - TP_fast_assign( - __entry->vcpu_pc = vcpu_pc; - __entry->is_wfe = is_wfe; - ), - - TP_printk("guest executed wf%c at: 0x%08lx", - __entry->is_wfe ? 'e' : 'i', __entry->vcpu_pc) -); - -TRACE_EVENT(kvm_hvc_arm64, - TP_PROTO(unsigned long vcpu_pc, unsigned long r0, unsigned long imm), - TP_ARGS(vcpu_pc, r0, imm), - - TP_STRUCT__entry( - __field(unsigned long, vcpu_pc) - __field(unsigned long, r0) - __field(unsigned long, imm) - ), - - TP_fast_assign( - __entry->vcpu_pc = vcpu_pc; - __entry->r0 = r0; - __entry->imm = imm; - ), - - TP_printk("HVC at 0x%08lx (r0: 0x%08lx, imm: 0x%lx)", - __entry->vcpu_pc, __entry->r0, __entry->imm) -); - -TRACE_EVENT(kvm_arm_setup_debug, - TP_PROTO(struct kvm_vcpu *vcpu, __u32 guest_debug), - TP_ARGS(vcpu, guest_debug), - - TP_STRUCT__entry( - __field(struct kvm_vcpu *, vcpu) - __field(__u32, guest_debug) - ), - - TP_fast_assign( - __entry->vcpu = vcpu; - __entry->guest_debug = guest_debug; - ), - - TP_printk("vcpu: %p, flags: 0x%08x", __entry->vcpu, __entry->guest_debug) -); - -TRACE_EVENT(kvm_arm_clear_debug, - TP_PROTO(__u32 guest_debug), - TP_ARGS(guest_debug), - - TP_STRUCT__entry( - __field(__u32, guest_debug) - ), - - TP_fast_assign( - __entry->guest_debug = guest_debug; - ), - - TP_printk("flags: 0x%08x", __entry->guest_debug) -); - -TRACE_EVENT(kvm_arm_set_dreg32, - TP_PROTO(const char *name, __u32 value), - TP_ARGS(name, value), - - TP_STRUCT__entry( - __field(const char *, name) - __field(__u32, value) - ), - - TP_fast_assign( - __entry->name = name; - __entry->value = value; - ), - - TP_printk("%s: 0x%08x", __entry->name, __entry->value) -); - -TRACE_DEFINE_SIZEOF(__u64); - -TRACE_EVENT(kvm_arm_set_regset, - TP_PROTO(const char *type, int len, __u64 *control, __u64 *value), - TP_ARGS(type, len, control, value), - TP_STRUCT__entry( - __field(const char *, name) - __field(int, len) - __array(u64, ctrls, 16) - __array(u64, values, 16) - ), - TP_fast_assign( - __entry->name = type; - __entry->len = len; - memcpy(__entry->ctrls, control, len << 3); - memcpy(__entry->values, value, len << 3); - ), - TP_printk("%d %s CTRL:%s VALUE:%s", __entry->len, __entry->name, - __print_array(__entry->ctrls, __entry->len, sizeof(__u64)), - __print_array(__entry->values, __entry->len, sizeof(__u64))) -); - -TRACE_EVENT(trap_reg, - TP_PROTO(const char *fn, int reg, bool is_write, u64 write_value), - TP_ARGS(fn, reg, is_write, write_value), - - TP_STRUCT__entry( - __field(const char *, fn) - __field(int, reg) - __field(bool, is_write) - __field(u64, write_value) - ), - - TP_fast_assign( - __entry->fn = fn; - __entry->reg = reg; - __entry->is_write = is_write; - __entry->write_value = write_value; - ), - - TP_printk("%s %s reg %d (0x%08llx)", __entry->fn, __entry->is_write?"write to":"read from", __entry->reg, __entry->write_value) -); - -TRACE_EVENT(kvm_handle_sys_reg, - TP_PROTO(unsigned long hsr), - TP_ARGS(hsr), - - TP_STRUCT__entry( - __field(unsigned long, hsr) - ), - - TP_fast_assign( - __entry->hsr = hsr; - ), - - TP_printk("HSR 0x%08lx", __entry->hsr) -); - -TRACE_EVENT(kvm_sys_access, - TP_PROTO(unsigned long vcpu_pc, struct sys_reg_params *params, const struct sys_reg_desc *reg), - TP_ARGS(vcpu_pc, params, reg), - - TP_STRUCT__entry( - __field(unsigned long, vcpu_pc) - __field(bool, is_write) - __field(const char *, name) - __field(u8, Op0) - __field(u8, Op1) - __field(u8, CRn) - __field(u8, CRm) - __field(u8, Op2) - ), - - TP_fast_assign( - __entry->vcpu_pc = vcpu_pc; - __entry->is_write = params->is_write; - __entry->name = reg->name; - __entry->Op0 = reg->Op0; - __entry->Op0 = reg->Op0; - __entry->Op1 = reg->Op1; - __entry->CRn = reg->CRn; - __entry->CRm = reg->CRm; - __entry->Op2 = reg->Op2; - ), - - TP_printk("PC: %lx %s (%d,%d,%d,%d,%d) %s", - __entry->vcpu_pc, __entry->name ?: "UNKN", - __entry->Op0, __entry->Op1, __entry->CRn, - __entry->CRm, __entry->Op2, - __entry->is_write ? "write" : "read") -); - -TRACE_EVENT(kvm_set_guest_debug, - TP_PROTO(struct kvm_vcpu *vcpu, __u32 guest_debug), - TP_ARGS(vcpu, guest_debug), - - TP_STRUCT__entry( - __field(struct kvm_vcpu *, vcpu) - __field(__u32, guest_debug) - ), - - TP_fast_assign( - __entry->vcpu = vcpu; - __entry->guest_debug = guest_debug; - ), - - TP_printk("vcpu: %p, flags: 0x%08x", __entry->vcpu, __entry->guest_debug) -); - - -#endif /* _TRACE_ARM64_KVM_H */ - -#undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH . -#undef TRACE_INCLUDE_FILE -#define TRACE_INCLUDE_FILE trace - -/* This part must be outside protection */ -#include <trace/define_trace.h> +#endif /* _TRACE_ARM64_KVM_H */ diff --git a/arch/arm64/kvm/trace_arm.h b/arch/arm64/kvm/trace_arm.h new file mode 100644 index 000000000000..4c71270cc097 --- /dev/null +++ b/arch/arm64/kvm/trace_arm.h @@ -0,0 +1,378 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#if !defined(_TRACE_ARM_ARM64_KVM_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_ARM_ARM64_KVM_H + +#include <kvm/arm_arch_timer.h> +#include <linux/tracepoint.h> + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kvm + +/* + * Tracepoints for entry/exit to guest + */ +TRACE_EVENT(kvm_entry, + TP_PROTO(unsigned long vcpu_pc), + TP_ARGS(vcpu_pc), + + TP_STRUCT__entry( + __field( unsigned long, vcpu_pc ) + ), + + TP_fast_assign( + __entry->vcpu_pc = vcpu_pc; + ), + + TP_printk("PC: 0x%08lx", __entry->vcpu_pc) +); + +TRACE_EVENT(kvm_exit, + TP_PROTO(int ret, unsigned int esr_ec, unsigned long vcpu_pc), + TP_ARGS(ret, esr_ec, vcpu_pc), + + TP_STRUCT__entry( + __field( int, ret ) + __field( unsigned int, esr_ec ) + __field( unsigned long, vcpu_pc ) + ), + + TP_fast_assign( + __entry->ret = ARM_EXCEPTION_CODE(ret); + __entry->esr_ec = ARM_EXCEPTION_IS_TRAP(ret) ? esr_ec : 0; + __entry->vcpu_pc = vcpu_pc; + ), + + TP_printk("%s: HSR_EC: 0x%04x (%s), PC: 0x%08lx", + __print_symbolic(__entry->ret, kvm_arm_exception_type), + __entry->esr_ec, + __print_symbolic(__entry->esr_ec, kvm_arm_exception_class), + __entry->vcpu_pc) +); + +TRACE_EVENT(kvm_guest_fault, + TP_PROTO(unsigned long vcpu_pc, unsigned long hsr, + unsigned long hxfar, + unsigned long long ipa), + TP_ARGS(vcpu_pc, hsr, hxfar, ipa), + + TP_STRUCT__entry( + __field( unsigned long, vcpu_pc ) + __field( unsigned long, hsr ) + __field( unsigned long, hxfar ) + __field( unsigned long long, ipa ) + ), + + TP_fast_assign( + __entry->vcpu_pc = vcpu_pc; + __entry->hsr = hsr; + __entry->hxfar = hxfar; + __entry->ipa = ipa; + ), + + TP_printk("ipa %#llx, hsr %#08lx, hxfar %#08lx, pc %#08lx", + __entry->ipa, __entry->hsr, + __entry->hxfar, __entry->vcpu_pc) +); + +TRACE_EVENT(kvm_access_fault, + TP_PROTO(unsigned long ipa), + TP_ARGS(ipa), + + TP_STRUCT__entry( + __field( unsigned long, ipa ) + ), + + TP_fast_assign( + __entry->ipa = ipa; + ), + + TP_printk("IPA: %lx", __entry->ipa) +); + +TRACE_EVENT(kvm_irq_line, + TP_PROTO(unsigned int type, int vcpu_idx, int irq_num, int level), + TP_ARGS(type, vcpu_idx, irq_num, level), + + TP_STRUCT__entry( + __field( unsigned int, type ) + __field( int, vcpu_idx ) + __field( int, irq_num ) + __field( int, level ) + ), + + TP_fast_assign( + __entry->type = type; + __entry->vcpu_idx = vcpu_idx; + __entry->irq_num = irq_num; + __entry->level = level; + ), + + TP_printk("Inject %s interrupt (%d), vcpu->idx: %d, num: %d, level: %d", + (__entry->type == KVM_ARM_IRQ_TYPE_CPU) ? "CPU" : + (__entry->type == KVM_ARM_IRQ_TYPE_PPI) ? "VGIC PPI" : + (__entry->type == KVM_ARM_IRQ_TYPE_SPI) ? "VGIC SPI" : "UNKNOWN", + __entry->type, __entry->vcpu_idx, __entry->irq_num, __entry->level) +); + +TRACE_EVENT(kvm_mmio_emulate, + TP_PROTO(unsigned long vcpu_pc, unsigned long instr, + unsigned long cpsr), + TP_ARGS(vcpu_pc, instr, cpsr), + + TP_STRUCT__entry( + __field( unsigned long, vcpu_pc ) + __field( unsigned long, instr ) + __field( unsigned long, cpsr ) + ), + + TP_fast_assign( + __entry->vcpu_pc = vcpu_pc; + __entry->instr = instr; + __entry->cpsr = cpsr; + ), + + TP_printk("Emulate MMIO at: 0x%08lx (instr: %08lx, cpsr: %08lx)", + __entry->vcpu_pc, __entry->instr, __entry->cpsr) +); + +TRACE_EVENT(kvm_unmap_hva_range, + TP_PROTO(unsigned long start, unsigned long end), + TP_ARGS(start, end), + + TP_STRUCT__entry( + __field( unsigned long, start ) + __field( unsigned long, end ) + ), + + TP_fast_assign( + __entry->start = start; + __entry->end = end; + ), + + TP_printk("mmu notifier unmap range: %#08lx -- %#08lx", + __entry->start, __entry->end) +); + +TRACE_EVENT(kvm_set_spte_hva, + TP_PROTO(unsigned long hva), + TP_ARGS(hva), + + TP_STRUCT__entry( + __field( unsigned long, hva ) + ), + + TP_fast_assign( + __entry->hva = hva; + ), + + TP_printk("mmu notifier set pte hva: %#08lx", __entry->hva) +); + +TRACE_EVENT(kvm_age_hva, + TP_PROTO(unsigned long start, unsigned long end), + TP_ARGS(start, end), + + TP_STRUCT__entry( + __field( unsigned long, start ) + __field( unsigned long, end ) + ), + + TP_fast_assign( + __entry->start = start; + __entry->end = end; + ), + + TP_printk("mmu notifier age hva: %#08lx -- %#08lx", + __entry->start, __entry->end) +); + +TRACE_EVENT(kvm_test_age_hva, + TP_PROTO(unsigned long hva), + TP_ARGS(hva), + + TP_STRUCT__entry( + __field( unsigned long, hva ) + ), + + TP_fast_assign( + __entry->hva = hva; + ), + + TP_printk("mmu notifier test age hva: %#08lx", __entry->hva) +); + +TRACE_EVENT(kvm_set_way_flush, + TP_PROTO(unsigned long vcpu_pc, bool cache), + TP_ARGS(vcpu_pc, cache), + + TP_STRUCT__entry( + __field( unsigned long, vcpu_pc ) + __field( bool, cache ) + ), + + TP_fast_assign( + __entry->vcpu_pc = vcpu_pc; + __entry->cache = cache; + ), + + TP_printk("S/W flush at 0x%016lx (cache %s)", + __entry->vcpu_pc, __entry->cache ? "on" : "off") +); + +TRACE_EVENT(kvm_toggle_cache, + TP_PROTO(unsigned long vcpu_pc, bool was, bool now), + TP_ARGS(vcpu_pc, was, now), + + TP_STRUCT__entry( + __field( unsigned long, vcpu_pc ) + __field( bool, was ) + __field( bool, now ) + ), + + TP_fast_assign( + __entry->vcpu_pc = vcpu_pc; + __entry->was = was; + __entry->now = now; + ), + + TP_printk("VM op at 0x%016lx (cache was %s, now %s)", + __entry->vcpu_pc, __entry->was ? "on" : "off", + __entry->now ? "on" : "off") +); + +/* + * Tracepoints for arch_timer + */ +TRACE_EVENT(kvm_timer_update_irq, + TP_PROTO(unsigned long vcpu_id, __u32 irq, int level), + TP_ARGS(vcpu_id, irq, level), + + TP_STRUCT__entry( + __field( unsigned long, vcpu_id ) + __field( __u32, irq ) + __field( int, level ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->irq = irq; + __entry->level = level; + ), + + TP_printk("VCPU: %ld, IRQ %d, level %d", + __entry->vcpu_id, __entry->irq, __entry->level) +); + +TRACE_EVENT(kvm_get_timer_map, + TP_PROTO(unsigned long vcpu_id, struct timer_map *map), + TP_ARGS(vcpu_id, map), + + TP_STRUCT__entry( + __field( unsigned long, vcpu_id ) + __field( int, direct_vtimer ) + __field( int, direct_ptimer ) + __field( int, emul_ptimer ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->direct_vtimer = arch_timer_ctx_index(map->direct_vtimer); + __entry->direct_ptimer = + (map->direct_ptimer) ? arch_timer_ctx_index(map->direct_ptimer) : -1; + __entry->emul_ptimer = + (map->emul_ptimer) ? arch_timer_ctx_index(map->emul_ptimer) : -1; + ), + + TP_printk("VCPU: %ld, dv: %d, dp: %d, ep: %d", + __entry->vcpu_id, + __entry->direct_vtimer, + __entry->direct_ptimer, + __entry->emul_ptimer) +); + +TRACE_EVENT(kvm_timer_save_state, + TP_PROTO(struct arch_timer_context *ctx), + TP_ARGS(ctx), + + TP_STRUCT__entry( + __field( unsigned long, ctl ) + __field( unsigned long long, cval ) + __field( int, timer_idx ) + ), + + TP_fast_assign( + __entry->ctl = ctx->cnt_ctl; + __entry->cval = ctx->cnt_cval; + __entry->timer_idx = arch_timer_ctx_index(ctx); + ), + + TP_printk(" CTL: %#08lx CVAL: %#16llx arch_timer_ctx_index: %d", + __entry->ctl, + __entry->cval, + __entry->timer_idx) +); + +TRACE_EVENT(kvm_timer_restore_state, + TP_PROTO(struct arch_timer_context *ctx), + TP_ARGS(ctx), + + TP_STRUCT__entry( + __field( unsigned long, ctl ) + __field( unsigned long long, cval ) + __field( int, timer_idx ) + ), + + TP_fast_assign( + __entry->ctl = ctx->cnt_ctl; + __entry->cval = ctx->cnt_cval; + __entry->timer_idx = arch_timer_ctx_index(ctx); + ), + + TP_printk("CTL: %#08lx CVAL: %#16llx arch_timer_ctx_index: %d", + __entry->ctl, + __entry->cval, + __entry->timer_idx) +); + +TRACE_EVENT(kvm_timer_hrtimer_expire, + TP_PROTO(struct arch_timer_context *ctx), + TP_ARGS(ctx), + + TP_STRUCT__entry( + __field( int, timer_idx ) + ), + + TP_fast_assign( + __entry->timer_idx = arch_timer_ctx_index(ctx); + ), + + TP_printk("arch_timer_ctx_index: %d", __entry->timer_idx) +); + +TRACE_EVENT(kvm_timer_emulate, + TP_PROTO(struct arch_timer_context *ctx, bool should_fire), + TP_ARGS(ctx, should_fire), + + TP_STRUCT__entry( + __field( int, timer_idx ) + __field( bool, should_fire ) + ), + + TP_fast_assign( + __entry->timer_idx = arch_timer_ctx_index(ctx); + __entry->should_fire = should_fire; + ), + + TP_printk("arch_timer_ctx_index: %d (should_fire: %d)", + __entry->timer_idx, __entry->should_fire) +); + +#endif /* _TRACE_ARM_ARM64_KVM_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace_arm + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/arch/arm64/kvm/trace_handle_exit.h b/arch/arm64/kvm/trace_handle_exit.h new file mode 100644 index 000000000000..2c56d1e0f5bd --- /dev/null +++ b/arch/arm64/kvm/trace_handle_exit.h @@ -0,0 +1,215 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#if !defined(_TRACE_HANDLE_EXIT_ARM64_KVM_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HANDLE_EXIT_ARM64_KVM_H + +#include <linux/tracepoint.h> +#include "sys_regs.h" + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kvm + +TRACE_EVENT(kvm_wfx_arm64, + TP_PROTO(unsigned long vcpu_pc, bool is_wfe), + TP_ARGS(vcpu_pc, is_wfe), + + TP_STRUCT__entry( + __field(unsigned long, vcpu_pc) + __field(bool, is_wfe) + ), + + TP_fast_assign( + __entry->vcpu_pc = vcpu_pc; + __entry->is_wfe = is_wfe; + ), + + TP_printk("guest executed wf%c at: 0x%08lx", + __entry->is_wfe ? 'e' : 'i', __entry->vcpu_pc) +); + +TRACE_EVENT(kvm_hvc_arm64, + TP_PROTO(unsigned long vcpu_pc, unsigned long r0, unsigned long imm), + TP_ARGS(vcpu_pc, r0, imm), + + TP_STRUCT__entry( + __field(unsigned long, vcpu_pc) + __field(unsigned long, r0) + __field(unsigned long, imm) + ), + + TP_fast_assign( + __entry->vcpu_pc = vcpu_pc; + __entry->r0 = r0; + __entry->imm = imm; + ), + + TP_printk("HVC at 0x%08lx (r0: 0x%08lx, imm: 0x%lx)", + __entry->vcpu_pc, __entry->r0, __entry->imm) +); + +TRACE_EVENT(kvm_arm_setup_debug, + TP_PROTO(struct kvm_vcpu *vcpu, __u32 guest_debug), + TP_ARGS(vcpu, guest_debug), + + TP_STRUCT__entry( + __field(struct kvm_vcpu *, vcpu) + __field(__u32, guest_debug) + ), + + TP_fast_assign( + __entry->vcpu = vcpu; + __entry->guest_debug = guest_debug; + ), + + TP_printk("vcpu: %p, flags: 0x%08x", __entry->vcpu, __entry->guest_debug) +); + +TRACE_EVENT(kvm_arm_clear_debug, + TP_PROTO(__u32 guest_debug), + TP_ARGS(guest_debug), + + TP_STRUCT__entry( + __field(__u32, guest_debug) + ), + + TP_fast_assign( + __entry->guest_debug = guest_debug; + ), + + TP_printk("flags: 0x%08x", __entry->guest_debug) +); + +TRACE_EVENT(kvm_arm_set_dreg32, + TP_PROTO(const char *name, __u32 value), + TP_ARGS(name, value), + + TP_STRUCT__entry( + __field(const char *, name) + __field(__u32, value) + ), + + TP_fast_assign( + __entry->name = name; + __entry->value = value; + ), + + TP_printk("%s: 0x%08x", __entry->name, __entry->value) +); + +TRACE_DEFINE_SIZEOF(__u64); + +TRACE_EVENT(kvm_arm_set_regset, + TP_PROTO(const char *type, int len, __u64 *control, __u64 *value), + TP_ARGS(type, len, control, value), + TP_STRUCT__entry( + __field(const char *, name) + __field(int, len) + __array(u64, ctrls, 16) + __array(u64, values, 16) + ), + TP_fast_assign( + __entry->name = type; + __entry->len = len; + memcpy(__entry->ctrls, control, len << 3); + memcpy(__entry->values, value, len << 3); + ), + TP_printk("%d %s CTRL:%s VALUE:%s", __entry->len, __entry->name, + __print_array(__entry->ctrls, __entry->len, sizeof(__u64)), + __print_array(__entry->values, __entry->len, sizeof(__u64))) +); + +TRACE_EVENT(trap_reg, + TP_PROTO(const char *fn, int reg, bool is_write, u64 write_value), + TP_ARGS(fn, reg, is_write, write_value), + + TP_STRUCT__entry( + __field(const char *, fn) + __field(int, reg) + __field(bool, is_write) + __field(u64, write_value) + ), + + TP_fast_assign( + __entry->fn = fn; + __entry->reg = reg; + __entry->is_write = is_write; + __entry->write_value = write_value; + ), + + TP_printk("%s %s reg %d (0x%08llx)", __entry->fn, __entry->is_write?"write to":"read from", __entry->reg, __entry->write_value) +); + +TRACE_EVENT(kvm_handle_sys_reg, + TP_PROTO(unsigned long hsr), + TP_ARGS(hsr), + + TP_STRUCT__entry( + __field(unsigned long, hsr) + ), + + TP_fast_assign( + __entry->hsr = hsr; + ), + + TP_printk("HSR 0x%08lx", __entry->hsr) +); + +TRACE_EVENT(kvm_sys_access, + TP_PROTO(unsigned long vcpu_pc, struct sys_reg_params *params, const struct sys_reg_desc *reg), + TP_ARGS(vcpu_pc, params, reg), + + TP_STRUCT__entry( + __field(unsigned long, vcpu_pc) + __field(bool, is_write) + __field(const char *, name) + __field(u8, Op0) + __field(u8, Op1) + __field(u8, CRn) + __field(u8, CRm) + __field(u8, Op2) + ), + + TP_fast_assign( + __entry->vcpu_pc = vcpu_pc; + __entry->is_write = params->is_write; + __entry->name = reg->name; + __entry->Op0 = reg->Op0; + __entry->Op0 = reg->Op0; + __entry->Op1 = reg->Op1; + __entry->CRn = reg->CRn; + __entry->CRm = reg->CRm; + __entry->Op2 = reg->Op2; + ), + + TP_printk("PC: %lx %s (%d,%d,%d,%d,%d) %s", + __entry->vcpu_pc, __entry->name ?: "UNKN", + __entry->Op0, __entry->Op1, __entry->CRn, + __entry->CRm, __entry->Op2, + __entry->is_write ? "write" : "read") +); + +TRACE_EVENT(kvm_set_guest_debug, + TP_PROTO(struct kvm_vcpu *vcpu, __u32 guest_debug), + TP_ARGS(vcpu, guest_debug), + + TP_STRUCT__entry( + __field(struct kvm_vcpu *, vcpu) + __field(__u32, guest_debug) + ), + + TP_fast_assign( + __entry->vcpu = vcpu; + __entry->guest_debug = guest_debug; + ), + + TP_printk("vcpu: %p, flags: 0x%08x", __entry->vcpu, __entry->guest_debug) +); + +#endif /* _TRACE_HANDLE_EXIT_ARM64_KVM_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace_handle_exit + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/arch/arm64/kvm/vgic-sys-reg-v3.c b/arch/arm64/kvm/vgic-sys-reg-v3.c index e7d1ea92095d..2f92bdcb1188 100644 --- a/arch/arm64/kvm/vgic-sys-reg-v3.c +++ b/arch/arm64/kvm/vgic-sys-reg-v3.c @@ -7,7 +7,7 @@ #include <linux/kvm.h> #include <linux/kvm_host.h> #include <asm/kvm_emulate.h> -#include "vgic.h" +#include "vgic/vgic.h" #include "sys_regs.h" static bool access_gic_ctlr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, diff --git a/arch/arm64/kvm/vgic/trace.h b/arch/arm64/kvm/vgic/trace.h new file mode 100644 index 000000000000..83c64401a7fc --- /dev/null +++ b/arch/arm64/kvm/vgic/trace.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#if !defined(_TRACE_VGIC_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_VGIC_H + +#include <linux/tracepoint.h> + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kvm + +TRACE_EVENT(vgic_update_irq_pending, + TP_PROTO(unsigned long vcpu_id, __u32 irq, bool level), + TP_ARGS(vcpu_id, irq, level), + + TP_STRUCT__entry( + __field( unsigned long, vcpu_id ) + __field( __u32, irq ) + __field( bool, level ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->irq = irq; + __entry->level = level; + ), + + TP_printk("VCPU: %ld, IRQ %d, level: %d", + __entry->vcpu_id, __entry->irq, __entry->level) +); + +#endif /* _TRACE_VGIC_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH ../../arch/arm64/kvm/vgic +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/arch/arm64/kvm/vgic/vgic-debug.c b/arch/arm64/kvm/vgic/vgic-debug.c new file mode 100644 index 000000000000..b13a9e3f99dd --- /dev/null +++ b/arch/arm64/kvm/vgic/vgic-debug.c @@ -0,0 +1,300 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2016 Linaro + * Author: Christoffer Dall <christoffer.dall@linaro.org> + */ + +#include <linux/cpu.h> +#include <linux/debugfs.h> +#include <linux/interrupt.h> +#include <linux/kvm_host.h> +#include <linux/seq_file.h> +#include <kvm/arm_vgic.h> +#include <asm/kvm_mmu.h> +#include "vgic.h" + +/* + * Structure to control looping through the entire vgic state. We start at + * zero for each field and move upwards. So, if dist_id is 0 we print the + * distributor info. When dist_id is 1, we have already printed it and move + * on. + * + * When vcpu_id < nr_cpus we print the vcpu info until vcpu_id == nr_cpus and + * so on. + */ +struct vgic_state_iter { + int nr_cpus; + int nr_spis; + int nr_lpis; + int dist_id; + int vcpu_id; + int intid; + int lpi_idx; + u32 *lpi_array; +}; + +static void iter_next(struct vgic_state_iter *iter) +{ + if (iter->dist_id == 0) { + iter->dist_id++; + return; + } + + iter->intid++; + if (iter->intid == VGIC_NR_PRIVATE_IRQS && + ++iter->vcpu_id < iter->nr_cpus) + iter->intid = 0; + + if (iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS)) { + if (iter->lpi_idx < iter->nr_lpis) + iter->intid = iter->lpi_array[iter->lpi_idx]; + iter->lpi_idx++; + } +} + +static void iter_init(struct kvm *kvm, struct vgic_state_iter *iter, + loff_t pos) +{ + int nr_cpus = atomic_read(&kvm->online_vcpus); + + memset(iter, 0, sizeof(*iter)); + + iter->nr_cpus = nr_cpus; + iter->nr_spis = kvm->arch.vgic.nr_spis; + if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { + iter->nr_lpis = vgic_copy_lpi_list(kvm, NULL, &iter->lpi_array); + if (iter->nr_lpis < 0) + iter->nr_lpis = 0; + } + + /* Fast forward to the right position if needed */ + while (pos--) + iter_next(iter); +} + +static bool end_of_vgic(struct vgic_state_iter *iter) +{ + return iter->dist_id > 0 && + iter->vcpu_id == iter->nr_cpus && + iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS) && + iter->lpi_idx > iter->nr_lpis; +} + +static void *vgic_debug_start(struct seq_file *s, loff_t *pos) +{ + struct kvm *kvm = (struct kvm *)s->private; + struct vgic_state_iter *iter; + + mutex_lock(&kvm->lock); + iter = kvm->arch.vgic.iter; + if (iter) { + iter = ERR_PTR(-EBUSY); + goto out; + } + + iter = kmalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) { + iter = ERR_PTR(-ENOMEM); + goto out; + } + + iter_init(kvm, iter, *pos); + kvm->arch.vgic.iter = iter; + + if (end_of_vgic(iter)) + iter = NULL; +out: + mutex_unlock(&kvm->lock); + return iter; +} + +static void *vgic_debug_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct kvm *kvm = (struct kvm *)s->private; + struct vgic_state_iter *iter = kvm->arch.vgic.iter; + + ++*pos; + iter_next(iter); + if (end_of_vgic(iter)) + iter = NULL; + return iter; +} + +static void vgic_debug_stop(struct seq_file *s, void *v) +{ + struct kvm *kvm = (struct kvm *)s->private; + struct vgic_state_iter *iter; + + /* + * If the seq file wasn't properly opened, there's nothing to clearn + * up. + */ + if (IS_ERR(v)) + return; + + mutex_lock(&kvm->lock); + iter = kvm->arch.vgic.iter; + kfree(iter->lpi_array); + kfree(iter); + kvm->arch.vgic.iter = NULL; + mutex_unlock(&kvm->lock); +} + +static void print_dist_state(struct seq_file *s, struct vgic_dist *dist) +{ + bool v3 = dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3; + + seq_printf(s, "Distributor\n"); + seq_printf(s, "===========\n"); + seq_printf(s, "vgic_model:\t%s\n", v3 ? "GICv3" : "GICv2"); + seq_printf(s, "nr_spis:\t%d\n", dist->nr_spis); + if (v3) + seq_printf(s, "nr_lpis:\t%d\n", dist->lpi_list_count); + seq_printf(s, "enabled:\t%d\n", dist->enabled); + seq_printf(s, "\n"); + + seq_printf(s, "P=pending_latch, L=line_level, A=active\n"); + seq_printf(s, "E=enabled, H=hw, C=config (level=1, edge=0)\n"); + seq_printf(s, "G=group\n"); +} + +static void print_header(struct seq_file *s, struct vgic_irq *irq, + struct kvm_vcpu *vcpu) +{ + int id = 0; + char *hdr = "SPI "; + + if (vcpu) { + hdr = "VCPU"; + id = vcpu->vcpu_id; + } + + seq_printf(s, "\n"); + seq_printf(s, "%s%2d TYP ID TGT_ID PLAEHCG HWID TARGET SRC PRI VCPU_ID\n", hdr, id); + seq_printf(s, "----------------------------------------------------------------\n"); +} + +static void print_irq_state(struct seq_file *s, struct vgic_irq *irq, + struct kvm_vcpu *vcpu) +{ + char *type; + bool pending; + + if (irq->intid < VGIC_NR_SGIS) + type = "SGI"; + else if (irq->intid < VGIC_NR_PRIVATE_IRQS) + type = "PPI"; + else if (irq->intid < VGIC_MAX_SPI) + type = "SPI"; + else + type = "LPI"; + + if (irq->intid ==0 || irq->intid == VGIC_NR_PRIVATE_IRQS) + print_header(s, irq, vcpu); + + pending = irq->pending_latch; + if (irq->hw && vgic_irq_is_sgi(irq->intid)) { + int err; + + err = irq_get_irqchip_state(irq->host_irq, + IRQCHIP_STATE_PENDING, + &pending); + WARN_ON_ONCE(err); + } + + seq_printf(s, " %s %4d " + " %2d " + "%d%d%d%d%d%d%d " + "%8d " + "%8x " + " %2x " + "%3d " + " %2d " + "\n", + type, irq->intid, + (irq->target_vcpu) ? irq->target_vcpu->vcpu_id : -1, + pending, + irq->line_level, + irq->active, + irq->enabled, + irq->hw, + irq->config == VGIC_CONFIG_LEVEL, + irq->group, + irq->hwintid, + irq->mpidr, + irq->source, + irq->priority, + (irq->vcpu) ? irq->vcpu->vcpu_id : -1); +} + +static int vgic_debug_show(struct seq_file *s, void *v) +{ + struct kvm *kvm = (struct kvm *)s->private; + struct vgic_state_iter *iter = (struct vgic_state_iter *)v; + struct vgic_irq *irq; + struct kvm_vcpu *vcpu = NULL; + unsigned long flags; + + if (iter->dist_id == 0) { + print_dist_state(s, &kvm->arch.vgic); + return 0; + } + + if (!kvm->arch.vgic.initialized) + return 0; + + if (iter->vcpu_id < iter->nr_cpus) + vcpu = kvm_get_vcpu(kvm, iter->vcpu_id); + + irq = vgic_get_irq(kvm, vcpu, iter->intid); + if (!irq) { + seq_printf(s, " LPI %4d freed\n", iter->intid); + return 0; + } + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + print_irq_state(s, irq, vcpu); + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + + vgic_put_irq(kvm, irq); + return 0; +} + +static const struct seq_operations vgic_debug_seq_ops = { + .start = vgic_debug_start, + .next = vgic_debug_next, + .stop = vgic_debug_stop, + .show = vgic_debug_show +}; + +static int debug_open(struct inode *inode, struct file *file) +{ + int ret; + ret = seq_open(file, &vgic_debug_seq_ops); + if (!ret) { + struct seq_file *seq; + /* seq_open will have modified file->private_data */ + seq = file->private_data; + seq->private = inode->i_private; + } + + return ret; +}; + +static const struct file_operations vgic_debug_fops = { + .owner = THIS_MODULE, + .open = debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + +void vgic_debug_init(struct kvm *kvm) +{ + debugfs_create_file("vgic-state", 0444, kvm->debugfs_dentry, kvm, + &vgic_debug_fops); +} + +void vgic_debug_destroy(struct kvm *kvm) +{ +} diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c new file mode 100644 index 000000000000..32e32d67a127 --- /dev/null +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -0,0 +1,556 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2015, 2016 ARM Ltd. + */ + +#include <linux/uaccess.h> +#include <linux/interrupt.h> +#include <linux/cpu.h> +#include <linux/kvm_host.h> +#include <kvm/arm_vgic.h> +#include <asm/kvm_emulate.h> +#include <asm/kvm_mmu.h> +#include "vgic.h" + +/* + * Initialization rules: there are multiple stages to the vgic + * initialization, both for the distributor and the CPU interfaces. The basic + * idea is that even though the VGIC is not functional or not requested from + * user space, the critical path of the run loop can still call VGIC functions + * that just won't do anything, without them having to check additional + * initialization flags to ensure they don't look at uninitialized data + * structures. + * + * Distributor: + * + * - kvm_vgic_early_init(): initialization of static data that doesn't + * depend on any sizing information or emulation type. No allocation + * is allowed there. + * + * - vgic_init(): allocation and initialization of the generic data + * structures that depend on sizing information (number of CPUs, + * number of interrupts). Also initializes the vcpu specific data + * structures. Can be executed lazily for GICv2. + * + * CPU Interface: + * + * - kvm_vgic_vcpu_init(): initialization of static data that + * doesn't depend on any sizing information or emulation type. No + * allocation is allowed there. + */ + +/* EARLY INIT */ + +/** + * kvm_vgic_early_init() - Initialize static VGIC VCPU data structures + * @kvm: The VM whose VGIC districutor should be initialized + * + * Only do initialization of static structures that don't require any + * allocation or sizing information from userspace. vgic_init() called + * kvm_vgic_dist_init() which takes care of the rest. + */ +void kvm_vgic_early_init(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + + INIT_LIST_HEAD(&dist->lpi_list_head); + INIT_LIST_HEAD(&dist->lpi_translation_cache); + raw_spin_lock_init(&dist->lpi_list_lock); +} + +/* CREATION */ + +/** + * kvm_vgic_create: triggered by the instantiation of the VGIC device by + * user space, either through the legacy KVM_CREATE_IRQCHIP ioctl (v2 only) + * or through the generic KVM_CREATE_DEVICE API ioctl. + * irqchip_in_kernel() tells you if this function succeeded or not. + * @kvm: kvm struct pointer + * @type: KVM_DEV_TYPE_ARM_VGIC_V[23] + */ +int kvm_vgic_create(struct kvm *kvm, u32 type) +{ + int i, ret; + struct kvm_vcpu *vcpu; + + if (irqchip_in_kernel(kvm)) + return -EEXIST; + + /* + * This function is also called by the KVM_CREATE_IRQCHIP handler, + * which had no chance yet to check the availability of the GICv2 + * emulation. So check this here again. KVM_CREATE_DEVICE does + * the proper checks already. + */ + if (type == KVM_DEV_TYPE_ARM_VGIC_V2 && + !kvm_vgic_global_state.can_emulate_gicv2) + return -ENODEV; + + ret = -EBUSY; + if (!lock_all_vcpus(kvm)) + return ret; + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (vcpu->arch.has_run_once) + goto out_unlock; + } + ret = 0; + + if (type == KVM_DEV_TYPE_ARM_VGIC_V2) + kvm->arch.max_vcpus = VGIC_V2_MAX_CPUS; + else + kvm->arch.max_vcpus = VGIC_V3_MAX_CPUS; + + if (atomic_read(&kvm->online_vcpus) > kvm->arch.max_vcpus) { + ret = -E2BIG; + goto out_unlock; + } + + kvm->arch.vgic.in_kernel = true; + kvm->arch.vgic.vgic_model = type; + + kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF; + + if (type == KVM_DEV_TYPE_ARM_VGIC_V2) + kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF; + else + INIT_LIST_HEAD(&kvm->arch.vgic.rd_regions); + +out_unlock: + unlock_all_vcpus(kvm); + return ret; +} + +/* INIT/DESTROY */ + +/** + * kvm_vgic_dist_init: initialize the dist data structures + * @kvm: kvm struct pointer + * @nr_spis: number of spis, frozen by caller + */ +static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct kvm_vcpu *vcpu0 = kvm_get_vcpu(kvm, 0); + int i; + + dist->spis = kcalloc(nr_spis, sizeof(struct vgic_irq), GFP_KERNEL); + if (!dist->spis) + return -ENOMEM; + + /* + * In the following code we do not take the irq struct lock since + * no other action on irq structs can happen while the VGIC is + * not initialized yet: + * If someone wants to inject an interrupt or does a MMIO access, we + * require prior initialization in case of a virtual GICv3 or trigger + * initialization when using a virtual GICv2. + */ + for (i = 0; i < nr_spis; i++) { + struct vgic_irq *irq = &dist->spis[i]; + + irq->intid = i + VGIC_NR_PRIVATE_IRQS; + INIT_LIST_HEAD(&irq->ap_list); + raw_spin_lock_init(&irq->irq_lock); + irq->vcpu = NULL; + irq->target_vcpu = vcpu0; + kref_init(&irq->refcount); + switch (dist->vgic_model) { + case KVM_DEV_TYPE_ARM_VGIC_V2: + irq->targets = 0; + irq->group = 0; + break; + case KVM_DEV_TYPE_ARM_VGIC_V3: + irq->mpidr = 0; + irq->group = 1; + break; + default: + kfree(dist->spis); + dist->spis = NULL; + return -EINVAL; + } + } + return 0; +} + +/** + * kvm_vgic_vcpu_init() - Initialize static VGIC VCPU data + * structures and register VCPU-specific KVM iodevs + * + * @vcpu: pointer to the VCPU being created and initialized + * + * Only do initialization, but do not actually enable the + * VGIC CPU interface + */ +int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + int ret = 0; + int i; + + vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF; + + INIT_LIST_HEAD(&vgic_cpu->ap_list_head); + raw_spin_lock_init(&vgic_cpu->ap_list_lock); + atomic_set(&vgic_cpu->vgic_v3.its_vpe.vlpi_count, 0); + + /* + * Enable and configure all SGIs to be edge-triggered and + * configure all PPIs as level-triggered. + */ + for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) { + struct vgic_irq *irq = &vgic_cpu->private_irqs[i]; + + INIT_LIST_HEAD(&irq->ap_list); + raw_spin_lock_init(&irq->irq_lock); + irq->intid = i; + irq->vcpu = NULL; + irq->target_vcpu = vcpu; + kref_init(&irq->refcount); + if (vgic_irq_is_sgi(i)) { + /* SGIs */ + irq->enabled = 1; + irq->config = VGIC_CONFIG_EDGE; + } else { + /* PPIs */ + irq->config = VGIC_CONFIG_LEVEL; + } + } + + if (!irqchip_in_kernel(vcpu->kvm)) + return 0; + + /* + * If we are creating a VCPU with a GICv3 we must also register the + * KVM io device for the redistributor that belongs to this VCPU. + */ + if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { + mutex_lock(&vcpu->kvm->lock); + ret = vgic_register_redist_iodev(vcpu); + mutex_unlock(&vcpu->kvm->lock); + } + return ret; +} + +static void kvm_vgic_vcpu_enable(struct kvm_vcpu *vcpu) +{ + if (kvm_vgic_global_state.type == VGIC_V2) + vgic_v2_enable(vcpu); + else + vgic_v3_enable(vcpu); +} + +/* + * vgic_init: allocates and initializes dist and vcpu data structures + * depending on two dimensioning parameters: + * - the number of spis + * - the number of vcpus + * The function is generally called when nr_spis has been explicitly set + * by the guest through the KVM DEVICE API. If not nr_spis is set to 256. + * vgic_initialized() returns true when this function has succeeded. + * Must be called with kvm->lock held! + */ +int vgic_init(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct kvm_vcpu *vcpu; + int ret = 0, i, idx; + + if (vgic_initialized(kvm)) + return 0; + + /* Are we also in the middle of creating a VCPU? */ + if (kvm->created_vcpus != atomic_read(&kvm->online_vcpus)) + return -EBUSY; + + /* freeze the number of spis */ + if (!dist->nr_spis) + dist->nr_spis = VGIC_NR_IRQS_LEGACY - VGIC_NR_PRIVATE_IRQS; + + ret = kvm_vgic_dist_init(kvm, dist->nr_spis); + if (ret) + goto out; + + /* Initialize groups on CPUs created before the VGIC type was known */ + kvm_for_each_vcpu(idx, vcpu, kvm) { + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + + for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) { + struct vgic_irq *irq = &vgic_cpu->private_irqs[i]; + switch (dist->vgic_model) { + case KVM_DEV_TYPE_ARM_VGIC_V3: + irq->group = 1; + irq->mpidr = kvm_vcpu_get_mpidr_aff(vcpu); + break; + case KVM_DEV_TYPE_ARM_VGIC_V2: + irq->group = 0; + irq->targets = 1U << idx; + break; + default: + ret = -EINVAL; + goto out; + } + } + } + + if (vgic_has_its(kvm)) + vgic_lpi_translation_cache_init(kvm); + + /* + * If we have GICv4.1 enabled, unconditionnaly request enable the + * v4 support so that we get HW-accelerated vSGIs. Otherwise, only + * enable it if we present a virtual ITS to the guest. + */ + if (vgic_supports_direct_msis(kvm)) { + ret = vgic_v4_init(kvm); + if (ret) + goto out; + } + + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_vgic_vcpu_enable(vcpu); + + ret = kvm_vgic_setup_default_irq_routing(kvm); + if (ret) + goto out; + + vgic_debug_init(kvm); + + dist->implementation_rev = 2; + dist->initialized = true; + +out: + return ret; +} + +static void kvm_vgic_dist_destroy(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct vgic_redist_region *rdreg, *next; + + dist->ready = false; + dist->initialized = false; + + kfree(dist->spis); + dist->spis = NULL; + dist->nr_spis = 0; + + if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { + list_for_each_entry_safe(rdreg, next, &dist->rd_regions, list) { + list_del(&rdreg->list); + kfree(rdreg); + } + INIT_LIST_HEAD(&dist->rd_regions); + } + + if (vgic_has_its(kvm)) + vgic_lpi_translation_cache_destroy(kvm); + + if (vgic_supports_direct_msis(kvm)) + vgic_v4_teardown(kvm); +} + +void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + + /* + * Retire all pending LPIs on this vcpu anyway as we're + * going to destroy it. + */ + vgic_flush_pending_lpis(vcpu); + + INIT_LIST_HEAD(&vgic_cpu->ap_list_head); +} + +/* To be called with kvm->lock held */ +static void __kvm_vgic_destroy(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + int i; + + vgic_debug_destroy(kvm); + + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_vgic_vcpu_destroy(vcpu); + + kvm_vgic_dist_destroy(kvm); +} + +void kvm_vgic_destroy(struct kvm *kvm) +{ + mutex_lock(&kvm->lock); + __kvm_vgic_destroy(kvm); + mutex_unlock(&kvm->lock); +} + +/** + * vgic_lazy_init: Lazy init is only allowed if the GIC exposed to the guest + * is a GICv2. A GICv3 must be explicitly initialized by the guest using the + * KVM_DEV_ARM_VGIC_GRP_CTRL KVM_DEVICE group. + * @kvm: kvm struct pointer + */ +int vgic_lazy_init(struct kvm *kvm) +{ + int ret = 0; + + if (unlikely(!vgic_initialized(kvm))) { + /* + * We only provide the automatic initialization of the VGIC + * for the legacy case of a GICv2. Any other type must + * be explicitly initialized once setup with the respective + * KVM device call. + */ + if (kvm->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V2) + return -EBUSY; + + mutex_lock(&kvm->lock); + ret = vgic_init(kvm); + mutex_unlock(&kvm->lock); + } + + return ret; +} + +/* RESOURCE MAPPING */ + +/** + * Map the MMIO regions depending on the VGIC model exposed to the guest + * called on the first VCPU run. + * Also map the virtual CPU interface into the VM. + * v2/v3 derivatives call vgic_init if not already done. + * vgic_ready() returns true if this function has succeeded. + * @kvm: kvm struct pointer + */ +int kvm_vgic_map_resources(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + int ret = 0; + + mutex_lock(&kvm->lock); + if (!irqchip_in_kernel(kvm)) + goto out; + + if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2) + ret = vgic_v2_map_resources(kvm); + else + ret = vgic_v3_map_resources(kvm); + + if (ret) + __kvm_vgic_destroy(kvm); + +out: + mutex_unlock(&kvm->lock); + return ret; +} + +/* GENERIC PROBE */ + +static int vgic_init_cpu_starting(unsigned int cpu) +{ + enable_percpu_irq(kvm_vgic_global_state.maint_irq, 0); + return 0; +} + + +static int vgic_init_cpu_dying(unsigned int cpu) +{ + disable_percpu_irq(kvm_vgic_global_state.maint_irq); + return 0; +} + +static irqreturn_t vgic_maintenance_handler(int irq, void *data) +{ + /* + * We cannot rely on the vgic maintenance interrupt to be + * delivered synchronously. This means we can only use it to + * exit the VM, and we perform the handling of EOIed + * interrupts on the exit path (see vgic_fold_lr_state). + */ + return IRQ_HANDLED; +} + +/** + * kvm_vgic_init_cpu_hardware - initialize the GIC VE hardware + * + * For a specific CPU, initialize the GIC VE hardware. + */ +void kvm_vgic_init_cpu_hardware(void) +{ + BUG_ON(preemptible()); + + /* + * We want to make sure the list registers start out clear so that we + * only have the program the used registers. + */ + if (kvm_vgic_global_state.type == VGIC_V2) + vgic_v2_init_lrs(); + else + kvm_call_hyp(__vgic_v3_init_lrs); +} + +/** + * kvm_vgic_hyp_init: populates the kvm_vgic_global_state variable + * according to the host GIC model. Accordingly calls either + * vgic_v2/v3_probe which registers the KVM_DEVICE that can be + * instantiated by a guest later on . + */ +int kvm_vgic_hyp_init(void) +{ + const struct gic_kvm_info *gic_kvm_info; + int ret; + + gic_kvm_info = gic_get_kvm_info(); + if (!gic_kvm_info) + return -ENODEV; + + if (!gic_kvm_info->maint_irq) { + kvm_err("No vgic maintenance irq\n"); + return -ENXIO; + } + + switch (gic_kvm_info->type) { + case GIC_V2: + ret = vgic_v2_probe(gic_kvm_info); + break; + case GIC_V3: + ret = vgic_v3_probe(gic_kvm_info); + if (!ret) { + static_branch_enable(&kvm_vgic_global_state.gicv3_cpuif); + kvm_info("GIC system register CPU interface enabled\n"); + } + break; + default: + ret = -ENODEV; + } + + if (ret) + return ret; + + kvm_vgic_global_state.maint_irq = gic_kvm_info->maint_irq; + ret = request_percpu_irq(kvm_vgic_global_state.maint_irq, + vgic_maintenance_handler, + "vgic", kvm_get_running_vcpus()); + if (ret) { + kvm_err("Cannot register interrupt %d\n", + kvm_vgic_global_state.maint_irq); + return ret; + } + + ret = cpuhp_setup_state(CPUHP_AP_KVM_ARM_VGIC_INIT_STARTING, + "kvm/arm/vgic:starting", + vgic_init_cpu_starting, vgic_init_cpu_dying); + if (ret) { + kvm_err("Cannot register vgic CPU notifier\n"); + goto out_free_irq; + } + + kvm_info("vgic interrupt IRQ%d\n", kvm_vgic_global_state.maint_irq); + return 0; + +out_free_irq: + free_percpu_irq(kvm_vgic_global_state.maint_irq, + kvm_get_running_vcpus()); + return ret; +} diff --git a/arch/arm64/kvm/vgic/vgic-irqfd.c b/arch/arm64/kvm/vgic/vgic-irqfd.c new file mode 100644 index 000000000000..d8cdfea5cc96 --- /dev/null +++ b/arch/arm64/kvm/vgic/vgic-irqfd.c @@ -0,0 +1,141 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2015, 2016 ARM Ltd. + */ + +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <trace/events/kvm.h> +#include <kvm/arm_vgic.h> +#include "vgic.h" + +/** + * vgic_irqfd_set_irq: inject the IRQ corresponding to the + * irqchip routing entry + * + * This is the entry point for irqfd IRQ injection + */ +static int vgic_irqfd_set_irq(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, + int level, bool line_status) +{ + unsigned int spi_id = e->irqchip.pin + VGIC_NR_PRIVATE_IRQS; + + if (!vgic_valid_spi(kvm, spi_id)) + return -EINVAL; + return kvm_vgic_inject_irq(kvm, 0, spi_id, level, NULL); +} + +/** + * kvm_set_routing_entry: populate a kvm routing entry + * from a user routing entry + * + * @kvm: the VM this entry is applied to + * @e: kvm kernel routing entry handle + * @ue: user api routing entry handle + * return 0 on success, -EINVAL on errors. + */ +int kvm_set_routing_entry(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *e, + const struct kvm_irq_routing_entry *ue) +{ + int r = -EINVAL; + + switch (ue->type) { + case KVM_IRQ_ROUTING_IRQCHIP: + e->set = vgic_irqfd_set_irq; + e->irqchip.irqchip = ue->u.irqchip.irqchip; + e->irqchip.pin = ue->u.irqchip.pin; + if ((e->irqchip.pin >= KVM_IRQCHIP_NUM_PINS) || + (e->irqchip.irqchip >= KVM_NR_IRQCHIPS)) + goto out; + break; + case KVM_IRQ_ROUTING_MSI: + e->set = kvm_set_msi; + e->msi.address_lo = ue->u.msi.address_lo; + e->msi.address_hi = ue->u.msi.address_hi; + e->msi.data = ue->u.msi.data; + e->msi.flags = ue->flags; + e->msi.devid = ue->u.msi.devid; + break; + default: + goto out; + } + r = 0; +out: + return r; +} + +static void kvm_populate_msi(struct kvm_kernel_irq_routing_entry *e, + struct kvm_msi *msi) +{ + msi->address_lo = e->msi.address_lo; + msi->address_hi = e->msi.address_hi; + msi->data = e->msi.data; + msi->flags = e->msi.flags; + msi->devid = e->msi.devid; +} +/** + * kvm_set_msi: inject the MSI corresponding to the + * MSI routing entry + * + * This is the entry point for irqfd MSI injection + * and userspace MSI injection. + */ +int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, + int level, bool line_status) +{ + struct kvm_msi msi; + + if (!vgic_has_its(kvm)) + return -ENODEV; + + if (!level) + return -1; + + kvm_populate_msi(e, &msi); + return vgic_its_inject_msi(kvm, &msi); +} + +/** + * kvm_arch_set_irq_inatomic: fast-path for irqfd injection + * + * Currently only direct MSI injection is supported. + */ +int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, int level, + bool line_status) +{ + if (e->type == KVM_IRQ_ROUTING_MSI && vgic_has_its(kvm) && level) { + struct kvm_msi msi; + + kvm_populate_msi(e, &msi); + if (!vgic_its_inject_cached_translation(kvm, &msi)) + return 0; + } + + return -EWOULDBLOCK; +} + +int kvm_vgic_setup_default_irq_routing(struct kvm *kvm) +{ + struct kvm_irq_routing_entry *entries; + struct vgic_dist *dist = &kvm->arch.vgic; + u32 nr = dist->nr_spis; + int i, ret; + + entries = kcalloc(nr, sizeof(*entries), GFP_KERNEL); + if (!entries) + return -ENOMEM; + + for (i = 0; i < nr; i++) { + entries[i].gsi = i; + entries[i].type = KVM_IRQ_ROUTING_IRQCHIP; + entries[i].u.irqchip.irqchip = 0; + entries[i].u.irqchip.pin = i; + } + ret = kvm_set_irq_routing(kvm, entries, nr, 0); + kfree(entries); + return ret; +} diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c new file mode 100644 index 000000000000..c012a52b19f5 --- /dev/null +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -0,0 +1,2783 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * GICv3 ITS emulation + * + * Copyright (C) 2015,2016 ARM Ltd. + * Author: Andre Przywara <andre.przywara@arm.com> + */ + +#include <linux/cpu.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/interrupt.h> +#include <linux/list.h> +#include <linux/uaccess.h> +#include <linux/list_sort.h> + +#include <linux/irqchip/arm-gic-v3.h> + +#include <asm/kvm_emulate.h> +#include <asm/kvm_arm.h> +#include <asm/kvm_mmu.h> + +#include "vgic.h" +#include "vgic-mmio.h" + +static int vgic_its_save_tables_v0(struct vgic_its *its); +static int vgic_its_restore_tables_v0(struct vgic_its *its); +static int vgic_its_commit_v0(struct vgic_its *its); +static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq, + struct kvm_vcpu *filter_vcpu, bool needs_inv); + +/* + * Creates a new (reference to a) struct vgic_irq for a given LPI. + * If this LPI is already mapped on another ITS, we increase its refcount + * and return a pointer to the existing structure. + * If this is a "new" LPI, we allocate and initialize a new struct vgic_irq. + * This function returns a pointer to the _unlocked_ structure. + */ +static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid, + struct kvm_vcpu *vcpu) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct vgic_irq *irq = vgic_get_irq(kvm, NULL, intid), *oldirq; + unsigned long flags; + int ret; + + /* In this case there is no put, since we keep the reference. */ + if (irq) + return irq; + + irq = kzalloc(sizeof(struct vgic_irq), GFP_KERNEL); + if (!irq) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&irq->lpi_list); + INIT_LIST_HEAD(&irq->ap_list); + raw_spin_lock_init(&irq->irq_lock); + + irq->config = VGIC_CONFIG_EDGE; + kref_init(&irq->refcount); + irq->intid = intid; + irq->target_vcpu = vcpu; + irq->group = 1; + + raw_spin_lock_irqsave(&dist->lpi_list_lock, flags); + + /* + * There could be a race with another vgic_add_lpi(), so we need to + * check that we don't add a second list entry with the same LPI. + */ + list_for_each_entry(oldirq, &dist->lpi_list_head, lpi_list) { + if (oldirq->intid != intid) + continue; + + /* Someone was faster with adding this LPI, lets use that. */ + kfree(irq); + irq = oldirq; + + /* + * This increases the refcount, the caller is expected to + * call vgic_put_irq() on the returned pointer once it's + * finished with the IRQ. + */ + vgic_get_irq_kref(irq); + + goto out_unlock; + } + + list_add_tail(&irq->lpi_list, &dist->lpi_list_head); + dist->lpi_list_count++; + +out_unlock: + raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); + + /* + * We "cache" the configuration table entries in our struct vgic_irq's. + * However we only have those structs for mapped IRQs, so we read in + * the respective config data from memory here upon mapping the LPI. + * + * Should any of these fail, behave as if we couldn't create the LPI + * by dropping the refcount and returning the error. + */ + ret = update_lpi_config(kvm, irq, NULL, false); + if (ret) { + vgic_put_irq(kvm, irq); + return ERR_PTR(ret); + } + + ret = vgic_v3_lpi_sync_pending_status(kvm, irq); + if (ret) { + vgic_put_irq(kvm, irq); + return ERR_PTR(ret); + } + + return irq; +} + +struct its_device { + struct list_head dev_list; + + /* the head for the list of ITTEs */ + struct list_head itt_head; + u32 num_eventid_bits; + gpa_t itt_addr; + u32 device_id; +}; + +#define COLLECTION_NOT_MAPPED ((u32)~0) + +struct its_collection { + struct list_head coll_list; + + u32 collection_id; + u32 target_addr; +}; + +#define its_is_collection_mapped(coll) ((coll) && \ + ((coll)->target_addr != COLLECTION_NOT_MAPPED)) + +struct its_ite { + struct list_head ite_list; + + struct vgic_irq *irq; + struct its_collection *collection; + u32 event_id; +}; + +struct vgic_translation_cache_entry { + struct list_head entry; + phys_addr_t db; + u32 devid; + u32 eventid; + struct vgic_irq *irq; +}; + +/** + * struct vgic_its_abi - ITS abi ops and settings + * @cte_esz: collection table entry size + * @dte_esz: device table entry size + * @ite_esz: interrupt translation table entry size + * @save tables: save the ITS tables into guest RAM + * @restore_tables: restore the ITS internal structs from tables + * stored in guest RAM + * @commit: initialize the registers which expose the ABI settings, + * especially the entry sizes + */ +struct vgic_its_abi { + int cte_esz; + int dte_esz; + int ite_esz; + int (*save_tables)(struct vgic_its *its); + int (*restore_tables)(struct vgic_its *its); + int (*commit)(struct vgic_its *its); +}; + +#define ABI_0_ESZ 8 +#define ESZ_MAX ABI_0_ESZ + +static const struct vgic_its_abi its_table_abi_versions[] = { + [0] = { + .cte_esz = ABI_0_ESZ, + .dte_esz = ABI_0_ESZ, + .ite_esz = ABI_0_ESZ, + .save_tables = vgic_its_save_tables_v0, + .restore_tables = vgic_its_restore_tables_v0, + .commit = vgic_its_commit_v0, + }, +}; + +#define NR_ITS_ABIS ARRAY_SIZE(its_table_abi_versions) + +inline const struct vgic_its_abi *vgic_its_get_abi(struct vgic_its *its) +{ + return &its_table_abi_versions[its->abi_rev]; +} + +static int vgic_its_set_abi(struct vgic_its *its, u32 rev) +{ + const struct vgic_its_abi *abi; + + its->abi_rev = rev; + abi = vgic_its_get_abi(its); + return abi->commit(its); +} + +/* + * Find and returns a device in the device table for an ITS. + * Must be called with the its_lock mutex held. + */ +static struct its_device *find_its_device(struct vgic_its *its, u32 device_id) +{ + struct its_device *device; + + list_for_each_entry(device, &its->device_list, dev_list) + if (device_id == device->device_id) + return device; + + return NULL; +} + +/* + * Find and returns an interrupt translation table entry (ITTE) for a given + * Device ID/Event ID pair on an ITS. + * Must be called with the its_lock mutex held. + */ +static struct its_ite *find_ite(struct vgic_its *its, u32 device_id, + u32 event_id) +{ + struct its_device *device; + struct its_ite *ite; + + device = find_its_device(its, device_id); + if (device == NULL) + return NULL; + + list_for_each_entry(ite, &device->itt_head, ite_list) + if (ite->event_id == event_id) + return ite; + + return NULL; +} + +/* To be used as an iterator this macro misses the enclosing parentheses */ +#define for_each_lpi_its(dev, ite, its) \ + list_for_each_entry(dev, &(its)->device_list, dev_list) \ + list_for_each_entry(ite, &(dev)->itt_head, ite_list) + +#define GIC_LPI_OFFSET 8192 + +#define VITS_TYPER_IDBITS 16 +#define VITS_TYPER_DEVBITS 16 +#define VITS_DTE_MAX_DEVID_OFFSET (BIT(14) - 1) +#define VITS_ITE_MAX_EVENTID_OFFSET (BIT(16) - 1) + +/* + * Finds and returns a collection in the ITS collection table. + * Must be called with the its_lock mutex held. + */ +static struct its_collection *find_collection(struct vgic_its *its, int coll_id) +{ + struct its_collection *collection; + + list_for_each_entry(collection, &its->collection_list, coll_list) { + if (coll_id == collection->collection_id) + return collection; + } + + return NULL; +} + +#define LPI_PROP_ENABLE_BIT(p) ((p) & LPI_PROP_ENABLED) +#define LPI_PROP_PRIORITY(p) ((p) & 0xfc) + +/* + * Reads the configuration data for a given LPI from guest memory and + * updates the fields in struct vgic_irq. + * If filter_vcpu is not NULL, applies only if the IRQ is targeting this + * VCPU. Unconditionally applies if filter_vcpu is NULL. + */ +static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq, + struct kvm_vcpu *filter_vcpu, bool needs_inv) +{ + u64 propbase = GICR_PROPBASER_ADDRESS(kvm->arch.vgic.propbaser); + u8 prop; + int ret; + unsigned long flags; + + ret = kvm_read_guest_lock(kvm, propbase + irq->intid - GIC_LPI_OFFSET, + &prop, 1); + + if (ret) + return ret; + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + + if (!filter_vcpu || filter_vcpu == irq->target_vcpu) { + irq->priority = LPI_PROP_PRIORITY(prop); + irq->enabled = LPI_PROP_ENABLE_BIT(prop); + + if (!irq->hw) { + vgic_queue_irq_unlock(kvm, irq, flags); + return 0; + } + } + + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + + if (irq->hw) + return its_prop_update_vlpi(irq->host_irq, prop, needs_inv); + + return 0; +} + +/* + * Create a snapshot of the current LPIs targeting @vcpu, so that we can + * enumerate those LPIs without holding any lock. + * Returns their number and puts the kmalloc'ed array into intid_ptr. + */ +int vgic_copy_lpi_list(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 **intid_ptr) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct vgic_irq *irq; + unsigned long flags; + u32 *intids; + int irq_count, i = 0; + + /* + * There is an obvious race between allocating the array and LPIs + * being mapped/unmapped. If we ended up here as a result of a + * command, we're safe (locks are held, preventing another + * command). If coming from another path (such as enabling LPIs), + * we must be careful not to overrun the array. + */ + irq_count = READ_ONCE(dist->lpi_list_count); + intids = kmalloc_array(irq_count, sizeof(intids[0]), GFP_KERNEL); + if (!intids) + return -ENOMEM; + + raw_spin_lock_irqsave(&dist->lpi_list_lock, flags); + list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) { + if (i == irq_count) + break; + /* We don't need to "get" the IRQ, as we hold the list lock. */ + if (vcpu && irq->target_vcpu != vcpu) + continue; + intids[i++] = irq->intid; + } + raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); + + *intid_ptr = intids; + return i; +} + +static int update_affinity(struct vgic_irq *irq, struct kvm_vcpu *vcpu) +{ + int ret = 0; + unsigned long flags; + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + irq->target_vcpu = vcpu; + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + + if (irq->hw) { + struct its_vlpi_map map; + + ret = its_get_vlpi(irq->host_irq, &map); + if (ret) + return ret; + + if (map.vpe) + atomic_dec(&map.vpe->vlpi_count); + map.vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe; + atomic_inc(&map.vpe->vlpi_count); + + ret = its_map_vlpi(irq->host_irq, &map); + } + + return ret; +} + +/* + * Promotes the ITS view of affinity of an ITTE (which redistributor this LPI + * is targeting) to the VGIC's view, which deals with target VCPUs. + * Needs to be called whenever either the collection for a LPIs has + * changed or the collection itself got retargeted. + */ +static void update_affinity_ite(struct kvm *kvm, struct its_ite *ite) +{ + struct kvm_vcpu *vcpu; + + if (!its_is_collection_mapped(ite->collection)) + return; + + vcpu = kvm_get_vcpu(kvm, ite->collection->target_addr); + update_affinity(ite->irq, vcpu); +} + +/* + * Updates the target VCPU for every LPI targeting this collection. + * Must be called with the its_lock mutex held. + */ +static void update_affinity_collection(struct kvm *kvm, struct vgic_its *its, + struct its_collection *coll) +{ + struct its_device *device; + struct its_ite *ite; + + for_each_lpi_its(device, ite, its) { + if (!ite->collection || coll != ite->collection) + continue; + + update_affinity_ite(kvm, ite); + } +} + +static u32 max_lpis_propbaser(u64 propbaser) +{ + int nr_idbits = (propbaser & 0x1f) + 1; + + return 1U << min(nr_idbits, INTERRUPT_ID_BITS_ITS); +} + +/* + * Sync the pending table pending bit of LPIs targeting @vcpu + * with our own data structures. This relies on the LPI being + * mapped before. + */ +static int its_sync_lpi_pending_table(struct kvm_vcpu *vcpu) +{ + gpa_t pendbase = GICR_PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser); + struct vgic_irq *irq; + int last_byte_offset = -1; + int ret = 0; + u32 *intids; + int nr_irqs, i; + unsigned long flags; + u8 pendmask; + + nr_irqs = vgic_copy_lpi_list(vcpu->kvm, vcpu, &intids); + if (nr_irqs < 0) + return nr_irqs; + + for (i = 0; i < nr_irqs; i++) { + int byte_offset, bit_nr; + + byte_offset = intids[i] / BITS_PER_BYTE; + bit_nr = intids[i] % BITS_PER_BYTE; + + /* + * For contiguously allocated LPIs chances are we just read + * this very same byte in the last iteration. Reuse that. + */ + if (byte_offset != last_byte_offset) { + ret = kvm_read_guest_lock(vcpu->kvm, + pendbase + byte_offset, + &pendmask, 1); + if (ret) { + kfree(intids); + return ret; + } + last_byte_offset = byte_offset; + } + + irq = vgic_get_irq(vcpu->kvm, NULL, intids[i]); + raw_spin_lock_irqsave(&irq->irq_lock, flags); + irq->pending_latch = pendmask & (1U << bit_nr); + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); + vgic_put_irq(vcpu->kvm, irq); + } + + kfree(intids); + + return ret; +} + +static unsigned long vgic_mmio_read_its_typer(struct kvm *kvm, + struct vgic_its *its, + gpa_t addr, unsigned int len) +{ + const struct vgic_its_abi *abi = vgic_its_get_abi(its); + u64 reg = GITS_TYPER_PLPIS; + + /* + * We use linear CPU numbers for redistributor addressing, + * so GITS_TYPER.PTA is 0. + * Also we force all PROPBASER registers to be the same, so + * CommonLPIAff is 0 as well. + * To avoid memory waste in the guest, we keep the number of IDBits and + * DevBits low - as least for the time being. + */ + reg |= GIC_ENCODE_SZ(VITS_TYPER_DEVBITS, 5) << GITS_TYPER_DEVBITS_SHIFT; + reg |= GIC_ENCODE_SZ(VITS_TYPER_IDBITS, 5) << GITS_TYPER_IDBITS_SHIFT; + reg |= GIC_ENCODE_SZ(abi->ite_esz, 4) << GITS_TYPER_ITT_ENTRY_SIZE_SHIFT; + + return extract_bytes(reg, addr & 7, len); +} + +static unsigned long vgic_mmio_read_its_iidr(struct kvm *kvm, + struct vgic_its *its, + gpa_t addr, unsigned int len) +{ + u32 val; + + val = (its->abi_rev << GITS_IIDR_REV_SHIFT) & GITS_IIDR_REV_MASK; + val |= (PRODUCT_ID_KVM << GITS_IIDR_PRODUCTID_SHIFT) | IMPLEMENTER_ARM; + return val; +} + +static int vgic_mmio_uaccess_write_its_iidr(struct kvm *kvm, + struct vgic_its *its, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 rev = GITS_IIDR_REV(val); + + if (rev >= NR_ITS_ABIS) + return -EINVAL; + return vgic_its_set_abi(its, rev); +} + +static unsigned long vgic_mmio_read_its_idregs(struct kvm *kvm, + struct vgic_its *its, + gpa_t addr, unsigned int len) +{ + switch (addr & 0xffff) { + case GITS_PIDR0: + return 0x92; /* part number, bits[7:0] */ + case GITS_PIDR1: + return 0xb4; /* part number, bits[11:8] */ + case GITS_PIDR2: + return GIC_PIDR2_ARCH_GICv3 | 0x0b; + case GITS_PIDR4: + return 0x40; /* This is a 64K software visible page */ + /* The following are the ID registers for (any) GIC. */ + case GITS_CIDR0: + return 0x0d; + case GITS_CIDR1: + return 0xf0; + case GITS_CIDR2: + return 0x05; + case GITS_CIDR3: + return 0xb1; + } + + return 0; +} + +static struct vgic_irq *__vgic_its_check_cache(struct vgic_dist *dist, + phys_addr_t db, + u32 devid, u32 eventid) +{ + struct vgic_translation_cache_entry *cte; + + list_for_each_entry(cte, &dist->lpi_translation_cache, entry) { + /* + * If we hit a NULL entry, there is nothing after this + * point. + */ + if (!cte->irq) + break; + + if (cte->db != db || cte->devid != devid || + cte->eventid != eventid) + continue; + + /* + * Move this entry to the head, as it is the most + * recently used. + */ + if (!list_is_first(&cte->entry, &dist->lpi_translation_cache)) + list_move(&cte->entry, &dist->lpi_translation_cache); + + return cte->irq; + } + + return NULL; +} + +static struct vgic_irq *vgic_its_check_cache(struct kvm *kvm, phys_addr_t db, + u32 devid, u32 eventid) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct vgic_irq *irq; + unsigned long flags; + + raw_spin_lock_irqsave(&dist->lpi_list_lock, flags); + irq = __vgic_its_check_cache(dist, db, devid, eventid); + raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); + + return irq; +} + +static void vgic_its_cache_translation(struct kvm *kvm, struct vgic_its *its, + u32 devid, u32 eventid, + struct vgic_irq *irq) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct vgic_translation_cache_entry *cte; + unsigned long flags; + phys_addr_t db; + + /* Do not cache a directly injected interrupt */ + if (irq->hw) + return; + + raw_spin_lock_irqsave(&dist->lpi_list_lock, flags); + + if (unlikely(list_empty(&dist->lpi_translation_cache))) + goto out; + + /* + * We could have raced with another CPU caching the same + * translation behind our back, so let's check it is not in + * already + */ + db = its->vgic_its_base + GITS_TRANSLATER; + if (__vgic_its_check_cache(dist, db, devid, eventid)) + goto out; + + /* Always reuse the last entry (LRU policy) */ + cte = list_last_entry(&dist->lpi_translation_cache, + typeof(*cte), entry); + + /* + * Caching the translation implies having an extra reference + * to the interrupt, so drop the potential reference on what + * was in the cache, and increment it on the new interrupt. + */ + if (cte->irq) + __vgic_put_lpi_locked(kvm, cte->irq); + + vgic_get_irq_kref(irq); + + cte->db = db; + cte->devid = devid; + cte->eventid = eventid; + cte->irq = irq; + + /* Move the new translation to the head of the list */ + list_move(&cte->entry, &dist->lpi_translation_cache); + +out: + raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); +} + +void vgic_its_invalidate_cache(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct vgic_translation_cache_entry *cte; + unsigned long flags; + + raw_spin_lock_irqsave(&dist->lpi_list_lock, flags); + + list_for_each_entry(cte, &dist->lpi_translation_cache, entry) { + /* + * If we hit a NULL entry, there is nothing after this + * point. + */ + if (!cte->irq) + break; + + __vgic_put_lpi_locked(kvm, cte->irq); + cte->irq = NULL; + } + + raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); +} + +int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its, + u32 devid, u32 eventid, struct vgic_irq **irq) +{ + struct kvm_vcpu *vcpu; + struct its_ite *ite; + + if (!its->enabled) + return -EBUSY; + + ite = find_ite(its, devid, eventid); + if (!ite || !its_is_collection_mapped(ite->collection)) + return E_ITS_INT_UNMAPPED_INTERRUPT; + + vcpu = kvm_get_vcpu(kvm, ite->collection->target_addr); + if (!vcpu) + return E_ITS_INT_UNMAPPED_INTERRUPT; + + if (!vcpu->arch.vgic_cpu.lpis_enabled) + return -EBUSY; + + vgic_its_cache_translation(kvm, its, devid, eventid, ite->irq); + + *irq = ite->irq; + return 0; +} + +struct vgic_its *vgic_msi_to_its(struct kvm *kvm, struct kvm_msi *msi) +{ + u64 address; + struct kvm_io_device *kvm_io_dev; + struct vgic_io_device *iodev; + + if (!vgic_has_its(kvm)) + return ERR_PTR(-ENODEV); + + if (!(msi->flags & KVM_MSI_VALID_DEVID)) + return ERR_PTR(-EINVAL); + + address = (u64)msi->address_hi << 32 | msi->address_lo; + + kvm_io_dev = kvm_io_bus_get_dev(kvm, KVM_MMIO_BUS, address); + if (!kvm_io_dev) + return ERR_PTR(-EINVAL); + + if (kvm_io_dev->ops != &kvm_io_gic_ops) + return ERR_PTR(-EINVAL); + + iodev = container_of(kvm_io_dev, struct vgic_io_device, dev); + if (iodev->iodev_type != IODEV_ITS) + return ERR_PTR(-EINVAL); + + return iodev->its; +} + +/* + * Find the target VCPU and the LPI number for a given devid/eventid pair + * and make this IRQ pending, possibly injecting it. + * Must be called with the its_lock mutex held. + * Returns 0 on success, a positive error value for any ITS mapping + * related errors and negative error values for generic errors. + */ +static int vgic_its_trigger_msi(struct kvm *kvm, struct vgic_its *its, + u32 devid, u32 eventid) +{ + struct vgic_irq *irq = NULL; + unsigned long flags; + int err; + + err = vgic_its_resolve_lpi(kvm, its, devid, eventid, &irq); + if (err) + return err; + + if (irq->hw) + return irq_set_irqchip_state(irq->host_irq, + IRQCHIP_STATE_PENDING, true); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + irq->pending_latch = true; + vgic_queue_irq_unlock(kvm, irq, flags); + + return 0; +} + +int vgic_its_inject_cached_translation(struct kvm *kvm, struct kvm_msi *msi) +{ + struct vgic_irq *irq; + unsigned long flags; + phys_addr_t db; + + db = (u64)msi->address_hi << 32 | msi->address_lo; + irq = vgic_its_check_cache(kvm, db, msi->devid, msi->data); + + if (!irq) + return -1; + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + irq->pending_latch = true; + vgic_queue_irq_unlock(kvm, irq, flags); + + return 0; +} + +/* + * Queries the KVM IO bus framework to get the ITS pointer from the given + * doorbell address. + * We then call vgic_its_trigger_msi() with the decoded data. + * According to the KVM_SIGNAL_MSI API description returns 1 on success. + */ +int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi) +{ + struct vgic_its *its; + int ret; + + if (!vgic_its_inject_cached_translation(kvm, msi)) + return 1; + + its = vgic_msi_to_its(kvm, msi); + if (IS_ERR(its)) + return PTR_ERR(its); + + mutex_lock(&its->its_lock); + ret = vgic_its_trigger_msi(kvm, its, msi->devid, msi->data); + mutex_unlock(&its->its_lock); + + if (ret < 0) + return ret; + + /* + * KVM_SIGNAL_MSI demands a return value > 0 for success and 0 + * if the guest has blocked the MSI. So we map any LPI mapping + * related error to that. + */ + if (ret) + return 0; + else + return 1; +} + +/* Requires the its_lock to be held. */ +static void its_free_ite(struct kvm *kvm, struct its_ite *ite) +{ + list_del(&ite->ite_list); + + /* This put matches the get in vgic_add_lpi. */ + if (ite->irq) { + if (ite->irq->hw) + WARN_ON(its_unmap_vlpi(ite->irq->host_irq)); + + vgic_put_irq(kvm, ite->irq); + } + + kfree(ite); +} + +static u64 its_cmd_mask_field(u64 *its_cmd, int word, int shift, int size) +{ + return (le64_to_cpu(its_cmd[word]) >> shift) & (BIT_ULL(size) - 1); +} + +#define its_cmd_get_command(cmd) its_cmd_mask_field(cmd, 0, 0, 8) +#define its_cmd_get_deviceid(cmd) its_cmd_mask_field(cmd, 0, 32, 32) +#define its_cmd_get_size(cmd) (its_cmd_mask_field(cmd, 1, 0, 5) + 1) +#define its_cmd_get_id(cmd) its_cmd_mask_field(cmd, 1, 0, 32) +#define its_cmd_get_physical_id(cmd) its_cmd_mask_field(cmd, 1, 32, 32) +#define its_cmd_get_collection(cmd) its_cmd_mask_field(cmd, 2, 0, 16) +#define its_cmd_get_ittaddr(cmd) (its_cmd_mask_field(cmd, 2, 8, 44) << 8) +#define its_cmd_get_target_addr(cmd) its_cmd_mask_field(cmd, 2, 16, 32) +#define its_cmd_get_validbit(cmd) its_cmd_mask_field(cmd, 2, 63, 1) + +/* + * The DISCARD command frees an Interrupt Translation Table Entry (ITTE). + * Must be called with the its_lock mutex held. + */ +static int vgic_its_cmd_handle_discard(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + u32 device_id = its_cmd_get_deviceid(its_cmd); + u32 event_id = its_cmd_get_id(its_cmd); + struct its_ite *ite; + + ite = find_ite(its, device_id, event_id); + if (ite && its_is_collection_mapped(ite->collection)) { + /* + * Though the spec talks about removing the pending state, we + * don't bother here since we clear the ITTE anyway and the + * pending state is a property of the ITTE struct. + */ + vgic_its_invalidate_cache(kvm); + + its_free_ite(kvm, ite); + return 0; + } + + return E_ITS_DISCARD_UNMAPPED_INTERRUPT; +} + +/* + * The MOVI command moves an ITTE to a different collection. + * Must be called with the its_lock mutex held. + */ +static int vgic_its_cmd_handle_movi(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + u32 device_id = its_cmd_get_deviceid(its_cmd); + u32 event_id = its_cmd_get_id(its_cmd); + u32 coll_id = its_cmd_get_collection(its_cmd); + struct kvm_vcpu *vcpu; + struct its_ite *ite; + struct its_collection *collection; + + ite = find_ite(its, device_id, event_id); + if (!ite) + return E_ITS_MOVI_UNMAPPED_INTERRUPT; + + if (!its_is_collection_mapped(ite->collection)) + return E_ITS_MOVI_UNMAPPED_COLLECTION; + + collection = find_collection(its, coll_id); + if (!its_is_collection_mapped(collection)) + return E_ITS_MOVI_UNMAPPED_COLLECTION; + + ite->collection = collection; + vcpu = kvm_get_vcpu(kvm, collection->target_addr); + + vgic_its_invalidate_cache(kvm); + + return update_affinity(ite->irq, vcpu); +} + +/* + * Check whether an ID can be stored into the corresponding guest table. + * For a direct table this is pretty easy, but gets a bit nasty for + * indirect tables. We check whether the resulting guest physical address + * is actually valid (covered by a memslot and guest accessible). + * For this we have to read the respective first level entry. + */ +static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id, + gpa_t *eaddr) +{ + int l1_tbl_size = GITS_BASER_NR_PAGES(baser) * SZ_64K; + u64 indirect_ptr, type = GITS_BASER_TYPE(baser); + phys_addr_t base = GITS_BASER_ADDR_48_to_52(baser); + int esz = GITS_BASER_ENTRY_SIZE(baser); + int index, idx; + gfn_t gfn; + bool ret; + + switch (type) { + case GITS_BASER_TYPE_DEVICE: + if (id >= BIT_ULL(VITS_TYPER_DEVBITS)) + return false; + break; + case GITS_BASER_TYPE_COLLECTION: + /* as GITS_TYPER.CIL == 0, ITS supports 16-bit collection ID */ + if (id >= BIT_ULL(16)) + return false; + break; + default: + return false; + } + + if (!(baser & GITS_BASER_INDIRECT)) { + phys_addr_t addr; + + if (id >= (l1_tbl_size / esz)) + return false; + + addr = base + id * esz; + gfn = addr >> PAGE_SHIFT; + + if (eaddr) + *eaddr = addr; + + goto out; + } + + /* calculate and check the index into the 1st level */ + index = id / (SZ_64K / esz); + if (index >= (l1_tbl_size / sizeof(u64))) + return false; + + /* Each 1st level entry is represented by a 64-bit value. */ + if (kvm_read_guest_lock(its->dev->kvm, + base + index * sizeof(indirect_ptr), + &indirect_ptr, sizeof(indirect_ptr))) + return false; + + indirect_ptr = le64_to_cpu(indirect_ptr); + + /* check the valid bit of the first level entry */ + if (!(indirect_ptr & BIT_ULL(63))) + return false; + + /* Mask the guest physical address and calculate the frame number. */ + indirect_ptr &= GENMASK_ULL(51, 16); + + /* Find the address of the actual entry */ + index = id % (SZ_64K / esz); + indirect_ptr += index * esz; + gfn = indirect_ptr >> PAGE_SHIFT; + + if (eaddr) + *eaddr = indirect_ptr; + +out: + idx = srcu_read_lock(&its->dev->kvm->srcu); + ret = kvm_is_visible_gfn(its->dev->kvm, gfn); + srcu_read_unlock(&its->dev->kvm->srcu, idx); + return ret; +} + +static int vgic_its_alloc_collection(struct vgic_its *its, + struct its_collection **colp, + u32 coll_id) +{ + struct its_collection *collection; + + if (!vgic_its_check_id(its, its->baser_coll_table, coll_id, NULL)) + return E_ITS_MAPC_COLLECTION_OOR; + + collection = kzalloc(sizeof(*collection), GFP_KERNEL); + if (!collection) + return -ENOMEM; + + collection->collection_id = coll_id; + collection->target_addr = COLLECTION_NOT_MAPPED; + + list_add_tail(&collection->coll_list, &its->collection_list); + *colp = collection; + + return 0; +} + +static void vgic_its_free_collection(struct vgic_its *its, u32 coll_id) +{ + struct its_collection *collection; + struct its_device *device; + struct its_ite *ite; + + /* + * Clearing the mapping for that collection ID removes the + * entry from the list. If there wasn't any before, we can + * go home early. + */ + collection = find_collection(its, coll_id); + if (!collection) + return; + + for_each_lpi_its(device, ite, its) + if (ite->collection && + ite->collection->collection_id == coll_id) + ite->collection = NULL; + + list_del(&collection->coll_list); + kfree(collection); +} + +/* Must be called with its_lock mutex held */ +static struct its_ite *vgic_its_alloc_ite(struct its_device *device, + struct its_collection *collection, + u32 event_id) +{ + struct its_ite *ite; + + ite = kzalloc(sizeof(*ite), GFP_KERNEL); + if (!ite) + return ERR_PTR(-ENOMEM); + + ite->event_id = event_id; + ite->collection = collection; + + list_add_tail(&ite->ite_list, &device->itt_head); + return ite; +} + +/* + * The MAPTI and MAPI commands map LPIs to ITTEs. + * Must be called with its_lock mutex held. + */ +static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + u32 device_id = its_cmd_get_deviceid(its_cmd); + u32 event_id = its_cmd_get_id(its_cmd); + u32 coll_id = its_cmd_get_collection(its_cmd); + struct its_ite *ite; + struct kvm_vcpu *vcpu = NULL; + struct its_device *device; + struct its_collection *collection, *new_coll = NULL; + struct vgic_irq *irq; + int lpi_nr; + + device = find_its_device(its, device_id); + if (!device) + return E_ITS_MAPTI_UNMAPPED_DEVICE; + + if (event_id >= BIT_ULL(device->num_eventid_bits)) + return E_ITS_MAPTI_ID_OOR; + + if (its_cmd_get_command(its_cmd) == GITS_CMD_MAPTI) + lpi_nr = its_cmd_get_physical_id(its_cmd); + else + lpi_nr = event_id; + if (lpi_nr < GIC_LPI_OFFSET || + lpi_nr >= max_lpis_propbaser(kvm->arch.vgic.propbaser)) + return E_ITS_MAPTI_PHYSICALID_OOR; + + /* If there is an existing mapping, behavior is UNPREDICTABLE. */ + if (find_ite(its, device_id, event_id)) + return 0; + + collection = find_collection(its, coll_id); + if (!collection) { + int ret = vgic_its_alloc_collection(its, &collection, coll_id); + if (ret) + return ret; + new_coll = collection; + } + + ite = vgic_its_alloc_ite(device, collection, event_id); + if (IS_ERR(ite)) { + if (new_coll) + vgic_its_free_collection(its, coll_id); + return PTR_ERR(ite); + } + + if (its_is_collection_mapped(collection)) + vcpu = kvm_get_vcpu(kvm, collection->target_addr); + + irq = vgic_add_lpi(kvm, lpi_nr, vcpu); + if (IS_ERR(irq)) { + if (new_coll) + vgic_its_free_collection(its, coll_id); + its_free_ite(kvm, ite); + return PTR_ERR(irq); + } + ite->irq = irq; + + return 0; +} + +/* Requires the its_lock to be held. */ +static void vgic_its_free_device(struct kvm *kvm, struct its_device *device) +{ + struct its_ite *ite, *temp; + + /* + * The spec says that unmapping a device with still valid + * ITTEs associated is UNPREDICTABLE. We remove all ITTEs, + * since we cannot leave the memory unreferenced. + */ + list_for_each_entry_safe(ite, temp, &device->itt_head, ite_list) + its_free_ite(kvm, ite); + + vgic_its_invalidate_cache(kvm); + + list_del(&device->dev_list); + kfree(device); +} + +/* its lock must be held */ +static void vgic_its_free_device_list(struct kvm *kvm, struct vgic_its *its) +{ + struct its_device *cur, *temp; + + list_for_each_entry_safe(cur, temp, &its->device_list, dev_list) + vgic_its_free_device(kvm, cur); +} + +/* its lock must be held */ +static void vgic_its_free_collection_list(struct kvm *kvm, struct vgic_its *its) +{ + struct its_collection *cur, *temp; + + list_for_each_entry_safe(cur, temp, &its->collection_list, coll_list) + vgic_its_free_collection(its, cur->collection_id); +} + +/* Must be called with its_lock mutex held */ +static struct its_device *vgic_its_alloc_device(struct vgic_its *its, + u32 device_id, gpa_t itt_addr, + u8 num_eventid_bits) +{ + struct its_device *device; + + device = kzalloc(sizeof(*device), GFP_KERNEL); + if (!device) + return ERR_PTR(-ENOMEM); + + device->device_id = device_id; + device->itt_addr = itt_addr; + device->num_eventid_bits = num_eventid_bits; + INIT_LIST_HEAD(&device->itt_head); + + list_add_tail(&device->dev_list, &its->device_list); + return device; +} + +/* + * MAPD maps or unmaps a device ID to Interrupt Translation Tables (ITTs). + * Must be called with the its_lock mutex held. + */ +static int vgic_its_cmd_handle_mapd(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + u32 device_id = its_cmd_get_deviceid(its_cmd); + bool valid = its_cmd_get_validbit(its_cmd); + u8 num_eventid_bits = its_cmd_get_size(its_cmd); + gpa_t itt_addr = its_cmd_get_ittaddr(its_cmd); + struct its_device *device; + + if (!vgic_its_check_id(its, its->baser_device_table, device_id, NULL)) + return E_ITS_MAPD_DEVICE_OOR; + + if (valid && num_eventid_bits > VITS_TYPER_IDBITS) + return E_ITS_MAPD_ITTSIZE_OOR; + + device = find_its_device(its, device_id); + + /* + * The spec says that calling MAPD on an already mapped device + * invalidates all cached data for this device. We implement this + * by removing the mapping and re-establishing it. + */ + if (device) + vgic_its_free_device(kvm, device); + + /* + * The spec does not say whether unmapping a not-mapped device + * is an error, so we are done in any case. + */ + if (!valid) + return 0; + + device = vgic_its_alloc_device(its, device_id, itt_addr, + num_eventid_bits); + + return PTR_ERR_OR_ZERO(device); +} + +/* + * The MAPC command maps collection IDs to redistributors. + * Must be called with the its_lock mutex held. + */ +static int vgic_its_cmd_handle_mapc(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + u16 coll_id; + u32 target_addr; + struct its_collection *collection; + bool valid; + + valid = its_cmd_get_validbit(its_cmd); + coll_id = its_cmd_get_collection(its_cmd); + target_addr = its_cmd_get_target_addr(its_cmd); + + if (target_addr >= atomic_read(&kvm->online_vcpus)) + return E_ITS_MAPC_PROCNUM_OOR; + + if (!valid) { + vgic_its_free_collection(its, coll_id); + vgic_its_invalidate_cache(kvm); + } else { + collection = find_collection(its, coll_id); + + if (!collection) { + int ret; + + ret = vgic_its_alloc_collection(its, &collection, + coll_id); + if (ret) + return ret; + collection->target_addr = target_addr; + } else { + collection->target_addr = target_addr; + update_affinity_collection(kvm, its, collection); + } + } + + return 0; +} + +/* + * The CLEAR command removes the pending state for a particular LPI. + * Must be called with the its_lock mutex held. + */ +static int vgic_its_cmd_handle_clear(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + u32 device_id = its_cmd_get_deviceid(its_cmd); + u32 event_id = its_cmd_get_id(its_cmd); + struct its_ite *ite; + + + ite = find_ite(its, device_id, event_id); + if (!ite) + return E_ITS_CLEAR_UNMAPPED_INTERRUPT; + + ite->irq->pending_latch = false; + + if (ite->irq->hw) + return irq_set_irqchip_state(ite->irq->host_irq, + IRQCHIP_STATE_PENDING, false); + + return 0; +} + +/* + * The INV command syncs the configuration bits from the memory table. + * Must be called with the its_lock mutex held. + */ +static int vgic_its_cmd_handle_inv(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + u32 device_id = its_cmd_get_deviceid(its_cmd); + u32 event_id = its_cmd_get_id(its_cmd); + struct its_ite *ite; + + + ite = find_ite(its, device_id, event_id); + if (!ite) + return E_ITS_INV_UNMAPPED_INTERRUPT; + + return update_lpi_config(kvm, ite->irq, NULL, true); +} + +/* + * The INVALL command requests flushing of all IRQ data in this collection. + * Find the VCPU mapped to that collection, then iterate over the VM's list + * of mapped LPIs and update the configuration for each IRQ which targets + * the specified vcpu. The configuration will be read from the in-memory + * configuration table. + * Must be called with the its_lock mutex held. + */ +static int vgic_its_cmd_handle_invall(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + u32 coll_id = its_cmd_get_collection(its_cmd); + struct its_collection *collection; + struct kvm_vcpu *vcpu; + struct vgic_irq *irq; + u32 *intids; + int irq_count, i; + + collection = find_collection(its, coll_id); + if (!its_is_collection_mapped(collection)) + return E_ITS_INVALL_UNMAPPED_COLLECTION; + + vcpu = kvm_get_vcpu(kvm, collection->target_addr); + + irq_count = vgic_copy_lpi_list(kvm, vcpu, &intids); + if (irq_count < 0) + return irq_count; + + for (i = 0; i < irq_count; i++) { + irq = vgic_get_irq(kvm, NULL, intids[i]); + if (!irq) + continue; + update_lpi_config(kvm, irq, vcpu, false); + vgic_put_irq(kvm, irq); + } + + kfree(intids); + + if (vcpu->arch.vgic_cpu.vgic_v3.its_vpe.its_vm) + its_invall_vpe(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe); + + return 0; +} + +/* + * The MOVALL command moves the pending state of all IRQs targeting one + * redistributor to another. We don't hold the pending state in the VCPUs, + * but in the IRQs instead, so there is really not much to do for us here. + * However the spec says that no IRQ must target the old redistributor + * afterwards, so we make sure that no LPI is using the associated target_vcpu. + * This command affects all LPIs in the system that target that redistributor. + */ +static int vgic_its_cmd_handle_movall(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + u32 target1_addr = its_cmd_get_target_addr(its_cmd); + u32 target2_addr = its_cmd_mask_field(its_cmd, 3, 16, 32); + struct kvm_vcpu *vcpu1, *vcpu2; + struct vgic_irq *irq; + u32 *intids; + int irq_count, i; + + if (target1_addr >= atomic_read(&kvm->online_vcpus) || + target2_addr >= atomic_read(&kvm->online_vcpus)) + return E_ITS_MOVALL_PROCNUM_OOR; + + if (target1_addr == target2_addr) + return 0; + + vcpu1 = kvm_get_vcpu(kvm, target1_addr); + vcpu2 = kvm_get_vcpu(kvm, target2_addr); + + irq_count = vgic_copy_lpi_list(kvm, vcpu1, &intids); + if (irq_count < 0) + return irq_count; + + for (i = 0; i < irq_count; i++) { + irq = vgic_get_irq(kvm, NULL, intids[i]); + + update_affinity(irq, vcpu2); + + vgic_put_irq(kvm, irq); + } + + vgic_its_invalidate_cache(kvm); + + kfree(intids); + return 0; +} + +/* + * The INT command injects the LPI associated with that DevID/EvID pair. + * Must be called with the its_lock mutex held. + */ +static int vgic_its_cmd_handle_int(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + u32 msi_data = its_cmd_get_id(its_cmd); + u64 msi_devid = its_cmd_get_deviceid(its_cmd); + + return vgic_its_trigger_msi(kvm, its, msi_devid, msi_data); +} + +/* + * This function is called with the its_cmd lock held, but the ITS data + * structure lock dropped. + */ +static int vgic_its_handle_command(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + int ret = -ENODEV; + + mutex_lock(&its->its_lock); + switch (its_cmd_get_command(its_cmd)) { + case GITS_CMD_MAPD: + ret = vgic_its_cmd_handle_mapd(kvm, its, its_cmd); + break; + case GITS_CMD_MAPC: + ret = vgic_its_cmd_handle_mapc(kvm, its, its_cmd); + break; + case GITS_CMD_MAPI: + ret = vgic_its_cmd_handle_mapi(kvm, its, its_cmd); + break; + case GITS_CMD_MAPTI: + ret = vgic_its_cmd_handle_mapi(kvm, its, its_cmd); + break; + case GITS_CMD_MOVI: + ret = vgic_its_cmd_handle_movi(kvm, its, its_cmd); + break; + case GITS_CMD_DISCARD: + ret = vgic_its_cmd_handle_discard(kvm, its, its_cmd); + break; + case GITS_CMD_CLEAR: + ret = vgic_its_cmd_handle_clear(kvm, its, its_cmd); + break; + case GITS_CMD_MOVALL: + ret = vgic_its_cmd_handle_movall(kvm, its, its_cmd); + break; + case GITS_CMD_INT: + ret = vgic_its_cmd_handle_int(kvm, its, its_cmd); + break; + case GITS_CMD_INV: + ret = vgic_its_cmd_handle_inv(kvm, its, its_cmd); + break; + case GITS_CMD_INVALL: + ret = vgic_its_cmd_handle_invall(kvm, its, its_cmd); + break; + case GITS_CMD_SYNC: + /* we ignore this command: we are in sync all of the time */ + ret = 0; + break; + } + mutex_unlock(&its->its_lock); + + return ret; +} + +static u64 vgic_sanitise_its_baser(u64 reg) +{ + reg = vgic_sanitise_field(reg, GITS_BASER_SHAREABILITY_MASK, + GITS_BASER_SHAREABILITY_SHIFT, + vgic_sanitise_shareability); + reg = vgic_sanitise_field(reg, GITS_BASER_INNER_CACHEABILITY_MASK, + GITS_BASER_INNER_CACHEABILITY_SHIFT, + vgic_sanitise_inner_cacheability); + reg = vgic_sanitise_field(reg, GITS_BASER_OUTER_CACHEABILITY_MASK, + GITS_BASER_OUTER_CACHEABILITY_SHIFT, + vgic_sanitise_outer_cacheability); + + /* We support only one (ITS) page size: 64K */ + reg = (reg & ~GITS_BASER_PAGE_SIZE_MASK) | GITS_BASER_PAGE_SIZE_64K; + + return reg; +} + +static u64 vgic_sanitise_its_cbaser(u64 reg) +{ + reg = vgic_sanitise_field(reg, GITS_CBASER_SHAREABILITY_MASK, + GITS_CBASER_SHAREABILITY_SHIFT, + vgic_sanitise_shareability); + reg = vgic_sanitise_field(reg, GITS_CBASER_INNER_CACHEABILITY_MASK, + GITS_CBASER_INNER_CACHEABILITY_SHIFT, + vgic_sanitise_inner_cacheability); + reg = vgic_sanitise_field(reg, GITS_CBASER_OUTER_CACHEABILITY_MASK, + GITS_CBASER_OUTER_CACHEABILITY_SHIFT, + vgic_sanitise_outer_cacheability); + + /* Sanitise the physical address to be 64k aligned. */ + reg &= ~GENMASK_ULL(15, 12); + + return reg; +} + +static unsigned long vgic_mmio_read_its_cbaser(struct kvm *kvm, + struct vgic_its *its, + gpa_t addr, unsigned int len) +{ + return extract_bytes(its->cbaser, addr & 7, len); +} + +static void vgic_mmio_write_its_cbaser(struct kvm *kvm, struct vgic_its *its, + gpa_t addr, unsigned int len, + unsigned long val) +{ + /* When GITS_CTLR.Enable is 1, this register is RO. */ + if (its->enabled) + return; + + mutex_lock(&its->cmd_lock); + its->cbaser = update_64bit_reg(its->cbaser, addr & 7, len, val); + its->cbaser = vgic_sanitise_its_cbaser(its->cbaser); + its->creadr = 0; + /* + * CWRITER is architecturally UNKNOWN on reset, but we need to reset + * it to CREADR to make sure we start with an empty command buffer. + */ + its->cwriter = its->creadr; + mutex_unlock(&its->cmd_lock); +} + +#define ITS_CMD_BUFFER_SIZE(baser) ((((baser) & 0xff) + 1) << 12) +#define ITS_CMD_SIZE 32 +#define ITS_CMD_OFFSET(reg) ((reg) & GENMASK(19, 5)) + +/* Must be called with the cmd_lock held. */ +static void vgic_its_process_commands(struct kvm *kvm, struct vgic_its *its) +{ + gpa_t cbaser; + u64 cmd_buf[4]; + + /* Commands are only processed when the ITS is enabled. */ + if (!its->enabled) + return; + + cbaser = GITS_CBASER_ADDRESS(its->cbaser); + + while (its->cwriter != its->creadr) { + int ret = kvm_read_guest_lock(kvm, cbaser + its->creadr, + cmd_buf, ITS_CMD_SIZE); + /* + * If kvm_read_guest() fails, this could be due to the guest + * programming a bogus value in CBASER or something else going + * wrong from which we cannot easily recover. + * According to section 6.3.2 in the GICv3 spec we can just + * ignore that command then. + */ + if (!ret) + vgic_its_handle_command(kvm, its, cmd_buf); + + its->creadr += ITS_CMD_SIZE; + if (its->creadr == ITS_CMD_BUFFER_SIZE(its->cbaser)) + its->creadr = 0; + } +} + +/* + * By writing to CWRITER the guest announces new commands to be processed. + * To avoid any races in the first place, we take the its_cmd lock, which + * protects our ring buffer variables, so that there is only one user + * per ITS handling commands at a given time. + */ +static void vgic_mmio_write_its_cwriter(struct kvm *kvm, struct vgic_its *its, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u64 reg; + + if (!its) + return; + + mutex_lock(&its->cmd_lock); + + reg = update_64bit_reg(its->cwriter, addr & 7, len, val); + reg = ITS_CMD_OFFSET(reg); + if (reg >= ITS_CMD_BUFFER_SIZE(its->cbaser)) { + mutex_unlock(&its->cmd_lock); + return; + } + its->cwriter = reg; + + vgic_its_process_commands(kvm, its); + + mutex_unlock(&its->cmd_lock); +} + +static unsigned long vgic_mmio_read_its_cwriter(struct kvm *kvm, + struct vgic_its *its, + gpa_t addr, unsigned int len) +{ + return extract_bytes(its->cwriter, addr & 0x7, len); +} + +static unsigned long vgic_mmio_read_its_creadr(struct kvm *kvm, + struct vgic_its *its, + gpa_t addr, unsigned int len) +{ + return extract_bytes(its->creadr, addr & 0x7, len); +} + +static int vgic_mmio_uaccess_write_its_creadr(struct kvm *kvm, + struct vgic_its *its, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 cmd_offset; + int ret = 0; + + mutex_lock(&its->cmd_lock); + + if (its->enabled) { + ret = -EBUSY; + goto out; + } + + cmd_offset = ITS_CMD_OFFSET(val); + if (cmd_offset >= ITS_CMD_BUFFER_SIZE(its->cbaser)) { + ret = -EINVAL; + goto out; + } + + its->creadr = cmd_offset; +out: + mutex_unlock(&its->cmd_lock); + return ret; +} + +#define BASER_INDEX(addr) (((addr) / sizeof(u64)) & 0x7) +static unsigned long vgic_mmio_read_its_baser(struct kvm *kvm, + struct vgic_its *its, + gpa_t addr, unsigned int len) +{ + u64 reg; + + switch (BASER_INDEX(addr)) { + case 0: + reg = its->baser_device_table; + break; + case 1: + reg = its->baser_coll_table; + break; + default: + reg = 0; + break; + } + + return extract_bytes(reg, addr & 7, len); +} + +#define GITS_BASER_RO_MASK (GENMASK_ULL(52, 48) | GENMASK_ULL(58, 56)) +static void vgic_mmio_write_its_baser(struct kvm *kvm, + struct vgic_its *its, + gpa_t addr, unsigned int len, + unsigned long val) +{ + const struct vgic_its_abi *abi = vgic_its_get_abi(its); + u64 entry_size, table_type; + u64 reg, *regptr, clearbits = 0; + + /* When GITS_CTLR.Enable is 1, we ignore write accesses. */ + if (its->enabled) + return; + + switch (BASER_INDEX(addr)) { + case 0: + regptr = &its->baser_device_table; + entry_size = abi->dte_esz; + table_type = GITS_BASER_TYPE_DEVICE; + break; + case 1: + regptr = &its->baser_coll_table; + entry_size = abi->cte_esz; + table_type = GITS_BASER_TYPE_COLLECTION; + clearbits = GITS_BASER_INDIRECT; + break; + default: + return; + } + + reg = update_64bit_reg(*regptr, addr & 7, len, val); + reg &= ~GITS_BASER_RO_MASK; + reg &= ~clearbits; + + reg |= (entry_size - 1) << GITS_BASER_ENTRY_SIZE_SHIFT; + reg |= table_type << GITS_BASER_TYPE_SHIFT; + reg = vgic_sanitise_its_baser(reg); + + *regptr = reg; + + if (!(reg & GITS_BASER_VALID)) { + /* Take the its_lock to prevent a race with a save/restore */ + mutex_lock(&its->its_lock); + switch (table_type) { + case GITS_BASER_TYPE_DEVICE: + vgic_its_free_device_list(kvm, its); + break; + case GITS_BASER_TYPE_COLLECTION: + vgic_its_free_collection_list(kvm, its); + break; + } + mutex_unlock(&its->its_lock); + } +} + +static unsigned long vgic_mmio_read_its_ctlr(struct kvm *vcpu, + struct vgic_its *its, + gpa_t addr, unsigned int len) +{ + u32 reg = 0; + + mutex_lock(&its->cmd_lock); + if (its->creadr == its->cwriter) + reg |= GITS_CTLR_QUIESCENT; + if (its->enabled) + reg |= GITS_CTLR_ENABLE; + mutex_unlock(&its->cmd_lock); + + return reg; +} + +static void vgic_mmio_write_its_ctlr(struct kvm *kvm, struct vgic_its *its, + gpa_t addr, unsigned int len, + unsigned long val) +{ + mutex_lock(&its->cmd_lock); + + /* + * It is UNPREDICTABLE to enable the ITS if any of the CBASER or + * device/collection BASER are invalid + */ + if (!its->enabled && (val & GITS_CTLR_ENABLE) && + (!(its->baser_device_table & GITS_BASER_VALID) || + !(its->baser_coll_table & GITS_BASER_VALID) || + !(its->cbaser & GITS_CBASER_VALID))) + goto out; + + its->enabled = !!(val & GITS_CTLR_ENABLE); + if (!its->enabled) + vgic_its_invalidate_cache(kvm); + + /* + * Try to process any pending commands. This function bails out early + * if the ITS is disabled or no commands have been queued. + */ + vgic_its_process_commands(kvm, its); + +out: + mutex_unlock(&its->cmd_lock); +} + +#define REGISTER_ITS_DESC(off, rd, wr, length, acc) \ +{ \ + .reg_offset = off, \ + .len = length, \ + .access_flags = acc, \ + .its_read = rd, \ + .its_write = wr, \ +} + +#define REGISTER_ITS_DESC_UACCESS(off, rd, wr, uwr, length, acc)\ +{ \ + .reg_offset = off, \ + .len = length, \ + .access_flags = acc, \ + .its_read = rd, \ + .its_write = wr, \ + .uaccess_its_write = uwr, \ +} + +static void its_mmio_write_wi(struct kvm *kvm, struct vgic_its *its, + gpa_t addr, unsigned int len, unsigned long val) +{ + /* Ignore */ +} + +static struct vgic_register_region its_registers[] = { + REGISTER_ITS_DESC(GITS_CTLR, + vgic_mmio_read_its_ctlr, vgic_mmio_write_its_ctlr, 4, + VGIC_ACCESS_32bit), + REGISTER_ITS_DESC_UACCESS(GITS_IIDR, + vgic_mmio_read_its_iidr, its_mmio_write_wi, + vgic_mmio_uaccess_write_its_iidr, 4, + VGIC_ACCESS_32bit), + REGISTER_ITS_DESC(GITS_TYPER, + vgic_mmio_read_its_typer, its_mmio_write_wi, 8, + VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), + REGISTER_ITS_DESC(GITS_CBASER, + vgic_mmio_read_its_cbaser, vgic_mmio_write_its_cbaser, 8, + VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), + REGISTER_ITS_DESC(GITS_CWRITER, + vgic_mmio_read_its_cwriter, vgic_mmio_write_its_cwriter, 8, + VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), + REGISTER_ITS_DESC_UACCESS(GITS_CREADR, + vgic_mmio_read_its_creadr, its_mmio_write_wi, + vgic_mmio_uaccess_write_its_creadr, 8, + VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), + REGISTER_ITS_DESC(GITS_BASER, + vgic_mmio_read_its_baser, vgic_mmio_write_its_baser, 0x40, + VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), + REGISTER_ITS_DESC(GITS_IDREGS_BASE, + vgic_mmio_read_its_idregs, its_mmio_write_wi, 0x30, + VGIC_ACCESS_32bit), +}; + +/* This is called on setting the LPI enable bit in the redistributor. */ +void vgic_enable_lpis(struct kvm_vcpu *vcpu) +{ + if (!(vcpu->arch.vgic_cpu.pendbaser & GICR_PENDBASER_PTZ)) + its_sync_lpi_pending_table(vcpu); +} + +static int vgic_register_its_iodev(struct kvm *kvm, struct vgic_its *its, + u64 addr) +{ + struct vgic_io_device *iodev = &its->iodev; + int ret; + + mutex_lock(&kvm->slots_lock); + if (!IS_VGIC_ADDR_UNDEF(its->vgic_its_base)) { + ret = -EBUSY; + goto out; + } + + its->vgic_its_base = addr; + iodev->regions = its_registers; + iodev->nr_regions = ARRAY_SIZE(its_registers); + kvm_iodevice_init(&iodev->dev, &kvm_io_gic_ops); + + iodev->base_addr = its->vgic_its_base; + iodev->iodev_type = IODEV_ITS; + iodev->its = its; + ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, iodev->base_addr, + KVM_VGIC_V3_ITS_SIZE, &iodev->dev); +out: + mutex_unlock(&kvm->slots_lock); + + return ret; +} + +/* Default is 16 cached LPIs per vcpu */ +#define LPI_DEFAULT_PCPU_CACHE_SIZE 16 + +void vgic_lpi_translation_cache_init(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + unsigned int sz; + int i; + + if (!list_empty(&dist->lpi_translation_cache)) + return; + + sz = atomic_read(&kvm->online_vcpus) * LPI_DEFAULT_PCPU_CACHE_SIZE; + + for (i = 0; i < sz; i++) { + struct vgic_translation_cache_entry *cte; + + /* An allocation failure is not fatal */ + cte = kzalloc(sizeof(*cte), GFP_KERNEL); + if (WARN_ON(!cte)) + break; + + INIT_LIST_HEAD(&cte->entry); + list_add(&cte->entry, &dist->lpi_translation_cache); + } +} + +void vgic_lpi_translation_cache_destroy(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct vgic_translation_cache_entry *cte, *tmp; + + vgic_its_invalidate_cache(kvm); + + list_for_each_entry_safe(cte, tmp, + &dist->lpi_translation_cache, entry) { + list_del(&cte->entry); + kfree(cte); + } +} + +#define INITIAL_BASER_VALUE \ + (GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWb) | \ + GIC_BASER_CACHEABILITY(GITS_BASER, OUTER, SameAsInner) | \ + GIC_BASER_SHAREABILITY(GITS_BASER, InnerShareable) | \ + GITS_BASER_PAGE_SIZE_64K) + +#define INITIAL_PROPBASER_VALUE \ + (GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWb) | \ + GIC_BASER_CACHEABILITY(GICR_PROPBASER, OUTER, SameAsInner) | \ + GIC_BASER_SHAREABILITY(GICR_PROPBASER, InnerShareable)) + +static int vgic_its_create(struct kvm_device *dev, u32 type) +{ + struct vgic_its *its; + + if (type != KVM_DEV_TYPE_ARM_VGIC_ITS) + return -ENODEV; + + its = kzalloc(sizeof(struct vgic_its), GFP_KERNEL); + if (!its) + return -ENOMEM; + + if (vgic_initialized(dev->kvm)) { + int ret = vgic_v4_init(dev->kvm); + if (ret < 0) { + kfree(its); + return ret; + } + + vgic_lpi_translation_cache_init(dev->kvm); + } + + mutex_init(&its->its_lock); + mutex_init(&its->cmd_lock); + + its->vgic_its_base = VGIC_ADDR_UNDEF; + + INIT_LIST_HEAD(&its->device_list); + INIT_LIST_HEAD(&its->collection_list); + + dev->kvm->arch.vgic.msis_require_devid = true; + dev->kvm->arch.vgic.has_its = true; + its->enabled = false; + its->dev = dev; + + its->baser_device_table = INITIAL_BASER_VALUE | + ((u64)GITS_BASER_TYPE_DEVICE << GITS_BASER_TYPE_SHIFT); + its->baser_coll_table = INITIAL_BASER_VALUE | + ((u64)GITS_BASER_TYPE_COLLECTION << GITS_BASER_TYPE_SHIFT); + dev->kvm->arch.vgic.propbaser = INITIAL_PROPBASER_VALUE; + + dev->private = its; + + return vgic_its_set_abi(its, NR_ITS_ABIS - 1); +} + +static void vgic_its_destroy(struct kvm_device *kvm_dev) +{ + struct kvm *kvm = kvm_dev->kvm; + struct vgic_its *its = kvm_dev->private; + + mutex_lock(&its->its_lock); + + vgic_its_free_device_list(kvm, its); + vgic_its_free_collection_list(kvm, its); + + mutex_unlock(&its->its_lock); + kfree(its); + kfree(kvm_dev);/* alloc by kvm_ioctl_create_device, free by .destroy */ +} + +static int vgic_its_has_attr_regs(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + const struct vgic_register_region *region; + gpa_t offset = attr->attr; + int align; + + align = (offset < GITS_TYPER) || (offset >= GITS_PIDR4) ? 0x3 : 0x7; + + if (offset & align) + return -EINVAL; + + region = vgic_find_mmio_region(its_registers, + ARRAY_SIZE(its_registers), + offset); + if (!region) + return -ENXIO; + + return 0; +} + +static int vgic_its_attr_regs_access(struct kvm_device *dev, + struct kvm_device_attr *attr, + u64 *reg, bool is_write) +{ + const struct vgic_register_region *region; + struct vgic_its *its; + gpa_t addr, offset; + unsigned int len; + int align, ret = 0; + + its = dev->private; + offset = attr->attr; + + /* + * Although the spec supports upper/lower 32-bit accesses to + * 64-bit ITS registers, the userspace ABI requires 64-bit + * accesses to all 64-bit wide registers. We therefore only + * support 32-bit accesses to GITS_CTLR, GITS_IIDR and GITS ID + * registers + */ + if ((offset < GITS_TYPER) || (offset >= GITS_PIDR4)) + align = 0x3; + else + align = 0x7; + + if (offset & align) + return -EINVAL; + + mutex_lock(&dev->kvm->lock); + + if (IS_VGIC_ADDR_UNDEF(its->vgic_its_base)) { + ret = -ENXIO; + goto out; + } + + region = vgic_find_mmio_region(its_registers, + ARRAY_SIZE(its_registers), + offset); + if (!region) { + ret = -ENXIO; + goto out; + } + + if (!lock_all_vcpus(dev->kvm)) { + ret = -EBUSY; + goto out; + } + + addr = its->vgic_its_base + offset; + + len = region->access_flags & VGIC_ACCESS_64bit ? 8 : 4; + + if (is_write) { + if (region->uaccess_its_write) + ret = region->uaccess_its_write(dev->kvm, its, addr, + len, *reg); + else + region->its_write(dev->kvm, its, addr, len, *reg); + } else { + *reg = region->its_read(dev->kvm, its, addr, len); + } + unlock_all_vcpus(dev->kvm); +out: + mutex_unlock(&dev->kvm->lock); + return ret; +} + +static u32 compute_next_devid_offset(struct list_head *h, + struct its_device *dev) +{ + struct its_device *next; + u32 next_offset; + + if (list_is_last(&dev->dev_list, h)) + return 0; + next = list_next_entry(dev, dev_list); + next_offset = next->device_id - dev->device_id; + + return min_t(u32, next_offset, VITS_DTE_MAX_DEVID_OFFSET); +} + +static u32 compute_next_eventid_offset(struct list_head *h, struct its_ite *ite) +{ + struct its_ite *next; + u32 next_offset; + + if (list_is_last(&ite->ite_list, h)) + return 0; + next = list_next_entry(ite, ite_list); + next_offset = next->event_id - ite->event_id; + + return min_t(u32, next_offset, VITS_ITE_MAX_EVENTID_OFFSET); +} + +/** + * entry_fn_t - Callback called on a table entry restore path + * @its: its handle + * @id: id of the entry + * @entry: pointer to the entry + * @opaque: pointer to an opaque data + * + * Return: < 0 on error, 0 if last element was identified, id offset to next + * element otherwise + */ +typedef int (*entry_fn_t)(struct vgic_its *its, u32 id, void *entry, + void *opaque); + +/** + * scan_its_table - Scan a contiguous table in guest RAM and applies a function + * to each entry + * + * @its: its handle + * @base: base gpa of the table + * @size: size of the table in bytes + * @esz: entry size in bytes + * @start_id: the ID of the first entry in the table + * (non zero for 2d level tables) + * @fn: function to apply on each entry + * + * Return: < 0 on error, 0 if last element was identified, 1 otherwise + * (the last element may not be found on second level tables) + */ +static int scan_its_table(struct vgic_its *its, gpa_t base, int size, u32 esz, + int start_id, entry_fn_t fn, void *opaque) +{ + struct kvm *kvm = its->dev->kvm; + unsigned long len = size; + int id = start_id; + gpa_t gpa = base; + char entry[ESZ_MAX]; + int ret; + + memset(entry, 0, esz); + + while (len > 0) { + int next_offset; + size_t byte_offset; + + ret = kvm_read_guest_lock(kvm, gpa, entry, esz); + if (ret) + return ret; + + next_offset = fn(its, id, entry, opaque); + if (next_offset <= 0) + return next_offset; + + byte_offset = next_offset * esz; + id += next_offset; + gpa += byte_offset; + len -= byte_offset; + } + return 1; +} + +/** + * vgic_its_save_ite - Save an interrupt translation entry at @gpa + */ +static int vgic_its_save_ite(struct vgic_its *its, struct its_device *dev, + struct its_ite *ite, gpa_t gpa, int ite_esz) +{ + struct kvm *kvm = its->dev->kvm; + u32 next_offset; + u64 val; + + next_offset = compute_next_eventid_offset(&dev->itt_head, ite); + val = ((u64)next_offset << KVM_ITS_ITE_NEXT_SHIFT) | + ((u64)ite->irq->intid << KVM_ITS_ITE_PINTID_SHIFT) | + ite->collection->collection_id; + val = cpu_to_le64(val); + return kvm_write_guest_lock(kvm, gpa, &val, ite_esz); +} + +/** + * vgic_its_restore_ite - restore an interrupt translation entry + * @event_id: id used for indexing + * @ptr: pointer to the ITE entry + * @opaque: pointer to the its_device + */ +static int vgic_its_restore_ite(struct vgic_its *its, u32 event_id, + void *ptr, void *opaque) +{ + struct its_device *dev = (struct its_device *)opaque; + struct its_collection *collection; + struct kvm *kvm = its->dev->kvm; + struct kvm_vcpu *vcpu = NULL; + u64 val; + u64 *p = (u64 *)ptr; + struct vgic_irq *irq; + u32 coll_id, lpi_id; + struct its_ite *ite; + u32 offset; + + val = *p; + + val = le64_to_cpu(val); + + coll_id = val & KVM_ITS_ITE_ICID_MASK; + lpi_id = (val & KVM_ITS_ITE_PINTID_MASK) >> KVM_ITS_ITE_PINTID_SHIFT; + + if (!lpi_id) + return 1; /* invalid entry, no choice but to scan next entry */ + + if (lpi_id < VGIC_MIN_LPI) + return -EINVAL; + + offset = val >> KVM_ITS_ITE_NEXT_SHIFT; + if (event_id + offset >= BIT_ULL(dev->num_eventid_bits)) + return -EINVAL; + + collection = find_collection(its, coll_id); + if (!collection) + return -EINVAL; + + ite = vgic_its_alloc_ite(dev, collection, event_id); + if (IS_ERR(ite)) + return PTR_ERR(ite); + + if (its_is_collection_mapped(collection)) + vcpu = kvm_get_vcpu(kvm, collection->target_addr); + + irq = vgic_add_lpi(kvm, lpi_id, vcpu); + if (IS_ERR(irq)) + return PTR_ERR(irq); + ite->irq = irq; + + return offset; +} + +static int vgic_its_ite_cmp(void *priv, struct list_head *a, + struct list_head *b) +{ + struct its_ite *itea = container_of(a, struct its_ite, ite_list); + struct its_ite *iteb = container_of(b, struct its_ite, ite_list); + + if (itea->event_id < iteb->event_id) + return -1; + else + return 1; +} + +static int vgic_its_save_itt(struct vgic_its *its, struct its_device *device) +{ + const struct vgic_its_abi *abi = vgic_its_get_abi(its); + gpa_t base = device->itt_addr; + struct its_ite *ite; + int ret; + int ite_esz = abi->ite_esz; + + list_sort(NULL, &device->itt_head, vgic_its_ite_cmp); + + list_for_each_entry(ite, &device->itt_head, ite_list) { + gpa_t gpa = base + ite->event_id * ite_esz; + + /* + * If an LPI carries the HW bit, this means that this + * interrupt is controlled by GICv4, and we do not + * have direct access to that state. Let's simply fail + * the save operation... + */ + if (ite->irq->hw) + return -EACCES; + + ret = vgic_its_save_ite(its, device, ite, gpa, ite_esz); + if (ret) + return ret; + } + return 0; +} + +/** + * vgic_its_restore_itt - restore the ITT of a device + * + * @its: its handle + * @dev: device handle + * + * Return 0 on success, < 0 on error + */ +static int vgic_its_restore_itt(struct vgic_its *its, struct its_device *dev) +{ + const struct vgic_its_abi *abi = vgic_its_get_abi(its); + gpa_t base = dev->itt_addr; + int ret; + int ite_esz = abi->ite_esz; + size_t max_size = BIT_ULL(dev->num_eventid_bits) * ite_esz; + + ret = scan_its_table(its, base, max_size, ite_esz, 0, + vgic_its_restore_ite, dev); + + /* scan_its_table returns +1 if all ITEs are invalid */ + if (ret > 0) + ret = 0; + + return ret; +} + +/** + * vgic_its_save_dte - Save a device table entry at a given GPA + * + * @its: ITS handle + * @dev: ITS device + * @ptr: GPA + */ +static int vgic_its_save_dte(struct vgic_its *its, struct its_device *dev, + gpa_t ptr, int dte_esz) +{ + struct kvm *kvm = its->dev->kvm; + u64 val, itt_addr_field; + u32 next_offset; + + itt_addr_field = dev->itt_addr >> 8; + next_offset = compute_next_devid_offset(&its->device_list, dev); + val = (1ULL << KVM_ITS_DTE_VALID_SHIFT | + ((u64)next_offset << KVM_ITS_DTE_NEXT_SHIFT) | + (itt_addr_field << KVM_ITS_DTE_ITTADDR_SHIFT) | + (dev->num_eventid_bits - 1)); + val = cpu_to_le64(val); + return kvm_write_guest_lock(kvm, ptr, &val, dte_esz); +} + +/** + * vgic_its_restore_dte - restore a device table entry + * + * @its: its handle + * @id: device id the DTE corresponds to + * @ptr: kernel VA where the 8 byte DTE is located + * @opaque: unused + * + * Return: < 0 on error, 0 if the dte is the last one, id offset to the + * next dte otherwise + */ +static int vgic_its_restore_dte(struct vgic_its *its, u32 id, + void *ptr, void *opaque) +{ + struct its_device *dev; + gpa_t itt_addr; + u8 num_eventid_bits; + u64 entry = *(u64 *)ptr; + bool valid; + u32 offset; + int ret; + + entry = le64_to_cpu(entry); + + valid = entry >> KVM_ITS_DTE_VALID_SHIFT; + num_eventid_bits = (entry & KVM_ITS_DTE_SIZE_MASK) + 1; + itt_addr = ((entry & KVM_ITS_DTE_ITTADDR_MASK) + >> KVM_ITS_DTE_ITTADDR_SHIFT) << 8; + + if (!valid) + return 1; + + /* dte entry is valid */ + offset = (entry & KVM_ITS_DTE_NEXT_MASK) >> KVM_ITS_DTE_NEXT_SHIFT; + + dev = vgic_its_alloc_device(its, id, itt_addr, num_eventid_bits); + if (IS_ERR(dev)) + return PTR_ERR(dev); + + ret = vgic_its_restore_itt(its, dev); + if (ret) { + vgic_its_free_device(its->dev->kvm, dev); + return ret; + } + + return offset; +} + +static int vgic_its_device_cmp(void *priv, struct list_head *a, + struct list_head *b) +{ + struct its_device *deva = container_of(a, struct its_device, dev_list); + struct its_device *devb = container_of(b, struct its_device, dev_list); + + if (deva->device_id < devb->device_id) + return -1; + else + return 1; +} + +/** + * vgic_its_save_device_tables - Save the device table and all ITT + * into guest RAM + * + * L1/L2 handling is hidden by vgic_its_check_id() helper which directly + * returns the GPA of the device entry + */ +static int vgic_its_save_device_tables(struct vgic_its *its) +{ + const struct vgic_its_abi *abi = vgic_its_get_abi(its); + u64 baser = its->baser_device_table; + struct its_device *dev; + int dte_esz = abi->dte_esz; + + if (!(baser & GITS_BASER_VALID)) + return 0; + + list_sort(NULL, &its->device_list, vgic_its_device_cmp); + + list_for_each_entry(dev, &its->device_list, dev_list) { + int ret; + gpa_t eaddr; + + if (!vgic_its_check_id(its, baser, + dev->device_id, &eaddr)) + return -EINVAL; + + ret = vgic_its_save_itt(its, dev); + if (ret) + return ret; + + ret = vgic_its_save_dte(its, dev, eaddr, dte_esz); + if (ret) + return ret; + } + return 0; +} + +/** + * handle_l1_dte - callback used for L1 device table entries (2 stage case) + * + * @its: its handle + * @id: index of the entry in the L1 table + * @addr: kernel VA + * @opaque: unused + * + * L1 table entries are scanned by steps of 1 entry + * Return < 0 if error, 0 if last dte was found when scanning the L2 + * table, +1 otherwise (meaning next L1 entry must be scanned) + */ +static int handle_l1_dte(struct vgic_its *its, u32 id, void *addr, + void *opaque) +{ + const struct vgic_its_abi *abi = vgic_its_get_abi(its); + int l2_start_id = id * (SZ_64K / abi->dte_esz); + u64 entry = *(u64 *)addr; + int dte_esz = abi->dte_esz; + gpa_t gpa; + int ret; + + entry = le64_to_cpu(entry); + + if (!(entry & KVM_ITS_L1E_VALID_MASK)) + return 1; + + gpa = entry & KVM_ITS_L1E_ADDR_MASK; + + ret = scan_its_table(its, gpa, SZ_64K, dte_esz, + l2_start_id, vgic_its_restore_dte, NULL); + + return ret; +} + +/** + * vgic_its_restore_device_tables - Restore the device table and all ITT + * from guest RAM to internal data structs + */ +static int vgic_its_restore_device_tables(struct vgic_its *its) +{ + const struct vgic_its_abi *abi = vgic_its_get_abi(its); + u64 baser = its->baser_device_table; + int l1_esz, ret; + int l1_tbl_size = GITS_BASER_NR_PAGES(baser) * SZ_64K; + gpa_t l1_gpa; + + if (!(baser & GITS_BASER_VALID)) + return 0; + + l1_gpa = GITS_BASER_ADDR_48_to_52(baser); + + if (baser & GITS_BASER_INDIRECT) { + l1_esz = GITS_LVL1_ENTRY_SIZE; + ret = scan_its_table(its, l1_gpa, l1_tbl_size, l1_esz, 0, + handle_l1_dte, NULL); + } else { + l1_esz = abi->dte_esz; + ret = scan_its_table(its, l1_gpa, l1_tbl_size, l1_esz, 0, + vgic_its_restore_dte, NULL); + } + + /* scan_its_table returns +1 if all entries are invalid */ + if (ret > 0) + ret = 0; + + return ret; +} + +static int vgic_its_save_cte(struct vgic_its *its, + struct its_collection *collection, + gpa_t gpa, int esz) +{ + u64 val; + + val = (1ULL << KVM_ITS_CTE_VALID_SHIFT | + ((u64)collection->target_addr << KVM_ITS_CTE_RDBASE_SHIFT) | + collection->collection_id); + val = cpu_to_le64(val); + return kvm_write_guest_lock(its->dev->kvm, gpa, &val, esz); +} + +static int vgic_its_restore_cte(struct vgic_its *its, gpa_t gpa, int esz) +{ + struct its_collection *collection; + struct kvm *kvm = its->dev->kvm; + u32 target_addr, coll_id; + u64 val; + int ret; + + BUG_ON(esz > sizeof(val)); + ret = kvm_read_guest_lock(kvm, gpa, &val, esz); + if (ret) + return ret; + val = le64_to_cpu(val); + if (!(val & KVM_ITS_CTE_VALID_MASK)) + return 0; + + target_addr = (u32)(val >> KVM_ITS_CTE_RDBASE_SHIFT); + coll_id = val & KVM_ITS_CTE_ICID_MASK; + + if (target_addr != COLLECTION_NOT_MAPPED && + target_addr >= atomic_read(&kvm->online_vcpus)) + return -EINVAL; + + collection = find_collection(its, coll_id); + if (collection) + return -EEXIST; + ret = vgic_its_alloc_collection(its, &collection, coll_id); + if (ret) + return ret; + collection->target_addr = target_addr; + return 1; +} + +/** + * vgic_its_save_collection_table - Save the collection table into + * guest RAM + */ +static int vgic_its_save_collection_table(struct vgic_its *its) +{ + const struct vgic_its_abi *abi = vgic_its_get_abi(its); + u64 baser = its->baser_coll_table; + gpa_t gpa = GITS_BASER_ADDR_48_to_52(baser); + struct its_collection *collection; + u64 val; + size_t max_size, filled = 0; + int ret, cte_esz = abi->cte_esz; + + if (!(baser & GITS_BASER_VALID)) + return 0; + + max_size = GITS_BASER_NR_PAGES(baser) * SZ_64K; + + list_for_each_entry(collection, &its->collection_list, coll_list) { + ret = vgic_its_save_cte(its, collection, gpa, cte_esz); + if (ret) + return ret; + gpa += cte_esz; + filled += cte_esz; + } + + if (filled == max_size) + return 0; + + /* + * table is not fully filled, add a last dummy element + * with valid bit unset + */ + val = 0; + BUG_ON(cte_esz > sizeof(val)); + ret = kvm_write_guest_lock(its->dev->kvm, gpa, &val, cte_esz); + return ret; +} + +/** + * vgic_its_restore_collection_table - reads the collection table + * in guest memory and restores the ITS internal state. Requires the + * BASER registers to be restored before. + */ +static int vgic_its_restore_collection_table(struct vgic_its *its) +{ + const struct vgic_its_abi *abi = vgic_its_get_abi(its); + u64 baser = its->baser_coll_table; + int cte_esz = abi->cte_esz; + size_t max_size, read = 0; + gpa_t gpa; + int ret; + + if (!(baser & GITS_BASER_VALID)) + return 0; + + gpa = GITS_BASER_ADDR_48_to_52(baser); + + max_size = GITS_BASER_NR_PAGES(baser) * SZ_64K; + + while (read < max_size) { + ret = vgic_its_restore_cte(its, gpa, cte_esz); + if (ret <= 0) + break; + gpa += cte_esz; + read += cte_esz; + } + + if (ret > 0) + return 0; + + return ret; +} + +/** + * vgic_its_save_tables_v0 - Save the ITS tables into guest ARM + * according to v0 ABI + */ +static int vgic_its_save_tables_v0(struct vgic_its *its) +{ + int ret; + + ret = vgic_its_save_device_tables(its); + if (ret) + return ret; + + return vgic_its_save_collection_table(its); +} + +/** + * vgic_its_restore_tables_v0 - Restore the ITS tables from guest RAM + * to internal data structs according to V0 ABI + * + */ +static int vgic_its_restore_tables_v0(struct vgic_its *its) +{ + int ret; + + ret = vgic_its_restore_collection_table(its); + if (ret) + return ret; + + return vgic_its_restore_device_tables(its); +} + +static int vgic_its_commit_v0(struct vgic_its *its) +{ + const struct vgic_its_abi *abi; + + abi = vgic_its_get_abi(its); + its->baser_coll_table &= ~GITS_BASER_ENTRY_SIZE_MASK; + its->baser_device_table &= ~GITS_BASER_ENTRY_SIZE_MASK; + + its->baser_coll_table |= (GIC_ENCODE_SZ(abi->cte_esz, 5) + << GITS_BASER_ENTRY_SIZE_SHIFT); + + its->baser_device_table |= (GIC_ENCODE_SZ(abi->dte_esz, 5) + << GITS_BASER_ENTRY_SIZE_SHIFT); + return 0; +} + +static void vgic_its_reset(struct kvm *kvm, struct vgic_its *its) +{ + /* We need to keep the ABI specific field values */ + its->baser_coll_table &= ~GITS_BASER_VALID; + its->baser_device_table &= ~GITS_BASER_VALID; + its->cbaser = 0; + its->creadr = 0; + its->cwriter = 0; + its->enabled = 0; + vgic_its_free_device_list(kvm, its); + vgic_its_free_collection_list(kvm, its); +} + +static int vgic_its_has_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_ADDR: + switch (attr->attr) { + case KVM_VGIC_ITS_ADDR_TYPE: + return 0; + } + break; + case KVM_DEV_ARM_VGIC_GRP_CTRL: + switch (attr->attr) { + case KVM_DEV_ARM_VGIC_CTRL_INIT: + return 0; + case KVM_DEV_ARM_ITS_CTRL_RESET: + return 0; + case KVM_DEV_ARM_ITS_SAVE_TABLES: + return 0; + case KVM_DEV_ARM_ITS_RESTORE_TABLES: + return 0; + } + break; + case KVM_DEV_ARM_VGIC_GRP_ITS_REGS: + return vgic_its_has_attr_regs(dev, attr); + } + return -ENXIO; +} + +static int vgic_its_ctrl(struct kvm *kvm, struct vgic_its *its, u64 attr) +{ + const struct vgic_its_abi *abi = vgic_its_get_abi(its); + int ret = 0; + + if (attr == KVM_DEV_ARM_VGIC_CTRL_INIT) /* Nothing to do */ + return 0; + + mutex_lock(&kvm->lock); + mutex_lock(&its->its_lock); + + if (!lock_all_vcpus(kvm)) { + mutex_unlock(&its->its_lock); + mutex_unlock(&kvm->lock); + return -EBUSY; + } + + switch (attr) { + case KVM_DEV_ARM_ITS_CTRL_RESET: + vgic_its_reset(kvm, its); + break; + case KVM_DEV_ARM_ITS_SAVE_TABLES: + ret = abi->save_tables(its); + break; + case KVM_DEV_ARM_ITS_RESTORE_TABLES: + ret = abi->restore_tables(its); + break; + } + + unlock_all_vcpus(kvm); + mutex_unlock(&its->its_lock); + mutex_unlock(&kvm->lock); + return ret; +} + +static int vgic_its_set_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + struct vgic_its *its = dev->private; + int ret; + + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_ADDR: { + u64 __user *uaddr = (u64 __user *)(long)attr->addr; + unsigned long type = (unsigned long)attr->attr; + u64 addr; + + if (type != KVM_VGIC_ITS_ADDR_TYPE) + return -ENODEV; + + if (copy_from_user(&addr, uaddr, sizeof(addr))) + return -EFAULT; + + ret = vgic_check_ioaddr(dev->kvm, &its->vgic_its_base, + addr, SZ_64K); + if (ret) + return ret; + + return vgic_register_its_iodev(dev->kvm, its, addr); + } + case KVM_DEV_ARM_VGIC_GRP_CTRL: + return vgic_its_ctrl(dev->kvm, its, attr->attr); + case KVM_DEV_ARM_VGIC_GRP_ITS_REGS: { + u64 __user *uaddr = (u64 __user *)(long)attr->addr; + u64 reg; + + if (get_user(reg, uaddr)) + return -EFAULT; + + return vgic_its_attr_regs_access(dev, attr, ®, true); + } + } + return -ENXIO; +} + +static int vgic_its_get_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_ADDR: { + struct vgic_its *its = dev->private; + u64 addr = its->vgic_its_base; + u64 __user *uaddr = (u64 __user *)(long)attr->addr; + unsigned long type = (unsigned long)attr->attr; + + if (type != KVM_VGIC_ITS_ADDR_TYPE) + return -ENODEV; + + if (copy_to_user(uaddr, &addr, sizeof(addr))) + return -EFAULT; + break; + } + case KVM_DEV_ARM_VGIC_GRP_ITS_REGS: { + u64 __user *uaddr = (u64 __user *)(long)attr->addr; + u64 reg; + int ret; + + ret = vgic_its_attr_regs_access(dev, attr, ®, false); + if (ret) + return ret; + return put_user(reg, uaddr); + } + default: + return -ENXIO; + } + + return 0; +} + +static struct kvm_device_ops kvm_arm_vgic_its_ops = { + .name = "kvm-arm-vgic-its", + .create = vgic_its_create, + .destroy = vgic_its_destroy, + .set_attr = vgic_its_set_attr, + .get_attr = vgic_its_get_attr, + .has_attr = vgic_its_has_attr, +}; + +int kvm_vgic_register_its_device(void) +{ + return kvm_register_device_ops(&kvm_arm_vgic_its_ops, + KVM_DEV_TYPE_ARM_VGIC_ITS); +} diff --git a/arch/arm64/kvm/vgic/vgic-kvm-device.c b/arch/arm64/kvm/vgic/vgic-kvm-device.c new file mode 100644 index 000000000000..44419679f91a --- /dev/null +++ b/arch/arm64/kvm/vgic/vgic-kvm-device.c @@ -0,0 +1,741 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * VGIC: KVM DEVICE API + * + * Copyright (C) 2015 ARM Ltd. + * Author: Marc Zyngier <marc.zyngier@arm.com> + */ +#include <linux/kvm_host.h> +#include <kvm/arm_vgic.h> +#include <linux/uaccess.h> +#include <asm/kvm_mmu.h> +#include <asm/cputype.h> +#include "vgic.h" + +/* common helpers */ + +int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr, + phys_addr_t addr, phys_addr_t alignment) +{ + if (addr & ~kvm_phys_mask(kvm)) + return -E2BIG; + + if (!IS_ALIGNED(addr, alignment)) + return -EINVAL; + + if (!IS_VGIC_ADDR_UNDEF(*ioaddr)) + return -EEXIST; + + return 0; +} + +static int vgic_check_type(struct kvm *kvm, int type_needed) +{ + if (kvm->arch.vgic.vgic_model != type_needed) + return -ENODEV; + else + return 0; +} + +/** + * kvm_vgic_addr - set or get vgic VM base addresses + * @kvm: pointer to the vm struct + * @type: the VGIC addr type, one of KVM_VGIC_V[23]_ADDR_TYPE_XXX + * @addr: pointer to address value + * @write: if true set the address in the VM address space, if false read the + * address + * + * Set or get the vgic base addresses for the distributor and the virtual CPU + * interface in the VM physical address space. These addresses are properties + * of the emulated core/SoC and therefore user space initially knows this + * information. + * Check them for sanity (alignment, double assignment). We can't check for + * overlapping regions in case of a virtual GICv3 here, since we don't know + * the number of VCPUs yet, so we defer this check to map_resources(). + */ +int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write) +{ + int r = 0; + struct vgic_dist *vgic = &kvm->arch.vgic; + phys_addr_t *addr_ptr, alignment; + u64 undef_value = VGIC_ADDR_UNDEF; + + mutex_lock(&kvm->lock); + switch (type) { + case KVM_VGIC_V2_ADDR_TYPE_DIST: + r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); + addr_ptr = &vgic->vgic_dist_base; + alignment = SZ_4K; + break; + case KVM_VGIC_V2_ADDR_TYPE_CPU: + r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); + addr_ptr = &vgic->vgic_cpu_base; + alignment = SZ_4K; + break; + case KVM_VGIC_V3_ADDR_TYPE_DIST: + r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V3); + addr_ptr = &vgic->vgic_dist_base; + alignment = SZ_64K; + break; + case KVM_VGIC_V3_ADDR_TYPE_REDIST: { + struct vgic_redist_region *rdreg; + + r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V3); + if (r) + break; + if (write) { + r = vgic_v3_set_redist_base(kvm, 0, *addr, 0); + goto out; + } + rdreg = list_first_entry(&vgic->rd_regions, + struct vgic_redist_region, list); + if (!rdreg) + addr_ptr = &undef_value; + else + addr_ptr = &rdreg->base; + break; + } + case KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION: + { + struct vgic_redist_region *rdreg; + u8 index; + + r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V3); + if (r) + break; + + index = *addr & KVM_VGIC_V3_RDIST_INDEX_MASK; + + if (write) { + gpa_t base = *addr & KVM_VGIC_V3_RDIST_BASE_MASK; + u32 count = (*addr & KVM_VGIC_V3_RDIST_COUNT_MASK) + >> KVM_VGIC_V3_RDIST_COUNT_SHIFT; + u8 flags = (*addr & KVM_VGIC_V3_RDIST_FLAGS_MASK) + >> KVM_VGIC_V3_RDIST_FLAGS_SHIFT; + + if (!count || flags) + r = -EINVAL; + else + r = vgic_v3_set_redist_base(kvm, index, + base, count); + goto out; + } + + rdreg = vgic_v3_rdist_region_from_index(kvm, index); + if (!rdreg) { + r = -ENOENT; + goto out; + } + + *addr = index; + *addr |= rdreg->base; + *addr |= (u64)rdreg->count << KVM_VGIC_V3_RDIST_COUNT_SHIFT; + goto out; + } + default: + r = -ENODEV; + } + + if (r) + goto out; + + if (write) { + r = vgic_check_ioaddr(kvm, addr_ptr, *addr, alignment); + if (!r) + *addr_ptr = *addr; + } else { + *addr = *addr_ptr; + } + +out: + mutex_unlock(&kvm->lock); + return r; +} + +static int vgic_set_common_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + int r; + + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_ADDR: { + u64 __user *uaddr = (u64 __user *)(long)attr->addr; + u64 addr; + unsigned long type = (unsigned long)attr->attr; + + if (copy_from_user(&addr, uaddr, sizeof(addr))) + return -EFAULT; + + r = kvm_vgic_addr(dev->kvm, type, &addr, true); + return (r == -ENODEV) ? -ENXIO : r; + } + case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: { + u32 __user *uaddr = (u32 __user *)(long)attr->addr; + u32 val; + int ret = 0; + + if (get_user(val, uaddr)) + return -EFAULT; + + /* + * We require: + * - at least 32 SPIs on top of the 16 SGIs and 16 PPIs + * - at most 1024 interrupts + * - a multiple of 32 interrupts + */ + if (val < (VGIC_NR_PRIVATE_IRQS + 32) || + val > VGIC_MAX_RESERVED || + (val & 31)) + return -EINVAL; + + mutex_lock(&dev->kvm->lock); + + if (vgic_ready(dev->kvm) || dev->kvm->arch.vgic.nr_spis) + ret = -EBUSY; + else + dev->kvm->arch.vgic.nr_spis = + val - VGIC_NR_PRIVATE_IRQS; + + mutex_unlock(&dev->kvm->lock); + + return ret; + } + case KVM_DEV_ARM_VGIC_GRP_CTRL: { + switch (attr->attr) { + case KVM_DEV_ARM_VGIC_CTRL_INIT: + mutex_lock(&dev->kvm->lock); + r = vgic_init(dev->kvm); + mutex_unlock(&dev->kvm->lock); + return r; + } + break; + } + } + + return -ENXIO; +} + +static int vgic_get_common_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + int r = -ENXIO; + + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_ADDR: { + u64 __user *uaddr = (u64 __user *)(long)attr->addr; + u64 addr; + unsigned long type = (unsigned long)attr->attr; + + r = kvm_vgic_addr(dev->kvm, type, &addr, false); + if (r) + return (r == -ENODEV) ? -ENXIO : r; + + if (copy_to_user(uaddr, &addr, sizeof(addr))) + return -EFAULT; + break; + } + case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: { + u32 __user *uaddr = (u32 __user *)(long)attr->addr; + + r = put_user(dev->kvm->arch.vgic.nr_spis + + VGIC_NR_PRIVATE_IRQS, uaddr); + break; + } + } + + return r; +} + +static int vgic_create(struct kvm_device *dev, u32 type) +{ + return kvm_vgic_create(dev->kvm, type); +} + +static void vgic_destroy(struct kvm_device *dev) +{ + kfree(dev); +} + +int kvm_register_vgic_device(unsigned long type) +{ + int ret = -ENODEV; + + switch (type) { + case KVM_DEV_TYPE_ARM_VGIC_V2: + ret = kvm_register_device_ops(&kvm_arm_vgic_v2_ops, + KVM_DEV_TYPE_ARM_VGIC_V2); + break; + case KVM_DEV_TYPE_ARM_VGIC_V3: + ret = kvm_register_device_ops(&kvm_arm_vgic_v3_ops, + KVM_DEV_TYPE_ARM_VGIC_V3); + + if (ret) + break; + ret = kvm_vgic_register_its_device(); + break; + } + + return ret; +} + +int vgic_v2_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr, + struct vgic_reg_attr *reg_attr) +{ + int cpuid; + + cpuid = (attr->attr & KVM_DEV_ARM_VGIC_CPUID_MASK) >> + KVM_DEV_ARM_VGIC_CPUID_SHIFT; + + if (cpuid >= atomic_read(&dev->kvm->online_vcpus)) + return -EINVAL; + + reg_attr->vcpu = kvm_get_vcpu(dev->kvm, cpuid); + reg_attr->addr = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK; + + return 0; +} + +/* unlocks vcpus from @vcpu_lock_idx and smaller */ +static void unlock_vcpus(struct kvm *kvm, int vcpu_lock_idx) +{ + struct kvm_vcpu *tmp_vcpu; + + for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) { + tmp_vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx); + mutex_unlock(&tmp_vcpu->mutex); + } +} + +void unlock_all_vcpus(struct kvm *kvm) +{ + unlock_vcpus(kvm, atomic_read(&kvm->online_vcpus) - 1); +} + +/* Returns true if all vcpus were locked, false otherwise */ +bool lock_all_vcpus(struct kvm *kvm) +{ + struct kvm_vcpu *tmp_vcpu; + int c; + + /* + * Any time a vcpu is run, vcpu_load is called which tries to grab the + * vcpu->mutex. By grabbing the vcpu->mutex of all VCPUs we ensure + * that no other VCPUs are run and fiddle with the vgic state while we + * access it. + */ + kvm_for_each_vcpu(c, tmp_vcpu, kvm) { + if (!mutex_trylock(&tmp_vcpu->mutex)) { + unlock_vcpus(kvm, c - 1); + return false; + } + } + + return true; +} + +/** + * vgic_v2_attr_regs_access - allows user space to access VGIC v2 state + * + * @dev: kvm device handle + * @attr: kvm device attribute + * @reg: address the value is read or written + * @is_write: true if userspace is writing a register + */ +static int vgic_v2_attr_regs_access(struct kvm_device *dev, + struct kvm_device_attr *attr, + u32 *reg, bool is_write) +{ + struct vgic_reg_attr reg_attr; + gpa_t addr; + struct kvm_vcpu *vcpu; + int ret; + + ret = vgic_v2_parse_attr(dev, attr, ®_attr); + if (ret) + return ret; + + vcpu = reg_attr.vcpu; + addr = reg_attr.addr; + + mutex_lock(&dev->kvm->lock); + + ret = vgic_init(dev->kvm); + if (ret) + goto out; + + if (!lock_all_vcpus(dev->kvm)) { + ret = -EBUSY; + goto out; + } + + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: + ret = vgic_v2_cpuif_uaccess(vcpu, is_write, addr, reg); + break; + case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: + ret = vgic_v2_dist_uaccess(vcpu, is_write, addr, reg); + break; + default: + ret = -EINVAL; + break; + } + + unlock_all_vcpus(dev->kvm); +out: + mutex_unlock(&dev->kvm->lock); + return ret; +} + +static int vgic_v2_set_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + int ret; + + ret = vgic_set_common_attr(dev, attr); + if (ret != -ENXIO) + return ret; + + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: + case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: { + u32 __user *uaddr = (u32 __user *)(long)attr->addr; + u32 reg; + + if (get_user(reg, uaddr)) + return -EFAULT; + + return vgic_v2_attr_regs_access(dev, attr, ®, true); + } + } + + return -ENXIO; +} + +static int vgic_v2_get_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + int ret; + + ret = vgic_get_common_attr(dev, attr); + if (ret != -ENXIO) + return ret; + + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: + case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: { + u32 __user *uaddr = (u32 __user *)(long)attr->addr; + u32 reg = 0; + + ret = vgic_v2_attr_regs_access(dev, attr, ®, false); + if (ret) + return ret; + return put_user(reg, uaddr); + } + } + + return -ENXIO; +} + +static int vgic_v2_has_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_ADDR: + switch (attr->attr) { + case KVM_VGIC_V2_ADDR_TYPE_DIST: + case KVM_VGIC_V2_ADDR_TYPE_CPU: + return 0; + } + break; + case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: + case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: + return vgic_v2_has_attr_regs(dev, attr); + case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: + return 0; + case KVM_DEV_ARM_VGIC_GRP_CTRL: + switch (attr->attr) { + case KVM_DEV_ARM_VGIC_CTRL_INIT: + return 0; + } + } + return -ENXIO; +} + +struct kvm_device_ops kvm_arm_vgic_v2_ops = { + .name = "kvm-arm-vgic-v2", + .create = vgic_create, + .destroy = vgic_destroy, + .set_attr = vgic_v2_set_attr, + .get_attr = vgic_v2_get_attr, + .has_attr = vgic_v2_has_attr, +}; + +int vgic_v3_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr, + struct vgic_reg_attr *reg_attr) +{ + unsigned long vgic_mpidr, mpidr_reg; + + /* + * For KVM_DEV_ARM_VGIC_GRP_DIST_REGS group, + * attr might not hold MPIDR. Hence assume vcpu0. + */ + if (attr->group != KVM_DEV_ARM_VGIC_GRP_DIST_REGS) { + vgic_mpidr = (attr->attr & KVM_DEV_ARM_VGIC_V3_MPIDR_MASK) >> + KVM_DEV_ARM_VGIC_V3_MPIDR_SHIFT; + + mpidr_reg = VGIC_TO_MPIDR(vgic_mpidr); + reg_attr->vcpu = kvm_mpidr_to_vcpu(dev->kvm, mpidr_reg); + } else { + reg_attr->vcpu = kvm_get_vcpu(dev->kvm, 0); + } + + if (!reg_attr->vcpu) + return -EINVAL; + + reg_attr->addr = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK; + + return 0; +} + +/* + * vgic_v3_attr_regs_access - allows user space to access VGIC v3 state + * + * @dev: kvm device handle + * @attr: kvm device attribute + * @reg: address the value is read or written + * @is_write: true if userspace is writing a register + */ +static int vgic_v3_attr_regs_access(struct kvm_device *dev, + struct kvm_device_attr *attr, + u64 *reg, bool is_write) +{ + struct vgic_reg_attr reg_attr; + gpa_t addr; + struct kvm_vcpu *vcpu; + int ret; + u32 tmp32; + + ret = vgic_v3_parse_attr(dev, attr, ®_attr); + if (ret) + return ret; + + vcpu = reg_attr.vcpu; + addr = reg_attr.addr; + + mutex_lock(&dev->kvm->lock); + + if (unlikely(!vgic_initialized(dev->kvm))) { + ret = -EBUSY; + goto out; + } + + if (!lock_all_vcpus(dev->kvm)) { + ret = -EBUSY; + goto out; + } + + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: + if (is_write) + tmp32 = *reg; + + ret = vgic_v3_dist_uaccess(vcpu, is_write, addr, &tmp32); + if (!is_write) + *reg = tmp32; + break; + case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: + if (is_write) + tmp32 = *reg; + + ret = vgic_v3_redist_uaccess(vcpu, is_write, addr, &tmp32); + if (!is_write) + *reg = tmp32; + break; + case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: { + u64 regid; + + regid = (attr->attr & KVM_DEV_ARM_VGIC_SYSREG_INSTR_MASK); + ret = vgic_v3_cpu_sysregs_uaccess(vcpu, is_write, + regid, reg); + break; + } + case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: { + unsigned int info, intid; + + info = (attr->attr & KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK) >> + KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT; + if (info == VGIC_LEVEL_INFO_LINE_LEVEL) { + intid = attr->attr & + KVM_DEV_ARM_VGIC_LINE_LEVEL_INTID_MASK; + ret = vgic_v3_line_level_info_uaccess(vcpu, is_write, + intid, reg); + } else { + ret = -EINVAL; + } + break; + } + default: + ret = -EINVAL; + break; + } + + unlock_all_vcpus(dev->kvm); +out: + mutex_unlock(&dev->kvm->lock); + return ret; +} + +static int vgic_v3_set_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + int ret; + + ret = vgic_set_common_attr(dev, attr); + if (ret != -ENXIO) + return ret; + + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: + case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: { + u32 __user *uaddr = (u32 __user *)(long)attr->addr; + u32 tmp32; + u64 reg; + + if (get_user(tmp32, uaddr)) + return -EFAULT; + + reg = tmp32; + return vgic_v3_attr_regs_access(dev, attr, ®, true); + } + case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: { + u64 __user *uaddr = (u64 __user *)(long)attr->addr; + u64 reg; + + if (get_user(reg, uaddr)) + return -EFAULT; + + return vgic_v3_attr_regs_access(dev, attr, ®, true); + } + case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: { + u32 __user *uaddr = (u32 __user *)(long)attr->addr; + u64 reg; + u32 tmp32; + + if (get_user(tmp32, uaddr)) + return -EFAULT; + + reg = tmp32; + return vgic_v3_attr_regs_access(dev, attr, ®, true); + } + case KVM_DEV_ARM_VGIC_GRP_CTRL: { + int ret; + + switch (attr->attr) { + case KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES: + mutex_lock(&dev->kvm->lock); + + if (!lock_all_vcpus(dev->kvm)) { + mutex_unlock(&dev->kvm->lock); + return -EBUSY; + } + ret = vgic_v3_save_pending_tables(dev->kvm); + unlock_all_vcpus(dev->kvm); + mutex_unlock(&dev->kvm->lock); + return ret; + } + break; + } + } + return -ENXIO; +} + +static int vgic_v3_get_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + int ret; + + ret = vgic_get_common_attr(dev, attr); + if (ret != -ENXIO) + return ret; + + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: + case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: { + u32 __user *uaddr = (u32 __user *)(long)attr->addr; + u64 reg; + u32 tmp32; + + ret = vgic_v3_attr_regs_access(dev, attr, ®, false); + if (ret) + return ret; + tmp32 = reg; + return put_user(tmp32, uaddr); + } + case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: { + u64 __user *uaddr = (u64 __user *)(long)attr->addr; + u64 reg; + + ret = vgic_v3_attr_regs_access(dev, attr, ®, false); + if (ret) + return ret; + return put_user(reg, uaddr); + } + case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: { + u32 __user *uaddr = (u32 __user *)(long)attr->addr; + u64 reg; + u32 tmp32; + + ret = vgic_v3_attr_regs_access(dev, attr, ®, false); + if (ret) + return ret; + tmp32 = reg; + return put_user(tmp32, uaddr); + } + } + return -ENXIO; +} + +static int vgic_v3_has_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_ADDR: + switch (attr->attr) { + case KVM_VGIC_V3_ADDR_TYPE_DIST: + case KVM_VGIC_V3_ADDR_TYPE_REDIST: + case KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION: + return 0; + } + break; + case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: + case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: + case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: + return vgic_v3_has_attr_regs(dev, attr); + case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: + return 0; + case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: { + if (((attr->attr & KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK) >> + KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT) == + VGIC_LEVEL_INFO_LINE_LEVEL) + return 0; + break; + } + case KVM_DEV_ARM_VGIC_GRP_CTRL: + switch (attr->attr) { + case KVM_DEV_ARM_VGIC_CTRL_INIT: + return 0; + case KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES: + return 0; + } + } + return -ENXIO; +} + +struct kvm_device_ops kvm_arm_vgic_v3_ops = { + .name = "kvm-arm-vgic-v3", + .create = vgic_create, + .destroy = vgic_destroy, + .set_attr = vgic_v3_set_attr, + .get_attr = vgic_v3_get_attr, + .has_attr = vgic_v3_has_attr, +}; diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v2.c b/arch/arm64/kvm/vgic/vgic-mmio-v2.c new file mode 100644 index 000000000000..a016f07adc28 --- /dev/null +++ b/arch/arm64/kvm/vgic/vgic-mmio-v2.c @@ -0,0 +1,550 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * VGICv2 MMIO handling functions + */ + +#include <linux/irqchip/arm-gic.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/nospec.h> + +#include <kvm/iodev.h> +#include <kvm/arm_vgic.h> + +#include "vgic.h" +#include "vgic-mmio.h" + +/* + * The Revision field in the IIDR have the following meanings: + * + * Revision 1: Report GICv2 interrupts as group 0 instead of group 1 + * Revision 2: Interrupt groups are guest-configurable and signaled using + * their configured groups. + */ + +static unsigned long vgic_mmio_read_v2_misc(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + struct vgic_dist *vgic = &vcpu->kvm->arch.vgic; + u32 value; + + switch (addr & 0x0c) { + case GIC_DIST_CTRL: + value = vgic->enabled ? GICD_ENABLE : 0; + break; + case GIC_DIST_CTR: + value = vgic->nr_spis + VGIC_NR_PRIVATE_IRQS; + value = (value >> 5) - 1; + value |= (atomic_read(&vcpu->kvm->online_vcpus) - 1) << 5; + break; + case GIC_DIST_IIDR: + value = (PRODUCT_ID_KVM << GICD_IIDR_PRODUCT_ID_SHIFT) | + (vgic->implementation_rev << GICD_IIDR_REVISION_SHIFT) | + (IMPLEMENTER_ARM << GICD_IIDR_IMPLEMENTER_SHIFT); + break; + default: + return 0; + } + + return value; +} + +static void vgic_mmio_write_v2_misc(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + bool was_enabled = dist->enabled; + + switch (addr & 0x0c) { + case GIC_DIST_CTRL: + dist->enabled = val & GICD_ENABLE; + if (!was_enabled && dist->enabled) + vgic_kick_vcpus(vcpu->kvm); + break; + case GIC_DIST_CTR: + case GIC_DIST_IIDR: + /* Nothing to do */ + return; + } +} + +static int vgic_mmio_uaccess_write_v2_misc(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + switch (addr & 0x0c) { + case GIC_DIST_IIDR: + if (val != vgic_mmio_read_v2_misc(vcpu, addr, len)) + return -EINVAL; + + /* + * If we observe a write to GICD_IIDR we know that userspace + * has been updated and has had a chance to cope with older + * kernels (VGICv2 IIDR.Revision == 0) incorrectly reporting + * interrupts as group 1, and therefore we now allow groups to + * be user writable. Doing this by default would break + * migration from old kernels to new kernels with legacy + * userspace. + */ + vcpu->kvm->arch.vgic.v2_groups_user_writable = true; + return 0; + } + + vgic_mmio_write_v2_misc(vcpu, addr, len, val); + return 0; +} + +static int vgic_mmio_uaccess_write_v2_group(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + if (vcpu->kvm->arch.vgic.v2_groups_user_writable) + vgic_mmio_write_group(vcpu, addr, len, val); + + return 0; +} + +static void vgic_mmio_write_sgir(struct kvm_vcpu *source_vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + int nr_vcpus = atomic_read(&source_vcpu->kvm->online_vcpus); + int intid = val & 0xf; + int targets = (val >> 16) & 0xff; + int mode = (val >> 24) & 0x03; + int c; + struct kvm_vcpu *vcpu; + unsigned long flags; + + switch (mode) { + case 0x0: /* as specified by targets */ + break; + case 0x1: + targets = (1U << nr_vcpus) - 1; /* all, ... */ + targets &= ~(1U << source_vcpu->vcpu_id); /* but self */ + break; + case 0x2: /* this very vCPU only */ + targets = (1U << source_vcpu->vcpu_id); + break; + case 0x3: /* reserved */ + return; + } + + kvm_for_each_vcpu(c, vcpu, source_vcpu->kvm) { + struct vgic_irq *irq; + + if (!(targets & (1U << c))) + continue; + + irq = vgic_get_irq(source_vcpu->kvm, vcpu, intid); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + irq->pending_latch = true; + irq->source |= 1U << source_vcpu->vcpu_id; + + vgic_queue_irq_unlock(source_vcpu->kvm, irq, flags); + vgic_put_irq(source_vcpu->kvm, irq); + } +} + +static unsigned long vgic_mmio_read_target(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 8); + int i; + u64 val = 0; + + for (i = 0; i < len; i++) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + val |= (u64)irq->targets << (i * 8); + + vgic_put_irq(vcpu->kvm, irq); + } + + return val; +} + +static void vgic_mmio_write_target(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 8); + u8 cpu_mask = GENMASK(atomic_read(&vcpu->kvm->online_vcpus) - 1, 0); + int i; + unsigned long flags; + + /* GICD_ITARGETSR[0-7] are read-only */ + if (intid < VGIC_NR_PRIVATE_IRQS) + return; + + for (i = 0; i < len; i++) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid + i); + int target; + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + + irq->targets = (val >> (i * 8)) & cpu_mask; + target = irq->targets ? __ffs(irq->targets) : 0; + irq->target_vcpu = kvm_get_vcpu(vcpu->kvm, target); + + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + vgic_put_irq(vcpu->kvm, irq); + } +} + +static unsigned long vgic_mmio_read_sgipend(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + u32 intid = addr & 0x0f; + int i; + u64 val = 0; + + for (i = 0; i < len; i++) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + val |= (u64)irq->source << (i * 8); + + vgic_put_irq(vcpu->kvm, irq); + } + return val; +} + +static void vgic_mmio_write_sgipendc(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = addr & 0x0f; + int i; + unsigned long flags; + + for (i = 0; i < len; i++) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + + irq->source &= ~((val >> (i * 8)) & 0xff); + if (!irq->source) + irq->pending_latch = false; + + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + vgic_put_irq(vcpu->kvm, irq); + } +} + +static void vgic_mmio_write_sgipends(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = addr & 0x0f; + int i; + unsigned long flags; + + for (i = 0; i < len; i++) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + + irq->source |= (val >> (i * 8)) & 0xff; + + if (irq->source) { + irq->pending_latch = true; + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); + } else { + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + } + vgic_put_irq(vcpu->kvm, irq); + } +} + +#define GICC_ARCH_VERSION_V2 0x2 + +/* These are for userland accesses only, there is no guest-facing emulation. */ +static unsigned long vgic_mmio_read_vcpuif(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + struct vgic_vmcr vmcr; + u32 val; + + vgic_get_vmcr(vcpu, &vmcr); + + switch (addr & 0xff) { + case GIC_CPU_CTRL: + val = vmcr.grpen0 << GIC_CPU_CTRL_EnableGrp0_SHIFT; + val |= vmcr.grpen1 << GIC_CPU_CTRL_EnableGrp1_SHIFT; + val |= vmcr.ackctl << GIC_CPU_CTRL_AckCtl_SHIFT; + val |= vmcr.fiqen << GIC_CPU_CTRL_FIQEn_SHIFT; + val |= vmcr.cbpr << GIC_CPU_CTRL_CBPR_SHIFT; + val |= vmcr.eoim << GIC_CPU_CTRL_EOImodeNS_SHIFT; + + break; + case GIC_CPU_PRIMASK: + /* + * Our KVM_DEV_TYPE_ARM_VGIC_V2 device ABI exports the + * the PMR field as GICH_VMCR.VMPriMask rather than + * GICC_PMR.Priority, so we expose the upper five bits of + * priority mask to userspace using the lower bits in the + * unsigned long. + */ + val = (vmcr.pmr & GICV_PMR_PRIORITY_MASK) >> + GICV_PMR_PRIORITY_SHIFT; + break; + case GIC_CPU_BINPOINT: + val = vmcr.bpr; + break; + case GIC_CPU_ALIAS_BINPOINT: + val = vmcr.abpr; + break; + case GIC_CPU_IDENT: + val = ((PRODUCT_ID_KVM << 20) | + (GICC_ARCH_VERSION_V2 << 16) | + IMPLEMENTER_ARM); + break; + default: + return 0; + } + + return val; +} + +static void vgic_mmio_write_vcpuif(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + struct vgic_vmcr vmcr; + + vgic_get_vmcr(vcpu, &vmcr); + + switch (addr & 0xff) { + case GIC_CPU_CTRL: + vmcr.grpen0 = !!(val & GIC_CPU_CTRL_EnableGrp0); + vmcr.grpen1 = !!(val & GIC_CPU_CTRL_EnableGrp1); + vmcr.ackctl = !!(val & GIC_CPU_CTRL_AckCtl); + vmcr.fiqen = !!(val & GIC_CPU_CTRL_FIQEn); + vmcr.cbpr = !!(val & GIC_CPU_CTRL_CBPR); + vmcr.eoim = !!(val & GIC_CPU_CTRL_EOImodeNS); + + break; + case GIC_CPU_PRIMASK: + /* + * Our KVM_DEV_TYPE_ARM_VGIC_V2 device ABI exports the + * the PMR field as GICH_VMCR.VMPriMask rather than + * GICC_PMR.Priority, so we expose the upper five bits of + * priority mask to userspace using the lower bits in the + * unsigned long. + */ + vmcr.pmr = (val << GICV_PMR_PRIORITY_SHIFT) & + GICV_PMR_PRIORITY_MASK; + break; + case GIC_CPU_BINPOINT: + vmcr.bpr = val; + break; + case GIC_CPU_ALIAS_BINPOINT: + vmcr.abpr = val; + break; + } + + vgic_set_vmcr(vcpu, &vmcr); +} + +static unsigned long vgic_mmio_read_apr(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + int n; /* which APRn is this */ + + n = (addr >> 2) & 0x3; + + if (kvm_vgic_global_state.type == VGIC_V2) { + /* GICv2 hardware systems support max. 32 groups */ + if (n != 0) + return 0; + return vcpu->arch.vgic_cpu.vgic_v2.vgic_apr; + } else { + struct vgic_v3_cpu_if *vgicv3 = &vcpu->arch.vgic_cpu.vgic_v3; + + if (n > vgic_v3_max_apr_idx(vcpu)) + return 0; + + n = array_index_nospec(n, 4); + + /* GICv3 only uses ICH_AP1Rn for memory mapped (GICv2) guests */ + return vgicv3->vgic_ap1r[n]; + } +} + +static void vgic_mmio_write_apr(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + int n; /* which APRn is this */ + + n = (addr >> 2) & 0x3; + + if (kvm_vgic_global_state.type == VGIC_V2) { + /* GICv2 hardware systems support max. 32 groups */ + if (n != 0) + return; + vcpu->arch.vgic_cpu.vgic_v2.vgic_apr = val; + } else { + struct vgic_v3_cpu_if *vgicv3 = &vcpu->arch.vgic_cpu.vgic_v3; + + if (n > vgic_v3_max_apr_idx(vcpu)) + return; + + n = array_index_nospec(n, 4); + + /* GICv3 only uses ICH_AP1Rn for memory mapped (GICv2) guests */ + vgicv3->vgic_ap1r[n] = val; + } +} + +static const struct vgic_register_region vgic_v2_dist_registers[] = { + REGISTER_DESC_WITH_LENGTH_UACCESS(GIC_DIST_CTRL, + vgic_mmio_read_v2_misc, vgic_mmio_write_v2_misc, + NULL, vgic_mmio_uaccess_write_v2_misc, + 12, VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_IGROUP, + vgic_mmio_read_group, vgic_mmio_write_group, + NULL, vgic_mmio_uaccess_write_v2_group, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ENABLE_SET, + vgic_mmio_read_enable, vgic_mmio_write_senable, + NULL, vgic_uaccess_write_senable, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ENABLE_CLEAR, + vgic_mmio_read_enable, vgic_mmio_write_cenable, + NULL, vgic_uaccess_write_cenable, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_SET, + vgic_mmio_read_pending, vgic_mmio_write_spending, + NULL, vgic_uaccess_write_spending, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_CLEAR, + vgic_mmio_read_pending, vgic_mmio_write_cpending, + NULL, vgic_uaccess_write_cpending, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ACTIVE_SET, + vgic_mmio_read_active, vgic_mmio_write_sactive, + vgic_uaccess_read_active, vgic_mmio_uaccess_write_sactive, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ACTIVE_CLEAR, + vgic_mmio_read_active, vgic_mmio_write_cactive, + vgic_uaccess_read_active, vgic_mmio_uaccess_write_cactive, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PRI, + vgic_mmio_read_priority, vgic_mmio_write_priority, NULL, NULL, + 8, VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), + REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_TARGET, + vgic_mmio_read_target, vgic_mmio_write_target, NULL, NULL, 8, + VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), + REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_CONFIG, + vgic_mmio_read_config, vgic_mmio_write_config, NULL, NULL, 2, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GIC_DIST_SOFTINT, + vgic_mmio_read_raz, vgic_mmio_write_sgir, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GIC_DIST_SGI_PENDING_CLEAR, + vgic_mmio_read_sgipend, vgic_mmio_write_sgipendc, 16, + VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), + REGISTER_DESC_WITH_LENGTH(GIC_DIST_SGI_PENDING_SET, + vgic_mmio_read_sgipend, vgic_mmio_write_sgipends, 16, + VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), +}; + +static const struct vgic_register_region vgic_v2_cpu_registers[] = { + REGISTER_DESC_WITH_LENGTH(GIC_CPU_CTRL, + vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GIC_CPU_PRIMASK, + vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GIC_CPU_BINPOINT, + vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GIC_CPU_ALIAS_BINPOINT, + vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GIC_CPU_ACTIVEPRIO, + vgic_mmio_read_apr, vgic_mmio_write_apr, 16, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GIC_CPU_IDENT, + vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4, + VGIC_ACCESS_32bit), +}; + +unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev) +{ + dev->regions = vgic_v2_dist_registers; + dev->nr_regions = ARRAY_SIZE(vgic_v2_dist_registers); + + kvm_iodevice_init(&dev->dev, &kvm_io_gic_ops); + + return SZ_4K; +} + +int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr) +{ + const struct vgic_register_region *region; + struct vgic_io_device iodev; + struct vgic_reg_attr reg_attr; + struct kvm_vcpu *vcpu; + gpa_t addr; + int ret; + + ret = vgic_v2_parse_attr(dev, attr, ®_attr); + if (ret) + return ret; + + vcpu = reg_attr.vcpu; + addr = reg_attr.addr; + + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: + iodev.regions = vgic_v2_dist_registers; + iodev.nr_regions = ARRAY_SIZE(vgic_v2_dist_registers); + iodev.base_addr = 0; + break; + case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: + iodev.regions = vgic_v2_cpu_registers; + iodev.nr_regions = ARRAY_SIZE(vgic_v2_cpu_registers); + iodev.base_addr = 0; + break; + default: + return -ENXIO; + } + + /* We only support aligned 32-bit accesses. */ + if (addr & 3) + return -ENXIO; + + region = vgic_get_mmio_region(vcpu, &iodev, addr, sizeof(u32)); + if (!region) + return -ENXIO; + + return 0; +} + +int vgic_v2_cpuif_uaccess(struct kvm_vcpu *vcpu, bool is_write, + int offset, u32 *val) +{ + struct vgic_io_device dev = { + .regions = vgic_v2_cpu_registers, + .nr_regions = ARRAY_SIZE(vgic_v2_cpu_registers), + .iodev_type = IODEV_CPUIF, + }; + + return vgic_uaccess(vcpu, &dev, is_write, offset, val); +} + +int vgic_v2_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write, + int offset, u32 *val) +{ + struct vgic_io_device dev = { + .regions = vgic_v2_dist_registers, + .nr_regions = ARRAY_SIZE(vgic_v2_dist_registers), + .iodev_type = IODEV_DIST, + }; + + return vgic_uaccess(vcpu, &dev, is_write, offset, val); +} diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c b/arch/arm64/kvm/vgic/vgic-mmio-v3.c new file mode 100644 index 000000000000..d2339a2b9fb9 --- /dev/null +++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c @@ -0,0 +1,1063 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * VGICv3 MMIO handling functions + */ + +#include <linux/bitfield.h> +#include <linux/irqchip/arm-gic-v3.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/interrupt.h> +#include <kvm/iodev.h> +#include <kvm/arm_vgic.h> + +#include <asm/kvm_emulate.h> +#include <asm/kvm_arm.h> +#include <asm/kvm_mmu.h> + +#include "vgic.h" +#include "vgic-mmio.h" + +/* extract @num bytes at @offset bytes offset in data */ +unsigned long extract_bytes(u64 data, unsigned int offset, + unsigned int num) +{ + return (data >> (offset * 8)) & GENMASK_ULL(num * 8 - 1, 0); +} + +/* allows updates of any half of a 64-bit register (or the whole thing) */ +u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len, + unsigned long val) +{ + int lower = (offset & 4) * 8; + int upper = lower + 8 * len - 1; + + reg &= ~GENMASK_ULL(upper, lower); + val &= GENMASK_ULL(len * 8 - 1, 0); + + return reg | ((u64)val << lower); +} + +bool vgic_has_its(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + + if (dist->vgic_model != KVM_DEV_TYPE_ARM_VGIC_V3) + return false; + + return dist->has_its; +} + +bool vgic_supports_direct_msis(struct kvm *kvm) +{ + return (kvm_vgic_global_state.has_gicv4_1 || + (kvm_vgic_global_state.has_gicv4 && vgic_has_its(kvm))); +} + +/* + * The Revision field in the IIDR have the following meanings: + * + * Revision 2: Interrupt groups are guest-configurable and signaled using + * their configured groups. + */ + +static unsigned long vgic_mmio_read_v3_misc(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + struct vgic_dist *vgic = &vcpu->kvm->arch.vgic; + u32 value = 0; + + switch (addr & 0x0c) { + case GICD_CTLR: + if (vgic->enabled) + value |= GICD_CTLR_ENABLE_SS_G1; + value |= GICD_CTLR_ARE_NS | GICD_CTLR_DS; + if (vgic->nassgireq) + value |= GICD_CTLR_nASSGIreq; + break; + case GICD_TYPER: + value = vgic->nr_spis + VGIC_NR_PRIVATE_IRQS; + value = (value >> 5) - 1; + if (vgic_has_its(vcpu->kvm)) { + value |= (INTERRUPT_ID_BITS_ITS - 1) << 19; + value |= GICD_TYPER_LPIS; + } else { + value |= (INTERRUPT_ID_BITS_SPIS - 1) << 19; + } + break; + case GICD_TYPER2: + if (kvm_vgic_global_state.has_gicv4_1) + value = GICD_TYPER2_nASSGIcap; + break; + case GICD_IIDR: + value = (PRODUCT_ID_KVM << GICD_IIDR_PRODUCT_ID_SHIFT) | + (vgic->implementation_rev << GICD_IIDR_REVISION_SHIFT) | + (IMPLEMENTER_ARM << GICD_IIDR_IMPLEMENTER_SHIFT); + break; + default: + return 0; + } + + return value; +} + +static void vgic_mmio_write_v3_misc(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + + switch (addr & 0x0c) { + case GICD_CTLR: { + bool was_enabled, is_hwsgi; + + mutex_lock(&vcpu->kvm->lock); + + was_enabled = dist->enabled; + is_hwsgi = dist->nassgireq; + + dist->enabled = val & GICD_CTLR_ENABLE_SS_G1; + + /* Not a GICv4.1? No HW SGIs */ + if (!kvm_vgic_global_state.has_gicv4_1) + val &= ~GICD_CTLR_nASSGIreq; + + /* Dist stays enabled? nASSGIreq is RO */ + if (was_enabled && dist->enabled) { + val &= ~GICD_CTLR_nASSGIreq; + val |= FIELD_PREP(GICD_CTLR_nASSGIreq, is_hwsgi); + } + + /* Switching HW SGIs? */ + dist->nassgireq = val & GICD_CTLR_nASSGIreq; + if (is_hwsgi != dist->nassgireq) + vgic_v4_configure_vsgis(vcpu->kvm); + + if (kvm_vgic_global_state.has_gicv4_1 && + was_enabled != dist->enabled) + kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_RELOAD_GICv4); + else if (!was_enabled && dist->enabled) + vgic_kick_vcpus(vcpu->kvm); + + mutex_unlock(&vcpu->kvm->lock); + break; + } + case GICD_TYPER: + case GICD_TYPER2: + case GICD_IIDR: + /* This is at best for documentation purposes... */ + return; + } +} + +static int vgic_mmio_uaccess_write_v3_misc(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + + switch (addr & 0x0c) { + case GICD_TYPER2: + case GICD_IIDR: + if (val != vgic_mmio_read_v3_misc(vcpu, addr, len)) + return -EINVAL; + return 0; + case GICD_CTLR: + /* Not a GICv4.1? No HW SGIs */ + if (!kvm_vgic_global_state.has_gicv4_1) + val &= ~GICD_CTLR_nASSGIreq; + + dist->enabled = val & GICD_CTLR_ENABLE_SS_G1; + dist->nassgireq = val & GICD_CTLR_nASSGIreq; + return 0; + } + + vgic_mmio_write_v3_misc(vcpu, addr, len, val); + return 0; +} + +static unsigned long vgic_mmio_read_irouter(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + int intid = VGIC_ADDR_TO_INTID(addr, 64); + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid); + unsigned long ret = 0; + + if (!irq) + return 0; + + /* The upper word is RAZ for us. */ + if (!(addr & 4)) + ret = extract_bytes(READ_ONCE(irq->mpidr), addr & 7, len); + + vgic_put_irq(vcpu->kvm, irq); + return ret; +} + +static void vgic_mmio_write_irouter(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + int intid = VGIC_ADDR_TO_INTID(addr, 64); + struct vgic_irq *irq; + unsigned long flags; + + /* The upper word is WI for us since we don't implement Aff3. */ + if (addr & 4) + return; + + irq = vgic_get_irq(vcpu->kvm, NULL, intid); + + if (!irq) + return; + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + + /* We only care about and preserve Aff0, Aff1 and Aff2. */ + irq->mpidr = val & GENMASK(23, 0); + irq->target_vcpu = kvm_mpidr_to_vcpu(vcpu->kvm, irq->mpidr); + + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + vgic_put_irq(vcpu->kvm, irq); +} + +static unsigned long vgic_mmio_read_v3r_ctlr(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + + return vgic_cpu->lpis_enabled ? GICR_CTLR_ENABLE_LPIS : 0; +} + + +static void vgic_mmio_write_v3r_ctlr(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + bool was_enabled = vgic_cpu->lpis_enabled; + + if (!vgic_has_its(vcpu->kvm)) + return; + + vgic_cpu->lpis_enabled = val & GICR_CTLR_ENABLE_LPIS; + + if (was_enabled && !vgic_cpu->lpis_enabled) { + vgic_flush_pending_lpis(vcpu); + vgic_its_invalidate_cache(vcpu->kvm); + } + + if (!was_enabled && vgic_cpu->lpis_enabled) + vgic_enable_lpis(vcpu); +} + +static unsigned long vgic_mmio_read_v3r_typer(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + unsigned long mpidr = kvm_vcpu_get_mpidr_aff(vcpu); + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_redist_region *rdreg = vgic_cpu->rdreg; + int target_vcpu_id = vcpu->vcpu_id; + gpa_t last_rdist_typer = rdreg->base + GICR_TYPER + + (rdreg->free_index - 1) * KVM_VGIC_V3_REDIST_SIZE; + u64 value; + + value = (u64)(mpidr & GENMASK(23, 0)) << 32; + value |= ((target_vcpu_id & 0xffff) << 8); + + if (addr == last_rdist_typer) + value |= GICR_TYPER_LAST; + if (vgic_has_its(vcpu->kvm)) + value |= GICR_TYPER_PLPIS; + + return extract_bytes(value, addr & 7, len); +} + +static unsigned long vgic_mmio_read_v3r_iidr(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + return (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0); +} + +static unsigned long vgic_mmio_read_v3_idregs(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + switch (addr & 0xffff) { + case GICD_PIDR2: + /* report a GICv3 compliant implementation */ + return 0x3b; + } + + return 0; +} + +static unsigned long vgic_v3_uaccess_read_pending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + u32 value = 0; + int i; + + /* + * pending state of interrupt is latched in pending_latch variable. + * Userspace will save and restore pending state and line_level + * separately. + * Refer to Documentation/virt/kvm/devices/arm-vgic-v3.rst + * for handling of ISPENDR and ICPENDR. + */ + for (i = 0; i < len * 8; i++) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + bool state = irq->pending_latch; + + if (irq->hw && vgic_irq_is_sgi(irq->intid)) { + int err; + + err = irq_get_irqchip_state(irq->host_irq, + IRQCHIP_STATE_PENDING, + &state); + WARN_ON(err); + } + + if (state) + value |= (1U << i); + + vgic_put_irq(vcpu->kvm, irq); + } + + return value; +} + +static int vgic_v3_uaccess_write_pending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + int i; + unsigned long flags; + + for (i = 0; i < len * 8; i++) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + if (test_bit(i, &val)) { + /* + * pending_latch is set irrespective of irq type + * (level or edge) to avoid dependency that VM should + * restore irq config before pending info. + */ + irq->pending_latch = true; + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); + } else { + irq->pending_latch = false; + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + } + + vgic_put_irq(vcpu->kvm, irq); + } + + return 0; +} + +/* We want to avoid outer shareable. */ +u64 vgic_sanitise_shareability(u64 field) +{ + switch (field) { + case GIC_BASER_OuterShareable: + return GIC_BASER_InnerShareable; + default: + return field; + } +} + +/* Avoid any inner non-cacheable mapping. */ +u64 vgic_sanitise_inner_cacheability(u64 field) +{ + switch (field) { + case GIC_BASER_CACHE_nCnB: + case GIC_BASER_CACHE_nC: + return GIC_BASER_CACHE_RaWb; + default: + return field; + } +} + +/* Non-cacheable or same-as-inner are OK. */ +u64 vgic_sanitise_outer_cacheability(u64 field) +{ + switch (field) { + case GIC_BASER_CACHE_SameAsInner: + case GIC_BASER_CACHE_nC: + return field; + default: + return GIC_BASER_CACHE_nC; + } +} + +u64 vgic_sanitise_field(u64 reg, u64 field_mask, int field_shift, + u64 (*sanitise_fn)(u64)) +{ + u64 field = (reg & field_mask) >> field_shift; + + field = sanitise_fn(field) << field_shift; + return (reg & ~field_mask) | field; +} + +#define PROPBASER_RES0_MASK \ + (GENMASK_ULL(63, 59) | GENMASK_ULL(55, 52) | GENMASK_ULL(6, 5)) +#define PENDBASER_RES0_MASK \ + (BIT_ULL(63) | GENMASK_ULL(61, 59) | GENMASK_ULL(55, 52) | \ + GENMASK_ULL(15, 12) | GENMASK_ULL(6, 0)) + +static u64 vgic_sanitise_pendbaser(u64 reg) +{ + reg = vgic_sanitise_field(reg, GICR_PENDBASER_SHAREABILITY_MASK, + GICR_PENDBASER_SHAREABILITY_SHIFT, + vgic_sanitise_shareability); + reg = vgic_sanitise_field(reg, GICR_PENDBASER_INNER_CACHEABILITY_MASK, + GICR_PENDBASER_INNER_CACHEABILITY_SHIFT, + vgic_sanitise_inner_cacheability); + reg = vgic_sanitise_field(reg, GICR_PENDBASER_OUTER_CACHEABILITY_MASK, + GICR_PENDBASER_OUTER_CACHEABILITY_SHIFT, + vgic_sanitise_outer_cacheability); + + reg &= ~PENDBASER_RES0_MASK; + + return reg; +} + +static u64 vgic_sanitise_propbaser(u64 reg) +{ + reg = vgic_sanitise_field(reg, GICR_PROPBASER_SHAREABILITY_MASK, + GICR_PROPBASER_SHAREABILITY_SHIFT, + vgic_sanitise_shareability); + reg = vgic_sanitise_field(reg, GICR_PROPBASER_INNER_CACHEABILITY_MASK, + GICR_PROPBASER_INNER_CACHEABILITY_SHIFT, + vgic_sanitise_inner_cacheability); + reg = vgic_sanitise_field(reg, GICR_PROPBASER_OUTER_CACHEABILITY_MASK, + GICR_PROPBASER_OUTER_CACHEABILITY_SHIFT, + vgic_sanitise_outer_cacheability); + + reg &= ~PROPBASER_RES0_MASK; + return reg; +} + +static unsigned long vgic_mmio_read_propbase(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + + return extract_bytes(dist->propbaser, addr & 7, len); +} + +static void vgic_mmio_write_propbase(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + u64 old_propbaser, propbaser; + + /* Storing a value with LPIs already enabled is undefined */ + if (vgic_cpu->lpis_enabled) + return; + + do { + old_propbaser = READ_ONCE(dist->propbaser); + propbaser = old_propbaser; + propbaser = update_64bit_reg(propbaser, addr & 4, len, val); + propbaser = vgic_sanitise_propbaser(propbaser); + } while (cmpxchg64(&dist->propbaser, old_propbaser, + propbaser) != old_propbaser); +} + +static unsigned long vgic_mmio_read_pendbase(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + u64 value = vgic_cpu->pendbaser; + + value &= ~GICR_PENDBASER_PTZ; + + return extract_bytes(value, addr & 7, len); +} + +static void vgic_mmio_write_pendbase(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + u64 old_pendbaser, pendbaser; + + /* Storing a value with LPIs already enabled is undefined */ + if (vgic_cpu->lpis_enabled) + return; + + do { + old_pendbaser = READ_ONCE(vgic_cpu->pendbaser); + pendbaser = old_pendbaser; + pendbaser = update_64bit_reg(pendbaser, addr & 4, len, val); + pendbaser = vgic_sanitise_pendbaser(pendbaser); + } while (cmpxchg64(&vgic_cpu->pendbaser, old_pendbaser, + pendbaser) != old_pendbaser); +} + +/* + * The GICv3 per-IRQ registers are split to control PPIs and SGIs in the + * redistributors, while SPIs are covered by registers in the distributor + * block. Trying to set private IRQs in this block gets ignored. + * We take some special care here to fix the calculation of the register + * offset. + */ +#define REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(off, rd, wr, ur, uw, bpi, acc) \ + { \ + .reg_offset = off, \ + .bits_per_irq = bpi, \ + .len = (bpi * VGIC_NR_PRIVATE_IRQS) / 8, \ + .access_flags = acc, \ + .read = vgic_mmio_read_raz, \ + .write = vgic_mmio_write_wi, \ + }, { \ + .reg_offset = off + (bpi * VGIC_NR_PRIVATE_IRQS) / 8, \ + .bits_per_irq = bpi, \ + .len = (bpi * (1024 - VGIC_NR_PRIVATE_IRQS)) / 8, \ + .access_flags = acc, \ + .read = rd, \ + .write = wr, \ + .uaccess_read = ur, \ + .uaccess_write = uw, \ + } + +static const struct vgic_register_region vgic_v3_dist_registers[] = { + REGISTER_DESC_WITH_LENGTH_UACCESS(GICD_CTLR, + vgic_mmio_read_v3_misc, vgic_mmio_write_v3_misc, + NULL, vgic_mmio_uaccess_write_v3_misc, + 16, VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GICD_STATUSR, + vgic_mmio_read_rao, vgic_mmio_write_wi, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IGROUPR, + vgic_mmio_read_group, vgic_mmio_write_group, NULL, NULL, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISENABLER, + vgic_mmio_read_enable, vgic_mmio_write_senable, + NULL, vgic_uaccess_write_senable, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICENABLER, + vgic_mmio_read_enable, vgic_mmio_write_cenable, + NULL, vgic_uaccess_write_cenable, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISPENDR, + vgic_mmio_read_pending, vgic_mmio_write_spending, + vgic_v3_uaccess_read_pending, vgic_v3_uaccess_write_pending, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICPENDR, + vgic_mmio_read_pending, vgic_mmio_write_cpending, + vgic_mmio_read_raz, vgic_mmio_uaccess_write_wi, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISACTIVER, + vgic_mmio_read_active, vgic_mmio_write_sactive, + vgic_uaccess_read_active, vgic_mmio_uaccess_write_sactive, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICACTIVER, + vgic_mmio_read_active, vgic_mmio_write_cactive, + vgic_uaccess_read_active, vgic_mmio_uaccess_write_cactive, + 1, VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IPRIORITYR, + vgic_mmio_read_priority, vgic_mmio_write_priority, NULL, NULL, + 8, VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), + REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ITARGETSR, + vgic_mmio_read_raz, vgic_mmio_write_wi, NULL, NULL, 8, + VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), + REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICFGR, + vgic_mmio_read_config, vgic_mmio_write_config, NULL, NULL, 2, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IGRPMODR, + vgic_mmio_read_raz, vgic_mmio_write_wi, NULL, NULL, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IROUTER, + vgic_mmio_read_irouter, vgic_mmio_write_irouter, NULL, NULL, 64, + VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GICD_IDREGS, + vgic_mmio_read_v3_idregs, vgic_mmio_write_wi, 48, + VGIC_ACCESS_32bit), +}; + +static const struct vgic_register_region vgic_v3_rd_registers[] = { + /* RD_base registers */ + REGISTER_DESC_WITH_LENGTH(GICR_CTLR, + vgic_mmio_read_v3r_ctlr, vgic_mmio_write_v3r_ctlr, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GICR_STATUSR, + vgic_mmio_read_raz, vgic_mmio_write_wi, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GICR_IIDR, + vgic_mmio_read_v3r_iidr, vgic_mmio_write_wi, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GICR_TYPER, + vgic_mmio_read_v3r_typer, vgic_mmio_write_wi, 8, + VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GICR_WAKER, + vgic_mmio_read_raz, vgic_mmio_write_wi, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GICR_PROPBASER, + vgic_mmio_read_propbase, vgic_mmio_write_propbase, 8, + VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GICR_PENDBASER, + vgic_mmio_read_pendbase, vgic_mmio_write_pendbase, 8, + VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GICR_IDREGS, + vgic_mmio_read_v3_idregs, vgic_mmio_write_wi, 48, + VGIC_ACCESS_32bit), + /* SGI_base registers */ + REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_IGROUPR0, + vgic_mmio_read_group, vgic_mmio_write_group, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ISENABLER0, + vgic_mmio_read_enable, vgic_mmio_write_senable, + NULL, vgic_uaccess_write_senable, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ICENABLER0, + vgic_mmio_read_enable, vgic_mmio_write_cenable, + NULL, vgic_uaccess_write_cenable, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ISPENDR0, + vgic_mmio_read_pending, vgic_mmio_write_spending, + vgic_v3_uaccess_read_pending, vgic_v3_uaccess_write_pending, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ICPENDR0, + vgic_mmio_read_pending, vgic_mmio_write_cpending, + vgic_mmio_read_raz, vgic_mmio_uaccess_write_wi, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ISACTIVER0, + vgic_mmio_read_active, vgic_mmio_write_sactive, + vgic_uaccess_read_active, vgic_mmio_uaccess_write_sactive, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ICACTIVER0, + vgic_mmio_read_active, vgic_mmio_write_cactive, + vgic_uaccess_read_active, vgic_mmio_uaccess_write_cactive, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_IPRIORITYR0, + vgic_mmio_read_priority, vgic_mmio_write_priority, 32, + VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), + REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_ICFGR0, + vgic_mmio_read_config, vgic_mmio_write_config, 8, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_IGRPMODR0, + vgic_mmio_read_raz, vgic_mmio_write_wi, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_NSACR, + vgic_mmio_read_raz, vgic_mmio_write_wi, 4, + VGIC_ACCESS_32bit), +}; + +unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev) +{ + dev->regions = vgic_v3_dist_registers; + dev->nr_regions = ARRAY_SIZE(vgic_v3_dist_registers); + + kvm_iodevice_init(&dev->dev, &kvm_io_gic_ops); + + return SZ_64K; +} + +/** + * vgic_register_redist_iodev - register a single redist iodev + * @vcpu: The VCPU to which the redistributor belongs + * + * Register a KVM iodev for this VCPU's redistributor using the address + * provided. + * + * Return 0 on success, -ERRNO otherwise. + */ +int vgic_register_redist_iodev(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct vgic_dist *vgic = &kvm->arch.vgic; + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_io_device *rd_dev = &vcpu->arch.vgic_cpu.rd_iodev; + struct vgic_redist_region *rdreg; + gpa_t rd_base; + int ret; + + if (!IS_VGIC_ADDR_UNDEF(vgic_cpu->rd_iodev.base_addr)) + return 0; + + /* + * We may be creating VCPUs before having set the base address for the + * redistributor region, in which case we will come back to this + * function for all VCPUs when the base address is set. Just return + * without doing any work for now. + */ + rdreg = vgic_v3_rdist_free_slot(&vgic->rd_regions); + if (!rdreg) + return 0; + + if (!vgic_v3_check_base(kvm)) + return -EINVAL; + + vgic_cpu->rdreg = rdreg; + + rd_base = rdreg->base + rdreg->free_index * KVM_VGIC_V3_REDIST_SIZE; + + kvm_iodevice_init(&rd_dev->dev, &kvm_io_gic_ops); + rd_dev->base_addr = rd_base; + rd_dev->iodev_type = IODEV_REDIST; + rd_dev->regions = vgic_v3_rd_registers; + rd_dev->nr_regions = ARRAY_SIZE(vgic_v3_rd_registers); + rd_dev->redist_vcpu = vcpu; + + mutex_lock(&kvm->slots_lock); + ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, rd_base, + 2 * SZ_64K, &rd_dev->dev); + mutex_unlock(&kvm->slots_lock); + + if (ret) + return ret; + + rdreg->free_index++; + return 0; +} + +static void vgic_unregister_redist_iodev(struct kvm_vcpu *vcpu) +{ + struct vgic_io_device *rd_dev = &vcpu->arch.vgic_cpu.rd_iodev; + + kvm_io_bus_unregister_dev(vcpu->kvm, KVM_MMIO_BUS, &rd_dev->dev); +} + +static int vgic_register_all_redist_iodevs(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + int c, ret = 0; + + kvm_for_each_vcpu(c, vcpu, kvm) { + ret = vgic_register_redist_iodev(vcpu); + if (ret) + break; + } + + if (ret) { + /* The current c failed, so we start with the previous one. */ + mutex_lock(&kvm->slots_lock); + for (c--; c >= 0; c--) { + vcpu = kvm_get_vcpu(kvm, c); + vgic_unregister_redist_iodev(vcpu); + } + mutex_unlock(&kvm->slots_lock); + } + + return ret; +} + +/** + * vgic_v3_insert_redist_region - Insert a new redistributor region + * + * Performs various checks before inserting the rdist region in the list. + * Those tests depend on whether the size of the rdist region is known + * (ie. count != 0). The list is sorted by rdist region index. + * + * @kvm: kvm handle + * @index: redist region index + * @base: base of the new rdist region + * @count: number of redistributors the region is made of (0 in the old style + * single region, whose size is induced from the number of vcpus) + * + * Return 0 on success, < 0 otherwise + */ +static int vgic_v3_insert_redist_region(struct kvm *kvm, uint32_t index, + gpa_t base, uint32_t count) +{ + struct vgic_dist *d = &kvm->arch.vgic; + struct vgic_redist_region *rdreg; + struct list_head *rd_regions = &d->rd_regions; + size_t size = count * KVM_VGIC_V3_REDIST_SIZE; + int ret; + + /* single rdist region already set ?*/ + if (!count && !list_empty(rd_regions)) + return -EINVAL; + + /* cross the end of memory ? */ + if (base + size < base) + return -EINVAL; + + if (list_empty(rd_regions)) { + if (index != 0) + return -EINVAL; + } else { + rdreg = list_last_entry(rd_regions, + struct vgic_redist_region, list); + if (index != rdreg->index + 1) + return -EINVAL; + + /* Cannot add an explicitly sized regions after legacy region */ + if (!rdreg->count) + return -EINVAL; + } + + /* + * For legacy single-region redistributor regions (!count), + * check that the redistributor region does not overlap with the + * distributor's address space. + */ + if (!count && !IS_VGIC_ADDR_UNDEF(d->vgic_dist_base) && + vgic_dist_overlap(kvm, base, size)) + return -EINVAL; + + /* collision with any other rdist region? */ + if (vgic_v3_rdist_overlap(kvm, base, size)) + return -EINVAL; + + rdreg = kzalloc(sizeof(*rdreg), GFP_KERNEL); + if (!rdreg) + return -ENOMEM; + + rdreg->base = VGIC_ADDR_UNDEF; + + ret = vgic_check_ioaddr(kvm, &rdreg->base, base, SZ_64K); + if (ret) + goto free; + + rdreg->base = base; + rdreg->count = count; + rdreg->free_index = 0; + rdreg->index = index; + + list_add_tail(&rdreg->list, rd_regions); + return 0; +free: + kfree(rdreg); + return ret; +} + +int vgic_v3_set_redist_base(struct kvm *kvm, u32 index, u64 addr, u32 count) +{ + int ret; + + ret = vgic_v3_insert_redist_region(kvm, index, addr, count); + if (ret) + return ret; + + /* + * Register iodevs for each existing VCPU. Adding more VCPUs + * afterwards will register the iodevs when needed. + */ + ret = vgic_register_all_redist_iodevs(kvm); + if (ret) + return ret; + + return 0; +} + +int vgic_v3_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr) +{ + const struct vgic_register_region *region; + struct vgic_io_device iodev; + struct vgic_reg_attr reg_attr; + struct kvm_vcpu *vcpu; + gpa_t addr; + int ret; + + ret = vgic_v3_parse_attr(dev, attr, ®_attr); + if (ret) + return ret; + + vcpu = reg_attr.vcpu; + addr = reg_attr.addr; + + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: + iodev.regions = vgic_v3_dist_registers; + iodev.nr_regions = ARRAY_SIZE(vgic_v3_dist_registers); + iodev.base_addr = 0; + break; + case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS:{ + iodev.regions = vgic_v3_rd_registers; + iodev.nr_regions = ARRAY_SIZE(vgic_v3_rd_registers); + iodev.base_addr = 0; + break; + } + case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: { + u64 reg, id; + + id = (attr->attr & KVM_DEV_ARM_VGIC_SYSREG_INSTR_MASK); + return vgic_v3_has_cpu_sysregs_attr(vcpu, 0, id, ®); + } + default: + return -ENXIO; + } + + /* We only support aligned 32-bit accesses. */ + if (addr & 3) + return -ENXIO; + + region = vgic_get_mmio_region(vcpu, &iodev, addr, sizeof(u32)); + if (!region) + return -ENXIO; + + return 0; +} +/* + * Compare a given affinity (level 1-3 and a level 0 mask, from the SGI + * generation register ICC_SGI1R_EL1) with a given VCPU. + * If the VCPU's MPIDR matches, return the level0 affinity, otherwise + * return -1. + */ +static int match_mpidr(u64 sgi_aff, u16 sgi_cpu_mask, struct kvm_vcpu *vcpu) +{ + unsigned long affinity; + int level0; + + /* + * Split the current VCPU's MPIDR into affinity level 0 and the + * rest as this is what we have to compare against. + */ + affinity = kvm_vcpu_get_mpidr_aff(vcpu); + level0 = MPIDR_AFFINITY_LEVEL(affinity, 0); + affinity &= ~MPIDR_LEVEL_MASK; + + /* bail out if the upper three levels don't match */ + if (sgi_aff != affinity) + return -1; + + /* Is this VCPU's bit set in the mask ? */ + if (!(sgi_cpu_mask & BIT(level0))) + return -1; + + return level0; +} + +/* + * The ICC_SGI* registers encode the affinity differently from the MPIDR, + * so provide a wrapper to use the existing defines to isolate a certain + * affinity level. + */ +#define SGI_AFFINITY_LEVEL(reg, level) \ + ((((reg) & ICC_SGI1R_AFFINITY_## level ##_MASK) \ + >> ICC_SGI1R_AFFINITY_## level ##_SHIFT) << MPIDR_LEVEL_SHIFT(level)) + +/** + * vgic_v3_dispatch_sgi - handle SGI requests from VCPUs + * @vcpu: The VCPU requesting a SGI + * @reg: The value written into ICC_{ASGI1,SGI0,SGI1}R by that VCPU + * @allow_group1: Does the sysreg access allow generation of G1 SGIs + * + * With GICv3 (and ARE=1) CPUs trigger SGIs by writing to a system register. + * This will trap in sys_regs.c and call this function. + * This ICC_SGI1R_EL1 register contains the upper three affinity levels of the + * target processors as well as a bitmask of 16 Aff0 CPUs. + * If the interrupt routing mode bit is not set, we iterate over all VCPUs to + * check for matching ones. If this bit is set, we signal all, but not the + * calling VCPU. + */ +void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg, bool allow_group1) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_vcpu *c_vcpu; + u16 target_cpus; + u64 mpidr; + int sgi, c; + int vcpu_id = vcpu->vcpu_id; + bool broadcast; + unsigned long flags; + + sgi = (reg & ICC_SGI1R_SGI_ID_MASK) >> ICC_SGI1R_SGI_ID_SHIFT; + broadcast = reg & BIT_ULL(ICC_SGI1R_IRQ_ROUTING_MODE_BIT); + target_cpus = (reg & ICC_SGI1R_TARGET_LIST_MASK) >> ICC_SGI1R_TARGET_LIST_SHIFT; + mpidr = SGI_AFFINITY_LEVEL(reg, 3); + mpidr |= SGI_AFFINITY_LEVEL(reg, 2); + mpidr |= SGI_AFFINITY_LEVEL(reg, 1); + + /* + * We iterate over all VCPUs to find the MPIDRs matching the request. + * If we have handled one CPU, we clear its bit to detect early + * if we are already finished. This avoids iterating through all + * VCPUs when most of the times we just signal a single VCPU. + */ + kvm_for_each_vcpu(c, c_vcpu, kvm) { + struct vgic_irq *irq; + + /* Exit early if we have dealt with all requested CPUs */ + if (!broadcast && target_cpus == 0) + break; + + /* Don't signal the calling VCPU */ + if (broadcast && c == vcpu_id) + continue; + + if (!broadcast) { + int level0; + + level0 = match_mpidr(mpidr, target_cpus, c_vcpu); + if (level0 == -1) + continue; + + /* remove this matching VCPU from the mask */ + target_cpus &= ~BIT(level0); + } + + irq = vgic_get_irq(vcpu->kvm, c_vcpu, sgi); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + + /* + * An access targetting Group0 SGIs can only generate + * those, while an access targetting Group1 SGIs can + * generate interrupts of either group. + */ + if (!irq->group || allow_group1) { + if (!irq->hw) { + irq->pending_latch = true; + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); + } else { + /* HW SGI? Ask the GIC to inject it */ + int err; + err = irq_set_irqchip_state(irq->host_irq, + IRQCHIP_STATE_PENDING, + true); + WARN_RATELIMIT(err, "IRQ %d", irq->host_irq); + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + } + } else { + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + } + + vgic_put_irq(vcpu->kvm, irq); + } +} + +int vgic_v3_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write, + int offset, u32 *val) +{ + struct vgic_io_device dev = { + .regions = vgic_v3_dist_registers, + .nr_regions = ARRAY_SIZE(vgic_v3_dist_registers), + }; + + return vgic_uaccess(vcpu, &dev, is_write, offset, val); +} + +int vgic_v3_redist_uaccess(struct kvm_vcpu *vcpu, bool is_write, + int offset, u32 *val) +{ + struct vgic_io_device rd_dev = { + .regions = vgic_v3_rd_registers, + .nr_regions = ARRAY_SIZE(vgic_v3_rd_registers), + }; + + return vgic_uaccess(vcpu, &rd_dev, is_write, offset, val); +} + +int vgic_v3_line_level_info_uaccess(struct kvm_vcpu *vcpu, bool is_write, + u32 intid, u64 *val) +{ + if (intid % 32) + return -EINVAL; + + if (is_write) + vgic_write_irq_line_level_info(vcpu, intid, *val); + else + *val = vgic_read_irq_line_level_info(vcpu, intid); + + return 0; +} diff --git a/arch/arm64/kvm/vgic/vgic-mmio.c b/arch/arm64/kvm/vgic/vgic-mmio.c new file mode 100644 index 000000000000..b2d73fc0d1ef --- /dev/null +++ b/arch/arm64/kvm/vgic/vgic-mmio.c @@ -0,0 +1,1088 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * VGIC MMIO handling functions + */ + +#include <linux/bitops.h> +#include <linux/bsearch.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <kvm/iodev.h> +#include <kvm/arm_arch_timer.h> +#include <kvm/arm_vgic.h> + +#include "vgic.h" +#include "vgic-mmio.h" + +unsigned long vgic_mmio_read_raz(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + return 0; +} + +unsigned long vgic_mmio_read_rao(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + return -1UL; +} + +void vgic_mmio_write_wi(struct kvm_vcpu *vcpu, gpa_t addr, + unsigned int len, unsigned long val) +{ + /* Ignore */ +} + +int vgic_mmio_uaccess_write_wi(struct kvm_vcpu *vcpu, gpa_t addr, + unsigned int len, unsigned long val) +{ + /* Ignore */ + return 0; +} + +unsigned long vgic_mmio_read_group(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + u32 value = 0; + int i; + + /* Loop over all IRQs affected by this read */ + for (i = 0; i < len * 8; i++) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + if (irq->group) + value |= BIT(i); + + vgic_put_irq(vcpu->kvm, irq); + } + + return value; +} + +static void vgic_update_vsgi(struct vgic_irq *irq) +{ + WARN_ON(its_prop_update_vsgi(irq->host_irq, irq->priority, irq->group)); +} + +void vgic_mmio_write_group(struct kvm_vcpu *vcpu, gpa_t addr, + unsigned int len, unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + int i; + unsigned long flags; + + for (i = 0; i < len * 8; i++) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + irq->group = !!(val & BIT(i)); + if (irq->hw && vgic_irq_is_sgi(irq->intid)) { + vgic_update_vsgi(irq); + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + } else { + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); + } + + vgic_put_irq(vcpu->kvm, irq); + } +} + +/* + * Read accesses to both GICD_ICENABLER and GICD_ISENABLER return the value + * of the enabled bit, so there is only one function for both here. + */ +unsigned long vgic_mmio_read_enable(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + u32 value = 0; + int i; + + /* Loop over all IRQs affected by this read */ + for (i = 0; i < len * 8; i++) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + if (irq->enabled) + value |= (1U << i); + + vgic_put_irq(vcpu->kvm, irq); + } + + return value; +} + +void vgic_mmio_write_senable(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + int i; + unsigned long flags; + + for_each_set_bit(i, &val, len * 8) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + if (irq->hw && vgic_irq_is_sgi(irq->intid)) { + if (!irq->enabled) { + struct irq_data *data; + + irq->enabled = true; + data = &irq_to_desc(irq->host_irq)->irq_data; + while (irqd_irq_disabled(data)) + enable_irq(irq->host_irq); + } + + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + vgic_put_irq(vcpu->kvm, irq); + + continue; + } else if (vgic_irq_is_mapped_level(irq)) { + bool was_high = irq->line_level; + + /* + * We need to update the state of the interrupt because + * the guest might have changed the state of the device + * while the interrupt was disabled at the VGIC level. + */ + irq->line_level = vgic_get_phys_line_level(irq); + /* + * Deactivate the physical interrupt so the GIC will let + * us know when it is asserted again. + */ + if (!irq->active && was_high && !irq->line_level) + vgic_irq_set_phys_active(irq, false); + } + irq->enabled = true; + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); + + vgic_put_irq(vcpu->kvm, irq); + } +} + +void vgic_mmio_write_cenable(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + int i; + unsigned long flags; + + for_each_set_bit(i, &val, len * 8) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + if (irq->hw && vgic_irq_is_sgi(irq->intid) && irq->enabled) + disable_irq_nosync(irq->host_irq); + + irq->enabled = false; + + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + vgic_put_irq(vcpu->kvm, irq); + } +} + +int vgic_uaccess_write_senable(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + int i; + unsigned long flags; + + for_each_set_bit(i, &val, len * 8) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + irq->enabled = true; + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); + + vgic_put_irq(vcpu->kvm, irq); + } + + return 0; +} + +int vgic_uaccess_write_cenable(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + int i; + unsigned long flags; + + for_each_set_bit(i, &val, len * 8) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + irq->enabled = false; + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + + vgic_put_irq(vcpu->kvm, irq); + } + + return 0; +} + +unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + u32 value = 0; + int i; + + /* Loop over all IRQs affected by this read */ + for (i = 0; i < len * 8; i++) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + unsigned long flags; + bool val; + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + if (irq->hw && vgic_irq_is_sgi(irq->intid)) { + int err; + + val = false; + err = irq_get_irqchip_state(irq->host_irq, + IRQCHIP_STATE_PENDING, + &val); + WARN_RATELIMIT(err, "IRQ %d", irq->host_irq); + } else { + val = irq_is_pending(irq); + } + + value |= ((u32)val << i); + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + + vgic_put_irq(vcpu->kvm, irq); + } + + return value; +} + +static bool is_vgic_v2_sgi(struct kvm_vcpu *vcpu, struct vgic_irq *irq) +{ + return (vgic_irq_is_sgi(irq->intid) && + vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2); +} + +void vgic_mmio_write_spending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + int i; + unsigned long flags; + + for_each_set_bit(i, &val, len * 8) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + /* GICD_ISPENDR0 SGI bits are WI */ + if (is_vgic_v2_sgi(vcpu, irq)) { + vgic_put_irq(vcpu->kvm, irq); + continue; + } + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + + if (irq->hw && vgic_irq_is_sgi(irq->intid)) { + /* HW SGI? Ask the GIC to inject it */ + int err; + err = irq_set_irqchip_state(irq->host_irq, + IRQCHIP_STATE_PENDING, + true); + WARN_RATELIMIT(err, "IRQ %d", irq->host_irq); + + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + vgic_put_irq(vcpu->kvm, irq); + + continue; + } + + irq->pending_latch = true; + if (irq->hw) + vgic_irq_set_phys_active(irq, true); + + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); + vgic_put_irq(vcpu->kvm, irq); + } +} + +int vgic_uaccess_write_spending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + int i; + unsigned long flags; + + for_each_set_bit(i, &val, len * 8) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + irq->pending_latch = true; + + /* + * GICv2 SGIs are terribly broken. We can't restore + * the source of the interrupt, so just pick the vcpu + * itself as the source... + */ + if (is_vgic_v2_sgi(vcpu, irq)) + irq->source |= BIT(vcpu->vcpu_id); + + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); + + vgic_put_irq(vcpu->kvm, irq); + } + + return 0; +} + +/* Must be called with irq->irq_lock held */ +static void vgic_hw_irq_cpending(struct kvm_vcpu *vcpu, struct vgic_irq *irq) +{ + irq->pending_latch = false; + + /* + * We don't want the guest to effectively mask the physical + * interrupt by doing a write to SPENDR followed by a write to + * CPENDR for HW interrupts, so we clear the active state on + * the physical side if the virtual interrupt is not active. + * This may lead to taking an additional interrupt on the + * host, but that should not be a problem as the worst that + * can happen is an additional vgic injection. We also clear + * the pending state to maintain proper semantics for edge HW + * interrupts. + */ + vgic_irq_set_phys_pending(irq, false); + if (!irq->active) + vgic_irq_set_phys_active(irq, false); +} + +void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + int i; + unsigned long flags; + + for_each_set_bit(i, &val, len * 8) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + /* GICD_ICPENDR0 SGI bits are WI */ + if (is_vgic_v2_sgi(vcpu, irq)) { + vgic_put_irq(vcpu->kvm, irq); + continue; + } + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + + if (irq->hw && vgic_irq_is_sgi(irq->intid)) { + /* HW SGI? Ask the GIC to clear its pending bit */ + int err; + err = irq_set_irqchip_state(irq->host_irq, + IRQCHIP_STATE_PENDING, + false); + WARN_RATELIMIT(err, "IRQ %d", irq->host_irq); + + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + vgic_put_irq(vcpu->kvm, irq); + + continue; + } + + if (irq->hw) + vgic_hw_irq_cpending(vcpu, irq); + else + irq->pending_latch = false; + + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + vgic_put_irq(vcpu->kvm, irq); + } +} + +int vgic_uaccess_write_cpending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + int i; + unsigned long flags; + + for_each_set_bit(i, &val, len * 8) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + /* + * More fun with GICv2 SGIs! If we're clearing one of them + * from userspace, which source vcpu to clear? Let's not + * even think of it, and blow the whole set. + */ + if (is_vgic_v2_sgi(vcpu, irq)) + irq->source = 0; + + irq->pending_latch = false; + + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + + vgic_put_irq(vcpu->kvm, irq); + } + + return 0; +} + +/* + * If we are fiddling with an IRQ's active state, we have to make sure the IRQ + * is not queued on some running VCPU's LRs, because then the change to the + * active state can be overwritten when the VCPU's state is synced coming back + * from the guest. + * + * For shared interrupts as well as GICv3 private interrupts, we have to + * stop all the VCPUs because interrupts can be migrated while we don't hold + * the IRQ locks and we don't want to be chasing moving targets. + * + * For GICv2 private interrupts we don't have to do anything because + * userspace accesses to the VGIC state already require all VCPUs to be + * stopped, and only the VCPU itself can modify its private interrupts + * active state, which guarantees that the VCPU is not running. + */ +static void vgic_access_active_prepare(struct kvm_vcpu *vcpu, u32 intid) +{ + if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 || + intid >= VGIC_NR_PRIVATE_IRQS) + kvm_arm_halt_guest(vcpu->kvm); +} + +/* See vgic_access_active_prepare */ +static void vgic_access_active_finish(struct kvm_vcpu *vcpu, u32 intid) +{ + if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 || + intid >= VGIC_NR_PRIVATE_IRQS) + kvm_arm_resume_guest(vcpu->kvm); +} + +static unsigned long __vgic_mmio_read_active(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + u32 value = 0; + int i; + + /* Loop over all IRQs affected by this read */ + for (i = 0; i < len * 8; i++) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + /* + * Even for HW interrupts, don't evaluate the HW state as + * all the guest is interested in is the virtual state. + */ + if (irq->active) + value |= (1U << i); + + vgic_put_irq(vcpu->kvm, irq); + } + + return value; +} + +unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + u32 val; + + mutex_lock(&vcpu->kvm->lock); + vgic_access_active_prepare(vcpu, intid); + + val = __vgic_mmio_read_active(vcpu, addr, len); + + vgic_access_active_finish(vcpu, intid); + mutex_unlock(&vcpu->kvm->lock); + + return val; +} + +unsigned long vgic_uaccess_read_active(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + return __vgic_mmio_read_active(vcpu, addr, len); +} + +/* Must be called with irq->irq_lock held */ +static void vgic_hw_irq_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq, + bool active, bool is_uaccess) +{ + if (is_uaccess) + return; + + irq->active = active; + vgic_irq_set_phys_active(irq, active); +} + +static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq, + bool active) +{ + unsigned long flags; + struct kvm_vcpu *requester_vcpu = kvm_get_running_vcpu(); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + + if (irq->hw && !vgic_irq_is_sgi(irq->intid)) { + vgic_hw_irq_change_active(vcpu, irq, active, !requester_vcpu); + } else if (irq->hw && vgic_irq_is_sgi(irq->intid)) { + /* + * GICv4.1 VSGI feature doesn't track an active state, + * so let's not kid ourselves, there is nothing we can + * do here. + */ + irq->active = false; + } else { + u32 model = vcpu->kvm->arch.vgic.vgic_model; + u8 active_source; + + irq->active = active; + + /* + * The GICv2 architecture indicates that the source CPUID for + * an SGI should be provided during an EOI which implies that + * the active state is stored somewhere, but at the same time + * this state is not architecturally exposed anywhere and we + * have no way of knowing the right source. + * + * This may lead to a VCPU not being able to receive + * additional instances of a particular SGI after migration + * for a GICv2 VM on some GIC implementations. Oh well. + */ + active_source = (requester_vcpu) ? requester_vcpu->vcpu_id : 0; + + if (model == KVM_DEV_TYPE_ARM_VGIC_V2 && + active && vgic_irq_is_sgi(irq->intid)) + irq->active_source = active_source; + } + + if (irq->active) + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); + else + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); +} + +static void __vgic_mmio_write_cactive(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + int i; + + for_each_set_bit(i, &val, len * 8) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + vgic_mmio_change_active(vcpu, irq, false); + vgic_put_irq(vcpu->kvm, irq); + } +} + +void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + + mutex_lock(&vcpu->kvm->lock); + vgic_access_active_prepare(vcpu, intid); + + __vgic_mmio_write_cactive(vcpu, addr, len, val); + + vgic_access_active_finish(vcpu, intid); + mutex_unlock(&vcpu->kvm->lock); +} + +int vgic_mmio_uaccess_write_cactive(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + __vgic_mmio_write_cactive(vcpu, addr, len, val); + return 0; +} + +static void __vgic_mmio_write_sactive(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + int i; + + for_each_set_bit(i, &val, len * 8) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + vgic_mmio_change_active(vcpu, irq, true); + vgic_put_irq(vcpu->kvm, irq); + } +} + +void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + + mutex_lock(&vcpu->kvm->lock); + vgic_access_active_prepare(vcpu, intid); + + __vgic_mmio_write_sactive(vcpu, addr, len, val); + + vgic_access_active_finish(vcpu, intid); + mutex_unlock(&vcpu->kvm->lock); +} + +int vgic_mmio_uaccess_write_sactive(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + __vgic_mmio_write_sactive(vcpu, addr, len, val); + return 0; +} + +unsigned long vgic_mmio_read_priority(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 8); + int i; + u64 val = 0; + + for (i = 0; i < len; i++) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + val |= (u64)irq->priority << (i * 8); + + vgic_put_irq(vcpu->kvm, irq); + } + + return val; +} + +/* + * We currently don't handle changing the priority of an interrupt that + * is already pending on a VCPU. If there is a need for this, we would + * need to make this VCPU exit and re-evaluate the priorities, potentially + * leading to this interrupt getting presented now to the guest (if it has + * been masked by the priority mask before). + */ +void vgic_mmio_write_priority(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 8); + int i; + unsigned long flags; + + for (i = 0; i < len; i++) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + /* Narrow the priority range to what we actually support */ + irq->priority = (val >> (i * 8)) & GENMASK(7, 8 - VGIC_PRI_BITS); + if (irq->hw && vgic_irq_is_sgi(irq->intid)) + vgic_update_vsgi(irq); + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + + vgic_put_irq(vcpu->kvm, irq); + } +} + +unsigned long vgic_mmio_read_config(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 2); + u32 value = 0; + int i; + + for (i = 0; i < len * 4; i++) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + if (irq->config == VGIC_CONFIG_EDGE) + value |= (2U << (i * 2)); + + vgic_put_irq(vcpu->kvm, irq); + } + + return value; +} + +void vgic_mmio_write_config(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 2); + int i; + unsigned long flags; + + for (i = 0; i < len * 4; i++) { + struct vgic_irq *irq; + + /* + * The configuration cannot be changed for SGIs in general, + * for PPIs this is IMPLEMENTATION DEFINED. The arch timer + * code relies on PPIs being level triggered, so we also + * make them read-only here. + */ + if (intid + i < VGIC_NR_PRIVATE_IRQS) + continue; + + irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + raw_spin_lock_irqsave(&irq->irq_lock, flags); + + if (test_bit(i * 2 + 1, &val)) + irq->config = VGIC_CONFIG_EDGE; + else + irq->config = VGIC_CONFIG_LEVEL; + + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + vgic_put_irq(vcpu->kvm, irq); + } +} + +u64 vgic_read_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid) +{ + int i; + u64 val = 0; + int nr_irqs = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS; + + for (i = 0; i < 32; i++) { + struct vgic_irq *irq; + + if ((intid + i) < VGIC_NR_SGIS || (intid + i) >= nr_irqs) + continue; + + irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + if (irq->config == VGIC_CONFIG_LEVEL && irq->line_level) + val |= (1U << i); + + vgic_put_irq(vcpu->kvm, irq); + } + + return val; +} + +void vgic_write_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid, + const u64 val) +{ + int i; + int nr_irqs = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS; + unsigned long flags; + + for (i = 0; i < 32; i++) { + struct vgic_irq *irq; + bool new_level; + + if ((intid + i) < VGIC_NR_SGIS || (intid + i) >= nr_irqs) + continue; + + irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + /* + * Line level is set irrespective of irq type + * (level or edge) to avoid dependency that VM should + * restore irq config before line level. + */ + new_level = !!(val & (1U << i)); + raw_spin_lock_irqsave(&irq->irq_lock, flags); + irq->line_level = new_level; + if (new_level) + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); + else + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + + vgic_put_irq(vcpu->kvm, irq); + } +} + +static int match_region(const void *key, const void *elt) +{ + const unsigned int offset = (unsigned long)key; + const struct vgic_register_region *region = elt; + + if (offset < region->reg_offset) + return -1; + + if (offset >= region->reg_offset + region->len) + return 1; + + return 0; +} + +const struct vgic_register_region * +vgic_find_mmio_region(const struct vgic_register_region *regions, + int nr_regions, unsigned int offset) +{ + return bsearch((void *)(uintptr_t)offset, regions, nr_regions, + sizeof(regions[0]), match_region); +} + +void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr) +{ + if (kvm_vgic_global_state.type == VGIC_V2) + vgic_v2_set_vmcr(vcpu, vmcr); + else + vgic_v3_set_vmcr(vcpu, vmcr); +} + +void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr) +{ + if (kvm_vgic_global_state.type == VGIC_V2) + vgic_v2_get_vmcr(vcpu, vmcr); + else + vgic_v3_get_vmcr(vcpu, vmcr); +} + +/* + * kvm_mmio_read_buf() returns a value in a format where it can be converted + * to a byte array and be directly observed as the guest wanted it to appear + * in memory if it had done the store itself, which is LE for the GIC, as the + * guest knows the GIC is always LE. + * + * We convert this value to the CPUs native format to deal with it as a data + * value. + */ +unsigned long vgic_data_mmio_bus_to_host(const void *val, unsigned int len) +{ + unsigned long data = kvm_mmio_read_buf(val, len); + + switch (len) { + case 1: + return data; + case 2: + return le16_to_cpu(data); + case 4: + return le32_to_cpu(data); + default: + return le64_to_cpu(data); + } +} + +/* + * kvm_mmio_write_buf() expects a value in a format such that if converted to + * a byte array it is observed as the guest would see it if it could perform + * the load directly. Since the GIC is LE, and the guest knows this, the + * guest expects a value in little endian format. + * + * We convert the data value from the CPUs native format to LE so that the + * value is returned in the proper format. + */ +void vgic_data_host_to_mmio_bus(void *buf, unsigned int len, + unsigned long data) +{ + switch (len) { + case 1: + break; + case 2: + data = cpu_to_le16(data); + break; + case 4: + data = cpu_to_le32(data); + break; + default: + data = cpu_to_le64(data); + } + + kvm_mmio_write_buf(buf, len, data); +} + +static +struct vgic_io_device *kvm_to_vgic_iodev(const struct kvm_io_device *dev) +{ + return container_of(dev, struct vgic_io_device, dev); +} + +static bool check_region(const struct kvm *kvm, + const struct vgic_register_region *region, + gpa_t addr, int len) +{ + int flags, nr_irqs = kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS; + + switch (len) { + case sizeof(u8): + flags = VGIC_ACCESS_8bit; + break; + case sizeof(u32): + flags = VGIC_ACCESS_32bit; + break; + case sizeof(u64): + flags = VGIC_ACCESS_64bit; + break; + default: + return false; + } + + if ((region->access_flags & flags) && IS_ALIGNED(addr, len)) { + if (!region->bits_per_irq) + return true; + + /* Do we access a non-allocated IRQ? */ + return VGIC_ADDR_TO_INTID(addr, region->bits_per_irq) < nr_irqs; + } + + return false; +} + +const struct vgic_register_region * +vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev, + gpa_t addr, int len) +{ + const struct vgic_register_region *region; + + region = vgic_find_mmio_region(iodev->regions, iodev->nr_regions, + addr - iodev->base_addr); + if (!region || !check_region(vcpu->kvm, region, addr, len)) + return NULL; + + return region; +} + +static int vgic_uaccess_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, + gpa_t addr, u32 *val) +{ + struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev); + const struct vgic_register_region *region; + struct kvm_vcpu *r_vcpu; + + region = vgic_get_mmio_region(vcpu, iodev, addr, sizeof(u32)); + if (!region) { + *val = 0; + return 0; + } + + r_vcpu = iodev->redist_vcpu ? iodev->redist_vcpu : vcpu; + if (region->uaccess_read) + *val = region->uaccess_read(r_vcpu, addr, sizeof(u32)); + else + *val = region->read(r_vcpu, addr, sizeof(u32)); + + return 0; +} + +static int vgic_uaccess_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, + gpa_t addr, const u32 *val) +{ + struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev); + const struct vgic_register_region *region; + struct kvm_vcpu *r_vcpu; + + region = vgic_get_mmio_region(vcpu, iodev, addr, sizeof(u32)); + if (!region) + return 0; + + r_vcpu = iodev->redist_vcpu ? iodev->redist_vcpu : vcpu; + if (region->uaccess_write) + return region->uaccess_write(r_vcpu, addr, sizeof(u32), *val); + + region->write(r_vcpu, addr, sizeof(u32), *val); + return 0; +} + +/* + * Userland access to VGIC registers. + */ +int vgic_uaccess(struct kvm_vcpu *vcpu, struct vgic_io_device *dev, + bool is_write, int offset, u32 *val) +{ + if (is_write) + return vgic_uaccess_write(vcpu, &dev->dev, offset, val); + else + return vgic_uaccess_read(vcpu, &dev->dev, offset, val); +} + +static int dispatch_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, + gpa_t addr, int len, void *val) +{ + struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev); + const struct vgic_register_region *region; + unsigned long data = 0; + + region = vgic_get_mmio_region(vcpu, iodev, addr, len); + if (!region) { + memset(val, 0, len); + return 0; + } + + switch (iodev->iodev_type) { + case IODEV_CPUIF: + data = region->read(vcpu, addr, len); + break; + case IODEV_DIST: + data = region->read(vcpu, addr, len); + break; + case IODEV_REDIST: + data = region->read(iodev->redist_vcpu, addr, len); + break; + case IODEV_ITS: + data = region->its_read(vcpu->kvm, iodev->its, addr, len); + break; + } + + vgic_data_host_to_mmio_bus(val, len, data); + return 0; +} + +static int dispatch_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, + gpa_t addr, int len, const void *val) +{ + struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev); + const struct vgic_register_region *region; + unsigned long data = vgic_data_mmio_bus_to_host(val, len); + + region = vgic_get_mmio_region(vcpu, iodev, addr, len); + if (!region) + return 0; + + switch (iodev->iodev_type) { + case IODEV_CPUIF: + region->write(vcpu, addr, len, data); + break; + case IODEV_DIST: + region->write(vcpu, addr, len, data); + break; + case IODEV_REDIST: + region->write(iodev->redist_vcpu, addr, len, data); + break; + case IODEV_ITS: + region->its_write(vcpu->kvm, iodev->its, addr, len, data); + break; + } + + return 0; +} + +struct kvm_io_device_ops kvm_io_gic_ops = { + .read = dispatch_mmio_read, + .write = dispatch_mmio_write, +}; + +int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address, + enum vgic_type type) +{ + struct vgic_io_device *io_device = &kvm->arch.vgic.dist_iodev; + int ret = 0; + unsigned int len; + + switch (type) { + case VGIC_V2: + len = vgic_v2_init_dist_iodev(io_device); + break; + case VGIC_V3: + len = vgic_v3_init_dist_iodev(io_device); + break; + default: + BUG_ON(1); + } + + io_device->base_addr = dist_base_address; + io_device->iodev_type = IODEV_DIST; + io_device->redist_vcpu = NULL; + + mutex_lock(&kvm->slots_lock); + ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, dist_base_address, + len, &io_device->dev); + mutex_unlock(&kvm->slots_lock); + + return ret; +} diff --git a/arch/arm64/kvm/vgic/vgic-mmio.h b/arch/arm64/kvm/vgic/vgic-mmio.h new file mode 100644 index 000000000000..fefcca2b14dc --- /dev/null +++ b/arch/arm64/kvm/vgic/vgic-mmio.h @@ -0,0 +1,227 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2015, 2016 ARM Ltd. + */ +#ifndef __KVM_ARM_VGIC_MMIO_H__ +#define __KVM_ARM_VGIC_MMIO_H__ + +struct vgic_register_region { + unsigned int reg_offset; + unsigned int len; + unsigned int bits_per_irq; + unsigned int access_flags; + union { + unsigned long (*read)(struct kvm_vcpu *vcpu, gpa_t addr, + unsigned int len); + unsigned long (*its_read)(struct kvm *kvm, struct vgic_its *its, + gpa_t addr, unsigned int len); + }; + union { + void (*write)(struct kvm_vcpu *vcpu, gpa_t addr, + unsigned int len, unsigned long val); + void (*its_write)(struct kvm *kvm, struct vgic_its *its, + gpa_t addr, unsigned int len, + unsigned long val); + }; + unsigned long (*uaccess_read)(struct kvm_vcpu *vcpu, gpa_t addr, + unsigned int len); + union { + int (*uaccess_write)(struct kvm_vcpu *vcpu, gpa_t addr, + unsigned int len, unsigned long val); + int (*uaccess_its_write)(struct kvm *kvm, struct vgic_its *its, + gpa_t addr, unsigned int len, + unsigned long val); + }; +}; + +extern struct kvm_io_device_ops kvm_io_gic_ops; + +#define VGIC_ACCESS_8bit 1 +#define VGIC_ACCESS_32bit 2 +#define VGIC_ACCESS_64bit 4 + +/* + * Generate a mask that covers the number of bytes required to address + * up to 1024 interrupts, each represented by <bits> bits. This assumes + * that <bits> is a power of two. + */ +#define VGIC_ADDR_IRQ_MASK(bits) (((bits) * 1024 / 8) - 1) + +/* + * (addr & mask) gives us the _byte_ offset for the INT ID. + * We multiply this by 8 the get the _bit_ offset, then divide this by + * the number of bits to learn the actual INT ID. + * But instead of a division (which requires a "long long div" implementation), + * we shift by the binary logarithm of <bits>. + * This assumes that <bits> is a power of two. + */ +#define VGIC_ADDR_TO_INTID(addr, bits) (((addr) & VGIC_ADDR_IRQ_MASK(bits)) * \ + 8 >> ilog2(bits)) + +/* + * Some VGIC registers store per-IRQ information, with a different number + * of bits per IRQ. For those registers this macro is used. + * The _WITH_LENGTH version instantiates registers with a fixed length + * and is mutually exclusive with the _PER_IRQ version. + */ +#define REGISTER_DESC_WITH_BITS_PER_IRQ(off, rd, wr, ur, uw, bpi, acc) \ + { \ + .reg_offset = off, \ + .bits_per_irq = bpi, \ + .len = bpi * 1024 / 8, \ + .access_flags = acc, \ + .read = rd, \ + .write = wr, \ + .uaccess_read = ur, \ + .uaccess_write = uw, \ + } + +#define REGISTER_DESC_WITH_LENGTH(off, rd, wr, length, acc) \ + { \ + .reg_offset = off, \ + .bits_per_irq = 0, \ + .len = length, \ + .access_flags = acc, \ + .read = rd, \ + .write = wr, \ + } + +#define REGISTER_DESC_WITH_LENGTH_UACCESS(off, rd, wr, urd, uwr, length, acc) \ + { \ + .reg_offset = off, \ + .bits_per_irq = 0, \ + .len = length, \ + .access_flags = acc, \ + .read = rd, \ + .write = wr, \ + .uaccess_read = urd, \ + .uaccess_write = uwr, \ + } + +unsigned long vgic_data_mmio_bus_to_host(const void *val, unsigned int len); + +void vgic_data_host_to_mmio_bus(void *buf, unsigned int len, + unsigned long data); + +unsigned long extract_bytes(u64 data, unsigned int offset, + unsigned int num); + +u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len, + unsigned long val); + +unsigned long vgic_mmio_read_raz(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len); + +unsigned long vgic_mmio_read_rao(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len); + +void vgic_mmio_write_wi(struct kvm_vcpu *vcpu, gpa_t addr, + unsigned int len, unsigned long val); + +int vgic_mmio_uaccess_write_wi(struct kvm_vcpu *vcpu, gpa_t addr, + unsigned int len, unsigned long val); + +unsigned long vgic_mmio_read_group(struct kvm_vcpu *vcpu, gpa_t addr, + unsigned int len); + +void vgic_mmio_write_group(struct kvm_vcpu *vcpu, gpa_t addr, + unsigned int len, unsigned long val); + +unsigned long vgic_mmio_read_enable(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len); + +void vgic_mmio_write_senable(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + +void vgic_mmio_write_cenable(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + +int vgic_uaccess_write_senable(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + +int vgic_uaccess_write_cenable(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + +unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len); + +void vgic_mmio_write_spending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + +void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + +int vgic_uaccess_write_spending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + +int vgic_uaccess_write_cpending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + +unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len); + +unsigned long vgic_uaccess_read_active(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len); + +void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + +void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + +int vgic_mmio_uaccess_write_cactive(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + +int vgic_mmio_uaccess_write_sactive(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + +unsigned long vgic_mmio_read_priority(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len); + +void vgic_mmio_write_priority(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + +unsigned long vgic_mmio_read_config(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len); + +void vgic_mmio_write_config(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + +int vgic_uaccess(struct kvm_vcpu *vcpu, struct vgic_io_device *dev, + bool is_write, int offset, u32 *val); + +u64 vgic_read_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid); + +void vgic_write_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid, + const u64 val); + +unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev); + +unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev); + +u64 vgic_sanitise_outer_cacheability(u64 reg); +u64 vgic_sanitise_inner_cacheability(u64 reg); +u64 vgic_sanitise_shareability(u64 reg); +u64 vgic_sanitise_field(u64 reg, u64 field_mask, int field_shift, + u64 (*sanitise_fn)(u64)); + +/* Find the proper register handler entry given a certain address offset */ +const struct vgic_register_region * +vgic_find_mmio_region(const struct vgic_register_region *regions, + int nr_regions, unsigned int offset); + +#endif diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c new file mode 100644 index 000000000000..ebf53a4e1296 --- /dev/null +++ b/arch/arm64/kvm/vgic/vgic-v2.c @@ -0,0 +1,504 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2015, 2016 ARM Ltd. + */ + +#include <linux/irqchip/arm-gic.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <kvm/arm_vgic.h> +#include <asm/kvm_mmu.h> + +#include "vgic.h" + +static inline void vgic_v2_write_lr(int lr, u32 val) +{ + void __iomem *base = kvm_vgic_global_state.vctrl_base; + + writel_relaxed(val, base + GICH_LR0 + (lr * 4)); +} + +void vgic_v2_init_lrs(void) +{ + int i; + + for (i = 0; i < kvm_vgic_global_state.nr_lr; i++) + vgic_v2_write_lr(i, 0); +} + +void vgic_v2_set_underflow(struct kvm_vcpu *vcpu) +{ + struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2; + + cpuif->vgic_hcr |= GICH_HCR_UIE; +} + +static bool lr_signals_eoi_mi(u32 lr_val) +{ + return !(lr_val & GICH_LR_STATE) && (lr_val & GICH_LR_EOI) && + !(lr_val & GICH_LR_HW); +} + +/* + * transfer the content of the LRs back into the corresponding ap_list: + * - active bit is transferred as is + * - pending bit is + * - transferred as is in case of edge sensitive IRQs + * - set to the line-level (resample time) for level sensitive IRQs + */ +void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_v2_cpu_if *cpuif = &vgic_cpu->vgic_v2; + int lr; + + DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); + + cpuif->vgic_hcr &= ~GICH_HCR_UIE; + + for (lr = 0; lr < vgic_cpu->vgic_v2.used_lrs; lr++) { + u32 val = cpuif->vgic_lr[lr]; + u32 cpuid, intid = val & GICH_LR_VIRTUALID; + struct vgic_irq *irq; + + /* Extract the source vCPU id from the LR */ + cpuid = val & GICH_LR_PHYSID_CPUID; + cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT; + cpuid &= 7; + + /* Notify fds when the guest EOI'ed a level-triggered SPI */ + if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid)) + kvm_notify_acked_irq(vcpu->kvm, 0, + intid - VGIC_NR_PRIVATE_IRQS); + + irq = vgic_get_irq(vcpu->kvm, vcpu, intid); + + raw_spin_lock(&irq->irq_lock); + + /* Always preserve the active bit */ + irq->active = !!(val & GICH_LR_ACTIVE_BIT); + + if (irq->active && vgic_irq_is_sgi(intid)) + irq->active_source = cpuid; + + /* Edge is the only case where we preserve the pending bit */ + if (irq->config == VGIC_CONFIG_EDGE && + (val & GICH_LR_PENDING_BIT)) { + irq->pending_latch = true; + + if (vgic_irq_is_sgi(intid)) + irq->source |= (1 << cpuid); + } + + /* + * Clear soft pending state when level irqs have been acked. + */ + if (irq->config == VGIC_CONFIG_LEVEL && !(val & GICH_LR_STATE)) + irq->pending_latch = false; + + /* + * Level-triggered mapped IRQs are special because we only + * observe rising edges as input to the VGIC. + * + * If the guest never acked the interrupt we have to sample + * the physical line and set the line level, because the + * device state could have changed or we simply need to + * process the still pending interrupt later. + * + * If this causes us to lower the level, we have to also clear + * the physical active state, since we will otherwise never be + * told when the interrupt becomes asserted again. + */ + if (vgic_irq_is_mapped_level(irq) && (val & GICH_LR_PENDING_BIT)) { + irq->line_level = vgic_get_phys_line_level(irq); + + if (!irq->line_level) + vgic_irq_set_phys_active(irq, false); + } + + raw_spin_unlock(&irq->irq_lock); + vgic_put_irq(vcpu->kvm, irq); + } + + cpuif->used_lrs = 0; +} + +/* + * Populates the particular LR with the state of a given IRQ: + * - for an edge sensitive IRQ the pending state is cleared in struct vgic_irq + * - for a level sensitive IRQ the pending state value is unchanged; + * it is dictated directly by the input level + * + * If @irq describes an SGI with multiple sources, we choose the + * lowest-numbered source VCPU and clear that bit in the source bitmap. + * + * The irq_lock must be held by the caller. + */ +void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) +{ + u32 val = irq->intid; + bool allow_pending = true; + + if (irq->active) { + val |= GICH_LR_ACTIVE_BIT; + if (vgic_irq_is_sgi(irq->intid)) + val |= irq->active_source << GICH_LR_PHYSID_CPUID_SHIFT; + if (vgic_irq_is_multi_sgi(irq)) { + allow_pending = false; + val |= GICH_LR_EOI; + } + } + + if (irq->group) + val |= GICH_LR_GROUP1; + + if (irq->hw) { + val |= GICH_LR_HW; + val |= irq->hwintid << GICH_LR_PHYSID_CPUID_SHIFT; + /* + * Never set pending+active on a HW interrupt, as the + * pending state is kept at the physical distributor + * level. + */ + if (irq->active) + allow_pending = false; + } else { + if (irq->config == VGIC_CONFIG_LEVEL) { + val |= GICH_LR_EOI; + + /* + * Software resampling doesn't work very well + * if we allow P+A, so let's not do that. + */ + if (irq->active) + allow_pending = false; + } + } + + if (allow_pending && irq_is_pending(irq)) { + val |= GICH_LR_PENDING_BIT; + + if (irq->config == VGIC_CONFIG_EDGE) + irq->pending_latch = false; + + if (vgic_irq_is_sgi(irq->intid)) { + u32 src = ffs(irq->source); + + if (WARN_RATELIMIT(!src, "No SGI source for INTID %d\n", + irq->intid)) + return; + + val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT; + irq->source &= ~(1 << (src - 1)); + if (irq->source) { + irq->pending_latch = true; + val |= GICH_LR_EOI; + } + } + } + + /* + * Level-triggered mapped IRQs are special because we only observe + * rising edges as input to the VGIC. We therefore lower the line + * level here, so that we can take new virtual IRQs. See + * vgic_v2_fold_lr_state for more info. + */ + if (vgic_irq_is_mapped_level(irq) && (val & GICH_LR_PENDING_BIT)) + irq->line_level = false; + + /* The GICv2 LR only holds five bits of priority. */ + val |= (irq->priority >> 3) << GICH_LR_PRIORITY_SHIFT; + + vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = val; +} + +void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr) +{ + vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = 0; +} + +void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) +{ + struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; + u32 vmcr; + + vmcr = (vmcrp->grpen0 << GICH_VMCR_ENABLE_GRP0_SHIFT) & + GICH_VMCR_ENABLE_GRP0_MASK; + vmcr |= (vmcrp->grpen1 << GICH_VMCR_ENABLE_GRP1_SHIFT) & + GICH_VMCR_ENABLE_GRP1_MASK; + vmcr |= (vmcrp->ackctl << GICH_VMCR_ACK_CTL_SHIFT) & + GICH_VMCR_ACK_CTL_MASK; + vmcr |= (vmcrp->fiqen << GICH_VMCR_FIQ_EN_SHIFT) & + GICH_VMCR_FIQ_EN_MASK; + vmcr |= (vmcrp->cbpr << GICH_VMCR_CBPR_SHIFT) & + GICH_VMCR_CBPR_MASK; + vmcr |= (vmcrp->eoim << GICH_VMCR_EOI_MODE_SHIFT) & + GICH_VMCR_EOI_MODE_MASK; + vmcr |= (vmcrp->abpr << GICH_VMCR_ALIAS_BINPOINT_SHIFT) & + GICH_VMCR_ALIAS_BINPOINT_MASK; + vmcr |= (vmcrp->bpr << GICH_VMCR_BINPOINT_SHIFT) & + GICH_VMCR_BINPOINT_MASK; + vmcr |= ((vmcrp->pmr >> GICV_PMR_PRIORITY_SHIFT) << + GICH_VMCR_PRIMASK_SHIFT) & GICH_VMCR_PRIMASK_MASK; + + cpu_if->vgic_vmcr = vmcr; +} + +void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) +{ + struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; + u32 vmcr; + + vmcr = cpu_if->vgic_vmcr; + + vmcrp->grpen0 = (vmcr & GICH_VMCR_ENABLE_GRP0_MASK) >> + GICH_VMCR_ENABLE_GRP0_SHIFT; + vmcrp->grpen1 = (vmcr & GICH_VMCR_ENABLE_GRP1_MASK) >> + GICH_VMCR_ENABLE_GRP1_SHIFT; + vmcrp->ackctl = (vmcr & GICH_VMCR_ACK_CTL_MASK) >> + GICH_VMCR_ACK_CTL_SHIFT; + vmcrp->fiqen = (vmcr & GICH_VMCR_FIQ_EN_MASK) >> + GICH_VMCR_FIQ_EN_SHIFT; + vmcrp->cbpr = (vmcr & GICH_VMCR_CBPR_MASK) >> + GICH_VMCR_CBPR_SHIFT; + vmcrp->eoim = (vmcr & GICH_VMCR_EOI_MODE_MASK) >> + GICH_VMCR_EOI_MODE_SHIFT; + + vmcrp->abpr = (vmcr & GICH_VMCR_ALIAS_BINPOINT_MASK) >> + GICH_VMCR_ALIAS_BINPOINT_SHIFT; + vmcrp->bpr = (vmcr & GICH_VMCR_BINPOINT_MASK) >> + GICH_VMCR_BINPOINT_SHIFT; + vmcrp->pmr = ((vmcr & GICH_VMCR_PRIMASK_MASK) >> + GICH_VMCR_PRIMASK_SHIFT) << GICV_PMR_PRIORITY_SHIFT; +} + +void vgic_v2_enable(struct kvm_vcpu *vcpu) +{ + /* + * By forcing VMCR to zero, the GIC will restore the binary + * points to their reset values. Anything else resets to zero + * anyway. + */ + vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = 0; + + /* Get the show on the road... */ + vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr = GICH_HCR_EN; +} + +/* check for overlapping regions and for regions crossing the end of memory */ +static bool vgic_v2_check_base(gpa_t dist_base, gpa_t cpu_base) +{ + if (dist_base + KVM_VGIC_V2_DIST_SIZE < dist_base) + return false; + if (cpu_base + KVM_VGIC_V2_CPU_SIZE < cpu_base) + return false; + + if (dist_base + KVM_VGIC_V2_DIST_SIZE <= cpu_base) + return true; + if (cpu_base + KVM_VGIC_V2_CPU_SIZE <= dist_base) + return true; + + return false; +} + +int vgic_v2_map_resources(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + int ret = 0; + + if (vgic_ready(kvm)) + goto out; + + if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) || + IS_VGIC_ADDR_UNDEF(dist->vgic_cpu_base)) { + kvm_err("Need to set vgic cpu and dist addresses first\n"); + ret = -ENXIO; + goto out; + } + + if (!vgic_v2_check_base(dist->vgic_dist_base, dist->vgic_cpu_base)) { + kvm_err("VGIC CPU and dist frames overlap\n"); + ret = -EINVAL; + goto out; + } + + /* + * Initialize the vgic if this hasn't already been done on demand by + * accessing the vgic state from userspace. + */ + ret = vgic_init(kvm); + if (ret) { + kvm_err("Unable to initialize VGIC dynamic data structures\n"); + goto out; + } + + ret = vgic_register_dist_iodev(kvm, dist->vgic_dist_base, VGIC_V2); + if (ret) { + kvm_err("Unable to register VGIC MMIO regions\n"); + goto out; + } + + if (!static_branch_unlikely(&vgic_v2_cpuif_trap)) { + ret = kvm_phys_addr_ioremap(kvm, dist->vgic_cpu_base, + kvm_vgic_global_state.vcpu_base, + KVM_VGIC_V2_CPU_SIZE, true); + if (ret) { + kvm_err("Unable to remap VGIC CPU to VCPU\n"); + goto out; + } + } + + dist->ready = true; + +out: + return ret; +} + +DEFINE_STATIC_KEY_FALSE(vgic_v2_cpuif_trap); + +/** + * vgic_v2_probe - probe for a VGICv2 compatible interrupt controller + * @info: pointer to the GIC description + * + * Returns 0 if the VGICv2 has been probed successfully, returns an error code + * otherwise + */ +int vgic_v2_probe(const struct gic_kvm_info *info) +{ + int ret; + u32 vtr; + + if (!info->vctrl.start) { + kvm_err("GICH not present in the firmware table\n"); + return -ENXIO; + } + + if (!PAGE_ALIGNED(info->vcpu.start) || + !PAGE_ALIGNED(resource_size(&info->vcpu))) { + kvm_info("GICV region size/alignment is unsafe, using trapping (reduced performance)\n"); + + ret = create_hyp_io_mappings(info->vcpu.start, + resource_size(&info->vcpu), + &kvm_vgic_global_state.vcpu_base_va, + &kvm_vgic_global_state.vcpu_hyp_va); + if (ret) { + kvm_err("Cannot map GICV into hyp\n"); + goto out; + } + + static_branch_enable(&vgic_v2_cpuif_trap); + } + + ret = create_hyp_io_mappings(info->vctrl.start, + resource_size(&info->vctrl), + &kvm_vgic_global_state.vctrl_base, + &kvm_vgic_global_state.vctrl_hyp); + if (ret) { + kvm_err("Cannot map VCTRL into hyp\n"); + goto out; + } + + vtr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VTR); + kvm_vgic_global_state.nr_lr = (vtr & 0x3f) + 1; + + ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2); + if (ret) { + kvm_err("Cannot register GICv2 KVM device\n"); + goto out; + } + + kvm_vgic_global_state.can_emulate_gicv2 = true; + kvm_vgic_global_state.vcpu_base = info->vcpu.start; + kvm_vgic_global_state.type = VGIC_V2; + kvm_vgic_global_state.max_gic_vcpus = VGIC_V2_MAX_CPUS; + + kvm_debug("vgic-v2@%llx\n", info->vctrl.start); + + return 0; +out: + if (kvm_vgic_global_state.vctrl_base) + iounmap(kvm_vgic_global_state.vctrl_base); + if (kvm_vgic_global_state.vcpu_base_va) + iounmap(kvm_vgic_global_state.vcpu_base_va); + + return ret; +} + +static void save_lrs(struct kvm_vcpu *vcpu, void __iomem *base) +{ + struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; + u64 used_lrs = cpu_if->used_lrs; + u64 elrsr; + int i; + + elrsr = readl_relaxed(base + GICH_ELRSR0); + if (unlikely(used_lrs > 32)) + elrsr |= ((u64)readl_relaxed(base + GICH_ELRSR1)) << 32; + + for (i = 0; i < used_lrs; i++) { + if (elrsr & (1UL << i)) + cpu_if->vgic_lr[i] &= ~GICH_LR_STATE; + else + cpu_if->vgic_lr[i] = readl_relaxed(base + GICH_LR0 + (i * 4)); + + writel_relaxed(0, base + GICH_LR0 + (i * 4)); + } +} + +void vgic_v2_save_state(struct kvm_vcpu *vcpu) +{ + void __iomem *base = kvm_vgic_global_state.vctrl_base; + u64 used_lrs = vcpu->arch.vgic_cpu.vgic_v2.used_lrs; + + if (!base) + return; + + if (used_lrs) { + save_lrs(vcpu, base); + writel_relaxed(0, base + GICH_HCR); + } +} + +void vgic_v2_restore_state(struct kvm_vcpu *vcpu) +{ + struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; + void __iomem *base = kvm_vgic_global_state.vctrl_base; + u64 used_lrs = cpu_if->used_lrs; + int i; + + if (!base) + return; + + if (used_lrs) { + writel_relaxed(cpu_if->vgic_hcr, base + GICH_HCR); + for (i = 0; i < used_lrs; i++) { + writel_relaxed(cpu_if->vgic_lr[i], + base + GICH_LR0 + (i * 4)); + } + } +} + +void vgic_v2_load(struct kvm_vcpu *vcpu) +{ + struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; + + writel_relaxed(cpu_if->vgic_vmcr, + kvm_vgic_global_state.vctrl_base + GICH_VMCR); + writel_relaxed(cpu_if->vgic_apr, + kvm_vgic_global_state.vctrl_base + GICH_APR); +} + +void vgic_v2_vmcr_sync(struct kvm_vcpu *vcpu) +{ + struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; + + cpu_if->vgic_vmcr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VMCR); +} + +void vgic_v2_put(struct kvm_vcpu *vcpu) +{ + struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; + + vgic_v2_vmcr_sync(vcpu); + cpu_if->vgic_apr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_APR); +} diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c new file mode 100644 index 000000000000..76e2d85789ed --- /dev/null +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -0,0 +1,693 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/irqchip/arm-gic-v3.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <kvm/arm_vgic.h> +#include <asm/kvm_hyp.h> +#include <asm/kvm_mmu.h> +#include <asm/kvm_asm.h> + +#include "vgic.h" + +static bool group0_trap; +static bool group1_trap; +static bool common_trap; +static bool gicv4_enable; + +void vgic_v3_set_underflow(struct kvm_vcpu *vcpu) +{ + struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3; + + cpuif->vgic_hcr |= ICH_HCR_UIE; +} + +static bool lr_signals_eoi_mi(u64 lr_val) +{ + return !(lr_val & ICH_LR_STATE) && (lr_val & ICH_LR_EOI) && + !(lr_val & ICH_LR_HW); +} + +void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_v3_cpu_if *cpuif = &vgic_cpu->vgic_v3; + u32 model = vcpu->kvm->arch.vgic.vgic_model; + int lr; + + DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); + + cpuif->vgic_hcr &= ~ICH_HCR_UIE; + + for (lr = 0; lr < cpuif->used_lrs; lr++) { + u64 val = cpuif->vgic_lr[lr]; + u32 intid, cpuid; + struct vgic_irq *irq; + bool is_v2_sgi = false; + + cpuid = val & GICH_LR_PHYSID_CPUID; + cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT; + + if (model == KVM_DEV_TYPE_ARM_VGIC_V3) { + intid = val & ICH_LR_VIRTUAL_ID_MASK; + } else { + intid = val & GICH_LR_VIRTUALID; + is_v2_sgi = vgic_irq_is_sgi(intid); + } + + /* Notify fds when the guest EOI'ed a level-triggered IRQ */ + if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid)) + kvm_notify_acked_irq(vcpu->kvm, 0, + intid - VGIC_NR_PRIVATE_IRQS); + + irq = vgic_get_irq(vcpu->kvm, vcpu, intid); + if (!irq) /* An LPI could have been unmapped. */ + continue; + + raw_spin_lock(&irq->irq_lock); + + /* Always preserve the active bit */ + irq->active = !!(val & ICH_LR_ACTIVE_BIT); + + if (irq->active && is_v2_sgi) + irq->active_source = cpuid; + + /* Edge is the only case where we preserve the pending bit */ + if (irq->config == VGIC_CONFIG_EDGE && + (val & ICH_LR_PENDING_BIT)) { + irq->pending_latch = true; + + if (is_v2_sgi) + irq->source |= (1 << cpuid); + } + + /* + * Clear soft pending state when level irqs have been acked. + */ + if (irq->config == VGIC_CONFIG_LEVEL && !(val & ICH_LR_STATE)) + irq->pending_latch = false; + + /* + * Level-triggered mapped IRQs are special because we only + * observe rising edges as input to the VGIC. + * + * If the guest never acked the interrupt we have to sample + * the physical line and set the line level, because the + * device state could have changed or we simply need to + * process the still pending interrupt later. + * + * If this causes us to lower the level, we have to also clear + * the physical active state, since we will otherwise never be + * told when the interrupt becomes asserted again. + */ + if (vgic_irq_is_mapped_level(irq) && (val & ICH_LR_PENDING_BIT)) { + irq->line_level = vgic_get_phys_line_level(irq); + + if (!irq->line_level) + vgic_irq_set_phys_active(irq, false); + } + + raw_spin_unlock(&irq->irq_lock); + vgic_put_irq(vcpu->kvm, irq); + } + + cpuif->used_lrs = 0; +} + +/* Requires the irq to be locked already */ +void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) +{ + u32 model = vcpu->kvm->arch.vgic.vgic_model; + u64 val = irq->intid; + bool allow_pending = true, is_v2_sgi; + + is_v2_sgi = (vgic_irq_is_sgi(irq->intid) && + model == KVM_DEV_TYPE_ARM_VGIC_V2); + + if (irq->active) { + val |= ICH_LR_ACTIVE_BIT; + if (is_v2_sgi) + val |= irq->active_source << GICH_LR_PHYSID_CPUID_SHIFT; + if (vgic_irq_is_multi_sgi(irq)) { + allow_pending = false; + val |= ICH_LR_EOI; + } + } + + if (irq->hw) { + val |= ICH_LR_HW; + val |= ((u64)irq->hwintid) << ICH_LR_PHYS_ID_SHIFT; + /* + * Never set pending+active on a HW interrupt, as the + * pending state is kept at the physical distributor + * level. + */ + if (irq->active) + allow_pending = false; + } else { + if (irq->config == VGIC_CONFIG_LEVEL) { + val |= ICH_LR_EOI; + + /* + * Software resampling doesn't work very well + * if we allow P+A, so let's not do that. + */ + if (irq->active) + allow_pending = false; + } + } + + if (allow_pending && irq_is_pending(irq)) { + val |= ICH_LR_PENDING_BIT; + + if (irq->config == VGIC_CONFIG_EDGE) + irq->pending_latch = false; + + if (vgic_irq_is_sgi(irq->intid) && + model == KVM_DEV_TYPE_ARM_VGIC_V2) { + u32 src = ffs(irq->source); + + if (WARN_RATELIMIT(!src, "No SGI source for INTID %d\n", + irq->intid)) + return; + + val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT; + irq->source &= ~(1 << (src - 1)); + if (irq->source) { + irq->pending_latch = true; + val |= ICH_LR_EOI; + } + } + } + + /* + * Level-triggered mapped IRQs are special because we only observe + * rising edges as input to the VGIC. We therefore lower the line + * level here, so that we can take new virtual IRQs. See + * vgic_v3_fold_lr_state for more info. + */ + if (vgic_irq_is_mapped_level(irq) && (val & ICH_LR_PENDING_BIT)) + irq->line_level = false; + + if (irq->group) + val |= ICH_LR_GROUP; + + val |= (u64)irq->priority << ICH_LR_PRIORITY_SHIFT; + + vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = val; +} + +void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr) +{ + vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = 0; +} + +void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) +{ + struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; + u32 model = vcpu->kvm->arch.vgic.vgic_model; + u32 vmcr; + + if (model == KVM_DEV_TYPE_ARM_VGIC_V2) { + vmcr = (vmcrp->ackctl << ICH_VMCR_ACK_CTL_SHIFT) & + ICH_VMCR_ACK_CTL_MASK; + vmcr |= (vmcrp->fiqen << ICH_VMCR_FIQ_EN_SHIFT) & + ICH_VMCR_FIQ_EN_MASK; + } else { + /* + * When emulating GICv3 on GICv3 with SRE=1 on the + * VFIQEn bit is RES1 and the VAckCtl bit is RES0. + */ + vmcr = ICH_VMCR_FIQ_EN_MASK; + } + + vmcr |= (vmcrp->cbpr << ICH_VMCR_CBPR_SHIFT) & ICH_VMCR_CBPR_MASK; + vmcr |= (vmcrp->eoim << ICH_VMCR_EOIM_SHIFT) & ICH_VMCR_EOIM_MASK; + vmcr |= (vmcrp->abpr << ICH_VMCR_BPR1_SHIFT) & ICH_VMCR_BPR1_MASK; + vmcr |= (vmcrp->bpr << ICH_VMCR_BPR0_SHIFT) & ICH_VMCR_BPR0_MASK; + vmcr |= (vmcrp->pmr << ICH_VMCR_PMR_SHIFT) & ICH_VMCR_PMR_MASK; + vmcr |= (vmcrp->grpen0 << ICH_VMCR_ENG0_SHIFT) & ICH_VMCR_ENG0_MASK; + vmcr |= (vmcrp->grpen1 << ICH_VMCR_ENG1_SHIFT) & ICH_VMCR_ENG1_MASK; + + cpu_if->vgic_vmcr = vmcr; +} + +void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) +{ + struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; + u32 model = vcpu->kvm->arch.vgic.vgic_model; + u32 vmcr; + + vmcr = cpu_if->vgic_vmcr; + + if (model == KVM_DEV_TYPE_ARM_VGIC_V2) { + vmcrp->ackctl = (vmcr & ICH_VMCR_ACK_CTL_MASK) >> + ICH_VMCR_ACK_CTL_SHIFT; + vmcrp->fiqen = (vmcr & ICH_VMCR_FIQ_EN_MASK) >> + ICH_VMCR_FIQ_EN_SHIFT; + } else { + /* + * When emulating GICv3 on GICv3 with SRE=1 on the + * VFIQEn bit is RES1 and the VAckCtl bit is RES0. + */ + vmcrp->fiqen = 1; + vmcrp->ackctl = 0; + } + + vmcrp->cbpr = (vmcr & ICH_VMCR_CBPR_MASK) >> ICH_VMCR_CBPR_SHIFT; + vmcrp->eoim = (vmcr & ICH_VMCR_EOIM_MASK) >> ICH_VMCR_EOIM_SHIFT; + vmcrp->abpr = (vmcr & ICH_VMCR_BPR1_MASK) >> ICH_VMCR_BPR1_SHIFT; + vmcrp->bpr = (vmcr & ICH_VMCR_BPR0_MASK) >> ICH_VMCR_BPR0_SHIFT; + vmcrp->pmr = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT; + vmcrp->grpen0 = (vmcr & ICH_VMCR_ENG0_MASK) >> ICH_VMCR_ENG0_SHIFT; + vmcrp->grpen1 = (vmcr & ICH_VMCR_ENG1_MASK) >> ICH_VMCR_ENG1_SHIFT; +} + +#define INITIAL_PENDBASER_VALUE \ + (GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWb) | \ + GIC_BASER_CACHEABILITY(GICR_PENDBASER, OUTER, SameAsInner) | \ + GIC_BASER_SHAREABILITY(GICR_PENDBASER, InnerShareable)) + +void vgic_v3_enable(struct kvm_vcpu *vcpu) +{ + struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3; + + /* + * By forcing VMCR to zero, the GIC will restore the binary + * points to their reset values. Anything else resets to zero + * anyway. + */ + vgic_v3->vgic_vmcr = 0; + + /* + * If we are emulating a GICv3, we do it in an non-GICv2-compatible + * way, so we force SRE to 1 to demonstrate this to the guest. + * Also, we don't support any form of IRQ/FIQ bypass. + * This goes with the spec allowing the value to be RAO/WI. + */ + if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { + vgic_v3->vgic_sre = (ICC_SRE_EL1_DIB | + ICC_SRE_EL1_DFB | + ICC_SRE_EL1_SRE); + vcpu->arch.vgic_cpu.pendbaser = INITIAL_PENDBASER_VALUE; + } else { + vgic_v3->vgic_sre = 0; + } + + vcpu->arch.vgic_cpu.num_id_bits = (kvm_vgic_global_state.ich_vtr_el2 & + ICH_VTR_ID_BITS_MASK) >> + ICH_VTR_ID_BITS_SHIFT; + vcpu->arch.vgic_cpu.num_pri_bits = ((kvm_vgic_global_state.ich_vtr_el2 & + ICH_VTR_PRI_BITS_MASK) >> + ICH_VTR_PRI_BITS_SHIFT) + 1; + + /* Get the show on the road... */ + vgic_v3->vgic_hcr = ICH_HCR_EN; + if (group0_trap) + vgic_v3->vgic_hcr |= ICH_HCR_TALL0; + if (group1_trap) + vgic_v3->vgic_hcr |= ICH_HCR_TALL1; + if (common_trap) + vgic_v3->vgic_hcr |= ICH_HCR_TC; +} + +int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq) +{ + struct kvm_vcpu *vcpu; + int byte_offset, bit_nr; + gpa_t pendbase, ptr; + bool status; + u8 val; + int ret; + unsigned long flags; + +retry: + vcpu = irq->target_vcpu; + if (!vcpu) + return 0; + + pendbase = GICR_PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser); + + byte_offset = irq->intid / BITS_PER_BYTE; + bit_nr = irq->intid % BITS_PER_BYTE; + ptr = pendbase + byte_offset; + + ret = kvm_read_guest_lock(kvm, ptr, &val, 1); + if (ret) + return ret; + + status = val & (1 << bit_nr); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + if (irq->target_vcpu != vcpu) { + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + goto retry; + } + irq->pending_latch = status; + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); + + if (status) { + /* clear consumed data */ + val &= ~(1 << bit_nr); + ret = kvm_write_guest_lock(kvm, ptr, &val, 1); + if (ret) + return ret; + } + return 0; +} + +/** + * vgic_v3_save_pending_tables - Save the pending tables into guest RAM + * kvm lock and all vcpu lock must be held + */ +int vgic_v3_save_pending_tables(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct vgic_irq *irq; + gpa_t last_ptr = ~(gpa_t)0; + int ret; + u8 val; + + list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) { + int byte_offset, bit_nr; + struct kvm_vcpu *vcpu; + gpa_t pendbase, ptr; + bool stored; + + vcpu = irq->target_vcpu; + if (!vcpu) + continue; + + pendbase = GICR_PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser); + + byte_offset = irq->intid / BITS_PER_BYTE; + bit_nr = irq->intid % BITS_PER_BYTE; + ptr = pendbase + byte_offset; + + if (ptr != last_ptr) { + ret = kvm_read_guest_lock(kvm, ptr, &val, 1); + if (ret) + return ret; + last_ptr = ptr; + } + + stored = val & (1U << bit_nr); + if (stored == irq->pending_latch) + continue; + + if (irq->pending_latch) + val |= 1 << bit_nr; + else + val &= ~(1 << bit_nr); + + ret = kvm_write_guest_lock(kvm, ptr, &val, 1); + if (ret) + return ret; + } + return 0; +} + +/** + * vgic_v3_rdist_overlap - check if a region overlaps with any + * existing redistributor region + * + * @kvm: kvm handle + * @base: base of the region + * @size: size of region + * + * Return: true if there is an overlap + */ +bool vgic_v3_rdist_overlap(struct kvm *kvm, gpa_t base, size_t size) +{ + struct vgic_dist *d = &kvm->arch.vgic; + struct vgic_redist_region *rdreg; + + list_for_each_entry(rdreg, &d->rd_regions, list) { + if ((base + size > rdreg->base) && + (base < rdreg->base + vgic_v3_rd_region_size(kvm, rdreg))) + return true; + } + return false; +} + +/* + * Check for overlapping regions and for regions crossing the end of memory + * for base addresses which have already been set. + */ +bool vgic_v3_check_base(struct kvm *kvm) +{ + struct vgic_dist *d = &kvm->arch.vgic; + struct vgic_redist_region *rdreg; + + if (!IS_VGIC_ADDR_UNDEF(d->vgic_dist_base) && + d->vgic_dist_base + KVM_VGIC_V3_DIST_SIZE < d->vgic_dist_base) + return false; + + list_for_each_entry(rdreg, &d->rd_regions, list) { + if (rdreg->base + vgic_v3_rd_region_size(kvm, rdreg) < + rdreg->base) + return false; + } + + if (IS_VGIC_ADDR_UNDEF(d->vgic_dist_base)) + return true; + + return !vgic_v3_rdist_overlap(kvm, d->vgic_dist_base, + KVM_VGIC_V3_DIST_SIZE); +} + +/** + * vgic_v3_rdist_free_slot - Look up registered rdist regions and identify one + * which has free space to put a new rdist region. + * + * @rd_regions: redistributor region list head + * + * A redistributor regions maps n redistributors, n = region size / (2 x 64kB). + * Stride between redistributors is 0 and regions are filled in the index order. + * + * Return: the redist region handle, if any, that has space to map a new rdist + * region. + */ +struct vgic_redist_region *vgic_v3_rdist_free_slot(struct list_head *rd_regions) +{ + struct vgic_redist_region *rdreg; + + list_for_each_entry(rdreg, rd_regions, list) { + if (!vgic_v3_redist_region_full(rdreg)) + return rdreg; + } + return NULL; +} + +struct vgic_redist_region *vgic_v3_rdist_region_from_index(struct kvm *kvm, + u32 index) +{ + struct list_head *rd_regions = &kvm->arch.vgic.rd_regions; + struct vgic_redist_region *rdreg; + + list_for_each_entry(rdreg, rd_regions, list) { + if (rdreg->index == index) + return rdreg; + } + return NULL; +} + + +int vgic_v3_map_resources(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct kvm_vcpu *vcpu; + int ret = 0; + int c; + + if (vgic_ready(kvm)) + goto out; + + kvm_for_each_vcpu(c, vcpu, kvm) { + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + + if (IS_VGIC_ADDR_UNDEF(vgic_cpu->rd_iodev.base_addr)) { + kvm_debug("vcpu %d redistributor base not set\n", c); + ret = -ENXIO; + goto out; + } + } + + if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base)) { + kvm_err("Need to set vgic distributor addresses first\n"); + ret = -ENXIO; + goto out; + } + + if (!vgic_v3_check_base(kvm)) { + kvm_err("VGIC redist and dist frames overlap\n"); + ret = -EINVAL; + goto out; + } + + /* + * For a VGICv3 we require the userland to explicitly initialize + * the VGIC before we need to use it. + */ + if (!vgic_initialized(kvm)) { + ret = -EBUSY; + goto out; + } + + ret = vgic_register_dist_iodev(kvm, dist->vgic_dist_base, VGIC_V3); + if (ret) { + kvm_err("Unable to register VGICv3 dist MMIO regions\n"); + goto out; + } + + if (kvm_vgic_global_state.has_gicv4_1) + vgic_v4_configure_vsgis(kvm); + dist->ready = true; + +out: + return ret; +} + +DEFINE_STATIC_KEY_FALSE(vgic_v3_cpuif_trap); + +static int __init early_group0_trap_cfg(char *buf) +{ + return strtobool(buf, &group0_trap); +} +early_param("kvm-arm.vgic_v3_group0_trap", early_group0_trap_cfg); + +static int __init early_group1_trap_cfg(char *buf) +{ + return strtobool(buf, &group1_trap); +} +early_param("kvm-arm.vgic_v3_group1_trap", early_group1_trap_cfg); + +static int __init early_common_trap_cfg(char *buf) +{ + return strtobool(buf, &common_trap); +} +early_param("kvm-arm.vgic_v3_common_trap", early_common_trap_cfg); + +static int __init early_gicv4_enable(char *buf) +{ + return strtobool(buf, &gicv4_enable); +} +early_param("kvm-arm.vgic_v4_enable", early_gicv4_enable); + +/** + * vgic_v3_probe - probe for a VGICv3 compatible interrupt controller + * @info: pointer to the GIC description + * + * Returns 0 if the VGICv3 has been probed successfully, returns an error code + * otherwise + */ +int vgic_v3_probe(const struct gic_kvm_info *info) +{ + u32 ich_vtr_el2 = kvm_call_hyp_ret(__vgic_v3_get_ich_vtr_el2); + int ret; + + /* + * The ListRegs field is 5 bits, but there is an architectural + * maximum of 16 list registers. Just ignore bit 4... + */ + kvm_vgic_global_state.nr_lr = (ich_vtr_el2 & 0xf) + 1; + kvm_vgic_global_state.can_emulate_gicv2 = false; + kvm_vgic_global_state.ich_vtr_el2 = ich_vtr_el2; + + /* GICv4 support? */ + if (info->has_v4) { + kvm_vgic_global_state.has_gicv4 = gicv4_enable; + kvm_vgic_global_state.has_gicv4_1 = info->has_v4_1 && gicv4_enable; + kvm_info("GICv4%s support %sabled\n", + kvm_vgic_global_state.has_gicv4_1 ? ".1" : "", + gicv4_enable ? "en" : "dis"); + } + + if (!info->vcpu.start) { + kvm_info("GICv3: no GICV resource entry\n"); + kvm_vgic_global_state.vcpu_base = 0; + } else if (!PAGE_ALIGNED(info->vcpu.start)) { + pr_warn("GICV physical address 0x%llx not page aligned\n", + (unsigned long long)info->vcpu.start); + kvm_vgic_global_state.vcpu_base = 0; + } else { + kvm_vgic_global_state.vcpu_base = info->vcpu.start; + kvm_vgic_global_state.can_emulate_gicv2 = true; + ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2); + if (ret) { + kvm_err("Cannot register GICv2 KVM device.\n"); + return ret; + } + kvm_info("vgic-v2@%llx\n", info->vcpu.start); + } + ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V3); + if (ret) { + kvm_err("Cannot register GICv3 KVM device.\n"); + kvm_unregister_device_ops(KVM_DEV_TYPE_ARM_VGIC_V2); + return ret; + } + + if (kvm_vgic_global_state.vcpu_base == 0) + kvm_info("disabling GICv2 emulation\n"); + + if (cpus_have_const_cap(ARM64_WORKAROUND_CAVIUM_30115)) { + group0_trap = true; + group1_trap = true; + } + + if (group0_trap || group1_trap || common_trap) { + kvm_info("GICv3 sysreg trapping enabled ([%s%s%s], reduced performance)\n", + group0_trap ? "G0" : "", + group1_trap ? "G1" : "", + common_trap ? "C" : ""); + static_branch_enable(&vgic_v3_cpuif_trap); + } + + kvm_vgic_global_state.vctrl_base = NULL; + kvm_vgic_global_state.type = VGIC_V3; + kvm_vgic_global_state.max_gic_vcpus = VGIC_V3_MAX_CPUS; + + return 0; +} + +void vgic_v3_load(struct kvm_vcpu *vcpu) +{ + struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; + + /* + * If dealing with a GICv2 emulation on GICv3, VMCR_EL2.VFIQen + * is dependent on ICC_SRE_EL1.SRE, and we have to perform the + * VMCR_EL2 save/restore in the world switch. + */ + if (likely(cpu_if->vgic_sre)) + kvm_call_hyp(__vgic_v3_write_vmcr, cpu_if->vgic_vmcr); + + kvm_call_hyp(__vgic_v3_restore_aprs, kern_hyp_va(cpu_if)); + + if (has_vhe()) + __vgic_v3_activate_traps(cpu_if); + + WARN_ON(vgic_v4_load(vcpu)); +} + +void vgic_v3_vmcr_sync(struct kvm_vcpu *vcpu) +{ + struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; + + if (likely(cpu_if->vgic_sre)) + cpu_if->vgic_vmcr = kvm_call_hyp_ret(__vgic_v3_read_vmcr); +} + +void vgic_v3_put(struct kvm_vcpu *vcpu) +{ + struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; + + WARN_ON(vgic_v4_put(vcpu, false)); + + vgic_v3_vmcr_sync(vcpu); + + kvm_call_hyp(__vgic_v3_save_aprs, kern_hyp_va(cpu_if)); + + if (has_vhe()) + __vgic_v3_deactivate_traps(cpu_if); +} diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c new file mode 100644 index 000000000000..27ac833e5ec7 --- /dev/null +++ b/arch/arm64/kvm/vgic/vgic-v4.c @@ -0,0 +1,453 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2017 ARM Ltd. + * Author: Marc Zyngier <marc.zyngier@arm.com> + */ + +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/irqdomain.h> +#include <linux/kvm_host.h> +#include <linux/irqchip/arm-gic-v3.h> + +#include "vgic.h" + +/* + * How KVM uses GICv4 (insert rude comments here): + * + * The vgic-v4 layer acts as a bridge between several entities: + * - The GICv4 ITS representation offered by the ITS driver + * - VFIO, which is in charge of the PCI endpoint + * - The virtual ITS, which is the only thing the guest sees + * + * The configuration of VLPIs is triggered by a callback from VFIO, + * instructing KVM that a PCI device has been configured to deliver + * MSIs to a vITS. + * + * kvm_vgic_v4_set_forwarding() is thus called with the routing entry, + * and this is used to find the corresponding vITS data structures + * (ITS instance, device, event and irq) using a process that is + * extremely similar to the injection of an MSI. + * + * At this stage, we can link the guest's view of an LPI (uniquely + * identified by the routing entry) and the host irq, using the GICv4 + * driver mapping operation. Should the mapping succeed, we've then + * successfully upgraded the guest's LPI to a VLPI. We can then start + * with updating GICv4's view of the property table and generating an + * INValidation in order to kickstart the delivery of this VLPI to the + * guest directly, without software intervention. Well, almost. + * + * When the PCI endpoint is deconfigured, this operation is reversed + * with VFIO calling kvm_vgic_v4_unset_forwarding(). + * + * Once the VLPI has been mapped, it needs to follow any change the + * guest performs on its LPI through the vITS. For that, a number of + * command handlers have hooks to communicate these changes to the HW: + * - Any invalidation triggers a call to its_prop_update_vlpi() + * - The INT command results in a irq_set_irqchip_state(), which + * generates an INT on the corresponding VLPI. + * - The CLEAR command results in a irq_set_irqchip_state(), which + * generates an CLEAR on the corresponding VLPI. + * - DISCARD translates into an unmap, similar to a call to + * kvm_vgic_v4_unset_forwarding(). + * - MOVI is translated by an update of the existing mapping, changing + * the target vcpu, resulting in a VMOVI being generated. + * - MOVALL is translated by a string of mapping updates (similar to + * the handling of MOVI). MOVALL is horrible. + * + * Note that a DISCARD/MAPTI sequence emitted from the guest without + * reprogramming the PCI endpoint after MAPTI does not result in a + * VLPI being mapped, as there is no callback from VFIO (the guest + * will get the interrupt via the normal SW injection). Fixing this is + * not trivial, and requires some horrible messing with the VFIO + * internals. Not fun. Don't do that. + * + * Then there is the scheduling. Each time a vcpu is about to run on a + * physical CPU, KVM must tell the corresponding redistributor about + * it. And if we've migrated our vcpu from one CPU to another, we must + * tell the ITS (so that the messages reach the right redistributor). + * This is done in two steps: first issue a irq_set_affinity() on the + * irq corresponding to the vcpu, then call its_make_vpe_resident(). + * You must be in a non-preemptible context. On exit, a call to + * its_make_vpe_non_resident() tells the redistributor that we're done + * with the vcpu. + * + * Finally, the doorbell handling: Each vcpu is allocated an interrupt + * which will fire each time a VLPI is made pending whilst the vcpu is + * not running. Each time the vcpu gets blocked, the doorbell + * interrupt gets enabled. When the vcpu is unblocked (for whatever + * reason), the doorbell interrupt is disabled. + */ + +#define DB_IRQ_FLAGS (IRQ_NOAUTOEN | IRQ_DISABLE_UNLAZY | IRQ_NO_BALANCING) + +static irqreturn_t vgic_v4_doorbell_handler(int irq, void *info) +{ + struct kvm_vcpu *vcpu = info; + + /* We got the message, no need to fire again */ + if (!kvm_vgic_global_state.has_gicv4_1 && + !irqd_irq_disabled(&irq_to_desc(irq)->irq_data)) + disable_irq_nosync(irq); + + vcpu->arch.vgic_cpu.vgic_v3.its_vpe.pending_last = true; + kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); + kvm_vcpu_kick(vcpu); + + return IRQ_HANDLED; +} + +static void vgic_v4_sync_sgi_config(struct its_vpe *vpe, struct vgic_irq *irq) +{ + vpe->sgi_config[irq->intid].enabled = irq->enabled; + vpe->sgi_config[irq->intid].group = irq->group; + vpe->sgi_config[irq->intid].priority = irq->priority; +} + +static void vgic_v4_enable_vsgis(struct kvm_vcpu *vcpu) +{ + struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe; + int i; + + /* + * With GICv4.1, every virtual SGI can be directly injected. So + * let's pretend that they are HW interrupts, tied to a host + * IRQ. The SGI code will do its magic. + */ + for (i = 0; i < VGIC_NR_SGIS; i++) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, i); + struct irq_desc *desc; + unsigned long flags; + int ret; + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + + if (irq->hw) + goto unlock; + + irq->hw = true; + irq->host_irq = irq_find_mapping(vpe->sgi_domain, i); + + /* Transfer the full irq state to the vPE */ + vgic_v4_sync_sgi_config(vpe, irq); + desc = irq_to_desc(irq->host_irq); + ret = irq_domain_activate_irq(irq_desc_get_irq_data(desc), + false); + if (!WARN_ON(ret)) { + /* Transfer pending state */ + ret = irq_set_irqchip_state(irq->host_irq, + IRQCHIP_STATE_PENDING, + irq->pending_latch); + WARN_ON(ret); + irq->pending_latch = false; + } + unlock: + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + vgic_put_irq(vcpu->kvm, irq); + } +} + +static void vgic_v4_disable_vsgis(struct kvm_vcpu *vcpu) +{ + int i; + + for (i = 0; i < VGIC_NR_SGIS; i++) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, i); + struct irq_desc *desc; + unsigned long flags; + int ret; + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + + if (!irq->hw) + goto unlock; + + irq->hw = false; + ret = irq_get_irqchip_state(irq->host_irq, + IRQCHIP_STATE_PENDING, + &irq->pending_latch); + WARN_ON(ret); + + desc = irq_to_desc(irq->host_irq); + irq_domain_deactivate_irq(irq_desc_get_irq_data(desc)); + unlock: + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + vgic_put_irq(vcpu->kvm, irq); + } +} + +/* Must be called with the kvm lock held */ +void vgic_v4_configure_vsgis(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct kvm_vcpu *vcpu; + int i; + + kvm_arm_halt_guest(kvm); + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (dist->nassgireq) + vgic_v4_enable_vsgis(vcpu); + else + vgic_v4_disable_vsgis(vcpu); + } + + kvm_arm_resume_guest(kvm); +} + +/** + * vgic_v4_init - Initialize the GICv4 data structures + * @kvm: Pointer to the VM being initialized + * + * We may be called each time a vITS is created, or when the + * vgic is initialized. This relies on kvm->lock to be + * held. In both cases, the number of vcpus should now be + * fixed. + */ +int vgic_v4_init(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct kvm_vcpu *vcpu; + int i, nr_vcpus, ret; + + if (!kvm_vgic_global_state.has_gicv4) + return 0; /* Nothing to see here... move along. */ + + if (dist->its_vm.vpes) + return 0; + + nr_vcpus = atomic_read(&kvm->online_vcpus); + + dist->its_vm.vpes = kcalloc(nr_vcpus, sizeof(*dist->its_vm.vpes), + GFP_KERNEL); + if (!dist->its_vm.vpes) + return -ENOMEM; + + dist->its_vm.nr_vpes = nr_vcpus; + + kvm_for_each_vcpu(i, vcpu, kvm) + dist->its_vm.vpes[i] = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe; + + ret = its_alloc_vcpu_irqs(&dist->its_vm); + if (ret < 0) { + kvm_err("VPE IRQ allocation failure\n"); + kfree(dist->its_vm.vpes); + dist->its_vm.nr_vpes = 0; + dist->its_vm.vpes = NULL; + return ret; + } + + kvm_for_each_vcpu(i, vcpu, kvm) { + int irq = dist->its_vm.vpes[i]->irq; + unsigned long irq_flags = DB_IRQ_FLAGS; + + /* + * Don't automatically enable the doorbell, as we're + * flipping it back and forth when the vcpu gets + * blocked. Also disable the lazy disabling, as the + * doorbell could kick us out of the guest too + * early... + * + * On GICv4.1, the doorbell is managed in HW and must + * be left enabled. + */ + if (kvm_vgic_global_state.has_gicv4_1) + irq_flags &= ~IRQ_NOAUTOEN; + irq_set_status_flags(irq, irq_flags); + + ret = request_irq(irq, vgic_v4_doorbell_handler, + 0, "vcpu", vcpu); + if (ret) { + kvm_err("failed to allocate vcpu IRQ%d\n", irq); + /* + * Trick: adjust the number of vpes so we know + * how many to nuke on teardown... + */ + dist->its_vm.nr_vpes = i; + break; + } + } + + if (ret) + vgic_v4_teardown(kvm); + + return ret; +} + +/** + * vgic_v4_teardown - Free the GICv4 data structures + * @kvm: Pointer to the VM being destroyed + * + * Relies on kvm->lock to be held. + */ +void vgic_v4_teardown(struct kvm *kvm) +{ + struct its_vm *its_vm = &kvm->arch.vgic.its_vm; + int i; + + if (!its_vm->vpes) + return; + + for (i = 0; i < its_vm->nr_vpes; i++) { + struct kvm_vcpu *vcpu = kvm_get_vcpu(kvm, i); + int irq = its_vm->vpes[i]->irq; + + irq_clear_status_flags(irq, DB_IRQ_FLAGS); + free_irq(irq, vcpu); + } + + its_free_vcpu_irqs(its_vm); + kfree(its_vm->vpes); + its_vm->nr_vpes = 0; + its_vm->vpes = NULL; +} + +int vgic_v4_put(struct kvm_vcpu *vcpu, bool need_db) +{ + struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe; + + if (!vgic_supports_direct_msis(vcpu->kvm) || !vpe->resident) + return 0; + + return its_make_vpe_non_resident(vpe, need_db); +} + +int vgic_v4_load(struct kvm_vcpu *vcpu) +{ + struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe; + int err; + + if (!vgic_supports_direct_msis(vcpu->kvm) || vpe->resident) + return 0; + + /* + * Before making the VPE resident, make sure the redistributor + * corresponding to our current CPU expects us here. See the + * doc in drivers/irqchip/irq-gic-v4.c to understand how this + * turns into a VMOVP command at the ITS level. + */ + err = irq_set_affinity(vpe->irq, cpumask_of(smp_processor_id())); + if (err) + return err; + + err = its_make_vpe_resident(vpe, false, vcpu->kvm->arch.vgic.enabled); + if (err) + return err; + + /* + * Now that the VPE is resident, let's get rid of a potential + * doorbell interrupt that would still be pending. This is a + * GICv4.0 only "feature"... + */ + if (!kvm_vgic_global_state.has_gicv4_1) + err = irq_set_irqchip_state(vpe->irq, IRQCHIP_STATE_PENDING, false); + + return err; +} + +static struct vgic_its *vgic_get_its(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *irq_entry) +{ + struct kvm_msi msi = (struct kvm_msi) { + .address_lo = irq_entry->msi.address_lo, + .address_hi = irq_entry->msi.address_hi, + .data = irq_entry->msi.data, + .flags = irq_entry->msi.flags, + .devid = irq_entry->msi.devid, + }; + + return vgic_msi_to_its(kvm, &msi); +} + +int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int virq, + struct kvm_kernel_irq_routing_entry *irq_entry) +{ + struct vgic_its *its; + struct vgic_irq *irq; + struct its_vlpi_map map; + int ret; + + if (!vgic_supports_direct_msis(kvm)) + return 0; + + /* + * Get the ITS, and escape early on error (not a valid + * doorbell for any of our vITSs). + */ + its = vgic_get_its(kvm, irq_entry); + if (IS_ERR(its)) + return 0; + + mutex_lock(&its->its_lock); + + /* Perform the actual DevID/EventID -> LPI translation. */ + ret = vgic_its_resolve_lpi(kvm, its, irq_entry->msi.devid, + irq_entry->msi.data, &irq); + if (ret) + goto out; + + /* + * Emit the mapping request. If it fails, the ITS probably + * isn't v4 compatible, so let's silently bail out. Holding + * the ITS lock should ensure that nothing can modify the + * target vcpu. + */ + map = (struct its_vlpi_map) { + .vm = &kvm->arch.vgic.its_vm, + .vpe = &irq->target_vcpu->arch.vgic_cpu.vgic_v3.its_vpe, + .vintid = irq->intid, + .properties = ((irq->priority & 0xfc) | + (irq->enabled ? LPI_PROP_ENABLED : 0) | + LPI_PROP_GROUP1), + .db_enabled = true, + }; + + ret = its_map_vlpi(virq, &map); + if (ret) + goto out; + + irq->hw = true; + irq->host_irq = virq; + atomic_inc(&map.vpe->vlpi_count); + +out: + mutex_unlock(&its->its_lock); + return ret; +} + +int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int virq, + struct kvm_kernel_irq_routing_entry *irq_entry) +{ + struct vgic_its *its; + struct vgic_irq *irq; + int ret; + + if (!vgic_supports_direct_msis(kvm)) + return 0; + + /* + * Get the ITS, and escape early on error (not a valid + * doorbell for any of our vITSs). + */ + its = vgic_get_its(kvm, irq_entry); + if (IS_ERR(its)) + return 0; + + mutex_lock(&its->its_lock); + + ret = vgic_its_resolve_lpi(kvm, its, irq_entry->msi.devid, + irq_entry->msi.data, &irq); + if (ret) + goto out; + + WARN_ON(!(irq->hw && irq->host_irq == virq)); + if (irq->hw) { + atomic_dec(&irq->target_vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vlpi_count); + irq->hw = false; + ret = its_unmap_vlpi(virq); + } + +out: + mutex_unlock(&its->its_lock); + return ret; +} diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c new file mode 100644 index 000000000000..c3643b7f101b --- /dev/null +++ b/arch/arm64/kvm/vgic/vgic.c @@ -0,0 +1,1020 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2015, 2016 ARM Ltd. + */ + +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/list_sort.h> +#include <linux/nospec.h> + +#include <asm/kvm_hyp.h> + +#include "vgic.h" + +#define CREATE_TRACE_POINTS +#include "trace.h" + +struct vgic_global kvm_vgic_global_state __ro_after_init = { + .gicv3_cpuif = STATIC_KEY_FALSE_INIT, +}; + +/* + * Locking order is always: + * kvm->lock (mutex) + * its->cmd_lock (mutex) + * its->its_lock (mutex) + * vgic_cpu->ap_list_lock must be taken with IRQs disabled + * kvm->lpi_list_lock must be taken with IRQs disabled + * vgic_irq->irq_lock must be taken with IRQs disabled + * + * As the ap_list_lock might be taken from the timer interrupt handler, + * we have to disable IRQs before taking this lock and everything lower + * than it. + * + * If you need to take multiple locks, always take the upper lock first, + * then the lower ones, e.g. first take the its_lock, then the irq_lock. + * If you are already holding a lock and need to take a higher one, you + * have to drop the lower ranking lock first and re-aquire it after having + * taken the upper one. + * + * When taking more than one ap_list_lock at the same time, always take the + * lowest numbered VCPU's ap_list_lock first, so: + * vcpuX->vcpu_id < vcpuY->vcpu_id: + * raw_spin_lock(vcpuX->arch.vgic_cpu.ap_list_lock); + * raw_spin_lock(vcpuY->arch.vgic_cpu.ap_list_lock); + * + * Since the VGIC must support injecting virtual interrupts from ISRs, we have + * to use the raw_spin_lock_irqsave/raw_spin_unlock_irqrestore versions of outer + * spinlocks for any lock that may be taken while injecting an interrupt. + */ + +/* + * Iterate over the VM's list of mapped LPIs to find the one with a + * matching interrupt ID and return a reference to the IRQ structure. + */ +static struct vgic_irq *vgic_get_lpi(struct kvm *kvm, u32 intid) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct vgic_irq *irq = NULL; + unsigned long flags; + + raw_spin_lock_irqsave(&dist->lpi_list_lock, flags); + + list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) { + if (irq->intid != intid) + continue; + + /* + * This increases the refcount, the caller is expected to + * call vgic_put_irq() later once it's finished with the IRQ. + */ + vgic_get_irq_kref(irq); + goto out_unlock; + } + irq = NULL; + +out_unlock: + raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); + + return irq; +} + +/* + * This looks up the virtual interrupt ID to get the corresponding + * struct vgic_irq. It also increases the refcount, so any caller is expected + * to call vgic_put_irq() once it's finished with this IRQ. + */ +struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, + u32 intid) +{ + /* SGIs and PPIs */ + if (intid <= VGIC_MAX_PRIVATE) { + intid = array_index_nospec(intid, VGIC_MAX_PRIVATE + 1); + return &vcpu->arch.vgic_cpu.private_irqs[intid]; + } + + /* SPIs */ + if (intid < (kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS)) { + intid = array_index_nospec(intid, kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS); + return &kvm->arch.vgic.spis[intid - VGIC_NR_PRIVATE_IRQS]; + } + + /* LPIs */ + if (intid >= VGIC_MIN_LPI) + return vgic_get_lpi(kvm, intid); + + WARN(1, "Looking up struct vgic_irq for reserved INTID"); + return NULL; +} + +/* + * We can't do anything in here, because we lack the kvm pointer to + * lock and remove the item from the lpi_list. So we keep this function + * empty and use the return value of kref_put() to trigger the freeing. + */ +static void vgic_irq_release(struct kref *ref) +{ +} + +/* + * Drop the refcount on the LPI. Must be called with lpi_list_lock held. + */ +void __vgic_put_lpi_locked(struct kvm *kvm, struct vgic_irq *irq) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + + if (!kref_put(&irq->refcount, vgic_irq_release)) + return; + + list_del(&irq->lpi_list); + dist->lpi_list_count--; + + kfree(irq); +} + +void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + unsigned long flags; + + if (irq->intid < VGIC_MIN_LPI) + return; + + raw_spin_lock_irqsave(&dist->lpi_list_lock, flags); + __vgic_put_lpi_locked(kvm, irq); + raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); +} + +void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_irq *irq, *tmp; + unsigned long flags; + + raw_spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags); + + list_for_each_entry_safe(irq, tmp, &vgic_cpu->ap_list_head, ap_list) { + if (irq->intid >= VGIC_MIN_LPI) { + raw_spin_lock(&irq->irq_lock); + list_del(&irq->ap_list); + irq->vcpu = NULL; + raw_spin_unlock(&irq->irq_lock); + vgic_put_irq(vcpu->kvm, irq); + } + } + + raw_spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags); +} + +void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending) +{ + WARN_ON(irq_set_irqchip_state(irq->host_irq, + IRQCHIP_STATE_PENDING, + pending)); +} + +bool vgic_get_phys_line_level(struct vgic_irq *irq) +{ + bool line_level; + + BUG_ON(!irq->hw); + + if (irq->get_input_level) + return irq->get_input_level(irq->intid); + + WARN_ON(irq_get_irqchip_state(irq->host_irq, + IRQCHIP_STATE_PENDING, + &line_level)); + return line_level; +} + +/* Set/Clear the physical active state */ +void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active) +{ + + BUG_ON(!irq->hw); + WARN_ON(irq_set_irqchip_state(irq->host_irq, + IRQCHIP_STATE_ACTIVE, + active)); +} + +/** + * kvm_vgic_target_oracle - compute the target vcpu for an irq + * + * @irq: The irq to route. Must be already locked. + * + * Based on the current state of the interrupt (enabled, pending, + * active, vcpu and target_vcpu), compute the next vcpu this should be + * given to. Return NULL if this shouldn't be injected at all. + * + * Requires the IRQ lock to be held. + */ +static struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq) +{ + lockdep_assert_held(&irq->irq_lock); + + /* If the interrupt is active, it must stay on the current vcpu */ + if (irq->active) + return irq->vcpu ? : irq->target_vcpu; + + /* + * If the IRQ is not active but enabled and pending, we should direct + * it to its configured target VCPU. + * If the distributor is disabled, pending interrupts shouldn't be + * forwarded. + */ + if (irq->enabled && irq_is_pending(irq)) { + if (unlikely(irq->target_vcpu && + !irq->target_vcpu->kvm->arch.vgic.enabled)) + return NULL; + + return irq->target_vcpu; + } + + /* If neither active nor pending and enabled, then this IRQ should not + * be queued to any VCPU. + */ + return NULL; +} + +/* + * The order of items in the ap_lists defines how we'll pack things in LRs as + * well, the first items in the list being the first things populated in the + * LRs. + * + * A hard rule is that active interrupts can never be pushed out of the LRs + * (and therefore take priority) since we cannot reliably trap on deactivation + * of IRQs and therefore they have to be present in the LRs. + * + * Otherwise things should be sorted by the priority field and the GIC + * hardware support will take care of preemption of priority groups etc. + * + * Return negative if "a" sorts before "b", 0 to preserve order, and positive + * to sort "b" before "a". + */ +static int vgic_irq_cmp(void *priv, struct list_head *a, struct list_head *b) +{ + struct vgic_irq *irqa = container_of(a, struct vgic_irq, ap_list); + struct vgic_irq *irqb = container_of(b, struct vgic_irq, ap_list); + bool penda, pendb; + int ret; + + /* + * list_sort may call this function with the same element when + * the list is fairly long. + */ + if (unlikely(irqa == irqb)) + return 0; + + raw_spin_lock(&irqa->irq_lock); + raw_spin_lock_nested(&irqb->irq_lock, SINGLE_DEPTH_NESTING); + + if (irqa->active || irqb->active) { + ret = (int)irqb->active - (int)irqa->active; + goto out; + } + + penda = irqa->enabled && irq_is_pending(irqa); + pendb = irqb->enabled && irq_is_pending(irqb); + + if (!penda || !pendb) { + ret = (int)pendb - (int)penda; + goto out; + } + + /* Both pending and enabled, sort by priority */ + ret = irqa->priority - irqb->priority; +out: + raw_spin_unlock(&irqb->irq_lock); + raw_spin_unlock(&irqa->irq_lock); + return ret; +} + +/* Must be called with the ap_list_lock held */ +static void vgic_sort_ap_list(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + + lockdep_assert_held(&vgic_cpu->ap_list_lock); + + list_sort(NULL, &vgic_cpu->ap_list_head, vgic_irq_cmp); +} + +/* + * Only valid injection if changing level for level-triggered IRQs or for a + * rising edge, and in-kernel connected IRQ lines can only be controlled by + * their owner. + */ +static bool vgic_validate_injection(struct vgic_irq *irq, bool level, void *owner) +{ + if (irq->owner != owner) + return false; + + switch (irq->config) { + case VGIC_CONFIG_LEVEL: + return irq->line_level != level; + case VGIC_CONFIG_EDGE: + return level; + } + + return false; +} + +/* + * Check whether an IRQ needs to (and can) be queued to a VCPU's ap list. + * Do the queuing if necessary, taking the right locks in the right order. + * Returns true when the IRQ was queued, false otherwise. + * + * Needs to be entered with the IRQ lock already held, but will return + * with all locks dropped. + */ +bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq, + unsigned long flags) +{ + struct kvm_vcpu *vcpu; + + lockdep_assert_held(&irq->irq_lock); + +retry: + vcpu = vgic_target_oracle(irq); + if (irq->vcpu || !vcpu) { + /* + * If this IRQ is already on a VCPU's ap_list, then it + * cannot be moved or modified and there is no more work for + * us to do. + * + * Otherwise, if the irq is not pending and enabled, it does + * not need to be inserted into an ap_list and there is also + * no more work for us to do. + */ + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + + /* + * We have to kick the VCPU here, because we could be + * queueing an edge-triggered interrupt for which we + * get no EOI maintenance interrupt. In that case, + * while the IRQ is already on the VCPU's AP list, the + * VCPU could have EOI'ed the original interrupt and + * won't see this one until it exits for some other + * reason. + */ + if (vcpu) { + kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); + kvm_vcpu_kick(vcpu); + } + return false; + } + + /* + * We must unlock the irq lock to take the ap_list_lock where + * we are going to insert this new pending interrupt. + */ + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + + /* someone can do stuff here, which we re-check below */ + + raw_spin_lock_irqsave(&vcpu->arch.vgic_cpu.ap_list_lock, flags); + raw_spin_lock(&irq->irq_lock); + + /* + * Did something change behind our backs? + * + * There are two cases: + * 1) The irq lost its pending state or was disabled behind our + * backs and/or it was queued to another VCPU's ap_list. + * 2) Someone changed the affinity on this irq behind our + * backs and we are now holding the wrong ap_list_lock. + * + * In both cases, drop the locks and retry. + */ + + if (unlikely(irq->vcpu || vcpu != vgic_target_oracle(irq))) { + raw_spin_unlock(&irq->irq_lock); + raw_spin_unlock_irqrestore(&vcpu->arch.vgic_cpu.ap_list_lock, + flags); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + goto retry; + } + + /* + * Grab a reference to the irq to reflect the fact that it is + * now in the ap_list. + */ + vgic_get_irq_kref(irq); + list_add_tail(&irq->ap_list, &vcpu->arch.vgic_cpu.ap_list_head); + irq->vcpu = vcpu; + + raw_spin_unlock(&irq->irq_lock); + raw_spin_unlock_irqrestore(&vcpu->arch.vgic_cpu.ap_list_lock, flags); + + kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); + kvm_vcpu_kick(vcpu); + + return true; +} + +/** + * kvm_vgic_inject_irq - Inject an IRQ from a device to the vgic + * @kvm: The VM structure pointer + * @cpuid: The CPU for PPIs + * @intid: The INTID to inject a new state to. + * @level: Edge-triggered: true: to trigger the interrupt + * false: to ignore the call + * Level-sensitive true: raise the input signal + * false: lower the input signal + * @owner: The opaque pointer to the owner of the IRQ being raised to verify + * that the caller is allowed to inject this IRQ. Userspace + * injections will have owner == NULL. + * + * The VGIC is not concerned with devices being active-LOW or active-HIGH for + * level-sensitive interrupts. You can think of the level parameter as 1 + * being HIGH and 0 being LOW and all devices being active-HIGH. + */ +int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid, + bool level, void *owner) +{ + struct kvm_vcpu *vcpu; + struct vgic_irq *irq; + unsigned long flags; + int ret; + + trace_vgic_update_irq_pending(cpuid, intid, level); + + ret = vgic_lazy_init(kvm); + if (ret) + return ret; + + vcpu = kvm_get_vcpu(kvm, cpuid); + if (!vcpu && intid < VGIC_NR_PRIVATE_IRQS) + return -EINVAL; + + irq = vgic_get_irq(kvm, vcpu, intid); + if (!irq) + return -EINVAL; + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + + if (!vgic_validate_injection(irq, level, owner)) { + /* Nothing to see here, move along... */ + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + vgic_put_irq(kvm, irq); + return 0; + } + + if (irq->config == VGIC_CONFIG_LEVEL) + irq->line_level = level; + else + irq->pending_latch = true; + + vgic_queue_irq_unlock(kvm, irq, flags); + vgic_put_irq(kvm, irq); + + return 0; +} + +/* @irq->irq_lock must be held */ +static int kvm_vgic_map_irq(struct kvm_vcpu *vcpu, struct vgic_irq *irq, + unsigned int host_irq, + bool (*get_input_level)(int vindid)) +{ + struct irq_desc *desc; + struct irq_data *data; + + /* + * Find the physical IRQ number corresponding to @host_irq + */ + desc = irq_to_desc(host_irq); + if (!desc) { + kvm_err("%s: no interrupt descriptor\n", __func__); + return -EINVAL; + } + data = irq_desc_get_irq_data(desc); + while (data->parent_data) + data = data->parent_data; + + irq->hw = true; + irq->host_irq = host_irq; + irq->hwintid = data->hwirq; + irq->get_input_level = get_input_level; + return 0; +} + +/* @irq->irq_lock must be held */ +static inline void kvm_vgic_unmap_irq(struct vgic_irq *irq) +{ + irq->hw = false; + irq->hwintid = 0; + irq->get_input_level = NULL; +} + +int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, unsigned int host_irq, + u32 vintid, bool (*get_input_level)(int vindid)) +{ + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, vintid); + unsigned long flags; + int ret; + + BUG_ON(!irq); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + ret = kvm_vgic_map_irq(vcpu, irq, host_irq, get_input_level); + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + vgic_put_irq(vcpu->kvm, irq); + + return ret; +} + +/** + * kvm_vgic_reset_mapped_irq - Reset a mapped IRQ + * @vcpu: The VCPU pointer + * @vintid: The INTID of the interrupt + * + * Reset the active and pending states of a mapped interrupt. Kernel + * subsystems injecting mapped interrupts should reset their interrupt lines + * when we are doing a reset of the VM. + */ +void kvm_vgic_reset_mapped_irq(struct kvm_vcpu *vcpu, u32 vintid) +{ + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, vintid); + unsigned long flags; + + if (!irq->hw) + goto out; + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + irq->active = false; + irq->pending_latch = false; + irq->line_level = false; + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); +out: + vgic_put_irq(vcpu->kvm, irq); +} + +int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int vintid) +{ + struct vgic_irq *irq; + unsigned long flags; + + if (!vgic_initialized(vcpu->kvm)) + return -EAGAIN; + + irq = vgic_get_irq(vcpu->kvm, vcpu, vintid); + BUG_ON(!irq); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + kvm_vgic_unmap_irq(irq); + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + vgic_put_irq(vcpu->kvm, irq); + + return 0; +} + +/** + * kvm_vgic_set_owner - Set the owner of an interrupt for a VM + * + * @vcpu: Pointer to the VCPU (used for PPIs) + * @intid: The virtual INTID identifying the interrupt (PPI or SPI) + * @owner: Opaque pointer to the owner + * + * Returns 0 if intid is not already used by another in-kernel device and the + * owner is set, otherwise returns an error code. + */ +int kvm_vgic_set_owner(struct kvm_vcpu *vcpu, unsigned int intid, void *owner) +{ + struct vgic_irq *irq; + unsigned long flags; + int ret = 0; + + if (!vgic_initialized(vcpu->kvm)) + return -EAGAIN; + + /* SGIs and LPIs cannot be wired up to any device */ + if (!irq_is_ppi(intid) && !vgic_valid_spi(vcpu->kvm, intid)) + return -EINVAL; + + irq = vgic_get_irq(vcpu->kvm, vcpu, intid); + raw_spin_lock_irqsave(&irq->irq_lock, flags); + if (irq->owner && irq->owner != owner) + ret = -EEXIST; + else + irq->owner = owner; + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + + return ret; +} + +/** + * vgic_prune_ap_list - Remove non-relevant interrupts from the list + * + * @vcpu: The VCPU pointer + * + * Go over the list of "interesting" interrupts, and prune those that we + * won't have to consider in the near future. + */ +static void vgic_prune_ap_list(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_irq *irq, *tmp; + + DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); + +retry: + raw_spin_lock(&vgic_cpu->ap_list_lock); + + list_for_each_entry_safe(irq, tmp, &vgic_cpu->ap_list_head, ap_list) { + struct kvm_vcpu *target_vcpu, *vcpuA, *vcpuB; + bool target_vcpu_needs_kick = false; + + raw_spin_lock(&irq->irq_lock); + + BUG_ON(vcpu != irq->vcpu); + + target_vcpu = vgic_target_oracle(irq); + + if (!target_vcpu) { + /* + * We don't need to process this interrupt any + * further, move it off the list. + */ + list_del(&irq->ap_list); + irq->vcpu = NULL; + raw_spin_unlock(&irq->irq_lock); + + /* + * This vgic_put_irq call matches the + * vgic_get_irq_kref in vgic_queue_irq_unlock, + * where we added the LPI to the ap_list. As + * we remove the irq from the list, we drop + * also drop the refcount. + */ + vgic_put_irq(vcpu->kvm, irq); + continue; + } + + if (target_vcpu == vcpu) { + /* We're on the right CPU */ + raw_spin_unlock(&irq->irq_lock); + continue; + } + + /* This interrupt looks like it has to be migrated. */ + + raw_spin_unlock(&irq->irq_lock); + raw_spin_unlock(&vgic_cpu->ap_list_lock); + + /* + * Ensure locking order by always locking the smallest + * ID first. + */ + if (vcpu->vcpu_id < target_vcpu->vcpu_id) { + vcpuA = vcpu; + vcpuB = target_vcpu; + } else { + vcpuA = target_vcpu; + vcpuB = vcpu; + } + + raw_spin_lock(&vcpuA->arch.vgic_cpu.ap_list_lock); + raw_spin_lock_nested(&vcpuB->arch.vgic_cpu.ap_list_lock, + SINGLE_DEPTH_NESTING); + raw_spin_lock(&irq->irq_lock); + + /* + * If the affinity has been preserved, move the + * interrupt around. Otherwise, it means things have + * changed while the interrupt was unlocked, and we + * need to replay this. + * + * In all cases, we cannot trust the list not to have + * changed, so we restart from the beginning. + */ + if (target_vcpu == vgic_target_oracle(irq)) { + struct vgic_cpu *new_cpu = &target_vcpu->arch.vgic_cpu; + + list_del(&irq->ap_list); + irq->vcpu = target_vcpu; + list_add_tail(&irq->ap_list, &new_cpu->ap_list_head); + target_vcpu_needs_kick = true; + } + + raw_spin_unlock(&irq->irq_lock); + raw_spin_unlock(&vcpuB->arch.vgic_cpu.ap_list_lock); + raw_spin_unlock(&vcpuA->arch.vgic_cpu.ap_list_lock); + + if (target_vcpu_needs_kick) { + kvm_make_request(KVM_REQ_IRQ_PENDING, target_vcpu); + kvm_vcpu_kick(target_vcpu); + } + + goto retry; + } + + raw_spin_unlock(&vgic_cpu->ap_list_lock); +} + +static inline void vgic_fold_lr_state(struct kvm_vcpu *vcpu) +{ + if (kvm_vgic_global_state.type == VGIC_V2) + vgic_v2_fold_lr_state(vcpu); + else + vgic_v3_fold_lr_state(vcpu); +} + +/* Requires the irq_lock to be held. */ +static inline void vgic_populate_lr(struct kvm_vcpu *vcpu, + struct vgic_irq *irq, int lr) +{ + lockdep_assert_held(&irq->irq_lock); + + if (kvm_vgic_global_state.type == VGIC_V2) + vgic_v2_populate_lr(vcpu, irq, lr); + else + vgic_v3_populate_lr(vcpu, irq, lr); +} + +static inline void vgic_clear_lr(struct kvm_vcpu *vcpu, int lr) +{ + if (kvm_vgic_global_state.type == VGIC_V2) + vgic_v2_clear_lr(vcpu, lr); + else + vgic_v3_clear_lr(vcpu, lr); +} + +static inline void vgic_set_underflow(struct kvm_vcpu *vcpu) +{ + if (kvm_vgic_global_state.type == VGIC_V2) + vgic_v2_set_underflow(vcpu); + else + vgic_v3_set_underflow(vcpu); +} + +/* Requires the ap_list_lock to be held. */ +static int compute_ap_list_depth(struct kvm_vcpu *vcpu, + bool *multi_sgi) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_irq *irq; + int count = 0; + + *multi_sgi = false; + + lockdep_assert_held(&vgic_cpu->ap_list_lock); + + list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { + int w; + + raw_spin_lock(&irq->irq_lock); + /* GICv2 SGIs can count for more than one... */ + w = vgic_irq_get_lr_count(irq); + raw_spin_unlock(&irq->irq_lock); + + count += w; + *multi_sgi |= (w > 1); + } + return count; +} + +/* Requires the VCPU's ap_list_lock to be held. */ +static void vgic_flush_lr_state(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_irq *irq; + int count; + bool multi_sgi; + u8 prio = 0xff; + int i = 0; + + lockdep_assert_held(&vgic_cpu->ap_list_lock); + + count = compute_ap_list_depth(vcpu, &multi_sgi); + if (count > kvm_vgic_global_state.nr_lr || multi_sgi) + vgic_sort_ap_list(vcpu); + + count = 0; + + list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { + raw_spin_lock(&irq->irq_lock); + + /* + * If we have multi-SGIs in the pipeline, we need to + * guarantee that they are all seen before any IRQ of + * lower priority. In that case, we need to filter out + * these interrupts by exiting early. This is easy as + * the AP list has been sorted already. + */ + if (multi_sgi && irq->priority > prio) { + _raw_spin_unlock(&irq->irq_lock); + break; + } + + if (likely(vgic_target_oracle(irq) == vcpu)) { + vgic_populate_lr(vcpu, irq, count++); + + if (irq->source) + prio = irq->priority; + } + + raw_spin_unlock(&irq->irq_lock); + + if (count == kvm_vgic_global_state.nr_lr) { + if (!list_is_last(&irq->ap_list, + &vgic_cpu->ap_list_head)) + vgic_set_underflow(vcpu); + break; + } + } + + /* Nuke remaining LRs */ + for (i = count ; i < kvm_vgic_global_state.nr_lr; i++) + vgic_clear_lr(vcpu, i); + + if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) + vcpu->arch.vgic_cpu.vgic_v2.used_lrs = count; + else + vcpu->arch.vgic_cpu.vgic_v3.used_lrs = count; +} + +static inline bool can_access_vgic_from_kernel(void) +{ + /* + * GICv2 can always be accessed from the kernel because it is + * memory-mapped, and VHE systems can access GICv3 EL2 system + * registers. + */ + return !static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif) || has_vhe(); +} + +static inline void vgic_save_state(struct kvm_vcpu *vcpu) +{ + if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) + vgic_v2_save_state(vcpu); + else + __vgic_v3_save_state(&vcpu->arch.vgic_cpu.vgic_v3); +} + +/* Sync back the hardware VGIC state into our emulation after a guest's run. */ +void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) +{ + int used_lrs; + + /* An empty ap_list_head implies used_lrs == 0 */ + if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head)) + return; + + if (can_access_vgic_from_kernel()) + vgic_save_state(vcpu); + + if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) + used_lrs = vcpu->arch.vgic_cpu.vgic_v2.used_lrs; + else + used_lrs = vcpu->arch.vgic_cpu.vgic_v3.used_lrs; + + if (used_lrs) + vgic_fold_lr_state(vcpu); + vgic_prune_ap_list(vcpu); +} + +static inline void vgic_restore_state(struct kvm_vcpu *vcpu) +{ + if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) + vgic_v2_restore_state(vcpu); + else + __vgic_v3_restore_state(&vcpu->arch.vgic_cpu.vgic_v3); +} + +/* Flush our emulation state into the GIC hardware before entering the guest. */ +void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) +{ + /* + * If there are no virtual interrupts active or pending for this + * VCPU, then there is no work to do and we can bail out without + * taking any lock. There is a potential race with someone injecting + * interrupts to the VCPU, but it is a benign race as the VCPU will + * either observe the new interrupt before or after doing this check, + * and introducing additional synchronization mechanism doesn't change + * this. + * + * Note that we still need to go through the whole thing if anything + * can be directly injected (GICv4). + */ + if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head) && + !vgic_supports_direct_msis(vcpu->kvm)) + return; + + DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); + + if (!list_empty(&vcpu->arch.vgic_cpu.ap_list_head)) { + raw_spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock); + vgic_flush_lr_state(vcpu); + raw_spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock); + } + + if (can_access_vgic_from_kernel()) + vgic_restore_state(vcpu); +} + +void kvm_vgic_load(struct kvm_vcpu *vcpu) +{ + if (unlikely(!vgic_initialized(vcpu->kvm))) + return; + + if (kvm_vgic_global_state.type == VGIC_V2) + vgic_v2_load(vcpu); + else + vgic_v3_load(vcpu); +} + +void kvm_vgic_put(struct kvm_vcpu *vcpu) +{ + if (unlikely(!vgic_initialized(vcpu->kvm))) + return; + + if (kvm_vgic_global_state.type == VGIC_V2) + vgic_v2_put(vcpu); + else + vgic_v3_put(vcpu); +} + +void kvm_vgic_vmcr_sync(struct kvm_vcpu *vcpu) +{ + if (unlikely(!irqchip_in_kernel(vcpu->kvm))) + return; + + if (kvm_vgic_global_state.type == VGIC_V2) + vgic_v2_vmcr_sync(vcpu); + else + vgic_v3_vmcr_sync(vcpu); +} + +int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_irq *irq; + bool pending = false; + unsigned long flags; + struct vgic_vmcr vmcr; + + if (!vcpu->kvm->arch.vgic.enabled) + return false; + + if (vcpu->arch.vgic_cpu.vgic_v3.its_vpe.pending_last) + return true; + + vgic_get_vmcr(vcpu, &vmcr); + + raw_spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags); + + list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { + raw_spin_lock(&irq->irq_lock); + pending = irq_is_pending(irq) && irq->enabled && + !irq->active && + irq->priority < vmcr.pmr; + raw_spin_unlock(&irq->irq_lock); + + if (pending) + break; + } + + raw_spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags); + + return pending; +} + +void vgic_kick_vcpus(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + int c; + + /* + * We've injected an interrupt, time to find out who deserves + * a good kick... + */ + kvm_for_each_vcpu(c, vcpu, kvm) { + if (kvm_vgic_vcpu_pending_irq(vcpu)) { + kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); + kvm_vcpu_kick(vcpu); + } + } +} + +bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int vintid) +{ + struct vgic_irq *irq; + bool map_is_active; + unsigned long flags; + + if (!vgic_initialized(vcpu->kvm)) + return false; + + irq = vgic_get_irq(vcpu->kvm, vcpu, vintid); + raw_spin_lock_irqsave(&irq->irq_lock, flags); + map_is_active = irq->hw && irq->active; + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + vgic_put_irq(vcpu->kvm, irq); + + return map_is_active; +} diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h new file mode 100644 index 000000000000..64fcd7511110 --- /dev/null +++ b/arch/arm64/kvm/vgic/vgic.h @@ -0,0 +1,321 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2015, 2016 ARM Ltd. + */ +#ifndef __KVM_ARM_VGIC_NEW_H__ +#define __KVM_ARM_VGIC_NEW_H__ + +#include <linux/irqchip/arm-gic-common.h> + +#define PRODUCT_ID_KVM 0x4b /* ASCII code K */ +#define IMPLEMENTER_ARM 0x43b + +#define VGIC_ADDR_UNDEF (-1) +#define IS_VGIC_ADDR_UNDEF(_x) ((_x) == VGIC_ADDR_UNDEF) + +#define INTERRUPT_ID_BITS_SPIS 10 +#define INTERRUPT_ID_BITS_ITS 16 +#define VGIC_PRI_BITS 5 + +#define vgic_irq_is_sgi(intid) ((intid) < VGIC_NR_SGIS) + +#define VGIC_AFFINITY_0_SHIFT 0 +#define VGIC_AFFINITY_0_MASK (0xffUL << VGIC_AFFINITY_0_SHIFT) +#define VGIC_AFFINITY_1_SHIFT 8 +#define VGIC_AFFINITY_1_MASK (0xffUL << VGIC_AFFINITY_1_SHIFT) +#define VGIC_AFFINITY_2_SHIFT 16 +#define VGIC_AFFINITY_2_MASK (0xffUL << VGIC_AFFINITY_2_SHIFT) +#define VGIC_AFFINITY_3_SHIFT 24 +#define VGIC_AFFINITY_3_MASK (0xffUL << VGIC_AFFINITY_3_SHIFT) + +#define VGIC_AFFINITY_LEVEL(reg, level) \ + ((((reg) & VGIC_AFFINITY_## level ##_MASK) \ + >> VGIC_AFFINITY_## level ##_SHIFT) << MPIDR_LEVEL_SHIFT(level)) + +/* + * The Userspace encodes the affinity differently from the MPIDR, + * Below macro converts vgic userspace format to MPIDR reg format. + */ +#define VGIC_TO_MPIDR(val) (VGIC_AFFINITY_LEVEL(val, 0) | \ + VGIC_AFFINITY_LEVEL(val, 1) | \ + VGIC_AFFINITY_LEVEL(val, 2) | \ + VGIC_AFFINITY_LEVEL(val, 3)) + +/* + * As per Documentation/virt/kvm/devices/arm-vgic-v3.rst, + * below macros are defined for CPUREG encoding. + */ +#define KVM_REG_ARM_VGIC_SYSREG_OP0_MASK 0x000000000000c000 +#define KVM_REG_ARM_VGIC_SYSREG_OP0_SHIFT 14 +#define KVM_REG_ARM_VGIC_SYSREG_OP1_MASK 0x0000000000003800 +#define KVM_REG_ARM_VGIC_SYSREG_OP1_SHIFT 11 +#define KVM_REG_ARM_VGIC_SYSREG_CRN_MASK 0x0000000000000780 +#define KVM_REG_ARM_VGIC_SYSREG_CRN_SHIFT 7 +#define KVM_REG_ARM_VGIC_SYSREG_CRM_MASK 0x0000000000000078 +#define KVM_REG_ARM_VGIC_SYSREG_CRM_SHIFT 3 +#define KVM_REG_ARM_VGIC_SYSREG_OP2_MASK 0x0000000000000007 +#define KVM_REG_ARM_VGIC_SYSREG_OP2_SHIFT 0 + +#define KVM_DEV_ARM_VGIC_SYSREG_MASK (KVM_REG_ARM_VGIC_SYSREG_OP0_MASK | \ + KVM_REG_ARM_VGIC_SYSREG_OP1_MASK | \ + KVM_REG_ARM_VGIC_SYSREG_CRN_MASK | \ + KVM_REG_ARM_VGIC_SYSREG_CRM_MASK | \ + KVM_REG_ARM_VGIC_SYSREG_OP2_MASK) + +/* + * As per Documentation/virt/kvm/devices/arm-vgic-its.rst, + * below macros are defined for ITS table entry encoding. + */ +#define KVM_ITS_CTE_VALID_SHIFT 63 +#define KVM_ITS_CTE_VALID_MASK BIT_ULL(63) +#define KVM_ITS_CTE_RDBASE_SHIFT 16 +#define KVM_ITS_CTE_ICID_MASK GENMASK_ULL(15, 0) +#define KVM_ITS_ITE_NEXT_SHIFT 48 +#define KVM_ITS_ITE_PINTID_SHIFT 16 +#define KVM_ITS_ITE_PINTID_MASK GENMASK_ULL(47, 16) +#define KVM_ITS_ITE_ICID_MASK GENMASK_ULL(15, 0) +#define KVM_ITS_DTE_VALID_SHIFT 63 +#define KVM_ITS_DTE_VALID_MASK BIT_ULL(63) +#define KVM_ITS_DTE_NEXT_SHIFT 49 +#define KVM_ITS_DTE_NEXT_MASK GENMASK_ULL(62, 49) +#define KVM_ITS_DTE_ITTADDR_SHIFT 5 +#define KVM_ITS_DTE_ITTADDR_MASK GENMASK_ULL(48, 5) +#define KVM_ITS_DTE_SIZE_MASK GENMASK_ULL(4, 0) +#define KVM_ITS_L1E_VALID_MASK BIT_ULL(63) +/* we only support 64 kB translation table page size */ +#define KVM_ITS_L1E_ADDR_MASK GENMASK_ULL(51, 16) + +#define KVM_VGIC_V3_RDIST_INDEX_MASK GENMASK_ULL(11, 0) +#define KVM_VGIC_V3_RDIST_FLAGS_MASK GENMASK_ULL(15, 12) +#define KVM_VGIC_V3_RDIST_FLAGS_SHIFT 12 +#define KVM_VGIC_V3_RDIST_BASE_MASK GENMASK_ULL(51, 16) +#define KVM_VGIC_V3_RDIST_COUNT_MASK GENMASK_ULL(63, 52) +#define KVM_VGIC_V3_RDIST_COUNT_SHIFT 52 + +#ifdef CONFIG_DEBUG_SPINLOCK +#define DEBUG_SPINLOCK_BUG_ON(p) BUG_ON(p) +#else +#define DEBUG_SPINLOCK_BUG_ON(p) +#endif + +/* Requires the irq_lock to be held by the caller. */ +static inline bool irq_is_pending(struct vgic_irq *irq) +{ + if (irq->config == VGIC_CONFIG_EDGE) + return irq->pending_latch; + else + return irq->pending_latch || irq->line_level; +} + +static inline bool vgic_irq_is_mapped_level(struct vgic_irq *irq) +{ + return irq->config == VGIC_CONFIG_LEVEL && irq->hw; +} + +static inline int vgic_irq_get_lr_count(struct vgic_irq *irq) +{ + /* Account for the active state as an interrupt */ + if (vgic_irq_is_sgi(irq->intid) && irq->source) + return hweight8(irq->source) + irq->active; + + return irq_is_pending(irq) || irq->active; +} + +static inline bool vgic_irq_is_multi_sgi(struct vgic_irq *irq) +{ + return vgic_irq_get_lr_count(irq) > 1; +} + +/* + * This struct provides an intermediate representation of the fields contained + * in the GICH_VMCR and ICH_VMCR registers, such that code exporting the GIC + * state to userspace can generate either GICv2 or GICv3 CPU interface + * registers regardless of the hardware backed GIC used. + */ +struct vgic_vmcr { + u32 grpen0; + u32 grpen1; + + u32 ackctl; + u32 fiqen; + u32 cbpr; + u32 eoim; + + u32 abpr; + u32 bpr; + u32 pmr; /* Priority mask field in the GICC_PMR and + * ICC_PMR_EL1 priority field format */ +}; + +struct vgic_reg_attr { + struct kvm_vcpu *vcpu; + gpa_t addr; +}; + +int vgic_v3_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr, + struct vgic_reg_attr *reg_attr); +int vgic_v2_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr, + struct vgic_reg_attr *reg_attr); +const struct vgic_register_region * +vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev, + gpa_t addr, int len); +struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, + u32 intid); +void __vgic_put_lpi_locked(struct kvm *kvm, struct vgic_irq *irq); +void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq); +bool vgic_get_phys_line_level(struct vgic_irq *irq); +void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending); +void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active); +bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq, + unsigned long flags); +void vgic_kick_vcpus(struct kvm *kvm); + +int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr, + phys_addr_t addr, phys_addr_t alignment); + +void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu); +void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr); +void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr); +void vgic_v2_set_underflow(struct kvm_vcpu *vcpu); +void vgic_v2_set_npie(struct kvm_vcpu *vcpu); +int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr); +int vgic_v2_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write, + int offset, u32 *val); +int vgic_v2_cpuif_uaccess(struct kvm_vcpu *vcpu, bool is_write, + int offset, u32 *val); +void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); +void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); +void vgic_v2_enable(struct kvm_vcpu *vcpu); +int vgic_v2_probe(const struct gic_kvm_info *info); +int vgic_v2_map_resources(struct kvm *kvm); +int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address, + enum vgic_type); + +void vgic_v2_init_lrs(void); +void vgic_v2_load(struct kvm_vcpu *vcpu); +void vgic_v2_put(struct kvm_vcpu *vcpu); +void vgic_v2_vmcr_sync(struct kvm_vcpu *vcpu); + +void vgic_v2_save_state(struct kvm_vcpu *vcpu); +void vgic_v2_restore_state(struct kvm_vcpu *vcpu); + +static inline void vgic_get_irq_kref(struct vgic_irq *irq) +{ + if (irq->intid < VGIC_MIN_LPI) + return; + + kref_get(&irq->refcount); +} + +void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu); +void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr); +void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr); +void vgic_v3_set_underflow(struct kvm_vcpu *vcpu); +void vgic_v3_set_npie(struct kvm_vcpu *vcpu); +void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); +void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); +void vgic_v3_enable(struct kvm_vcpu *vcpu); +int vgic_v3_probe(const struct gic_kvm_info *info); +int vgic_v3_map_resources(struct kvm *kvm); +int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq); +int vgic_v3_save_pending_tables(struct kvm *kvm); +int vgic_v3_set_redist_base(struct kvm *kvm, u32 index, u64 addr, u32 count); +int vgic_register_redist_iodev(struct kvm_vcpu *vcpu); +bool vgic_v3_check_base(struct kvm *kvm); + +void vgic_v3_load(struct kvm_vcpu *vcpu); +void vgic_v3_put(struct kvm_vcpu *vcpu); +void vgic_v3_vmcr_sync(struct kvm_vcpu *vcpu); + +bool vgic_has_its(struct kvm *kvm); +int kvm_vgic_register_its_device(void); +void vgic_enable_lpis(struct kvm_vcpu *vcpu); +void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu); +int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi); +int vgic_v3_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr); +int vgic_v3_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write, + int offset, u32 *val); +int vgic_v3_redist_uaccess(struct kvm_vcpu *vcpu, bool is_write, + int offset, u32 *val); +int vgic_v3_cpu_sysregs_uaccess(struct kvm_vcpu *vcpu, bool is_write, + u64 id, u64 *val); +int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, bool is_write, u64 id, + u64 *reg); +int vgic_v3_line_level_info_uaccess(struct kvm_vcpu *vcpu, bool is_write, + u32 intid, u64 *val); +int kvm_register_vgic_device(unsigned long type); +void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); +void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); +int vgic_lazy_init(struct kvm *kvm); +int vgic_init(struct kvm *kvm); + +void vgic_debug_init(struct kvm *kvm); +void vgic_debug_destroy(struct kvm *kvm); + +bool lock_all_vcpus(struct kvm *kvm); +void unlock_all_vcpus(struct kvm *kvm); + +static inline int vgic_v3_max_apr_idx(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *cpu_if = &vcpu->arch.vgic_cpu; + + /* + * num_pri_bits are initialized with HW supported values. + * We can rely safely on num_pri_bits even if VM has not + * restored ICC_CTLR_EL1 before restoring APnR registers. + */ + switch (cpu_if->num_pri_bits) { + case 7: return 3; + case 6: return 1; + default: return 0; + } +} + +static inline bool +vgic_v3_redist_region_full(struct vgic_redist_region *region) +{ + if (!region->count) + return false; + + return (region->free_index >= region->count); +} + +struct vgic_redist_region *vgic_v3_rdist_free_slot(struct list_head *rdregs); + +static inline size_t +vgic_v3_rd_region_size(struct kvm *kvm, struct vgic_redist_region *rdreg) +{ + if (!rdreg->count) + return atomic_read(&kvm->online_vcpus) * KVM_VGIC_V3_REDIST_SIZE; + else + return rdreg->count * KVM_VGIC_V3_REDIST_SIZE; +} + +struct vgic_redist_region *vgic_v3_rdist_region_from_index(struct kvm *kvm, + u32 index); + +bool vgic_v3_rdist_overlap(struct kvm *kvm, gpa_t base, size_t size); + +static inline bool vgic_dist_overlap(struct kvm *kvm, gpa_t base, size_t size) +{ + struct vgic_dist *d = &kvm->arch.vgic; + + return (base + size > d->vgic_dist_base) && + (base < d->vgic_dist_base + KVM_VGIC_V3_DIST_SIZE); +} + +int vgic_copy_lpi_list(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 **intid_ptr); +int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its, + u32 devid, u32 eventid, struct vgic_irq **irq); +struct vgic_its *vgic_msi_to_its(struct kvm *kvm, struct kvm_msi *msi); +int vgic_its_inject_cached_translation(struct kvm *kvm, struct kvm_msi *msi); +void vgic_lpi_translation_cache_init(struct kvm *kvm); +void vgic_lpi_translation_cache_destroy(struct kvm *kvm); +void vgic_its_invalidate_cache(struct kvm *kvm); + +bool vgic_supports_direct_msis(struct kvm *kvm); +int vgic_v4_init(struct kvm *kvm); +void vgic_v4_teardown(struct kvm *kvm); +void vgic_v4_configure_vsgis(struct kvm *kvm); + +#endif diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S index 8e25e89ad01f..0f8a3a9e3795 100644 --- a/arch/arm64/lib/copy_from_user.S +++ b/arch/arm64/lib/copy_from_user.S @@ -20,36 +20,36 @@ * x0 - bytes not copied */ - .macro ldrb1 ptr, regB, val - uao_user_alternative 9998f, ldrb, ldtrb, \ptr, \regB, \val + .macro ldrb1 reg, ptr, val + uao_user_alternative 9998f, ldrb, ldtrb, \reg, \ptr, \val .endm - .macro strb1 ptr, regB, val - strb \ptr, [\regB], \val + .macro strb1 reg, ptr, val + strb \reg, [\ptr], \val .endm - .macro ldrh1 ptr, regB, val - uao_user_alternative 9998f, ldrh, ldtrh, \ptr, \regB, \val + .macro ldrh1 reg, ptr, val + uao_user_alternative 9998f, ldrh, ldtrh, \reg, \ptr, \val .endm - .macro strh1 ptr, regB, val - strh \ptr, [\regB], \val + .macro strh1 reg, ptr, val + strh \reg, [\ptr], \val .endm - .macro ldr1 ptr, regB, val - uao_user_alternative 9998f, ldr, ldtr, \ptr, \regB, \val + .macro ldr1 reg, ptr, val + uao_user_alternative 9998f, ldr, ldtr, \reg, \ptr, \val .endm - .macro str1 ptr, regB, val - str \ptr, [\regB], \val + .macro str1 reg, ptr, val + str \reg, [\ptr], \val .endm - .macro ldp1 ptr, regB, regC, val - uao_ldp 9998f, \ptr, \regB, \regC, \val + .macro ldp1 reg1, reg2, ptr, val + uao_ldp 9998f, \reg1, \reg2, \ptr, \val .endm - .macro stp1 ptr, regB, regC, val - stp \ptr, \regB, [\regC], \val + .macro stp1 reg1, reg2, ptr, val + stp \reg1, \reg2, [\ptr], \val .endm end .req x5 diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S index 667139013ed1..80e37ada0ee1 100644 --- a/arch/arm64/lib/copy_in_user.S +++ b/arch/arm64/lib/copy_in_user.S @@ -21,36 +21,36 @@ * Returns: * x0 - bytes not copied */ - .macro ldrb1 ptr, regB, val - uao_user_alternative 9998f, ldrb, ldtrb, \ptr, \regB, \val + .macro ldrb1 reg, ptr, val + uao_user_alternative 9998f, ldrb, ldtrb, \reg, \ptr, \val .endm - .macro strb1 ptr, regB, val - uao_user_alternative 9998f, strb, sttrb, \ptr, \regB, \val + .macro strb1 reg, ptr, val + uao_user_alternative 9998f, strb, sttrb, \reg, \ptr, \val .endm - .macro ldrh1 ptr, regB, val - uao_user_alternative 9998f, ldrh, ldtrh, \ptr, \regB, \val + .macro ldrh1 reg, ptr, val + uao_user_alternative 9998f, ldrh, ldtrh, \reg, \ptr, \val .endm - .macro strh1 ptr, regB, val - uao_user_alternative 9998f, strh, sttrh, \ptr, \regB, \val + .macro strh1 reg, ptr, val + uao_user_alternative 9998f, strh, sttrh, \reg, \ptr, \val .endm - .macro ldr1 ptr, regB, val - uao_user_alternative 9998f, ldr, ldtr, \ptr, \regB, \val + .macro ldr1 reg, ptr, val + uao_user_alternative 9998f, ldr, ldtr, \reg, \ptr, \val .endm - .macro str1 ptr, regB, val - uao_user_alternative 9998f, str, sttr, \ptr, \regB, \val + .macro str1 reg, ptr, val + uao_user_alternative 9998f, str, sttr, \reg, \ptr, \val .endm - .macro ldp1 ptr, regB, regC, val - uao_ldp 9998f, \ptr, \regB, \regC, \val + .macro ldp1 reg1, reg2, ptr, val + uao_ldp 9998f, \reg1, \reg2, \ptr, \val .endm - .macro stp1 ptr, regB, regC, val - uao_stp 9998f, \ptr, \regB, \regC, \val + .macro stp1 reg1, reg2, ptr, val + uao_stp 9998f, \reg1, \reg2, \ptr, \val .endm end .req x5 diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S index 1a104d0089f3..4ec59704b8f2 100644 --- a/arch/arm64/lib/copy_to_user.S +++ b/arch/arm64/lib/copy_to_user.S @@ -19,36 +19,36 @@ * Returns: * x0 - bytes not copied */ - .macro ldrb1 ptr, regB, val - ldrb \ptr, [\regB], \val + .macro ldrb1 reg, ptr, val + ldrb \reg, [\ptr], \val .endm - .macro strb1 ptr, regB, val - uao_user_alternative 9998f, strb, sttrb, \ptr, \regB, \val + .macro strb1 reg, ptr, val + uao_user_alternative 9998f, strb, sttrb, \reg, \ptr, \val .endm - .macro ldrh1 ptr, regB, val - ldrh \ptr, [\regB], \val + .macro ldrh1 reg, ptr, val + ldrh \reg, [\ptr], \val .endm - .macro strh1 ptr, regB, val - uao_user_alternative 9998f, strh, sttrh, \ptr, \regB, \val + .macro strh1 reg, ptr, val + uao_user_alternative 9998f, strh, sttrh, \reg, \ptr, \val .endm - .macro ldr1 ptr, regB, val - ldr \ptr, [\regB], \val + .macro ldr1 reg, ptr, val + ldr \reg, [\ptr], \val .endm - .macro str1 ptr, regB, val - uao_user_alternative 9998f, str, sttr, \ptr, \regB, \val + .macro str1 reg, ptr, val + uao_user_alternative 9998f, str, sttr, \reg, \ptr, \val .endm - .macro ldp1 ptr, regB, regC, val - ldp \ptr, \regB, [\regC], \val + .macro ldp1 reg1, reg2, ptr, val + ldp \reg1, \reg2, [\ptr], \val .endm - .macro stp1 ptr, regB, regC, val - uao_stp 9998f, \ptr, \regB, \regC, \val + .macro stp1 reg1, reg2, ptr, val + uao_stp 9998f, \reg1, \reg2, \ptr, \val .endm end .req x5 diff --git a/arch/arm64/lib/crc32.S b/arch/arm64/lib/crc32.S index 243e107e9896..0f9e10ecda23 100644 --- a/arch/arm64/lib/crc32.S +++ b/arch/arm64/lib/crc32.S @@ -9,7 +9,7 @@ #include <asm/alternative.h> #include <asm/assembler.h> - .cpu generic+crc + .arch armv8-a+crc .macro __crc32, c cmp x2, #16 diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S index 9f382adfa88a..e0bf83d556f2 100644 --- a/arch/arm64/lib/memcpy.S +++ b/arch/arm64/lib/memcpy.S @@ -24,36 +24,36 @@ * Returns: * x0 - dest */ - .macro ldrb1 ptr, regB, val - ldrb \ptr, [\regB], \val + .macro ldrb1 reg, ptr, val + ldrb \reg, [\ptr], \val .endm - .macro strb1 ptr, regB, val - strb \ptr, [\regB], \val + .macro strb1 reg, ptr, val + strb \reg, [\ptr], \val .endm - .macro ldrh1 ptr, regB, val - ldrh \ptr, [\regB], \val + .macro ldrh1 reg, ptr, val + ldrh \reg, [\ptr], \val .endm - .macro strh1 ptr, regB, val - strh \ptr, [\regB], \val + .macro strh1 reg, ptr, val + strh \reg, [\ptr], \val .endm - .macro ldr1 ptr, regB, val - ldr \ptr, [\regB], \val + .macro ldr1 reg, ptr, val + ldr \reg, [\ptr], \val .endm - .macro str1 ptr, regB, val - str \ptr, [\regB], \val + .macro str1 reg, ptr, val + str \reg, [\ptr], \val .endm - .macro ldp1 ptr, regB, regC, val - ldp \ptr, \regB, [\regC], \val + .macro ldp1 reg1, reg2, ptr, val + ldp \reg1, \reg2, [\ptr], \val .endm - .macro stp1 ptr, regB, regC, val - stp \ptr, \regB, [\regC], \val + .macro stp1 reg1, reg2, ptr, val + stp \reg1, \reg2, [\ptr], \val .endm .weak memcpy diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c index 9b26f9a88724..d702d60e64da 100644 --- a/arch/arm64/mm/context.c +++ b/arch/arm64/mm/context.c @@ -92,6 +92,9 @@ static void set_reserved_asid_bits(void) bitmap_clear(asid_map, 0, NUM_USER_ASIDS); } +#define asid_gen_match(asid) \ + (!(((asid) ^ atomic64_read(&asid_generation)) >> asid_bits)) + static void flush_context(void) { int i; @@ -220,8 +223,7 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu) * because atomic RmWs are totally ordered for a given location. */ old_active_asid = atomic64_read(&per_cpu(active_asids, cpu)); - if (old_active_asid && - !((asid ^ atomic64_read(&asid_generation)) >> asid_bits) && + if (old_active_asid && asid_gen_match(asid) && atomic64_cmpxchg_relaxed(&per_cpu(active_asids, cpu), old_active_asid, asid)) goto switch_mm_fastpath; @@ -229,7 +231,7 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu) raw_spin_lock_irqsave(&cpu_asid_lock, flags); /* Check that our ASID belongs to the current generation. */ asid = atomic64_read(&mm->context.id); - if ((asid ^ atomic64_read(&asid_generation)) >> asid_bits) { + if (!asid_gen_match(asid)) { asid = new_context(mm); atomic64_set(&mm->context.id, asid); } diff --git a/arch/arm64/mm/dump.c b/arch/arm64/mm/dump.c index 860c00ec8bd3..0da020c563e6 100644 --- a/arch/arm64/mm/dump.c +++ b/arch/arm64/mm/dump.c @@ -146,6 +146,11 @@ static const struct prot_bits pte_bits[] = { .set = "UXN", .clear = " ", }, { + .mask = PTE_GP, + .val = PTE_GP, + .set = "GP", + .clear = " ", + }, { .mask = PTE_ATTRINDX_MASK, .val = PTE_ATTRINDX(MT_DEVICE_nGnRnE), .set = "DEVICE/nGnRnE", @@ -247,7 +252,7 @@ static void note_prot_wx(struct pg_state *st, unsigned long addr) } static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, - unsigned long val) + u64 val) { struct pg_state *st = container_of(pt_st, struct pg_state, ptdump); static const char units[] = "KMGTPE"; diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index c9cedc0432d2..dff2d72b0883 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -635,11 +635,13 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs) inf = esr_to_fault_info(esr); - /* - * Return value ignored as we rely on signal merging. - * Future patches will make this more robust. - */ - apei_claim_sea(regs); + if (user_mode(regs) && apei_claim_sea(regs) == 0) { + /* + * APEI claimed this as a firmware-first notification. + * Some processing deferred to task_work before ret_to_user(). + */ + return 0; + } if (esr & ESR_ELx_FnV) siaddr = NULL; diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index bbeb6a5a6ba6..07f154b8b84a 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -230,6 +230,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, ptep = (pte_t *)pudp; } else if (sz == (CONT_PTE_SIZE)) { pmdp = pmd_alloc(mm, pudp, addr); + if (!pmdp) + return NULL; WARN_ON(addr & (sz - 1)); /* @@ -441,44 +443,30 @@ void huge_ptep_clear_flush(struct vm_area_struct *vma, clear_flush(vma->vm_mm, addr, ptep, pgsize, ncontig); } -static void __init add_huge_page_size(unsigned long size) -{ - if (size_to_hstate(size)) - return; - - hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT); -} - static int __init hugetlbpage_init(void) { #ifdef CONFIG_ARM64_4K_PAGES - add_huge_page_size(PUD_SIZE); + hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); #endif - add_huge_page_size(CONT_PMD_SIZE); - add_huge_page_size(PMD_SIZE); - add_huge_page_size(CONT_PTE_SIZE); + hugetlb_add_hstate((CONT_PMD_SHIFT + PMD_SHIFT) - PAGE_SHIFT); + hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT); + hugetlb_add_hstate((CONT_PTE_SHIFT + PAGE_SHIFT) - PAGE_SHIFT); return 0; } arch_initcall(hugetlbpage_init); -static __init int setup_hugepagesz(char *opt) +bool __init arch_hugetlb_valid_size(unsigned long size) { - unsigned long ps = memparse(opt, &opt); - - switch (ps) { + switch (size) { #ifdef CONFIG_ARM64_4K_PAGES case PUD_SIZE: #endif case CONT_PMD_SIZE: case PMD_SIZE: case CONT_PTE_SIZE: - add_huge_page_size(ps); - return 1; + return true; } - hugetlb_bad_size(); - pr_err("hugepagesz: Unsupported page size %lu K\n", ps >> 10); - return 0; + return false; } -__setup("hugepagesz=", setup_hugepagesz); diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index e42727e3568e..e631e6425165 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -192,8 +192,6 @@ static phys_addr_t __init max_zone_phys(unsigned int zone_bits) return min(offset + (1ULL << zone_bits), memblock_end_of_DRAM()); } -#ifdef CONFIG_NUMA - static void __init zone_sizes_init(unsigned long min, unsigned long max) { unsigned long max_zone_pfns[MAX_NR_ZONES] = {0}; @@ -206,61 +204,9 @@ static void __init zone_sizes_init(unsigned long min, unsigned long max) #endif max_zone_pfns[ZONE_NORMAL] = max; - free_area_init_nodes(max_zone_pfns); -} - -#else - -static void __init zone_sizes_init(unsigned long min, unsigned long max) -{ - struct memblock_region *reg; - unsigned long zone_size[MAX_NR_ZONES], zhole_size[MAX_NR_ZONES]; - unsigned long __maybe_unused max_dma, max_dma32; - - memset(zone_size, 0, sizeof(zone_size)); - - max_dma = max_dma32 = min; -#ifdef CONFIG_ZONE_DMA - max_dma = max_dma32 = PFN_DOWN(arm64_dma_phys_limit); - zone_size[ZONE_DMA] = max_dma - min; -#endif -#ifdef CONFIG_ZONE_DMA32 - max_dma32 = PFN_DOWN(arm64_dma32_phys_limit); - zone_size[ZONE_DMA32] = max_dma32 - max_dma; -#endif - zone_size[ZONE_NORMAL] = max - max_dma32; - - memcpy(zhole_size, zone_size, sizeof(zhole_size)); - - for_each_memblock(memory, reg) { - unsigned long start = memblock_region_memory_base_pfn(reg); - unsigned long end = memblock_region_memory_end_pfn(reg); - -#ifdef CONFIG_ZONE_DMA - if (start >= min && start < max_dma) { - unsigned long dma_end = min(end, max_dma); - zhole_size[ZONE_DMA] -= dma_end - start; - start = dma_end; - } -#endif -#ifdef CONFIG_ZONE_DMA32 - if (start >= max_dma && start < max_dma32) { - unsigned long dma32_end = min(end, max_dma32); - zhole_size[ZONE_DMA32] -= dma32_end - start; - start = dma32_end; - } -#endif - if (start >= max_dma32 && start < max) { - unsigned long normal_end = min(end, max); - zhole_size[ZONE_NORMAL] -= normal_end - start; - } - } - - free_area_init_node(0, zone_size, min, zhole_size); + free_area_init(max_zone_pfns); } -#endif /* CONFIG_NUMA */ - int pfn_valid(unsigned long pfn) { phys_addr_t addr = pfn << PAGE_SHIFT; @@ -272,7 +218,7 @@ int pfn_valid(unsigned long pfn) if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) return 0; - if (!valid_section(__nr_to_section(pfn_to_section_nr(pfn)))) + if (!valid_section(__pfn_to_section(pfn))) return 0; #endif return memblock_is_map_memory(addr); diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index a374e4f51a62..c299b73dd5e4 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -610,6 +610,22 @@ core_initcall(map_entry_trampoline); #endif /* + * Open coded check for BTI, only for use to determine configuration + * for early mappings for before the cpufeature code has run. + */ +static bool arm64_early_this_cpu_has_bti(void) +{ + u64 pfr1; + + if (!IS_ENABLED(CONFIG_ARM64_BTI_KERNEL)) + return false; + + pfr1 = read_sysreg_s(SYS_ID_AA64PFR1_EL1); + return cpuid_feature_extract_unsigned_field(pfr1, + ID_AA64PFR1_BT_SHIFT); +} + +/* * Create fine-grained mappings for the kernel. */ static void __init map_kernel(pgd_t *pgdp) @@ -625,6 +641,14 @@ static void __init map_kernel(pgd_t *pgdp) pgprot_t text_prot = rodata_enabled ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC; /* + * If we have a CPU that supports BTI and a kernel built for + * BTI then mark the kernel executable text as guarded pages + * now so we don't have to rewrite the page tables later. + */ + if (arm64_early_this_cpu_has_bti()) + text_prot = __pgprot_modify(text_prot, PTE_GP, PTE_GP); + + /* * Only rodata will be remapped with different permissions later on, * all other segments are allowed to use contiguous mappings. */ diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c index 4decf1659700..aafcee3e3f7e 100644 --- a/arch/arm64/mm/numa.c +++ b/arch/arm64/mm/numa.c @@ -350,13 +350,16 @@ static int __init numa_register_nodes(void) struct memblock_region *mblk; /* Check that valid nid is set to memblks */ - for_each_memblock(memory, mblk) - if (mblk->nid == NUMA_NO_NODE || mblk->nid >= MAX_NUMNODES) { + for_each_memblock(memory, mblk) { + int mblk_nid = memblock_get_region_node(mblk); + + if (mblk_nid == NUMA_NO_NODE || mblk_nid >= MAX_NUMNODES) { pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n", - mblk->nid, mblk->base, + mblk_nid, mblk->base, mblk->base + mblk->size - 1); return -EINVAL; } + } /* Finally register nodes. */ for_each_node_mask(nid, numa_nodes_parsed) { diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 250c49008d73..bde08090b838 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -126,13 +126,13 @@ int set_memory_nx(unsigned long addr, int numpages) { return change_memory_common(addr, numpages, __pgprot(PTE_PXN), - __pgprot(0)); + __pgprot(PTE_MAYBE_GP)); } int set_memory_x(unsigned long addr, int numpages) { return change_memory_common(addr, numpages, - __pgprot(0), + __pgprot(PTE_MAYBE_GP), __pgprot(PTE_PXN)); } diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index 197a9ba2d5ea..b7bebb12a56d 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -58,6 +58,8 @@ * cpu_do_suspend - save CPU registers context * * x0: virtual address of context pointer + * + * This must be kept in sync with struct cpu_suspend_ctx in <asm/suspend.h>. */ SYM_FUNC_START(cpu_do_suspend) mrs x2, tpidr_el0 @@ -82,6 +84,11 @@ alternative_endif stp x8, x9, [x0, #48] stp x10, x11, [x0, #64] stp x12, x13, [x0, #80] + /* + * Save x18 as it may be used as a platform register, e.g. by shadow + * call stack. + */ + str x18, [x0, #96] ret SYM_FUNC_END(cpu_do_suspend) @@ -98,6 +105,13 @@ SYM_FUNC_START(cpu_do_resume) ldp x9, x10, [x0, #48] ldp x11, x12, [x0, #64] ldp x13, x14, [x0, #80] + /* + * Restore x18, as it may be used as a platform register, and clear + * the buffer to minimize the risk of exposure when used for shadow + * call stack. + */ + ldr x18, [x0, #96] + str xzr, [x0, #96] msr tpidr_el0, x2 msr tpidrro_el0, x3 msr contextidr_el1, x4 @@ -139,7 +153,7 @@ alternative_if ARM64_HAS_RAS_EXTN msr_s SYS_DISR_EL1, xzr alternative_else_nop_endif - ptrauth_keys_install_kernel x14, 0, x1, x2, x3 + ptrauth_keys_install_kernel_nosync x14, x1, x2, x3 isb ret SYM_FUNC_END(cpu_do_resume) @@ -386,8 +400,6 @@ SYM_FUNC_END(idmap_kpti_install_ng_mappings) * * Initialise the processor for turning the MMU on. * - * Input: - * x0 with a flag ARM64_CPU_BOOT_PRIMARY/ARM64_CPU_BOOT_SECONDARY/ARM64_CPU_RUNTIME. * Output: * Return in x0 the value of the SCTLR_EL1 register. */ @@ -446,51 +458,9 @@ SYM_FUNC_START(__cpu_setup) 1: #endif /* CONFIG_ARM64_HW_AFDBM */ msr tcr_el1, x10 - mov x1, x0 /* * Prepare SCTLR */ mov_q x0, SCTLR_EL1_SET - -#ifdef CONFIG_ARM64_PTR_AUTH - /* No ptrauth setup for run time cpus */ - cmp x1, #ARM64_CPU_RUNTIME - b.eq 3f - - /* Check if the CPU supports ptrauth */ - mrs x2, id_aa64isar1_el1 - ubfx x2, x2, #ID_AA64ISAR1_APA_SHIFT, #8 - cbz x2, 3f - - /* - * The primary cpu keys are reset here and can be - * re-initialised with some proper values later. - */ - msr_s SYS_APIAKEYLO_EL1, xzr - msr_s SYS_APIAKEYHI_EL1, xzr - - /* Just enable ptrauth for primary cpu */ - cmp x1, #ARM64_CPU_BOOT_PRIMARY - b.eq 2f - - /* if !system_supports_address_auth() then skip enable */ -alternative_if_not ARM64_HAS_ADDRESS_AUTH - b 3f -alternative_else_nop_endif - - /* Install ptrauth key for secondary cpus */ - adr_l x2, secondary_data - ldr x3, [x2, #CPU_BOOT_TASK] // get secondary_data.task - cbz x3, 2f // check for slow booting cpus - ldp x3, x4, [x2, #CPU_BOOT_PTRAUTH_KEY] - msr_s SYS_APIAKEYLO_EL1, x3 - msr_s SYS_APIAKEYHI_EL1, x4 - -2: /* Enable ptrauth instructions */ - ldr x2, =SCTLR_ELx_ENIA | SCTLR_ELx_ENIB | \ - SCTLR_ELx_ENDA | SCTLR_ELx_ENDB - orr x0, x0, x2 -3: -#endif ret // return to head.S SYM_FUNC_END(__cpu_setup) diff --git a/arch/arm64/net/bpf_jit.h b/arch/arm64/net/bpf_jit.h index eb73f9f72c46..cc0cf0f5c7c3 100644 --- a/arch/arm64/net/bpf_jit.h +++ b/arch/arm64/net/bpf_jit.h @@ -100,6 +100,14 @@ /* Rd = Rn OP imm12 */ #define A64_ADD_I(sf, Rd, Rn, imm12) A64_ADDSUB_IMM(sf, Rd, Rn, imm12, ADD) #define A64_SUB_I(sf, Rd, Rn, imm12) A64_ADDSUB_IMM(sf, Rd, Rn, imm12, SUB) +#define A64_ADDS_I(sf, Rd, Rn, imm12) \ + A64_ADDSUB_IMM(sf, Rd, Rn, imm12, ADD_SETFLAGS) +#define A64_SUBS_I(sf, Rd, Rn, imm12) \ + A64_ADDSUB_IMM(sf, Rd, Rn, imm12, SUB_SETFLAGS) +/* Rn + imm12; set condition flags */ +#define A64_CMN_I(sf, Rn, imm12) A64_ADDS_I(sf, A64_ZR, Rn, imm12) +/* Rn - imm12; set condition flags */ +#define A64_CMP_I(sf, Rn, imm12) A64_SUBS_I(sf, A64_ZR, Rn, imm12) /* Rd = Rn */ #define A64_MOV(sf, Rd, Rn) A64_ADD_I(sf, Rd, Rn, 0) @@ -189,4 +197,26 @@ /* Rn & Rm; set condition flags */ #define A64_TST(sf, Rn, Rm) A64_ANDS(sf, A64_ZR, Rn, Rm) +/* Logical (immediate) */ +#define A64_LOGIC_IMM(sf, Rd, Rn, imm, type) ({ \ + u64 imm64 = (sf) ? (u64)imm : (u64)(u32)imm; \ + aarch64_insn_gen_logical_immediate(AARCH64_INSN_LOGIC_##type, \ + A64_VARIANT(sf), Rn, Rd, imm64); \ +}) +/* Rd = Rn OP imm */ +#define A64_AND_I(sf, Rd, Rn, imm) A64_LOGIC_IMM(sf, Rd, Rn, imm, AND) +#define A64_ORR_I(sf, Rd, Rn, imm) A64_LOGIC_IMM(sf, Rd, Rn, imm, ORR) +#define A64_EOR_I(sf, Rd, Rn, imm) A64_LOGIC_IMM(sf, Rd, Rn, imm, EOR) +#define A64_ANDS_I(sf, Rd, Rn, imm) A64_LOGIC_IMM(sf, Rd, Rn, imm, AND_SETFLAGS) +/* Rn & imm; set condition flags */ +#define A64_TST_I(sf, Rn, imm) A64_ANDS_I(sf, A64_ZR, Rn, imm) + +/* HINTs */ +#define A64_HINT(x) aarch64_insn_gen_hint(x) + +/* BTI */ +#define A64_BTI_C A64_HINT(AARCH64_INSN_HINT_BTIC) +#define A64_BTI_J A64_HINT(AARCH64_INSN_HINT_BTIJ) +#define A64_BTI_JC A64_HINT(AARCH64_INSN_HINT_BTIJC) + #endif /* _BPF_JIT_H */ diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index cdc79de0c794..3cb25b43b368 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -167,11 +167,21 @@ static inline int epilogue_offset(const struct jit_ctx *ctx) return to - from; } +static bool is_addsub_imm(u32 imm) +{ + /* Either imm12 or shifted imm12. */ + return !(imm & ~0xfff) || !(imm & ~0xfff000); +} + /* Stack must be multiples of 16B */ #define STACK_ALIGN(sz) (((sz) + 15) & ~15) /* Tail call offset to jump into */ +#if IS_ENABLED(CONFIG_ARM64_BTI_KERNEL) +#define PROLOGUE_OFFSET 8 +#else #define PROLOGUE_OFFSET 7 +#endif static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf) { @@ -208,6 +218,10 @@ static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf) * */ + /* BTI landing pad */ + if (IS_ENABLED(CONFIG_ARM64_BTI_KERNEL)) + emit(A64_BTI_C, ctx); + /* Save FP and LR registers to stay align with ARM64 AAPCS */ emit(A64_PUSH(A64_FP, A64_LR, A64_SP), ctx); emit(A64_MOV(1, A64_FP, A64_SP), ctx); @@ -230,6 +244,10 @@ static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf) cur_offset, PROLOGUE_OFFSET); return -1; } + + /* BTI landing pad for the tail call, done with a BR */ + if (IS_ENABLED(CONFIG_ARM64_BTI_KERNEL)) + emit(A64_BTI_J, ctx); } ctx->stack_size = STACK_ALIGN(prog->aux->stack_depth); @@ -356,6 +374,7 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, const bool isdw = BPF_SIZE(code) == BPF_DW; u8 jmp_cond, reg; s32 jmp_offset; + u32 a64_insn; #define check_imm(bits, imm) do { \ if ((((imm) > 0) && ((imm) >> (bits))) || \ @@ -478,28 +497,55 @@ emit_bswap_uxt: /* dst = dst OP imm */ case BPF_ALU | BPF_ADD | BPF_K: case BPF_ALU64 | BPF_ADD | BPF_K: - emit_a64_mov_i(is64, tmp, imm, ctx); - emit(A64_ADD(is64, dst, dst, tmp), ctx); + if (is_addsub_imm(imm)) { + emit(A64_ADD_I(is64, dst, dst, imm), ctx); + } else if (is_addsub_imm(-imm)) { + emit(A64_SUB_I(is64, dst, dst, -imm), ctx); + } else { + emit_a64_mov_i(is64, tmp, imm, ctx); + emit(A64_ADD(is64, dst, dst, tmp), ctx); + } break; case BPF_ALU | BPF_SUB | BPF_K: case BPF_ALU64 | BPF_SUB | BPF_K: - emit_a64_mov_i(is64, tmp, imm, ctx); - emit(A64_SUB(is64, dst, dst, tmp), ctx); + if (is_addsub_imm(imm)) { + emit(A64_SUB_I(is64, dst, dst, imm), ctx); + } else if (is_addsub_imm(-imm)) { + emit(A64_ADD_I(is64, dst, dst, -imm), ctx); + } else { + emit_a64_mov_i(is64, tmp, imm, ctx); + emit(A64_SUB(is64, dst, dst, tmp), ctx); + } break; case BPF_ALU | BPF_AND | BPF_K: case BPF_ALU64 | BPF_AND | BPF_K: - emit_a64_mov_i(is64, tmp, imm, ctx); - emit(A64_AND(is64, dst, dst, tmp), ctx); + a64_insn = A64_AND_I(is64, dst, dst, imm); + if (a64_insn != AARCH64_BREAK_FAULT) { + emit(a64_insn, ctx); + } else { + emit_a64_mov_i(is64, tmp, imm, ctx); + emit(A64_AND(is64, dst, dst, tmp), ctx); + } break; case BPF_ALU | BPF_OR | BPF_K: case BPF_ALU64 | BPF_OR | BPF_K: - emit_a64_mov_i(is64, tmp, imm, ctx); - emit(A64_ORR(is64, dst, dst, tmp), ctx); + a64_insn = A64_ORR_I(is64, dst, dst, imm); + if (a64_insn != AARCH64_BREAK_FAULT) { + emit(a64_insn, ctx); + } else { + emit_a64_mov_i(is64, tmp, imm, ctx); + emit(A64_ORR(is64, dst, dst, tmp), ctx); + } break; case BPF_ALU | BPF_XOR | BPF_K: case BPF_ALU64 | BPF_XOR | BPF_K: - emit_a64_mov_i(is64, tmp, imm, ctx); - emit(A64_EOR(is64, dst, dst, tmp), ctx); + a64_insn = A64_EOR_I(is64, dst, dst, imm); + if (a64_insn != AARCH64_BREAK_FAULT) { + emit(a64_insn, ctx); + } else { + emit_a64_mov_i(is64, tmp, imm, ctx); + emit(A64_EOR(is64, dst, dst, tmp), ctx); + } break; case BPF_ALU | BPF_MUL | BPF_K: case BPF_ALU64 | BPF_MUL | BPF_K: @@ -623,13 +669,24 @@ emit_cond_jmp: case BPF_JMP32 | BPF_JSLT | BPF_K: case BPF_JMP32 | BPF_JSGE | BPF_K: case BPF_JMP32 | BPF_JSLE | BPF_K: - emit_a64_mov_i(is64, tmp, imm, ctx); - emit(A64_CMP(is64, dst, tmp), ctx); + if (is_addsub_imm(imm)) { + emit(A64_CMP_I(is64, dst, imm), ctx); + } else if (is_addsub_imm(-imm)) { + emit(A64_CMN_I(is64, dst, -imm), ctx); + } else { + emit_a64_mov_i(is64, tmp, imm, ctx); + emit(A64_CMP(is64, dst, tmp), ctx); + } goto emit_cond_jmp; case BPF_JMP | BPF_JSET | BPF_K: case BPF_JMP32 | BPF_JSET | BPF_K: - emit_a64_mov_i(is64, tmp, imm, ctx); - emit(A64_TST(is64, dst, tmp), ctx); + a64_insn = A64_TST_I(is64, dst, imm); + if (a64_insn != AARCH64_BREAK_FAULT) { + emit(a64_insn, ctx); + } else { + emit_a64_mov_i(is64, tmp, imm, ctx); + emit(A64_TST(is64, dst, tmp), ctx); + } goto emit_cond_jmp; /* function call */ case BPF_JMP | BPF_CALL: |