summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/kernel-parameters.txt9
-rw-r--r--Documentation/virtual/kvm/devices/arm-vgic-its.txt38
-rw-r--r--Documentation/virtual/kvm/devices/arm-vgic-v3.txt206
-rw-r--r--Documentation/virtual/kvm/devices/arm-vgic.txt52
-rw-r--r--Documentation/virtual/kvm/devices/vcpu.txt4
-rw-r--r--arch/arm/include/asm/arch_gicv3.h93
-rw-r--r--arch/arm/include/asm/cp15.h15
-rw-r--r--arch/arm/include/asm/cputype.h1
-rw-r--r--arch/arm/include/asm/kvm_asm.h7
-rw-r--r--arch/arm/include/asm/kvm_emulate.h35
-rw-r--r--arch/arm/include/asm/kvm_host.h17
-rw-r--r--arch/arm/include/asm/kvm_hyp.h18
-rw-r--r--arch/arm/include/asm/kvm_mmu.h28
-rw-r--r--arch/arm/include/uapi/asm/kvm.h7
-rw-r--r--arch/arm/kvm/Makefile3
-rw-r--r--arch/arm/kvm/arm.c22
-rw-r--r--arch/arm/kvm/coproc.c35
-rw-r--r--arch/arm/kvm/emulate.c111
-rw-r--r--arch/arm/kvm/handle_exit.c49
-rw-r--r--arch/arm/kvm/hyp/Makefile1
-rw-r--r--arch/arm/kvm/hyp/entry.S31
-rw-r--r--arch/arm/kvm/hyp/hyp-entry.S16
-rw-r--r--arch/arm/kvm/hyp/switch.c25
-rw-r--r--arch/arm/kvm/hyp/tlb.c15
-rw-r--r--arch/arm/kvm/mmio.c6
-rw-r--r--arch/arm/kvm/mmu.c7
-rw-r--r--arch/arm64/include/asm/arch_gicv3.h13
-rw-r--r--arch/arm64/include/asm/kvm_arm.h4
-rw-r--r--arch/arm64/include/asm/kvm_asm.h9
-rw-r--r--arch/arm64/include/asm/kvm_emulate.h11
-rw-r--r--arch/arm64/include/asm/kvm_host.h12
-rw-r--r--arch/arm64/include/asm/kvm_hyp.h1
-rw-r--r--arch/arm64/include/asm/kvm_mmu.h6
-rw-r--r--arch/arm64/kvm/Kconfig4
-rw-r--r--arch/arm64/kvm/Makefile3
-rw-r--r--arch/arm64/kvm/handle_exit.c23
-rw-r--r--arch/arm64/kvm/hyp/Makefile2
-rw-r--r--arch/arm64/kvm/hyp/debug-sr.c4
-rw-r--r--arch/arm64/kvm/hyp/entry.S128
-rw-r--r--arch/arm64/kvm/hyp/hyp-entry.S73
-rw-r--r--arch/arm64/kvm/hyp/switch.c84
-rw-r--r--arch/arm64/kvm/hyp/tlb.c13
-rw-r--r--arch/arm64/kvm/inject_fault.c12
-rw-r--r--arch/mips/include/asm/kvm_host.h63
-rw-r--r--arch/mips/kvm/emulate.c78
-rw-r--r--arch/mips/kvm/mips.c40
-rw-r--r--arch/mips/kvm/mmu.c16
-rw-r--r--arch/mips/kvm/trap_emul.c18
-rw-r--r--arch/powerpc/include/asm/book3s/64/mmu-hash.h37
-rw-r--r--arch/powerpc/include/asm/io.h29
-rw-r--r--arch/powerpc/include/asm/kvm_asm.h10
-rw-r--r--arch/powerpc/include/asm/kvm_book3s.h39
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_64.h90
-rw-r--r--arch/powerpc/include/asm/kvm_host.h124
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h28
-rw-r--r--arch/powerpc/include/asm/mmu.h1
-rw-r--r--arch/powerpc/include/asm/opal.h1
-rw-r--r--arch/powerpc/include/asm/pnv-pci.h3
-rw-r--r--arch/powerpc/include/asm/reg.h1
-rw-r--r--arch/powerpc/kernel/asm-offsets.c2
-rw-r--r--arch/powerpc/kvm/Kconfig3
-rw-r--r--arch/powerpc/kvm/Makefile19
-rw-r--r--arch/powerpc/kvm/book3s.c13
-rw-r--r--arch/powerpc/kvm/book3s_emulate.c4
-rw-r--r--arch/powerpc/kvm/book3s_hv.c533
-rw-r--r--arch/powerpc/kvm/book3s_hv_builtin.c156
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_xics.c120
-rw-r--r--arch/powerpc/kvm/book3s_hv_rmhandlers.S197
-rw-r--r--arch/powerpc/kvm/book3s_pr.c10
-rw-r--r--arch/powerpc/kvm/book3s_xics.c57
-rw-r--r--arch/powerpc/kvm/book3s_xics.h2
-rw-r--r--arch/powerpc/kvm/booke.c2
-rw-r--r--arch/powerpc/kvm/e500_mmu.c73
-rw-r--r--arch/powerpc/kvm/powerpc.c61
-rw-r--r--arch/powerpc/kvm/trace_hv.h22
-rw-r--r--arch/powerpc/mm/hash_native_64.c42
-rw-r--r--arch/powerpc/mm/hash_utils_64.c55
-rw-r--r--arch/powerpc/platforms/powernv/opal-wrappers.S1
-rw-r--r--arch/powerpc/platforms/powernv/pci-ioda.c24
-rw-r--r--arch/s390/include/asm/kvm_host.h136
-rw-r--r--arch/s390/kernel/asm-offsets.c1
-rw-r--r--arch/s390/kvm/gaccess.c37
-rw-r--r--arch/s390/kvm/guestdbg.c59
-rw-r--r--arch/s390/kvm/intercept.c1
-rw-r--r--arch/s390/kvm/interrupt.c98
-rw-r--r--arch/s390/kvm/kvm-s390.c75
-rw-r--r--arch/s390/kvm/kvm-s390.h14
-rw-r--r--arch/s390/kvm/priv.c21
-rw-r--r--arch/x86/entry/vdso/vclock_gettime.c2
-rw-r--r--arch/x86/include/asm/kvm_host.h78
-rw-r--r--arch/x86/include/asm/pvclock.h5
-rw-r--r--arch/x86/kernel/pvclock.c2
-rw-r--r--arch/x86/kvm/Makefile2
-rw-r--r--arch/x86/kvm/cpuid.c3
-rw-r--r--arch/x86/kvm/debugfs.c69
-rw-r--r--arch/x86/kvm/hyperv.c157
-rw-r--r--arch/x86/kvm/hyperv.h3
-rw-r--r--arch/x86/kvm/lapic.c5
-rw-r--r--arch/x86/kvm/mmu.c12
-rw-r--r--arch/x86/kvm/svm.c417
-rw-r--r--arch/x86/kvm/vmx.c207
-rw-r--r--arch/x86/kvm/x86.c171
-rw-r--r--arch/x86/kvm/x86.h6
-rw-r--r--drivers/iommu/amd_iommu.c484
-rw-r--r--drivers/iommu/amd_iommu_init.c181
-rw-r--r--drivers/iommu/amd_iommu_proto.h1
-rw-r--r--drivers/iommu/amd_iommu_types.h149
-rw-r--r--include/kvm/arm_vgic.h18
-rw-r--r--include/linux/amd-iommu.h43
-rw-r--r--include/linux/kvm_host.h4
-rw-r--r--kernel/configs/kvm_guest.config (renamed from arch/x86/configs/kvm_guest.config)1
-rw-r--r--virt/kvm/arm/aarch32.c (renamed from arch/arm64/kvm/emulate.c)25
-rw-r--r--virt/kvm/arm/arch_timer.c17
-rw-r--r--virt/kvm/arm/hyp/vgic-v2-sr.c57
-rw-r--r--virt/kvm/arm/hyp/vgic-v3-sr.c (renamed from arch/arm64/kvm/hyp/vgic-v3-sr.c)17
-rw-r--r--virt/kvm/arm/pmu.c8
-rw-r--r--virt/kvm/arm/vgic/vgic-init.c4
-rw-r--r--virt/kvm/arm/vgic/vgic-irqfd.c6
-rw-r--r--virt/kvm/arm/vgic/vgic-kvm-device.c133
-rw-r--r--virt/kvm/arm/vgic/vgic-mmio-v3.c8
-rw-r--r--virt/kvm/arm/vgic/vgic-mmio.c2
-rw-r--r--virt/kvm/arm/vgic/vgic-mmio.h4
-rw-r--r--virt/kvm/arm/vgic/vgic-v2.c71
-rw-r--r--virt/kvm/arm/vgic/vgic.c8
-rw-r--r--virt/kvm/arm/vgic/vgic.h54
-rw-r--r--virt/kvm/eventfd.c22
-rw-r--r--virt/kvm/kvm_main.c50
127 files changed, 4540 insertions, 1635 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 6fa1d8ab973c..ec8d81417dc8 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -460,6 +460,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
driver will print ACPI tables for AMD IOMMU during
IOMMU initialization.
+ amd_iommu_intr= [HW,X86-64]
+ Specifies one of the following AMD IOMMU interrupt
+ remapping modes:
+ legacy - Use legacy interrupt remapping mode.
+ vapic - Use virtual APIC mode, which allows IOMMU
+ to inject interrupts directly into guest.
+ This mode requires kvm-amd.avic=1.
+ (Default when IOMMU HW support is present.)
+
amijoy.map= [HW,JOY] Amiga joystick support
Map of devices attached to JOY0DAT and JOY1DAT
Format: <a>,<b>
diff --git a/Documentation/virtual/kvm/devices/arm-vgic-its.txt b/Documentation/virtual/kvm/devices/arm-vgic-its.txt
new file mode 100644
index 000000000000..6081a5b7fc1e
--- /dev/null
+++ b/Documentation/virtual/kvm/devices/arm-vgic-its.txt
@@ -0,0 +1,38 @@
+ARM Virtual Interrupt Translation Service (ITS)
+===============================================
+
+Device types supported:
+ KVM_DEV_TYPE_ARM_VGIC_ITS ARM Interrupt Translation Service Controller
+
+The ITS allows MSI(-X) interrupts to be injected into guests. This extension is
+optional. Creating a virtual ITS controller also requires a host GICv3 (see
+arm-vgic-v3.txt), but does not depend on having physical ITS controllers.
+
+There can be multiple ITS controllers per guest, each of them has to have
+a separate, non-overlapping MMIO region.
+
+
+Groups:
+ KVM_DEV_ARM_VGIC_GRP_ADDR
+ Attributes:
+ KVM_VGIC_ITS_ADDR_TYPE (rw, 64-bit)
+ Base address in the guest physical address space of the GICv3 ITS
+ control register frame.
+ This address needs to be 64K aligned and the region covers 128K.
+ Errors:
+ -E2BIG: Address outside of addressable IPA range
+ -EINVAL: Incorrectly aligned address
+ -EEXIST: Address already configured
+ -EFAULT: Invalid user pointer for attr->addr.
+ -ENODEV: Incorrect attribute or the ITS is not supported.
+
+
+ KVM_DEV_ARM_VGIC_GRP_CTRL
+ Attributes:
+ KVM_DEV_ARM_VGIC_CTRL_INIT
+ request the initialization of the ITS, no additional parameter in
+ kvm_device_attr.addr.
+ Errors:
+ -ENXIO: ITS not properly configured as required prior to setting
+ this attribute
+ -ENOMEM: Memory shortage when allocating ITS internal data
diff --git a/Documentation/virtual/kvm/devices/arm-vgic-v3.txt b/Documentation/virtual/kvm/devices/arm-vgic-v3.txt
new file mode 100644
index 000000000000..9348b3caccd7
--- /dev/null
+++ b/Documentation/virtual/kvm/devices/arm-vgic-v3.txt
@@ -0,0 +1,206 @@
+ARM Virtual Generic Interrupt Controller v3 and later (VGICv3)
+==============================================================
+
+
+Device types supported:
+ KVM_DEV_TYPE_ARM_VGIC_V3 ARM Generic Interrupt Controller v3.0
+
+Only one VGIC instance may be instantiated through this API. The created VGIC
+will act as the VM interrupt controller, requiring emulated user-space devices
+to inject interrupts to the VGIC instead of directly to CPUs. It is not
+possible to create both a GICv3 and GICv2 on the same VM.
+
+Creating a guest GICv3 device requires a host GICv3 as well.
+
+
+Groups:
+ KVM_DEV_ARM_VGIC_GRP_ADDR
+ Attributes:
+ KVM_VGIC_V3_ADDR_TYPE_DIST (rw, 64-bit)
+ Base address in the guest physical address space of the GICv3 distributor
+ register mappings. Only valid for KVM_DEV_TYPE_ARM_VGIC_V3.
+ This address needs to be 64K aligned and the region covers 64 KByte.
+
+ KVM_VGIC_V3_ADDR_TYPE_REDIST (rw, 64-bit)
+ Base address in the guest physical address space of the GICv3
+ redistributor register mappings. There are two 64K pages for each
+ VCPU and all of the redistributor pages are contiguous.
+ Only valid for KVM_DEV_TYPE_ARM_VGIC_V3.
+ This address needs to be 64K aligned.
+ Errors:
+ -E2BIG: Address outside of addressable IPA range
+ -EINVAL: Incorrectly aligned address
+ -EEXIST: Address already configured
+ -ENXIO: The group or attribute is unknown/unsupported for this device
+ or hardware support is missing.
+ -EFAULT: Invalid user pointer for attr->addr.
+
+
+
+ KVM_DEV_ARM_VGIC_GRP_DIST_REGS
+ KVM_DEV_ARM_VGIC_GRP_REDIST_REGS
+ Attributes:
+ The attr field of kvm_device_attr encodes two values:
+ bits: | 63 .... 32 | 31 .... 0 |
+ values: | mpidr | offset |
+
+ All distributor regs are (rw, 32-bit) and kvm_device_attr.addr points to a
+ __u32 value. 64-bit registers must be accessed by separately accessing the
+ lower and higher word.
+
+ Writes to read-only registers are ignored by the kernel.
+
+ KVM_DEV_ARM_VGIC_GRP_DIST_REGS accesses the main distributor registers.
+ KVM_DEV_ARM_VGIC_GRP_REDIST_REGS accesses the redistributor of the CPU
+ specified by the mpidr.
+
+ The offset is relative to the "[Re]Distributor base address" as defined
+ in the GICv3/4 specs. Getting or setting such a register has the same
+ effect as reading or writing the register on real hardware, except for the
+ following registers: GICD_STATUSR, GICR_STATUSR, GICD_ISPENDR,
+ GICR_ISPENDR0, GICD_ICPENDR, and GICR_ICPENDR0. These registers behave
+ differently when accessed via this interface compared to their
+ architecturally defined behavior to allow software a full view of the
+ VGIC's internal state.
+
+ The mpidr field is used to specify which
+ redistributor is accessed. The mpidr is ignored for the distributor.
+
+ The mpidr encoding is based on the affinity information in the
+ architecture defined MPIDR, and the field is encoded as follows:
+ | 63 .... 56 | 55 .... 48 | 47 .... 40 | 39 .... 32 |
+ | Aff3 | Aff2 | Aff1 | Aff0 |
+
+ Note that distributor fields are not banked, but return the same value
+ regardless of the mpidr used to access the register.
+
+ The GICD_STATUSR and GICR_STATUSR registers are architecturally defined such
+ that a write of a clear bit has no effect, whereas a write with a set bit
+ clears that value. To allow userspace to freely set the values of these two
+ registers, setting the attributes with the register offsets for these two
+ registers simply sets the non-reserved bits to the value written.
+
+
+ Accesses (reads and writes) to the GICD_ISPENDR register region and
+ GICR_ISPENDR0 registers get/set the value of the latched pending state for
+ the interrupts.
+
+ This is identical to the value returned by a guest read from ISPENDR for an
+ edge triggered interrupt, but may differ for level triggered interrupts.
+ For edge triggered interrupts, once an interrupt becomes pending (whether
+ because of an edge detected on the input line or because of a guest write
+ to ISPENDR) this state is "latched", and only cleared when either the
+ interrupt is activated or when the guest writes to ICPENDR. A level
+ triggered interrupt may be pending either because the level input is held
+ high by a device, or because of a guest write to the ISPENDR register. Only
+ ISPENDR writes are latched; if the device lowers the line level then the
+ interrupt is no longer pending unless the guest also wrote to ISPENDR, and
+ conversely writes to ICPENDR or activations of the interrupt do not clear
+ the pending status if the line level is still being held high. (These
+ rules are documented in the GICv3 specification descriptions of the ICPENDR
+ and ISPENDR registers.) For a level triggered interrupt the value accessed
+ here is that of the latch which is set by ISPENDR and cleared by ICPENDR or
+ interrupt activation, whereas the value returned by a guest read from
+ ISPENDR is the logical OR of the latch value and the input line level.
+
+ Raw access to the latch state is provided to userspace so that it can save
+ and restore the entire GIC internal state (which is defined by the
+ combination of the current input line level and the latch state, and cannot
+ be deduced from purely the line level and the value of the ISPENDR
+ registers).
+
+ Accesses to GICD_ICPENDR register region and GICR_ICPENDR0 registers have
+ RAZ/WI semantics, meaning that reads always return 0 and writes are always
+ ignored.
+
+ Errors:
+ -ENXIO: Getting or setting this register is not yet supported
+ -EBUSY: One or more VCPUs are running
+
+
+ KVM_DEV_ARM_VGIC_CPU_SYSREGS
+ Attributes:
+ The attr field of kvm_device_attr encodes two values:
+ bits: | 63 .... 32 | 31 .... 16 | 15 .... 0 |
+ values: | mpidr | RES | instr |
+
+ The mpidr field encodes the CPU ID based on the affinity information in the
+ architecture defined MPIDR, and the field is encoded as follows:
+ | 63 .... 56 | 55 .... 48 | 47 .... 40 | 39 .... 32 |
+ | Aff3 | Aff2 | Aff1 | Aff0 |
+
+ The instr field encodes the system register to access based on the fields
+ defined in the A64 instruction set encoding for system register access
+ (RES means the bits are reserved for future use and should be zero):
+
+ | 15 ... 14 | 13 ... 11 | 10 ... 7 | 6 ... 3 | 2 ... 0 |
+ | Op 0 | Op1 | CRn | CRm | Op2 |
+
+ All system regs accessed through this API are (rw, 64-bit) and
+ kvm_device_attr.addr points to a __u64 value.
+
+ KVM_DEV_ARM_VGIC_CPU_SYSREGS accesses the CPU interface registers for the
+ CPU specified by the mpidr field.
+
+ Errors:
+ -ENXIO: Getting or setting this register is not yet supported
+ -EBUSY: VCPU is running
+ -EINVAL: Invalid mpidr supplied
+
+
+ KVM_DEV_ARM_VGIC_GRP_NR_IRQS
+ Attributes:
+ A value describing the number of interrupts (SGI, PPI and SPI) for
+ this GIC instance, ranging from 64 to 1024, in increments of 32.
+
+ kvm_device_attr.addr points to a __u32 value.
+
+ Errors:
+ -EINVAL: Value set is out of the expected range
+ -EBUSY: Value has already be set.
+
+
+ KVM_DEV_ARM_VGIC_GRP_CTRL
+ Attributes:
+ KVM_DEV_ARM_VGIC_CTRL_INIT
+ request the initialization of the VGIC, no additional parameter in
+ kvm_device_attr.addr.
+ Errors:
+ -ENXIO: VGIC not properly configured as required prior to calling
+ this attribute
+ -ENODEV: no online VCPU
+ -ENOMEM: memory shortage when allocating vgic internal data
+
+
+ KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO
+ Attributes:
+ The attr field of kvm_device_attr encodes the following values:
+ bits: | 63 .... 32 | 31 .... 10 | 9 .... 0 |
+ values: | mpidr | info | vINTID |
+
+ The vINTID specifies which set of IRQs is reported on.
+
+ The info field specifies which information userspace wants to get or set
+ using this interface. Currently we support the following info values:
+
+ VGIC_LEVEL_INFO_LINE_LEVEL:
+ Get/Set the input level of the IRQ line for a set of 32 contiguously
+ numbered interrupts.
+ vINTID must be a multiple of 32.
+
+ kvm_device_attr.addr points to a __u32 value which will contain a
+ bitmap where a set bit means the interrupt level is asserted.
+
+ Bit[n] indicates the status for interrupt vINTID + n.
+
+ SGIs and any interrupt with a higher ID than the number of interrupts
+ supported, will be RAZ/WI. LPIs are always edge-triggered and are
+ therefore not supported by this interface.
+
+ PPIs are reported per VCPU as specified in the mpidr field, and SPIs are
+ reported with the same value regardless of the mpidr specified.
+
+ The mpidr field encodes the CPU ID based on the affinity information in the
+ architecture defined MPIDR, and the field is encoded as follows:
+ | 63 .... 56 | 55 .... 48 | 47 .... 40 | 39 .... 32 |
+ | Aff3 | Aff2 | Aff1 | Aff0 |
diff --git a/Documentation/virtual/kvm/devices/arm-vgic.txt b/Documentation/virtual/kvm/devices/arm-vgic.txt
index 89182f80cc7f..76e61c883347 100644
--- a/Documentation/virtual/kvm/devices/arm-vgic.txt
+++ b/Documentation/virtual/kvm/devices/arm-vgic.txt
@@ -1,24 +1,19 @@
-ARM Virtual Generic Interrupt Controller (VGIC)
-===============================================
+ARM Virtual Generic Interrupt Controller v2 (VGIC)
+==================================================
Device types supported:
KVM_DEV_TYPE_ARM_VGIC_V2 ARM Generic Interrupt Controller v2.0
- KVM_DEV_TYPE_ARM_VGIC_V3 ARM Generic Interrupt Controller v3.0
- KVM_DEV_TYPE_ARM_VGIC_ITS ARM Interrupt Translation Service Controller
-Only one VGIC instance of the V2/V3 types above may be instantiated through
-either this API or the legacy KVM_CREATE_IRQCHIP api. The created VGIC will
-act as the VM interrupt controller, requiring emulated user-space devices to
-inject interrupts to the VGIC instead of directly to CPUs.
+Only one VGIC instance may be instantiated through either this API or the
+legacy KVM_CREATE_IRQCHIP API. The created VGIC will act as the VM interrupt
+controller, requiring emulated user-space devices to inject interrupts to the
+VGIC instead of directly to CPUs.
-Creating a guest GICv3 device requires a host GICv3 as well.
-GICv3 implementations with hardware compatibility support allow a guest GICv2
-as well.
+GICv3 implementations with hardware compatibility support allow creating a
+guest GICv2 through this interface. For information on creating a guest GICv3
+device and guest ITS devices, see arm-vgic-v3.txt. It is not possible to
+create both a GICv3 and GICv2 device on the same VM.
-Creating a virtual ITS controller requires a host GICv3 (but does not depend
-on having physical ITS controllers).
-There can be multiple ITS controllers per guest, each of them has to have
-a separate, non-overlapping MMIO region.
Groups:
KVM_DEV_ARM_VGIC_GRP_ADDR
@@ -32,26 +27,13 @@ Groups:
Base address in the guest physical address space of the GIC virtual cpu
interface register mappings. Only valid for KVM_DEV_TYPE_ARM_VGIC_V2.
This address needs to be 4K aligned and the region covers 4 KByte.
-
- KVM_VGIC_V3_ADDR_TYPE_DIST (rw, 64-bit)
- Base address in the guest physical address space of the GICv3 distributor
- register mappings. Only valid for KVM_DEV_TYPE_ARM_VGIC_V3.
- This address needs to be 64K aligned and the region covers 64 KByte.
-
- KVM_VGIC_V3_ADDR_TYPE_REDIST (rw, 64-bit)
- Base address in the guest physical address space of the GICv3
- redistributor register mappings. There are two 64K pages for each
- VCPU and all of the redistributor pages are contiguous.
- Only valid for KVM_DEV_TYPE_ARM_VGIC_V3.
- This address needs to be 64K aligned.
-
- KVM_VGIC_V3_ADDR_TYPE_ITS (rw, 64-bit)
- Base address in the guest physical address space of the GICv3 ITS
- control register frame. The ITS allows MSI(-X) interrupts to be
- injected into guests. This extension is optional. If the kernel
- does not support the ITS, the call returns -ENODEV.
- Only valid for KVM_DEV_TYPE_ARM_VGIC_ITS.
- This address needs to be 64K aligned and the region covers 128K.
+ Errors:
+ -E2BIG: Address outside of addressable IPA range
+ -EINVAL: Incorrectly aligned address
+ -EEXIST: Address already configured
+ -ENXIO: The group or attribute is unknown/unsupported for this device
+ or hardware support is missing.
+ -EFAULT: Invalid user pointer for attr->addr.
KVM_DEV_ARM_VGIC_GRP_DIST_REGS
Attributes:
diff --git a/Documentation/virtual/kvm/devices/vcpu.txt b/Documentation/virtual/kvm/devices/vcpu.txt
index c04165868faf..02f50686c418 100644
--- a/Documentation/virtual/kvm/devices/vcpu.txt
+++ b/Documentation/virtual/kvm/devices/vcpu.txt
@@ -30,4 +30,6 @@ Returns: -ENODEV: PMUv3 not supported
attribute
-EBUSY: PMUv3 already initialized
-Request the initialization of the PMUv3.
+Request the initialization of the PMUv3. This must be done after creating the
+in-kernel irqchip. Creating a PMU with a userspace irqchip is currently not
+supported.
diff --git a/arch/arm/include/asm/arch_gicv3.h b/arch/arm/include/asm/arch_gicv3.h
index dfe4002812da..a8088290b778 100644
--- a/arch/arm/include/asm/arch_gicv3.h
+++ b/arch/arm/include/asm/arch_gicv3.h
@@ -22,9 +22,7 @@
#include <linux/io.h>
#include <asm/barrier.h>
-
-#define __ACCESS_CP15(CRn, Op1, CRm, Op2) p15, Op1, %0, CRn, CRm, Op2
-#define __ACCESS_CP15_64(Op1, CRm) p15, Op1, %Q0, %R0, CRm
+#include <asm/cp15.h>
#define ICC_EOIR1 __ACCESS_CP15(c12, 0, c12, 1)
#define ICC_DIR __ACCESS_CP15(c12, 0, c11, 1)
@@ -99,68 +97,129 @@
#define ICH_AP1R2 __AP1Rx(2)
#define ICH_AP1R3 __AP1Rx(3)
+/* A32-to-A64 mappings used by VGIC save/restore */
+
+#define CPUIF_MAP(a32, a64) \
+static inline void write_ ## a64(u32 val) \
+{ \
+ write_sysreg(val, a32); \
+} \
+static inline u32 read_ ## a64(void) \
+{ \
+ return read_sysreg(a32); \
+} \
+
+#define CPUIF_MAP_LO_HI(a32lo, a32hi, a64) \
+static inline void write_ ## a64(u64 val) \
+{ \
+ write_sysreg(lower_32_bits(val), a32lo);\
+ write_sysreg(upper_32_bits(val), a32hi);\
+} \
+static inline u64 read_ ## a64(void) \
+{ \
+ u64 val = read_sysreg(a32lo); \
+ \
+ val |= (u64)read_sysreg(a32hi) << 32; \
+ \
+ return val; \
+}
+
+CPUIF_MAP(ICH_HCR, ICH_HCR_EL2)
+CPUIF_MAP(ICH_VTR, ICH_VTR_EL2)
+CPUIF_MAP(ICH_MISR, ICH_MISR_EL2)
+CPUIF_MAP(ICH_EISR, ICH_EISR_EL2)
+CPUIF_MAP(ICH_ELSR, ICH_ELSR_EL2)
+CPUIF_MAP(ICH_VMCR, ICH_VMCR_EL2)
+CPUIF_MAP(ICH_AP0R3, ICH_AP0R3_EL2)
+CPUIF_MAP(ICH_AP0R2, ICH_AP0R2_EL2)
+CPUIF_MAP(ICH_AP0R1, ICH_AP0R1_EL2)
+CPUIF_MAP(ICH_AP0R0, ICH_AP0R0_EL2)
+CPUIF_MAP(ICH_AP1R3, ICH_AP1R3_EL2)
+CPUIF_MAP(ICH_AP1R2, ICH_AP1R2_EL2)
+CPUIF_MAP(ICH_AP1R1, ICH_AP1R1_EL2)
+CPUIF_MAP(ICH_AP1R0, ICH_AP1R0_EL2)
+CPUIF_MAP(ICC_HSRE, ICC_SRE_EL2)
+CPUIF_MAP(ICC_SRE, ICC_SRE_EL1)
+
+CPUIF_MAP_LO_HI(ICH_LR15, ICH_LRC15, ICH_LR15_EL2)
+CPUIF_MAP_LO_HI(ICH_LR14, ICH_LRC14, ICH_LR14_EL2)
+CPUIF_MAP_LO_HI(ICH_LR13, ICH_LRC13, ICH_LR13_EL2)
+CPUIF_MAP_LO_HI(ICH_LR12, ICH_LRC12, ICH_LR12_EL2)
+CPUIF_MAP_LO_HI(ICH_LR11, ICH_LRC11, ICH_LR11_EL2)
+CPUIF_MAP_LO_HI(ICH_LR10, ICH_LRC10, ICH_LR10_EL2)
+CPUIF_MAP_LO_HI(ICH_LR9, ICH_LRC9, ICH_LR9_EL2)
+CPUIF_MAP_LO_HI(ICH_LR8, ICH_LRC8, ICH_LR8_EL2)
+CPUIF_MAP_LO_HI(ICH_LR7, ICH_LRC7, ICH_LR7_EL2)
+CPUIF_MAP_LO_HI(ICH_LR6, ICH_LRC6, ICH_LR6_EL2)
+CPUIF_MAP_LO_HI(ICH_LR5, ICH_LRC5, ICH_LR5_EL2)
+CPUIF_MAP_LO_HI(ICH_LR4, ICH_LRC4, ICH_LR4_EL2)
+CPUIF_MAP_LO_HI(ICH_LR3, ICH_LRC3, ICH_LR3_EL2)
+CPUIF_MAP_LO_HI(ICH_LR2, ICH_LRC2, ICH_LR2_EL2)
+CPUIF_MAP_LO_HI(ICH_LR1, ICH_LRC1, ICH_LR1_EL2)
+CPUIF_MAP_LO_HI(ICH_LR0, ICH_LRC0, ICH_LR0_EL2)
+
+#define read_gicreg(r) read_##r()
+#define write_gicreg(v, r) write_##r(v)
+
/* Low-level accessors */
static inline void gic_write_eoir(u32 irq)
{
- asm volatile("mcr " __stringify(ICC_EOIR1) : : "r" (irq));
+ write_sysreg(irq, ICC_EOIR1);
isb();
}
static inline void gic_write_dir(u32 val)
{
- asm volatile("mcr " __stringify(ICC_DIR) : : "r" (val));
+ write_sysreg(val, ICC_DIR);
isb();
}
static inline u32 gic_read_iar(void)
{
- u32 irqstat;
+ u32 irqstat = read_sysreg(ICC_IAR1);
- asm volatile("mrc " __stringify(ICC_IAR1) : "=r" (irqstat));
dsb(sy);
+
return irqstat;
}
static inline void gic_write_pmr(u32 val)
{
- asm volatile("mcr " __stringify(ICC_PMR) : : "r" (val));
+ write_sysreg(val, ICC_PMR);
}
static inline void gic_write_ctlr(u32 val)
{
- asm volatile("mcr " __stringify(ICC_CTLR) : : "r" (val));
+ write_sysreg(val, ICC_CTLR);
isb();
}
static inline void gic_write_grpen1(u32 val)
{
- asm volatile("mcr " __stringify(ICC_IGRPEN1) : : "r" (val));
+ write_sysreg(val, ICC_IGRPEN1);
isb();
}
static inline void gic_write_sgi1r(u64 val)
{
- asm volatile("mcrr " __stringify(ICC_SGI1R) : : "r" (val));
+ write_sysreg(val, ICC_SGI1R);
}
static inline u32 gic_read_sre(void)
{
- u32 val;
-
- asm volatile("mrc " __stringify(ICC_SRE) : "=r" (val));
- return val;
+ return read_sysreg(ICC_SRE);
}
static inline void gic_write_sre(u32 val)
{
- asm volatile("mcr " __stringify(ICC_SRE) : : "r" (val));
+ write_sysreg(val, ICC_SRE);
isb();
}
static inline void gic_write_bpr1(u32 val)
{
- asm volatile("mcr " __stringify(ICC_BPR1) : : "r" (val));
+ write_sysreg(val, ICC_BPR1);
}
/*
diff --git a/arch/arm/include/asm/cp15.h b/arch/arm/include/asm/cp15.h
index c3f11524f10c..dbdbce1b3a72 100644
--- a/arch/arm/include/asm/cp15.h
+++ b/arch/arm/include/asm/cp15.h
@@ -49,6 +49,21 @@
#ifdef CONFIG_CPU_CP15
+#define __ACCESS_CP15(CRn, Op1, CRm, Op2) \
+ "mrc", "mcr", __stringify(p15, Op1, %0, CRn, CRm, Op2), u32
+#define __ACCESS_CP15_64(Op1, CRm) \
+ "mrrc", "mcrr", __stringify(p15, Op1, %Q0, %R0, CRm), u64
+
+#define __read_sysreg(r, w, c, t) ({ \
+ t __val; \
+ asm volatile(r " " c : "=r" (__val)); \
+ __val; \
+})
+#define read_sysreg(...) __read_sysreg(__VA_ARGS__)
+
+#define __write_sysreg(v, r, w, c, t) asm volatile(w " " c : : "r" ((t)(v)))
+#define write_sysreg(v, ...) __write_sysreg(v, __VA_ARGS__)
+
extern unsigned long cr_alignment; /* defined in entry-armv.S */
static inline unsigned long get_cr(void)
diff --git a/arch/arm/include/asm/cputype.h b/arch/arm/include/asm/cputype.h
index 754f86f667d4..522b5feb4eaa 100644
--- a/arch/arm/include/asm/cputype.h
+++ b/arch/arm/include/asm/cputype.h
@@ -55,6 +55,7 @@
#define MPIDR_LEVEL_BITS 8
#define MPIDR_LEVEL_MASK ((1 << MPIDR_LEVEL_BITS) - 1)
+#define MPIDR_LEVEL_SHIFT(level) (MPIDR_LEVEL_BITS * level)
#define MPIDR_AFFINITY_LEVEL(mpidr, level) \
((mpidr >> (MPIDR_LEVEL_BITS * level)) & MPIDR_LEVEL_MASK)
diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h
index 58faff5f1eb2..d7ea6bcb29bf 100644
--- a/arch/arm/include/asm/kvm_asm.h
+++ b/arch/arm/include/asm/kvm_asm.h
@@ -21,6 +21,10 @@
#include <asm/virt.h>
+#define ARM_EXIT_WITH_ABORT_BIT 31
+#define ARM_EXCEPTION_CODE(x) ((x) & ~(1U << ARM_EXIT_WITH_ABORT_BIT))
+#define ARM_ABORT_PENDING(x) !!((x) & (1U << ARM_EXIT_WITH_ABORT_BIT))
+
#define ARM_EXCEPTION_RESET 0
#define ARM_EXCEPTION_UNDEFINED 1
#define ARM_EXCEPTION_SOFTWARE 2
@@ -68,6 +72,9 @@ extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
extern void __init_stage2_translation(void);
extern void __kvm_hyp_reset(unsigned long);
+
+extern u64 __vgic_v3_get_ich_vtr_el2(void);
+extern void __vgic_v3_init_lrs(void);
#endif
#endif /* __ARM_KVM_ASM_H__ */
diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h
index ee5328fc4b06..9a8a45aaf19a 100644
--- a/arch/arm/include/asm/kvm_emulate.h
+++ b/arch/arm/include/asm/kvm_emulate.h
@@ -40,18 +40,29 @@ static inline void vcpu_set_reg(struct kvm_vcpu *vcpu, u8 reg_num,
*vcpu_reg(vcpu, reg_num) = val;
}
-bool kvm_condition_valid(struct kvm_vcpu *vcpu);
-void kvm_skip_instr(struct kvm_vcpu *vcpu, bool is_wide_instr);
+bool kvm_condition_valid32(const struct kvm_vcpu *vcpu);
+void kvm_skip_instr32(struct kvm_vcpu *vcpu, bool is_wide_instr);
void kvm_inject_undefined(struct kvm_vcpu *vcpu);
+void kvm_inject_vabt(struct kvm_vcpu *vcpu);
void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr);
void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr);
+static inline bool kvm_condition_valid(const struct kvm_vcpu *vcpu)
+{
+ return kvm_condition_valid32(vcpu);
+}
+
+static inline void kvm_skip_instr(struct kvm_vcpu *vcpu, bool is_wide_instr)
+{
+ kvm_skip_instr32(vcpu, is_wide_instr);
+}
+
static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
{
vcpu->arch.hcr = HCR_GUEST_MASK;
}
-static inline unsigned long vcpu_get_hcr(struct kvm_vcpu *vcpu)
+static inline unsigned long vcpu_get_hcr(const struct kvm_vcpu *vcpu)
{
return vcpu->arch.hcr;
}
@@ -61,7 +72,7 @@ static inline void vcpu_set_hcr(struct kvm_vcpu *vcpu, unsigned long hcr)
vcpu->arch.hcr = hcr;
}
-static inline bool vcpu_mode_is_32bit(struct kvm_vcpu *vcpu)
+static inline bool vcpu_mode_is_32bit(const struct kvm_vcpu *vcpu)
{
return 1;
}
@@ -71,9 +82,9 @@ static inline unsigned long *vcpu_pc(struct kvm_vcpu *vcpu)
return &vcpu->arch.ctxt.gp_regs.usr_regs.ARM_pc;
}
-static inline unsigned long *vcpu_cpsr(struct kvm_vcpu *vcpu)
+static inline unsigned long *vcpu_cpsr(const struct kvm_vcpu *vcpu)
{
- return &vcpu->arch.ctxt.gp_regs.usr_regs.ARM_cpsr;
+ return (unsigned long *)&vcpu->arch.ctxt.gp_regs.usr_regs.ARM_cpsr;
}
static inline void vcpu_set_thumb(struct kvm_vcpu *vcpu)
@@ -93,11 +104,21 @@ static inline bool vcpu_mode_priv(struct kvm_vcpu *vcpu)
return cpsr_mode > USR_MODE;;
}
-static inline u32 kvm_vcpu_get_hsr(struct kvm_vcpu *vcpu)
+static inline u32 kvm_vcpu_get_hsr(const struct kvm_vcpu *vcpu)
{
return vcpu->arch.fault.hsr;
}
+static inline int kvm_vcpu_get_condition(const struct kvm_vcpu *vcpu)
+{
+ u32 hsr = kvm_vcpu_get_hsr(vcpu);
+
+ if (hsr & HSR_CV)
+ return (hsr & HSR_COND) >> HSR_COND_SHIFT;
+
+ return -1;
+}
+
static inline unsigned long kvm_vcpu_get_hfar(struct kvm_vcpu *vcpu)
{
return vcpu->arch.fault.hxfar;
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index de338d93d11b..2d19e02d03fd 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -39,7 +39,12 @@
#include <kvm/arm_vgic.h>
+
+#ifdef CONFIG_ARM_GIC_V3
+#define KVM_MAX_VCPUS VGIC_V3_MAX_CPUS
+#else
#define KVM_MAX_VCPUS VGIC_V2_MAX_CPUS
+#endif
#define KVM_REQ_VCPU_EXIT 8
@@ -183,15 +188,15 @@ struct kvm_vcpu_arch {
};
struct kvm_vm_stat {
- u32 remote_tlb_flush;
+ ulong remote_tlb_flush;
};
struct kvm_vcpu_stat {
- u32 halt_successful_poll;
- u32 halt_attempted_poll;
- u32 halt_poll_invalid;
- u32 halt_wakeup;
- u32 hvc_exit_stat;
+ u64 halt_successful_poll;
+ u64 halt_attempted_poll;
+ u64 halt_poll_invalid;
+ u64 halt_wakeup;
+ u64 hvc_exit_stat;
u64 wfe_exit_stat;
u64 wfi_exit_stat;
u64 mmio_exit_user;
diff --git a/arch/arm/include/asm/kvm_hyp.h b/arch/arm/include/asm/kvm_hyp.h
index 6eaff28f2ff3..343135ede5fa 100644
--- a/arch/arm/include/asm/kvm_hyp.h
+++ b/arch/arm/include/asm/kvm_hyp.h
@@ -20,28 +20,15 @@
#include <linux/compiler.h>
#include <linux/kvm_host.h>
+#include <asm/cp15.h>
#include <asm/kvm_mmu.h>
#include <asm/vfp.h>
#define __hyp_text __section(.hyp.text) notrace
-#define __ACCESS_CP15(CRn, Op1, CRm, Op2) \
- "mrc", "mcr", __stringify(p15, Op1, %0, CRn, CRm, Op2), u32
-#define __ACCESS_CP15_64(Op1, CRm) \
- "mrrc", "mcrr", __stringify(p15, Op1, %Q0, %R0, CRm), u64
#define __ACCESS_VFP(CRn) \
"mrc", "mcr", __stringify(p10, 7, %0, CRn, cr0, 0), u32
-#define __write_sysreg(v, r, w, c, t) asm volatile(w " " c : : "r" ((t)(v)))
-#define write_sysreg(v, ...) __write_sysreg(v, __VA_ARGS__)
-
-#define __read_sysreg(r, w, c, t) ({ \
- t __val; \
- asm volatile(r " " c : "=r" (__val)); \
- __val; \
-})
-#define read_sysreg(...) __read_sysreg(__VA_ARGS__)
-
#define write_special(v, r) \
asm volatile("msr " __stringify(r) ", %0" : : "r" (v))
#define read_special(r) ({ \
@@ -119,6 +106,9 @@ void __vgic_v2_restore_state(struct kvm_vcpu *vcpu);
void __sysreg_save_state(struct kvm_cpu_context *ctxt);
void __sysreg_restore_state(struct kvm_cpu_context *ctxt);
+void __vgic_v3_save_state(struct kvm_vcpu *vcpu);
+void __vgic_v3_restore_state(struct kvm_vcpu *vcpu);
+
void asmlinkage __vfp_save_state(struct vfp_hard_struct *vfp);
void asmlinkage __vfp_restore_state(struct vfp_hard_struct *vfp);
static inline bool __vfp_enabled(void)
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index 3bb803d6814b..74a44727f8e1 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -63,37 +63,13 @@ void kvm_clear_hyp_idmap(void);
static inline void kvm_set_pmd(pmd_t *pmd, pmd_t new_pmd)
{
*pmd = new_pmd;
- flush_pmd_entry(pmd);
+ dsb(ishst);
}
static inline void kvm_set_pte(pte_t *pte, pte_t new_pte)
{
*pte = new_pte;
- /*
- * flush_pmd_entry just takes a void pointer and cleans the necessary
- * cache entries, so we can reuse the function for ptes.
- */
- flush_pmd_entry(pte);
-}
-
-static inline void kvm_clean_pgd(pgd_t *pgd)
-{
- clean_dcache_area(pgd, PTRS_PER_S2_PGD * sizeof(pgd_t));
-}
-
-static inline void kvm_clean_pmd(pmd_t *pmd)
-{
- clean_dcache_area(pmd, PTRS_PER_PMD * sizeof(pmd_t));
-}
-
-static inline void kvm_clean_pmd_entry(pmd_t *pmd)
-{
- clean_pmd_entry(pmd);
-}
-
-static inline void kvm_clean_pte(pte_t *pte)
-{
- clean_pte_table(pte);
+ dsb(ishst);
}
static inline pte_t kvm_s2pte_mkwrite(pte_t pte)
diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h
index a2b3eb313a25..b38c10c73579 100644
--- a/arch/arm/include/uapi/asm/kvm.h
+++ b/arch/arm/include/uapi/asm/kvm.h
@@ -84,6 +84,13 @@ struct kvm_regs {
#define KVM_VGIC_V2_DIST_SIZE 0x1000
#define KVM_VGIC_V2_CPU_SIZE 0x2000
+/* Supported VGICv3 address types */
+#define KVM_VGIC_V3_ADDR_TYPE_DIST 2
+#define KVM_VGIC_V3_ADDR_TYPE_REDIST 3
+
+#define KVM_VGIC_V3_DIST_SIZE SZ_64K
+#define KVM_VGIC_V3_REDIST_SIZE (2 * SZ_64K)
+
#define KVM_ARM_VCPU_POWER_OFF 0 /* CPU is started in OFF state */
#define KVM_ARM_VCPU_PSCI_0_2 1 /* CPU uses PSCI v0.2 */
diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile
index 10d77a66cad5..f19842ea5418 100644
--- a/arch/arm/kvm/Makefile
+++ b/arch/arm/kvm/Makefile
@@ -21,13 +21,16 @@ obj-$(CONFIG_KVM_ARM_HOST) += hyp/
obj-y += kvm-arm.o init.o interrupts.o
obj-y += arm.o handle_exit.o guest.o mmu.o emulate.o reset.o
obj-y += coproc.o coproc_a15.o coproc_a7.o mmio.o psci.o perf.o
+obj-y += $(KVM)/arm/aarch32.o
obj-y += $(KVM)/arm/vgic/vgic.o
obj-y += $(KVM)/arm/vgic/vgic-init.o
obj-y += $(KVM)/arm/vgic/vgic-irqfd.o
obj-y += $(KVM)/arm/vgic/vgic-v2.o
+obj-y += $(KVM)/arm/vgic/vgic-v3.o
obj-y += $(KVM)/arm/vgic/vgic-mmio.o
obj-y += $(KVM)/arm/vgic/vgic-mmio-v2.o
+obj-y += $(KVM)/arm/vgic/vgic-mmio-v3.o
obj-y += $(KVM)/arm/vgic/vgic-kvm-device.o
obj-y += $(KVM)/irqchip.o
obj-y += $(KVM)/arm/arch_timer.o
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index c94b90d43772..03e9273f1876 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -144,6 +144,16 @@ out_fail_alloc:
return ret;
}
+bool kvm_arch_has_vcpu_debugfs(void)
+{
+ return false;
+}
+
+int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+
int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
{
return VM_FAULT_SIGBUS;
@@ -1176,6 +1186,10 @@ static int init_common_resources(void)
return -ENOMEM;
}
+ /* set size of VMID supported by CPU */
+ kvm_vmid_bits = kvm_get_vmid_bits();
+ kvm_info("%d-bit VMID\n", kvm_vmid_bits);
+
return 0;
}
@@ -1241,10 +1255,6 @@ static void teardown_hyp_mode(void)
static int init_vhe_mode(void)
{
- /* set size of VMID supported by CPU */
- kvm_vmid_bits = kvm_get_vmid_bits();
- kvm_info("%d-bit VMID\n", kvm_vmid_bits);
-
kvm_info("VHE mode initialized successfully\n");
return 0;
}
@@ -1328,10 +1338,6 @@ static int init_hyp_mode(void)
}
}
- /* set size of VMID supported by CPU */
- kvm_vmid_bits = kvm_get_vmid_bits();
- kvm_info("%d-bit VMID\n", kvm_vmid_bits);
-
kvm_info("Hyp mode initialized successfully\n");
return 0;
diff --git a/arch/arm/kvm/coproc.c b/arch/arm/kvm/coproc.c
index 1bb2b79c01ff..3e5e4194ef86 100644
--- a/arch/arm/kvm/coproc.c
+++ b/arch/arm/kvm/coproc.c
@@ -228,6 +228,35 @@ bool access_vm_reg(struct kvm_vcpu *vcpu,
return true;
}
+static bool access_gic_sgi(struct kvm_vcpu *vcpu,
+ const struct coproc_params *p,
+ const struct coproc_reg *r)
+{
+ u64 reg;
+
+ if (!p->is_write)
+ return read_from_write_only(vcpu, p);
+
+ reg = (u64)*vcpu_reg(vcpu, p->Rt2) << 32;
+ reg |= *vcpu_reg(vcpu, p->Rt1) ;
+
+ vgic_v3_dispatch_sgi(vcpu, reg);
+
+ return true;
+}
+
+static bool access_gic_sre(struct kvm_vcpu *vcpu,
+ const struct coproc_params *p,
+ const struct coproc_reg *r)
+{
+ if (p->is_write)
+ return ignore_write(vcpu, p);
+
+ *vcpu_reg(vcpu, p->Rt1) = vcpu->arch.vgic_cpu.vgic_v3.vgic_sre;
+
+ return true;
+}
+
/*
* We could trap ID_DFR0 and tell the guest we don't support performance
* monitoring. Unfortunately the patch to make the kernel check ID_DFR0 was
@@ -361,10 +390,16 @@ static const struct coproc_reg cp15_regs[] = {
{ CRn(10), CRm( 3), Op1( 0), Op2( 1), is32,
access_vm_reg, reset_unknown, c10_AMAIR1},
+ /* ICC_SGI1R */
+ { CRm64(12), Op1( 0), is64, access_gic_sgi},
+
/* VBAR: swapped by interrupt.S. */
{ CRn(12), CRm( 0), Op1( 0), Op2( 0), is32,
NULL, reset_val, c12_VBAR, 0x00000000 },
+ /* ICC_SRE */
+ { CRn(12), CRm(12), Op1( 0), Op2(5), is32, access_gic_sre },
+
/* CONTEXTIDR/TPIDRURW/TPIDRURO/TPIDRPRW: swapped by interrupt.S. */
{ CRn(13), CRm( 0), Op1( 0), Op2( 1), is32,
access_vm_reg, reset_val, c13_CID, 0x00000000 },
diff --git a/arch/arm/kvm/emulate.c b/arch/arm/kvm/emulate.c
index af93e3ffc9f3..0064b86a2c87 100644
--- a/arch/arm/kvm/emulate.c
+++ b/arch/arm/kvm/emulate.c
@@ -161,105 +161,6 @@ unsigned long *vcpu_spsr(struct kvm_vcpu *vcpu)
}
}
-/*
- * A conditional instruction is allowed to trap, even though it
- * wouldn't be executed. So let's re-implement the hardware, in
- * software!
- */
-bool kvm_condition_valid(struct kvm_vcpu *vcpu)
-{
- unsigned long cpsr, cond, insn;
-
- /*
- * Exception Code 0 can only happen if we set HCR.TGE to 1, to
- * catch undefined instructions, and then we won't get past
- * the arm_exit_handlers test anyway.
- */
- BUG_ON(!kvm_vcpu_trap_get_class(vcpu));
-
- /* Top two bits non-zero? Unconditional. */
- if (kvm_vcpu_get_hsr(vcpu) >> 30)
- return true;
-
- cpsr = *vcpu_cpsr(vcpu);
-
- /* Is condition field valid? */
- if ((kvm_vcpu_get_hsr(vcpu) & HSR_CV) >> HSR_CV_SHIFT)
- cond = (kvm_vcpu_get_hsr(vcpu) & HSR_COND) >> HSR_COND_SHIFT;
- else {
- /* This can happen in Thumb mode: examine IT state. */
- unsigned long it;
-
- it = ((cpsr >> 8) & 0xFC) | ((cpsr >> 25) & 0x3);
-
- /* it == 0 => unconditional. */
- if (it == 0)
- return true;
-
- /* The cond for this insn works out as the top 4 bits. */
- cond = (it >> 4);
- }
-
- /* Shift makes it look like an ARM-mode instruction */
- insn = cond << 28;
- return arm_check_condition(insn, cpsr) != ARM_OPCODE_CONDTEST_FAIL;
-}
-
-/**
- * adjust_itstate - adjust ITSTATE when emulating instructions in IT-block
- * @vcpu: The VCPU pointer
- *
- * When exceptions occur while instructions are executed in Thumb IF-THEN
- * blocks, the ITSTATE field of the CPSR is not advanced (updated), so we have
- * to do this little bit of work manually. The fields map like this:
- *
- * IT[7:0] -> CPSR[26:25],CPSR[15:10]
- */
-static void kvm_adjust_itstate(struct kvm_vcpu *vcpu)
-{
- unsigned long itbits, cond;
- unsigned long cpsr = *vcpu_cpsr(vcpu);
- bool is_arm = !(cpsr & PSR_T_BIT);
-
- BUG_ON(is_arm && (cpsr & PSR_IT_MASK));
-
- if (!(cpsr & PSR_IT_MASK))
- return;
-
- cond = (cpsr & 0xe000) >> 13;
- itbits = (cpsr & 0x1c00) >> (10 - 2);
- itbits |= (cpsr & (0x3 << 25)) >> 25;
-
- /* Perform ITAdvance (see page A-52 in ARM DDI 0406C) */
- if ((itbits & 0x7) == 0)
- itbits = cond = 0;
- else
- itbits = (itbits << 1) & 0x1f;
-
- cpsr &= ~PSR_IT_MASK;
- cpsr |= cond << 13;
- cpsr |= (itbits & 0x1c) << (10 - 2);
- cpsr |= (itbits & 0x3) << 25;
- *vcpu_cpsr(vcpu) = cpsr;
-}
-
-/**
- * kvm_skip_instr - skip a trapped instruction and proceed to the next
- * @vcpu: The vcpu pointer
- */
-void kvm_skip_instr(struct kvm_vcpu *vcpu, bool is_wide_instr)
-{
- bool is_thumb;
-
- is_thumb = !!(*vcpu_cpsr(vcpu) & PSR_T_BIT);
- if (is_thumb && !is_wide_instr)
- *vcpu_pc(vcpu) += 2;
- else
- *vcpu_pc(vcpu) += 4;
- kvm_adjust_itstate(vcpu);
-}
-
-
/******************************************************************************
* Inject exceptions into the guest
*/
@@ -402,3 +303,15 @@ void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr)
{
inject_abt(vcpu, true, addr);
}
+
+/**
+ * kvm_inject_vabt - inject an async abort / SError into the guest
+ * @vcpu: The VCPU to receive the exception
+ *
+ * It is assumed that this code is called from the VCPU thread and that the
+ * VCPU therefore is not currently executing guest code.
+ */
+void kvm_inject_vabt(struct kvm_vcpu *vcpu)
+{
+ vcpu_set_hcr(vcpu, vcpu_get_hcr(vcpu) | HCR_VA);
+}
diff --git a/arch/arm/kvm/handle_exit.c b/arch/arm/kvm/handle_exit.c
index 3f1ef0dbc899..4e40d1955e35 100644
--- a/arch/arm/kvm/handle_exit.c
+++ b/arch/arm/kvm/handle_exit.c
@@ -28,14 +28,6 @@
typedef int (*exit_handle_fn)(struct kvm_vcpu *, struct kvm_run *);
-static int handle_svc_hyp(struct kvm_vcpu *vcpu, struct kvm_run *run)
-{
- /* SVC called from Hyp mode should never get here */
- kvm_debug("SVC called from Hyp mode shouldn't go here\n");
- BUG();
- return -EINVAL; /* Squash warning */
-}
-
static int handle_hvc(struct kvm_vcpu *vcpu, struct kvm_run *run)
{
int ret;
@@ -59,22 +51,6 @@ static int handle_smc(struct kvm_vcpu *vcpu, struct kvm_run *run)
return 1;
}
-static int handle_pabt_hyp(struct kvm_vcpu *vcpu, struct kvm_run *run)
-{
- /* The hypervisor should never cause aborts */
- kvm_err("Prefetch Abort taken from Hyp mode at %#08lx (HSR: %#08x)\n",
- kvm_vcpu_get_hfar(vcpu), kvm_vcpu_get_hsr(vcpu));
- return -EFAULT;
-}
-
-static int handle_dabt_hyp(struct kvm_vcpu *vcpu, struct kvm_run *run)
-{
- /* This is either an error in the ws. code or an external abort */
- kvm_err("Data Abort taken from Hyp mode at %#08lx (HSR: %#08x)\n",
- kvm_vcpu_get_hfar(vcpu), kvm_vcpu_get_hsr(vcpu));
- return -EFAULT;
-}
-
/**
* kvm_handle_wfx - handle a WFI or WFE instructions trapped in guests
* @vcpu: the vcpu pointer
@@ -112,13 +88,10 @@ static exit_handle_fn arm_exit_handlers[] = {
[HSR_EC_CP14_64] = kvm_handle_cp14_access,
[HSR_EC_CP_0_13] = kvm_handle_cp_0_13_access,
[HSR_EC_CP10_ID] = kvm_handle_cp10_id,
- [HSR_EC_SVC_HYP] = handle_svc_hyp,
[HSR_EC_HVC] = handle_hvc,
[HSR_EC_SMC] = handle_smc,
[HSR_EC_IABT] = kvm_handle_guest_abort,
- [HSR_EC_IABT_HYP] = handle_pabt_hyp,
[HSR_EC_DABT] = kvm_handle_guest_abort,
- [HSR_EC_DABT_HYP] = handle_dabt_hyp,
};
static exit_handle_fn kvm_get_exit_handler(struct kvm_vcpu *vcpu)
@@ -144,6 +117,25 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
{
exit_handle_fn exit_handler;
+ if (ARM_ABORT_PENDING(exception_index)) {
+ u8 hsr_ec = kvm_vcpu_trap_get_class(vcpu);
+
+ /*
+ * HVC/SMC already have an adjusted PC, which we need
+ * to correct in order to return to after having
+ * injected the abort.
+ */
+ if (hsr_ec == HSR_EC_HVC || hsr_ec == HSR_EC_SMC) {
+ u32 adj = kvm_vcpu_trap_il_is32bit(vcpu) ? 4 : 2;
+ *vcpu_pc(vcpu) -= adj;
+ }
+
+ kvm_inject_vabt(vcpu);
+ return 1;
+ }
+
+ exception_index = ARM_EXCEPTION_CODE(exception_index);
+
switch (exception_index) {
case ARM_EXCEPTION_IRQ:
return 1;
@@ -160,6 +152,9 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
exit_handler = kvm_get_exit_handler(vcpu);
return exit_handler(vcpu, run);
+ case ARM_EXCEPTION_DATA_ABORT:
+ kvm_inject_vabt(vcpu);
+ return 1;
default:
kvm_pr_unimpl("Unsupported exception type: %d",
exception_index);
diff --git a/arch/arm/kvm/hyp/Makefile b/arch/arm/kvm/hyp/Makefile
index 8dfa5f7f9290..3023bb530edf 100644
--- a/arch/arm/kvm/hyp/Makefile
+++ b/arch/arm/kvm/hyp/Makefile
@@ -5,6 +5,7 @@
KVM=../../../../virt/kvm
obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/vgic-v2-sr.o
+obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/vgic-v3-sr.o
obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/timer-sr.o
obj-$(CONFIG_KVM_ARM_HOST) += tlb.o
diff --git a/arch/arm/kvm/hyp/entry.S b/arch/arm/kvm/hyp/entry.S
index 21c238871c9e..60783f3b57cc 100644
--- a/arch/arm/kvm/hyp/entry.S
+++ b/arch/arm/kvm/hyp/entry.S
@@ -18,6 +18,7 @@
#include <linux/linkage.h>
#include <asm/asm-offsets.h>
#include <asm/kvm_arm.h>
+#include <asm/kvm_asm.h>
.arch_extension virt
@@ -63,6 +64,36 @@ ENTRY(__guest_exit)
ldr lr, [r0, #4]
mov r0, r1
+ mrs r1, SPSR
+ mrs r2, ELR_hyp
+ mrc p15, 4, r3, c5, c2, 0 @ HSR
+
+ /*
+ * Force loads and stores to complete before unmasking aborts
+ * and forcing the delivery of the exception. This gives us a
+ * single instruction window, which the handler will try to
+ * match.
+ */
+ dsb sy
+ cpsie a
+
+ .global abort_guest_exit_start
+abort_guest_exit_start:
+
+ isb
+
+ .global abort_guest_exit_end
+abort_guest_exit_end:
+
+ /*
+ * If we took an abort, r0[31] will be set, and cmp will set
+ * the N bit in PSTATE.
+ */
+ cmp r0, #0
+ msrmi SPSR_cxsf, r1
+ msrmi ELR_hyp, r2
+ mcrmi p15, 4, r3, c5, c2, 0 @ HSR
+
bx lr
ENDPROC(__guest_exit)
diff --git a/arch/arm/kvm/hyp/hyp-entry.S b/arch/arm/kvm/hyp/hyp-entry.S
index 78091383a5d9..96beb53934c9 100644
--- a/arch/arm/kvm/hyp/hyp-entry.S
+++ b/arch/arm/kvm/hyp/hyp-entry.S
@@ -81,7 +81,6 @@ __kvm_hyp_vector:
invalid_vector hyp_undef ARM_EXCEPTION_UNDEFINED
invalid_vector hyp_svc ARM_EXCEPTION_SOFTWARE
invalid_vector hyp_pabt ARM_EXCEPTION_PREF_ABORT
- invalid_vector hyp_dabt ARM_EXCEPTION_DATA_ABORT
invalid_vector hyp_fiq ARM_EXCEPTION_FIQ
ENTRY(__hyp_do_panic)
@@ -164,6 +163,21 @@ hyp_irq:
load_vcpu r0 @ Load VCPU pointer to r0
b __guest_exit
+hyp_dabt:
+ push {r0, r1}
+ mrs r0, ELR_hyp
+ ldr r1, =abort_guest_exit_start
+THUMB( add r1, r1, #1)
+ cmp r0, r1
+ ldrne r1, =abort_guest_exit_end
+THUMB( addne r1, r1, #1)
+ cmpne r0, r1
+ pop {r0, r1}
+ bne __hyp_panic
+
+ orr r0, r0, #(1 << ARM_EXIT_WITH_ABORT_BIT)
+ eret
+
.ltorg
.popsection
diff --git a/arch/arm/kvm/hyp/switch.c b/arch/arm/kvm/hyp/switch.c
index b13caa90cd44..92678b7bd046 100644
--- a/arch/arm/kvm/hyp/switch.c
+++ b/arch/arm/kvm/hyp/switch.c
@@ -14,6 +14,7 @@
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
+#include <linux/jump_label.h>
#include <asm/kvm_asm.h>
#include <asm/kvm_hyp.h>
@@ -54,6 +55,15 @@ static void __hyp_text __deactivate_traps(struct kvm_vcpu *vcpu)
{
u32 val;
+ /*
+ * If we pended a virtual abort, preserve it until it gets
+ * cleared. See B1.9.9 (Virtual Abort exception) for details,
+ * but the crucial bit is the zeroing of HCR.VA in the
+ * pseudocode.
+ */
+ if (vcpu->arch.hcr & HCR_VA)
+ vcpu->arch.hcr = read_sysreg(HCR);
+
write_sysreg(0, HCR);
write_sysreg(0, HSTR);
val = read_sysreg(HDCR);
@@ -74,14 +84,21 @@ static void __hyp_text __deactivate_vm(struct kvm_vcpu *vcpu)
write_sysreg(read_sysreg(MIDR), VPIDR);
}
+
static void __hyp_text __vgic_save_state(struct kvm_vcpu *vcpu)
{
- __vgic_v2_save_state(vcpu);
+ if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
+ __vgic_v3_save_state(vcpu);
+ else
+ __vgic_v2_save_state(vcpu);
}
static void __hyp_text __vgic_restore_state(struct kvm_vcpu *vcpu)
{
- __vgic_v2_restore_state(vcpu);
+ if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
+ __vgic_v3_restore_state(vcpu);
+ else
+ __vgic_v2_restore_state(vcpu);
}
static bool __hyp_text __populate_fault_info(struct kvm_vcpu *vcpu)
@@ -134,7 +151,7 @@ static bool __hyp_text __populate_fault_info(struct kvm_vcpu *vcpu)
return true;
}
-static int __hyp_text __guest_run(struct kvm_vcpu *vcpu)
+int __hyp_text __kvm_vcpu_run(struct kvm_vcpu *vcpu)
{
struct kvm_cpu_context *host_ctxt;
struct kvm_cpu_context *guest_ctxt;
@@ -191,8 +208,6 @@ again:
return exit_code;
}
-__alias(__guest_run) int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
-
static const char * const __hyp_panic_string[] = {
[ARM_EXCEPTION_RESET] = "\nHYP panic: RST PC:%08x CPSR:%08x",
[ARM_EXCEPTION_UNDEFINED] = "\nHYP panic: UNDEF PC:%08x CPSR:%08x",
diff --git a/arch/arm/kvm/hyp/tlb.c b/arch/arm/kvm/hyp/tlb.c
index a2636001e616..729652854f90 100644
--- a/arch/arm/kvm/hyp/tlb.c
+++ b/arch/arm/kvm/hyp/tlb.c
@@ -34,7 +34,7 @@
* As v7 does not support flushing per IPA, just nuke the whole TLB
* instead, ignoring the ipa value.
*/
-static void __hyp_text __tlb_flush_vmid(struct kvm *kvm)
+void __hyp_text __kvm_tlb_flush_vmid(struct kvm *kvm)
{
dsb(ishst);
@@ -50,21 +50,14 @@ static void __hyp_text __tlb_flush_vmid(struct kvm *kvm)
write_sysreg(0, VTTBR);
}
-__alias(__tlb_flush_vmid) void __kvm_tlb_flush_vmid(struct kvm *kvm);
-
-static void __hyp_text __tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
+void __hyp_text __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
{
- __tlb_flush_vmid(kvm);
+ __kvm_tlb_flush_vmid(kvm);
}
-__alias(__tlb_flush_vmid_ipa) void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm,
- phys_addr_t ipa);
-
-static void __hyp_text __tlb_flush_vm_context(void)
+void __hyp_text __kvm_flush_vm_context(void)
{
write_sysreg(0, TLBIALLNSNHIS);
write_sysreg(0, ICIALLUIS);
dsb(ish);
}
-
-__alias(__tlb_flush_vm_context) void __kvm_flush_vm_context(void);
diff --git a/arch/arm/kvm/mmio.c b/arch/arm/kvm/mmio.c
index 10f80a6c797a..b6e715fd3c90 100644
--- a/arch/arm/kvm/mmio.c
+++ b/arch/arm/kvm/mmio.c
@@ -126,12 +126,6 @@ static int decode_hsr(struct kvm_vcpu *vcpu, bool *is_write, int *len)
int access_size;
bool sign_extend;
- if (kvm_vcpu_dabt_isextabt(vcpu)) {
- /* cache operation on I/O addr, tell guest unsupported */
- kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
- return 1;
- }
-
if (kvm_vcpu_dabt_iss1tw(vcpu)) {
/* page table accesses IO mem: tell guest to fix its TTBR */
kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index e9a5c0e0c115..a5265edbeeab 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -744,7 +744,6 @@ int kvm_alloc_stage2_pgd(struct kvm *kvm)
if (!pgd)
return -ENOMEM;
- kvm_clean_pgd(pgd);
kvm->arch.pgd = pgd;
return 0;
}
@@ -936,7 +935,6 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
if (!cache)
return 0; /* ignore calls from kvm_set_spte_hva */
pte = mmu_memory_cache_alloc(cache);
- kvm_clean_pte(pte);
pmd_populate_kernel(NULL, pmd, pte);
get_page(virt_to_page(pmd));
}
@@ -1434,6 +1432,11 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
int ret, idx;
is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
+ if (unlikely(!is_iabt && kvm_vcpu_dabt_isextabt(vcpu))) {
+ kvm_inject_vabt(vcpu);
+ return 1;
+ }
+
fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_hsr(vcpu),
diff --git a/arch/arm64/include/asm/arch_gicv3.h b/arch/arm64/include/asm/arch_gicv3.h
index fc2a0cb47b2c..f8ae6d6e4767 100644
--- a/arch/arm64/include/asm/arch_gicv3.h
+++ b/arch/arm64/include/asm/arch_gicv3.h
@@ -80,6 +80,19 @@
#include <linux/stringify.h>
#include <asm/barrier.h>
+#define read_gicreg(r) \
+ ({ \
+ u64 reg; \
+ asm volatile("mrs_s %0, " __stringify(r) : "=r" (reg)); \
+ reg; \
+ })
+
+#define write_gicreg(v,r) \
+ do { \
+ u64 __val = (v); \
+ asm volatile("msr_s " __stringify(r) ", %0" : : "r" (__val));\
+ } while (0)
+
/*
* Low-level accessors
*
diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index 4b5c977af465..2a2752b5b6aa 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -50,7 +50,7 @@
#define HCR_BSU (3 << 10)
#define HCR_BSU_IS (UL(1) << 10)
#define HCR_FB (UL(1) << 9)
-#define HCR_VA (UL(1) << 8)
+#define HCR_VSE (UL(1) << 8)
#define HCR_VI (UL(1) << 7)
#define HCR_VF (UL(1) << 6)
#define HCR_AMO (UL(1) << 5)
@@ -80,7 +80,7 @@
#define HCR_GUEST_FLAGS (HCR_TSC | HCR_TSW | HCR_TWE | HCR_TWI | HCR_VM | \
HCR_TVM | HCR_BSU_IS | HCR_FB | HCR_TAC | \
HCR_AMO | HCR_SWIO | HCR_TIDCP | HCR_RW)
-#define HCR_VIRT_EXCP_MASK (HCR_VA | HCR_VI | HCR_VF)
+#define HCR_VIRT_EXCP_MASK (HCR_VSE | HCR_VI | HCR_VF)
#define HCR_INT_OVERRIDE (HCR_FMO | HCR_IMO)
#define HCR_HOST_VHE_FLAGS (HCR_RW | HCR_TGE | HCR_E2H)
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 7561f63f1c28..18f746551bf6 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -20,10 +20,15 @@
#include <asm/virt.h>
+#define ARM_EXIT_WITH_SERROR_BIT 31
+#define ARM_EXCEPTION_CODE(x) ((x) & ~(1U << ARM_EXIT_WITH_SERROR_BIT))
+#define ARM_SERROR_PENDING(x) !!((x) & (1U << ARM_EXIT_WITH_SERROR_BIT))
+
#define ARM_EXCEPTION_IRQ 0
-#define ARM_EXCEPTION_TRAP 1
+#define ARM_EXCEPTION_EL1_SERROR 1
+#define ARM_EXCEPTION_TRAP 2
/* The hyp-stub will return this for any kvm_call_hyp() call */
-#define ARM_EXCEPTION_HYP_GONE 2
+#define ARM_EXCEPTION_HYP_GONE 3
#define KVM_ARM64_DEBUG_DIRTY_SHIFT 0
#define KVM_ARM64_DEBUG_DIRTY (1 << KVM_ARM64_DEBUG_DIRTY_SHIFT)
diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index 4cdeae3b17c6..fd9d5fd788f5 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -38,6 +38,7 @@ bool kvm_condition_valid32(const struct kvm_vcpu *vcpu);
void kvm_skip_instr32(struct kvm_vcpu *vcpu, bool is_wide_instr);
void kvm_inject_undefined(struct kvm_vcpu *vcpu);
+void kvm_inject_vabt(struct kvm_vcpu *vcpu);
void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr);
void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr);
@@ -147,6 +148,16 @@ static inline u32 kvm_vcpu_get_hsr(const struct kvm_vcpu *vcpu)
return vcpu->arch.fault.esr_el2;
}
+static inline int kvm_vcpu_get_condition(const struct kvm_vcpu *vcpu)
+{
+ u32 esr = kvm_vcpu_get_hsr(vcpu);
+
+ if (esr & ESR_ELx_CV)
+ return (esr & ESR_ELx_COND_MASK) >> ESR_ELx_COND_SHIFT;
+
+ return -1;
+}
+
static inline unsigned long kvm_vcpu_get_hfar(const struct kvm_vcpu *vcpu)
{
return vcpu->arch.fault.far_el2;
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 3eda975837d0..bd94e6766759 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -290,15 +290,15 @@ struct kvm_vcpu_arch {
#endif
struct kvm_vm_stat {
- u32 remote_tlb_flush;
+ ulong remote_tlb_flush;
};
struct kvm_vcpu_stat {
- u32 halt_successful_poll;
- u32 halt_attempted_poll;
- u32 halt_poll_invalid;
- u32 halt_wakeup;
- u32 hvc_exit_stat;
+ u64 halt_successful_poll;
+ u64 halt_attempted_poll;
+ u64 halt_poll_invalid;
+ u64 halt_wakeup;
+ u64 hvc_exit_stat;
u64 wfe_exit_stat;
u64 wfi_exit_stat;
u64 mmio_exit_user;
diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index cff510574fae..b18e852d27e8 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -123,6 +123,7 @@ typeof(orig) * __hyp_text fname(void) \
void __vgic_v2_save_state(struct kvm_vcpu *vcpu);
void __vgic_v2_restore_state(struct kvm_vcpu *vcpu);
+int __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu);
void __vgic_v3_save_state(struct kvm_vcpu *vcpu);
void __vgic_v3_restore_state(struct kvm_vcpu *vcpu);
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index dff109871f2a..a79b969c26fc 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -162,12 +162,6 @@ void kvm_clear_hyp_idmap(void);
#define kvm_set_pte(ptep, pte) set_pte(ptep, pte)
#define kvm_set_pmd(pmdp, pmd) set_pmd(pmdp, pmd)
-static inline void kvm_clean_pgd(pgd_t *pgd) {}
-static inline void kvm_clean_pmd(pmd_t *pmd) {}
-static inline void kvm_clean_pmd_entry(pmd_t *pmd) {}
-static inline void kvm_clean_pte(pte_t *pte) {}
-static inline void kvm_clean_pte_entry(pte_t *pte) {}
-
static inline pte_t kvm_s2pte_mkwrite(pte_t pte)
{
pte_val(pte) |= PTE_S2_RDWR;
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index 9c9edc98d271..6eaf12c1d627 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -16,7 +16,7 @@ menuconfig VIRTUALIZATION
if VIRTUALIZATION
-config KVM_ARM_VGIC_V3
+config KVM_ARM_VGIC_V3_ITS
bool
config KVM
@@ -34,7 +34,7 @@ config KVM
select KVM_VFIO
select HAVE_KVM_EVENTFD
select HAVE_KVM_IRQFD
- select KVM_ARM_VGIC_V3
+ select KVM_ARM_VGIC_V3_ITS
select KVM_ARM_PMU if HW_PERF_EVENTS
select HAVE_KVM_MSI
select HAVE_KVM_IRQCHIP
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index 695eb3c7ef41..d50a82a16ff6 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -16,9 +16,10 @@ kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/e
kvm-$(CONFIG_KVM_ARM_HOST) += $(ARM)/arm.o $(ARM)/mmu.o $(ARM)/mmio.o
kvm-$(CONFIG_KVM_ARM_HOST) += $(ARM)/psci.o $(ARM)/perf.o
-kvm-$(CONFIG_KVM_ARM_HOST) += emulate.o inject_fault.o regmap.o
+kvm-$(CONFIG_KVM_ARM_HOST) += inject_fault.o regmap.o
kvm-$(CONFIG_KVM_ARM_HOST) += hyp.o hyp-init.o handle_exit.o
kvm-$(CONFIG_KVM_ARM_HOST) += guest.o debug.o reset.o sys_regs.o sys_regs_generic_v8.o
+kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/aarch32.o
kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic.o
kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-init.o
diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
index fa96fe2bd469..a204adf29f0a 100644
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -170,9 +170,32 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
{
exit_handle_fn exit_handler;
+ if (ARM_SERROR_PENDING(exception_index)) {
+ u8 hsr_ec = ESR_ELx_EC(kvm_vcpu_get_hsr(vcpu));
+
+ /*
+ * HVC/SMC already have an adjusted PC, which we need
+ * to correct in order to return to after having
+ * injected the SError.
+ */
+ if (hsr_ec == ESR_ELx_EC_HVC32 || hsr_ec == ESR_ELx_EC_HVC64 ||
+ hsr_ec == ESR_ELx_EC_SMC32 || hsr_ec == ESR_ELx_EC_SMC64) {
+ u32 adj = kvm_vcpu_trap_il_is32bit(vcpu) ? 4 : 2;
+ *vcpu_pc(vcpu) -= adj;
+ }
+
+ kvm_inject_vabt(vcpu);
+ return 1;
+ }
+
+ exception_index = ARM_EXCEPTION_CODE(exception_index);
+
switch (exception_index) {
case ARM_EXCEPTION_IRQ:
return 1;
+ case ARM_EXCEPTION_EL1_SERROR:
+ kvm_inject_vabt(vcpu);
+ return 1;
case ARM_EXCEPTION_TRAP:
/*
* See ARM ARM B1.14.1: "Hyp traps on instructions
diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile
index 0c85febcc1eb..aaf42ae8d8c3 100644
--- a/arch/arm64/kvm/hyp/Makefile
+++ b/arch/arm64/kvm/hyp/Makefile
@@ -5,9 +5,9 @@
KVM=../../../../virt/kvm
obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/vgic-v2-sr.o
+obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/vgic-v3-sr.o
obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/timer-sr.o
-obj-$(CONFIG_KVM_ARM_HOST) += vgic-v3-sr.o
obj-$(CONFIG_KVM_ARM_HOST) += sysreg-sr.o
obj-$(CONFIG_KVM_ARM_HOST) += debug-sr.o
obj-$(CONFIG_KVM_ARM_HOST) += entry.o
diff --git a/arch/arm64/kvm/hyp/debug-sr.c b/arch/arm64/kvm/hyp/debug-sr.c
index 33342a776ec7..4ba5c9095d03 100644
--- a/arch/arm64/kvm/hyp/debug-sr.c
+++ b/arch/arm64/kvm/hyp/debug-sr.c
@@ -131,9 +131,7 @@ void __hyp_text __debug_cond_restore_host_state(struct kvm_vcpu *vcpu)
vcpu->arch.debug_flags &= ~KVM_ARM64_DEBUG_DIRTY;
}
-static u32 __hyp_text __debug_read_mdcr_el2(void)
+u32 __hyp_text __kvm_get_mdcr_el2(void)
{
return read_sysreg(mdcr_el2);
}
-
-__alias(__debug_read_mdcr_el2) u32 __kvm_get_mdcr_el2(void);
diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S
index ce9e5e5f28cf..12ee62d6d410 100644
--- a/arch/arm64/kvm/hyp/entry.S
+++ b/arch/arm64/kvm/hyp/entry.S
@@ -55,79 +55,111 @@
*/
ENTRY(__guest_enter)
// x0: vcpu
- // x1: host/guest context
- // x2-x18: clobbered by macros
+ // x1: host context
+ // x2-x17: clobbered by macros
+ // x18: guest context
// Store the host regs
save_callee_saved_regs x1
- // Preserve vcpu & host_ctxt for use at exit time
- stp x0, x1, [sp, #-16]!
+ // Store the host_ctxt for use at exit time
+ str x1, [sp, #-16]!
- add x1, x0, #VCPU_CONTEXT
+ add x18, x0, #VCPU_CONTEXT
- // Prepare x0-x1 for later restore by pushing them onto the stack
- ldp x2, x3, [x1, #CPU_XREG_OFFSET(0)]
- stp x2, x3, [sp, #-16]!
+ // Restore guest regs x0-x17
+ ldp x0, x1, [x18, #CPU_XREG_OFFSET(0)]
+ ldp x2, x3, [x18, #CPU_XREG_OFFSET(2)]
+ ldp x4, x5, [x18, #CPU_XREG_OFFSET(4)]
+ ldp x6, x7, [x18, #CPU_XREG_OFFSET(6)]
+ ldp x8, x9, [x18, #CPU_XREG_OFFSET(8)]
+ ldp x10, x11, [x18, #CPU_XREG_OFFSET(10)]
+ ldp x12, x13, [x18, #CPU_XREG_OFFSET(12)]
+ ldp x14, x15, [x18, #CPU_XREG_OFFSET(14)]
+ ldp x16, x17, [x18, #CPU_XREG_OFFSET(16)]
- // x2-x18
- ldp x2, x3, [x1, #CPU_XREG_OFFSET(2)]
- ldp x4, x5, [x1, #CPU_XREG_OFFSET(4)]
- ldp x6, x7, [x1, #CPU_XREG_OFFSET(6)]
- ldp x8, x9, [x1, #CPU_XREG_OFFSET(8)]
- ldp x10, x11, [x1, #CPU_XREG_OFFSET(10)]
- ldp x12, x13, [x1, #CPU_XREG_OFFSET(12)]
- ldp x14, x15, [x1, #CPU_XREG_OFFSET(14)]
- ldp x16, x17, [x1, #CPU_XREG_OFFSET(16)]
- ldr x18, [x1, #CPU_XREG_OFFSET(18)]
-
- // x19-x29, lr
- restore_callee_saved_regs x1
-
- // Last bits of the 64bit state
- ldp x0, x1, [sp], #16
+ // Restore guest regs x19-x29, lr
+ restore_callee_saved_regs x18
+
+ // Restore guest reg x18
+ ldr x18, [x18, #CPU_XREG_OFFSET(18)]
// Do not touch any register after this!
eret
ENDPROC(__guest_enter)
ENTRY(__guest_exit)
- // x0: vcpu
- // x1: return code
- // x2-x3: free
- // x4-x29,lr: vcpu regs
- // vcpu x0-x3 on the stack
+ // x0: return code
+ // x1: vcpu
+ // x2-x29,lr: vcpu regs
+ // vcpu x0-x1 on the stack
- add x2, x0, #VCPU_CONTEXT
+ add x1, x1, #VCPU_CONTEXT
- stp x4, x5, [x2, #CPU_XREG_OFFSET(4)]
- stp x6, x7, [x2, #CPU_XREG_OFFSET(6)]
- stp x8, x9, [x2, #CPU_XREG_OFFSET(8)]
- stp x10, x11, [x2, #CPU_XREG_OFFSET(10)]
- stp x12, x13, [x2, #CPU_XREG_OFFSET(12)]
- stp x14, x15, [x2, #CPU_XREG_OFFSET(14)]
- stp x16, x17, [x2, #CPU_XREG_OFFSET(16)]
- str x18, [x2, #CPU_XREG_OFFSET(18)]
+ ALTERNATIVE(nop, SET_PSTATE_PAN(1), ARM64_HAS_PAN, CONFIG_ARM64_PAN)
- ldp x6, x7, [sp], #16 // x2, x3
- ldp x4, x5, [sp], #16 // x0, x1
+ // Store the guest regs x2 and x3
+ stp x2, x3, [x1, #CPU_XREG_OFFSET(2)]
- stp x4, x5, [x2, #CPU_XREG_OFFSET(0)]
- stp x6, x7, [x2, #CPU_XREG_OFFSET(2)]
+ // Retrieve the guest regs x0-x1 from the stack
+ ldp x2, x3, [sp], #16 // x0, x1
+
+ // Store the guest regs x0-x1 and x4-x18
+ stp x2, x3, [x1, #CPU_XREG_OFFSET(0)]
+ stp x4, x5, [x1, #CPU_XREG_OFFSET(4)]
+ stp x6, x7, [x1, #CPU_XREG_OFFSET(6)]
+ stp x8, x9, [x1, #CPU_XREG_OFFSET(8)]
+ stp x10, x11, [x1, #CPU_XREG_OFFSET(10)]
+ stp x12, x13, [x1, #CPU_XREG_OFFSET(12)]
+ stp x14, x15, [x1, #CPU_XREG_OFFSET(14)]
+ stp x16, x17, [x1, #CPU_XREG_OFFSET(16)]
+ str x18, [x1, #CPU_XREG_OFFSET(18)]
+
+ // Store the guest regs x19-x29, lr
+ save_callee_saved_regs x1
- save_callee_saved_regs x2
+ // Restore the host_ctxt from the stack
+ ldr x2, [sp], #16
- // Restore vcpu & host_ctxt from the stack
- // (preserving return code in x1)
- ldp x0, x2, [sp], #16
// Now restore the host regs
restore_callee_saved_regs x2
- mov x0, x1
- ret
+ // If we have a pending asynchronous abort, now is the
+ // time to find out. From your VAXorcist book, page 666:
+ // "Threaten me not, oh Evil one! For I speak with
+ // the power of DEC, and I command thee to show thyself!"
+ mrs x2, elr_el2
+ mrs x3, esr_el2
+ mrs x4, spsr_el2
+ mov x5, x0
+
+ dsb sy // Synchronize against in-flight ld/st
+ msr daifclr, #4 // Unmask aborts
+
+ // This is our single instruction exception window. A pending
+ // SError is guaranteed to occur at the earliest when we unmask
+ // it, and at the latest just after the ISB.
+ .global abort_guest_exit_start
+abort_guest_exit_start:
+
+ isb
+
+ .global abort_guest_exit_end
+abort_guest_exit_end:
+
+ // If the exception took place, restore the EL1 exception
+ // context so that we can report some information.
+ // Merge the exception code with the SError pending bit.
+ tbz x0, #ARM_EXIT_WITH_SERROR_BIT, 1f
+ msr elr_el2, x2
+ msr esr_el2, x3
+ msr spsr_el2, x4
+ orr x0, x0, x5
+1: ret
ENDPROC(__guest_exit)
ENTRY(__fpsimd_guest_restore)
+ stp x2, x3, [sp, #-16]!
stp x4, lr, [sp, #-16]!
alternative_if_not ARM64_HAS_VIRT_HOST_EXTN
diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S
index f6d9694ae3b1..4e92399f7105 100644
--- a/arch/arm64/kvm/hyp/hyp-entry.S
+++ b/arch/arm64/kvm/hyp/hyp-entry.S
@@ -27,16 +27,6 @@
.text
.pushsection .hyp.text, "ax"
-.macro save_x0_to_x3
- stp x0, x1, [sp, #-16]!
- stp x2, x3, [sp, #-16]!
-.endm
-
-.macro restore_x0_to_x3
- ldp x2, x3, [sp], #16
- ldp x0, x1, [sp], #16
-.endm
-
.macro do_el2_call
/*
* Shuffle the parameters before calling the function
@@ -79,23 +69,23 @@ ENTRY(__kvm_hyp_teardown)
ENDPROC(__kvm_hyp_teardown)
el1_sync: // Guest trapped into EL2
- save_x0_to_x3
+ stp x0, x1, [sp, #-16]!
alternative_if_not ARM64_HAS_VIRT_HOST_EXTN
mrs x1, esr_el2
alternative_else
mrs x1, esr_el1
alternative_endif
- lsr x2, x1, #ESR_ELx_EC_SHIFT
+ lsr x0, x1, #ESR_ELx_EC_SHIFT
- cmp x2, #ESR_ELx_EC_HVC64
+ cmp x0, #ESR_ELx_EC_HVC64
b.ne el1_trap
- mrs x3, vttbr_el2 // If vttbr is valid, the 64bit guest
- cbnz x3, el1_trap // called HVC
+ mrs x1, vttbr_el2 // If vttbr is valid, the 64bit guest
+ cbnz x1, el1_trap // called HVC
/* Here, we're pretty sure the host called HVC. */
- restore_x0_to_x3
+ ldp x0, x1, [sp], #16
cmp x0, #HVC_GET_VECTORS
b.ne 1f
@@ -113,24 +103,51 @@ alternative_endif
el1_trap:
/*
- * x1: ESR
- * x2: ESR_EC
+ * x0: ESR_EC
*/
/* Guest accessed VFP/SIMD registers, save host, restore Guest */
- cmp x2, #ESR_ELx_EC_FP_ASIMD
+ cmp x0, #ESR_ELx_EC_FP_ASIMD
b.eq __fpsimd_guest_restore
- mrs x0, tpidr_el2
- mov x1, #ARM_EXCEPTION_TRAP
+ mrs x1, tpidr_el2
+ mov x0, #ARM_EXCEPTION_TRAP
b __guest_exit
el1_irq:
- save_x0_to_x3
- mrs x0, tpidr_el2
- mov x1, #ARM_EXCEPTION_IRQ
+ stp x0, x1, [sp, #-16]!
+ mrs x1, tpidr_el2
+ mov x0, #ARM_EXCEPTION_IRQ
+ b __guest_exit
+
+el1_error:
+ stp x0, x1, [sp, #-16]!
+ mrs x1, tpidr_el2
+ mov x0, #ARM_EXCEPTION_EL1_SERROR
b __guest_exit
+el2_error:
+ /*
+ * Only two possibilities:
+ * 1) Either we come from the exit path, having just unmasked
+ * PSTATE.A: change the return code to an EL2 fault, and
+ * carry on, as we're already in a sane state to handle it.
+ * 2) Or we come from anywhere else, and that's a bug: we panic.
+ *
+ * For (1), x0 contains the original return code and x1 doesn't
+ * contain anything meaningful at that stage. We can reuse them
+ * as temp registers.
+ * For (2), who cares?
+ */
+ mrs x0, elr_el2
+ adr x1, abort_guest_exit_start
+ cmp x0, x1
+ adr x1, abort_guest_exit_end
+ ccmp x0, x1, #4, ne
+ b.ne __hyp_panic
+ mov x0, #(1 << ARM_EXIT_WITH_SERROR_BIT)
+ eret
+
ENTRY(__hyp_do_panic)
mov lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\
PSR_MODE_EL1h)
@@ -155,11 +172,9 @@ ENDPROC(\label)
invalid_vector el2h_sync_invalid
invalid_vector el2h_irq_invalid
invalid_vector el2h_fiq_invalid
- invalid_vector el2h_error_invalid
invalid_vector el1_sync_invalid
invalid_vector el1_irq_invalid
invalid_vector el1_fiq_invalid
- invalid_vector el1_error_invalid
.ltorg
@@ -174,15 +189,15 @@ ENTRY(__kvm_hyp_vector)
ventry el2h_sync_invalid // Synchronous EL2h
ventry el2h_irq_invalid // IRQ EL2h
ventry el2h_fiq_invalid // FIQ EL2h
- ventry el2h_error_invalid // Error EL2h
+ ventry el2_error // Error EL2h
ventry el1_sync // Synchronous 64-bit EL1
ventry el1_irq // IRQ 64-bit EL1
ventry el1_fiq_invalid // FIQ 64-bit EL1
- ventry el1_error_invalid // Error 64-bit EL1
+ ventry el1_error // Error 64-bit EL1
ventry el1_sync // Synchronous 32-bit EL1
ventry el1_irq // IRQ 32-bit EL1
ventry el1_fiq_invalid // FIQ 32-bit EL1
- ventry el1_error_invalid // Error 32-bit EL1
+ ventry el1_error // Error 32-bit EL1
ENDPROC(__kvm_hyp_vector)
diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c
index 5a84b4562603..83037cd62d01 100644
--- a/arch/arm64/kvm/hyp/switch.c
+++ b/arch/arm64/kvm/hyp/switch.c
@@ -16,7 +16,10 @@
*/
#include <linux/types.h>
+#include <linux/jump_label.h>
+
#include <asm/kvm_asm.h>
+#include <asm/kvm_emulate.h>
#include <asm/kvm_hyp.h>
static bool __hyp_text __fpsimd_enabled_nvhe(void)
@@ -109,6 +112,15 @@ static hyp_alternate_select(__deactivate_traps_arch,
static void __hyp_text __deactivate_traps(struct kvm_vcpu *vcpu)
{
+ /*
+ * If we pended a virtual abort, preserve it until it gets
+ * cleared. See D1.14.3 (Virtual Interrupts) for details, but
+ * the crucial bit is "On taking a vSError interrupt,
+ * HCR_EL2.VSE is cleared to 0."
+ */
+ if (vcpu->arch.hcr_el2 & HCR_VSE)
+ vcpu->arch.hcr_el2 = read_sysreg(hcr_el2);
+
__deactivate_traps_arch()();
write_sysreg(0, hstr_el2);
write_sysreg(read_sysreg(mdcr_el2) & MDCR_EL2_HPMN_MASK, mdcr_el2);
@@ -126,17 +138,13 @@ static void __hyp_text __deactivate_vm(struct kvm_vcpu *vcpu)
write_sysreg(0, vttbr_el2);
}
-static hyp_alternate_select(__vgic_call_save_state,
- __vgic_v2_save_state, __vgic_v3_save_state,
- ARM64_HAS_SYSREG_GIC_CPUIF);
-
-static hyp_alternate_select(__vgic_call_restore_state,
- __vgic_v2_restore_state, __vgic_v3_restore_state,
- ARM64_HAS_SYSREG_GIC_CPUIF);
-
static void __hyp_text __vgic_save_state(struct kvm_vcpu *vcpu)
{
- __vgic_call_save_state()(vcpu);
+ if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
+ __vgic_v3_save_state(vcpu);
+ else
+ __vgic_v2_save_state(vcpu);
+
write_sysreg(read_sysreg(hcr_el2) & ~HCR_INT_OVERRIDE, hcr_el2);
}
@@ -149,7 +157,10 @@ static void __hyp_text __vgic_restore_state(struct kvm_vcpu *vcpu)
val |= vcpu->arch.irq_lines;
write_sysreg(val, hcr_el2);
- __vgic_call_restore_state()(vcpu);
+ if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
+ __vgic_v3_restore_state(vcpu);
+ else
+ __vgic_v2_restore_state(vcpu);
}
static bool __hyp_text __true_value(void)
@@ -232,7 +243,22 @@ static bool __hyp_text __populate_fault_info(struct kvm_vcpu *vcpu)
return true;
}
-static int __hyp_text __guest_run(struct kvm_vcpu *vcpu)
+static void __hyp_text __skip_instr(struct kvm_vcpu *vcpu)
+{
+ *vcpu_pc(vcpu) = read_sysreg_el2(elr);
+
+ if (vcpu_mode_is_32bit(vcpu)) {
+ vcpu->arch.ctxt.gp_regs.regs.pstate = read_sysreg_el2(spsr);
+ kvm_skip_instr32(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
+ write_sysreg_el2(vcpu->arch.ctxt.gp_regs.regs.pstate, spsr);
+ } else {
+ *vcpu_pc(vcpu) += 4;
+ }
+
+ write_sysreg_el2(*vcpu_pc(vcpu), elr);
+}
+
+int __hyp_text __kvm_vcpu_run(struct kvm_vcpu *vcpu)
{
struct kvm_cpu_context *host_ctxt;
struct kvm_cpu_context *guest_ctxt;
@@ -267,9 +293,43 @@ again:
exit_code = __guest_enter(vcpu, host_ctxt);
/* And we're baaack! */
+ /*
+ * We're using the raw exception code in order to only process
+ * the trap if no SError is pending. We will come back to the
+ * same PC once the SError has been injected, and replay the
+ * trapping instruction.
+ */
if (exit_code == ARM_EXCEPTION_TRAP && !__populate_fault_info(vcpu))
goto again;
+ if (static_branch_unlikely(&vgic_v2_cpuif_trap) &&
+ exit_code == ARM_EXCEPTION_TRAP) {
+ bool valid;
+
+ valid = kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_DABT_LOW &&
+ kvm_vcpu_trap_get_fault_type(vcpu) == FSC_FAULT &&
+ kvm_vcpu_dabt_isvalid(vcpu) &&
+ !kvm_vcpu_dabt_isextabt(vcpu) &&
+ !kvm_vcpu_dabt_iss1tw(vcpu);
+
+ if (valid) {
+ int ret = __vgic_v2_perform_cpuif_access(vcpu);
+
+ if (ret == 1) {
+ __skip_instr(vcpu);
+ goto again;
+ }
+
+ if (ret == -1) {
+ /* Promote an illegal access to an SError */
+ __skip_instr(vcpu);
+ exit_code = ARM_EXCEPTION_EL1_SERROR;
+ }
+
+ /* 0 falls through to be handler out of EL2 */
+ }
+ }
+
fp_enabled = __fpsimd_enabled();
__sysreg_save_guest_state(guest_ctxt);
@@ -293,8 +353,6 @@ again:
return exit_code;
}
-__alias(__guest_run) int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
-
static const char __hyp_panic_string[] = "HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%p\n";
static void __hyp_text __hyp_call_panic_nvhe(u64 spsr, u64 elr, u64 par)
diff --git a/arch/arm64/kvm/hyp/tlb.c b/arch/arm64/kvm/hyp/tlb.c
index be8177cdd3bf..9cc0ea784ae6 100644
--- a/arch/arm64/kvm/hyp/tlb.c
+++ b/arch/arm64/kvm/hyp/tlb.c
@@ -17,7 +17,7 @@
#include <asm/kvm_hyp.h>
-static void __hyp_text __tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
+void __hyp_text __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
{
dsb(ishst);
@@ -48,10 +48,7 @@ static void __hyp_text __tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
write_sysreg(0, vttbr_el2);
}
-__alias(__tlb_flush_vmid_ipa) void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm,
- phys_addr_t ipa);
-
-static void __hyp_text __tlb_flush_vmid(struct kvm *kvm)
+void __hyp_text __kvm_tlb_flush_vmid(struct kvm *kvm)
{
dsb(ishst);
@@ -67,14 +64,10 @@ static void __hyp_text __tlb_flush_vmid(struct kvm *kvm)
write_sysreg(0, vttbr_el2);
}
-__alias(__tlb_flush_vmid) void __kvm_tlb_flush_vmid(struct kvm *kvm);
-
-static void __hyp_text __tlb_flush_vm_context(void)
+void __hyp_text __kvm_flush_vm_context(void)
{
dsb(ishst);
asm volatile("tlbi alle1is \n"
"ic ialluis ": : );
dsb(ish);
}
-
-__alias(__tlb_flush_vm_context) void __kvm_flush_vm_context(void);
diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c
index 898c0e6aedd4..da6a8cfa54a0 100644
--- a/arch/arm64/kvm/inject_fault.c
+++ b/arch/arm64/kvm/inject_fault.c
@@ -231,3 +231,15 @@ void kvm_inject_undefined(struct kvm_vcpu *vcpu)
else
inject_undef64(vcpu);
}
+
+/**
+ * kvm_inject_vabt - inject an async abort / SError into the guest
+ * @vcpu: The VCPU to receive the exception
+ *
+ * It is assumed that this code is called from the VCPU thread and that the
+ * VCPU therefore is not currently executing guest code.
+ */
+void kvm_inject_vabt(struct kvm_vcpu *vcpu)
+{
+ vcpu_set_hcr(vcpu, vcpu_get_hcr(vcpu) | HCR_VSE);
+}
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index b54bcadd8aec..07f58cfc1ab9 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -107,35 +107,49 @@
#define KVM_INVALID_INST 0xdeadbeef
#define KVM_INVALID_ADDR 0xdeadbeef
+/*
+ * EVA has overlapping user & kernel address spaces, so user VAs may be >
+ * PAGE_OFFSET. For this reason we can't use the default KVM_HVA_ERR_BAD of
+ * PAGE_OFFSET.
+ */
+
+#define KVM_HVA_ERR_BAD (-1UL)
+#define KVM_HVA_ERR_RO_BAD (-2UL)
+
+static inline bool kvm_is_error_hva(unsigned long addr)
+{
+ return IS_ERR_VALUE(addr);
+}
+
extern atomic_t kvm_mips_instance;
struct kvm_vm_stat {
- u32 remote_tlb_flush;
+ ulong remote_tlb_flush;
};
struct kvm_vcpu_stat {
- u32 wait_exits;
- u32 cache_exits;
- u32 signal_exits;
- u32 int_exits;
- u32 cop_unusable_exits;
- u32 tlbmod_exits;
- u32 tlbmiss_ld_exits;
- u32 tlbmiss_st_exits;
- u32 addrerr_st_exits;
- u32 addrerr_ld_exits;
- u32 syscall_exits;
- u32 resvd_inst_exits;
- u32 break_inst_exits;
- u32 trap_inst_exits;
- u32 msa_fpe_exits;
- u32 fpe_exits;
- u32 msa_disabled_exits;
- u32 flush_dcache_exits;
- u32 halt_successful_poll;
- u32 halt_attempted_poll;
- u32 halt_poll_invalid;
- u32 halt_wakeup;
+ u64 wait_exits;
+ u64 cache_exits;
+ u64 signal_exits;
+ u64 int_exits;
+ u64 cop_unusable_exits;
+ u64 tlbmod_exits;
+ u64 tlbmiss_ld_exits;
+ u64 tlbmiss_st_exits;
+ u64 addrerr_st_exits;
+ u64 addrerr_ld_exits;
+ u64 syscall_exits;
+ u64 resvd_inst_exits;
+ u64 break_inst_exits;
+ u64 trap_inst_exits;
+ u64 msa_fpe_exits;
+ u64 fpe_exits;
+ u64 msa_disabled_exits;
+ u64 flush_dcache_exits;
+ u64 halt_successful_poll;
+ u64 halt_attempted_poll;
+ u64 halt_poll_invalid;
+ u64 halt_wakeup;
};
struct kvm_arch_memory_slot {
@@ -314,6 +328,9 @@ struct kvm_vcpu_arch {
u32 guest_kernel_asid[NR_CPUS];
struct mm_struct guest_kernel_mm, guest_user_mm;
+ /* Guest ASID of last user mode execution */
+ unsigned int last_user_gasid;
+
int last_sched_cpu;
/* WAIT executed */
diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index e788515f766b..4db4c0370859 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -846,6 +846,47 @@ enum emulation_result kvm_mips_emul_tlbr(struct kvm_vcpu *vcpu)
return EMULATE_FAIL;
}
+/**
+ * kvm_mips_invalidate_guest_tlb() - Indicates a change in guest MMU map.
+ * @vcpu: VCPU with changed mappings.
+ * @tlb: TLB entry being removed.
+ *
+ * This is called to indicate a single change in guest MMU mappings, so that we
+ * can arrange TLB flushes on this and other CPUs.
+ */
+static void kvm_mips_invalidate_guest_tlb(struct kvm_vcpu *vcpu,
+ struct kvm_mips_tlb *tlb)
+{
+ int cpu, i;
+ bool user;
+
+ /* No need to flush for entries which are already invalid */
+ if (!((tlb->tlb_lo[0] | tlb->tlb_lo[1]) & ENTRYLO_V))
+ return;
+ /* User address space doesn't need flushing for KSeg2/3 changes */
+ user = tlb->tlb_hi < KVM_GUEST_KSEG0;
+
+ preempt_disable();
+
+ /*
+ * Probe the shadow host TLB for the entry being overwritten, if one
+ * matches, invalidate it
+ */
+ kvm_mips_host_tlb_inv(vcpu, tlb->tlb_hi);
+
+ /* Invalidate the whole ASID on other CPUs */
+ cpu = smp_processor_id();
+ for_each_possible_cpu(i) {
+ if (i == cpu)
+ continue;
+ if (user)
+ vcpu->arch.guest_user_asid[i] = 0;
+ vcpu->arch.guest_kernel_asid[i] = 0;
+ }
+
+ preempt_enable();
+}
+
/* Write Guest TLB Entry @ Index */
enum emulation_result kvm_mips_emul_tlbwi(struct kvm_vcpu *vcpu)
{
@@ -865,11 +906,8 @@ enum emulation_result kvm_mips_emul_tlbwi(struct kvm_vcpu *vcpu)
}
tlb = &vcpu->arch.guest_tlb[index];
- /*
- * Probe the shadow host TLB for the entry being overwritten, if one
- * matches, invalidate it
- */
- kvm_mips_host_tlb_inv(vcpu, tlb->tlb_hi);
+
+ kvm_mips_invalidate_guest_tlb(vcpu, tlb);
tlb->tlb_mask = kvm_read_c0_guest_pagemask(cop0);
tlb->tlb_hi = kvm_read_c0_guest_entryhi(cop0);
@@ -898,11 +936,7 @@ enum emulation_result kvm_mips_emul_tlbwr(struct kvm_vcpu *vcpu)
tlb = &vcpu->arch.guest_tlb[index];
- /*
- * Probe the shadow host TLB for the entry being overwritten, if one
- * matches, invalidate it
- */
- kvm_mips_host_tlb_inv(vcpu, tlb->tlb_hi);
+ kvm_mips_invalidate_guest_tlb(vcpu, tlb);
tlb->tlb_mask = kvm_read_c0_guest_pagemask(cop0);
tlb->tlb_hi = kvm_read_c0_guest_entryhi(cop0);
@@ -1026,6 +1060,7 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst,
enum emulation_result er = EMULATE_DONE;
u32 rt, rd, sel;
unsigned long curr_pc;
+ int cpu, i;
/*
* Update PC and hold onto current PC in case there is
@@ -1127,16 +1162,31 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst,
} else if (rd == MIPS_CP0_TLB_HI && sel == 0) {
u32 nasid =
vcpu->arch.gprs[rt] & KVM_ENTRYHI_ASID;
- if ((KSEGX(vcpu->arch.gprs[rt]) != CKSEG0) &&
- ((kvm_read_c0_guest_entryhi(cop0) &
+ if (((kvm_read_c0_guest_entryhi(cop0) &
KVM_ENTRYHI_ASID) != nasid)) {
trace_kvm_asid_change(vcpu,
kvm_read_c0_guest_entryhi(cop0)
& KVM_ENTRYHI_ASID,
nasid);
- /* Blow away the shadow host TLBs */
- kvm_mips_flush_host_tlb(1);
+ /*
+ * Regenerate/invalidate kernel MMU
+ * context.
+ * The user MMU context will be
+ * regenerated lazily on re-entry to
+ * guest user if the guest ASID actually
+ * changes.
+ */
+ preempt_disable();
+ cpu = smp_processor_id();
+ kvm_get_new_mmu_context(&vcpu->arch.guest_kernel_mm,
+ cpu, vcpu);
+ vcpu->arch.guest_kernel_asid[cpu] =
+ vcpu->arch.guest_kernel_mm.context.asid[cpu];
+ for_each_possible_cpu(i)
+ if (i != cpu)
+ vcpu->arch.guest_kernel_asid[i] = 0;
+ preempt_enable();
}
kvm_write_c0_guest_entryhi(cop0,
vcpu->arch.gprs[rt]);
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index a6ea084b4d9d..ce961495b5e1 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -140,6 +140,16 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
return 0;
}
+bool kvm_arch_has_vcpu_debugfs(void)
+{
+ return false;
+}
+
+int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+
void kvm_mips_free_vcpus(struct kvm *kvm)
{
unsigned int i;
@@ -411,6 +421,31 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
return -ENOIOCTLCMD;
}
+/* Must be called with preemption disabled, just before entering guest */
+static void kvm_mips_check_asids(struct kvm_vcpu *vcpu)
+{
+ struct mips_coproc *cop0 = vcpu->arch.cop0;
+ int cpu = smp_processor_id();
+ unsigned int gasid;
+
+ /*
+ * Lazy host ASID regeneration for guest user mode.
+ * If the guest ASID has changed since the last guest usermode
+ * execution, regenerate the host ASID so as to invalidate stale TLB
+ * entries.
+ */
+ if (!KVM_GUEST_KERNEL_MODE(vcpu)) {
+ gasid = kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID;
+ if (gasid != vcpu->arch.last_user_gasid) {
+ kvm_get_new_mmu_context(&vcpu->arch.guest_user_mm, cpu,
+ vcpu);
+ vcpu->arch.guest_user_asid[cpu] =
+ vcpu->arch.guest_user_mm.context.asid[cpu];
+ vcpu->arch.last_user_gasid = gasid;
+ }
+ }
+}
+
int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
{
int r = 0;
@@ -438,6 +473,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
htw_stop();
trace_kvm_enter(vcpu);
+
+ kvm_mips_check_asids(vcpu);
+
r = vcpu->arch.vcpu_run(run, vcpu);
trace_kvm_out(vcpu);
@@ -1551,6 +1589,8 @@ skip_emul:
if (ret == RESUME_GUEST) {
trace_kvm_reenter(vcpu);
+ kvm_mips_check_asids(vcpu);
+
/*
* If FPU / MSA are enabled (i.e. the guest's FPU / MSA context
* is live), restore FCR31 / MSACSR.
diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index 121008c0fcc9..03883ba806e2 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -250,15 +250,27 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
kvm_get_new_mmu_context(&vcpu->arch.guest_kernel_mm, cpu, vcpu);
vcpu->arch.guest_kernel_asid[cpu] =
vcpu->arch.guest_kernel_mm.context.asid[cpu];
+ newasid++;
+
+ kvm_debug("[%d]: cpu_context: %#lx\n", cpu,
+ cpu_context(cpu, current->mm));
+ kvm_debug("[%d]: Allocated new ASID for Guest Kernel: %#x\n",
+ cpu, vcpu->arch.guest_kernel_asid[cpu]);
+ }
+
+ if ((vcpu->arch.guest_user_asid[cpu] ^ asid_cache(cpu)) &
+ asid_version_mask(cpu)) {
+ u32 gasid = kvm_read_c0_guest_entryhi(vcpu->arch.cop0) &
+ KVM_ENTRYHI_ASID;
+
kvm_get_new_mmu_context(&vcpu->arch.guest_user_mm, cpu, vcpu);
vcpu->arch.guest_user_asid[cpu] =
vcpu->arch.guest_user_mm.context.asid[cpu];
+ vcpu->arch.last_user_gasid = gasid;
newasid++;
kvm_debug("[%d]: cpu_context: %#lx\n", cpu,
cpu_context(cpu, current->mm));
- kvm_debug("[%d]: Allocated new ASID for Guest Kernel: %#x\n",
- cpu, vcpu->arch.guest_kernel_asid[cpu]);
kvm_debug("[%d]: Allocated new ASID for Guest User: %#x\n", cpu,
vcpu->arch.guest_user_asid[cpu]);
}
diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index 091553942bcb..3a5484f9aa50 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -175,6 +175,24 @@ static int kvm_trap_emul_handle_tlb_miss(struct kvm_vcpu *vcpu, bool store)
run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
ret = RESUME_HOST;
}
+ } else if (KVM_GUEST_KERNEL_MODE(vcpu)
+ && (KSEGX(badvaddr) == CKSEG0 || KSEGX(badvaddr) == CKSEG1)) {
+ /*
+ * With EVA we may get a TLB exception instead of an address
+ * error when the guest performs MMIO to KSeg1 addresses.
+ */
+ kvm_debug("Emulate %s MMIO space\n",
+ store ? "Store to" : "Load from");
+ er = kvm_mips_emulate_inst(cause, opc, run, vcpu);
+ if (er == EMULATE_FAIL) {
+ kvm_err("Emulate %s MMIO space failed\n",
+ store ? "Store to" : "Load from");
+ run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ ret = RESUME_HOST;
+ } else {
+ run->exit_reason = KVM_EXIT_MMIO;
+ ret = RESUME_HOST;
+ }
} else {
kvm_err("Illegal TLB %s fault address , cause %#x, PC: %p, BadVaddr: %#lx\n",
store ? "ST" : "LD", cause, opc, badvaddr);
diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index 287a656ceb57..e407af2b7333 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -245,6 +245,43 @@ static inline int segment_shift(int ssize)
}
/*
+ * This array is indexed by the LP field of the HPTE second dword.
+ * Since this field may contain some RPN bits, some entries are
+ * replicated so that we get the same value irrespective of RPN.
+ * The top 4 bits are the page size index (MMU_PAGE_*) for the
+ * actual page size, the bottom 4 bits are the base page size.
+ */
+extern u8 hpte_page_sizes[1 << LP_BITS];
+
+static inline unsigned long __hpte_page_size(unsigned long h, unsigned long l,
+ bool is_base_size)
+{
+ unsigned int i, lp;
+
+ if (!(h & HPTE_V_LARGE))
+ return 1ul << 12;
+
+ /* Look at the 8 bit LP value */
+ lp = (l >> LP_SHIFT) & ((1 << LP_BITS) - 1);
+ i = hpte_page_sizes[lp];
+ if (!i)
+ return 0;
+ if (!is_base_size)
+ i >>= 4;
+ return 1ul << mmu_psize_defs[i & 0xf].shift;
+}
+
+static inline unsigned long hpte_page_size(unsigned long h, unsigned long l)
+{
+ return __hpte_page_size(h, l, 0);
+}
+
+static inline unsigned long hpte_base_page_size(unsigned long h, unsigned long l)
+{
+ return __hpte_page_size(h, l, 1);
+}
+
+/*
* The current system page and segment sizes
*/
extern int mmu_kernel_ssize;
diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h
index 2fd1690b79d2..f6fda8482f60 100644
--- a/arch/powerpc/include/asm/io.h
+++ b/arch/powerpc/include/asm/io.h
@@ -241,6 +241,35 @@ static inline void out_be64(volatile u64 __iomem *addr, u64 val)
#endif
#endif /* __powerpc64__ */
+
+/*
+ * Simple Cache inhibited accessors
+ * Unlike the DEF_MMIO_* macros, these don't include any h/w memory
+ * barriers, callers need to manage memory barriers on their own.
+ * These can only be used in hypervisor real mode.
+ */
+
+static inline u32 _lwzcix(unsigned long addr)
+{
+ u32 ret;
+
+ __asm__ __volatile__("lwzcix %0,0, %1"
+ : "=r" (ret) : "r" (addr) : "memory");
+ return ret;
+}
+
+static inline void _stbcix(u64 addr, u8 val)
+{
+ __asm__ __volatile__("stbcix %0,0,%1"
+ : : "r" (val), "r" (addr) : "memory");
+}
+
+static inline void _stwcix(u64 addr, u32 val)
+{
+ __asm__ __volatile__("stwcix %0,0,%1"
+ : : "r" (val), "r" (addr) : "memory");
+}
+
/*
* Low level IO stream instructions are defined out of line for now
*/
diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h
index 5bca220bbb60..05cabed3d1bd 100644
--- a/arch/powerpc/include/asm/kvm_asm.h
+++ b/arch/powerpc/include/asm/kvm_asm.h
@@ -105,6 +105,15 @@
#define BOOK3S_INTERRUPT_FAC_UNAVAIL 0xf60
#define BOOK3S_INTERRUPT_H_FAC_UNAVAIL 0xf80
+/* book3s_hv */
+
+/*
+ * Special trap used to indicate to host that this is a
+ * passthrough interrupt that could not be handled
+ * completely in the guest.
+ */
+#define BOOK3S_INTERRUPT_HV_RM_HARD 0x5555
+
#define BOOK3S_IRQPRIO_SYSTEM_RESET 0
#define BOOK3S_IRQPRIO_DATA_SEGMENT 1
#define BOOK3S_IRQPRIO_INST_SEGMENT 2
@@ -136,6 +145,7 @@
#define RESUME_FLAG_NV (1<<0) /* Reload guest nonvolatile state? */
#define RESUME_FLAG_HOST (1<<1) /* Resume host? */
#define RESUME_FLAG_ARCH1 (1<<2)
+#define RESUME_FLAG_ARCH2 (1<<3)
#define RESUME_GUEST 0
#define RESUME_GUEST_NV RESUME_FLAG_NV
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 8f39796c9da8..5cf306ae0ac3 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -69,6 +69,43 @@ struct hpte_cache {
int pagesize;
};
+/*
+ * Struct for a virtual core.
+ * Note: entry_exit_map combines a bitmap of threads that have entered
+ * in the bottom 8 bits and a bitmap of threads that have exited in the
+ * next 8 bits. This is so that we can atomically set the entry bit
+ * iff the exit map is 0 without taking a lock.
+ */
+struct kvmppc_vcore {
+ int n_runnable;
+ int num_threads;
+ int entry_exit_map;
+ int napping_threads;
+ int first_vcpuid;
+ u16 pcpu;
+ u16 last_cpu;
+ u8 vcore_state;
+ u8 in_guest;
+ struct kvmppc_vcore *master_vcore;
+ struct kvm_vcpu *runnable_threads[MAX_SMT_THREADS];
+ struct list_head preempt_list;
+ spinlock_t lock;
+ struct swait_queue_head wq;
+ spinlock_t stoltb_lock; /* protects stolen_tb and preempt_tb */
+ u64 stolen_tb;
+ u64 preempt_tb;
+ struct kvm_vcpu *runner;
+ struct kvm *kvm;
+ u64 tb_offset; /* guest timebase - host timebase */
+ ulong lpcr;
+ u32 arch_compat;
+ ulong pcr;
+ ulong dpdes; /* doorbell state (POWER8) */
+ ulong vtb; /* virtual timebase */
+ ulong conferring_threads;
+ unsigned int halt_poll_ns;
+};
+
struct kvmppc_vcpu_book3s {
struct kvmppc_sid_map sid_map[SID_MAP_NUM];
struct {
@@ -83,6 +120,7 @@ struct kvmppc_vcpu_book3s {
u64 sdr1;
u64 hior;
u64 msr_mask;
+ u64 vtb;
#ifdef CONFIG_PPC_BOOK3S_32
u32 vsid_pool[VSID_POOL_SIZE];
u32 vsid_next;
@@ -191,6 +229,7 @@ extern void kvmppc_copy_to_svcpu(struct kvmppc_book3s_shadow_vcpu *svcpu,
struct kvm_vcpu *vcpu);
extern void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu,
struct kvmppc_book3s_shadow_vcpu *svcpu);
+extern int kvm_irq_bypass;
static inline struct kvmppc_vcpu_book3s *to_book3s(struct kvm_vcpu *vcpu)
{
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 88d17b4ea9c8..848292176908 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -20,6 +20,8 @@
#ifndef __ASM_KVM_BOOK3S_64_H__
#define __ASM_KVM_BOOK3S_64_H__
+#include <asm/book3s/64/mmu-hash.h>
+
#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
static inline struct kvmppc_book3s_shadow_vcpu *svcpu_get(struct kvm_vcpu *vcpu)
{
@@ -97,56 +99,20 @@ static inline void __unlock_hpte(__be64 *hpte, unsigned long hpte_v)
hpte[0] = cpu_to_be64(hpte_v);
}
-static inline int __hpte_actual_psize(unsigned int lp, int psize)
-{
- int i, shift;
- unsigned int mask;
-
- /* start from 1 ignoring MMU_PAGE_4K */
- for (i = 1; i < MMU_PAGE_COUNT; i++) {
-
- /* invalid penc */
- if (mmu_psize_defs[psize].penc[i] == -1)
- continue;
- /*
- * encoding bits per actual page size
- * PTE LP actual page size
- * rrrr rrrz >=8KB
- * rrrr rrzz >=16KB
- * rrrr rzzz >=32KB
- * rrrr zzzz >=64KB
- * .......
- */
- shift = mmu_psize_defs[i].shift - LP_SHIFT;
- if (shift > LP_BITS)
- shift = LP_BITS;
- mask = (1 << shift) - 1;
- if ((lp & mask) == mmu_psize_defs[psize].penc[i])
- return i;
- }
- return -1;
-}
-
static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
unsigned long pte_index)
{
- int b_psize = MMU_PAGE_4K, a_psize = MMU_PAGE_4K;
+ int i, b_psize = MMU_PAGE_4K, a_psize = MMU_PAGE_4K;
unsigned int penc;
unsigned long rb = 0, va_low, sllp;
unsigned int lp = (r >> LP_SHIFT) & ((1 << LP_BITS) - 1);
if (v & HPTE_V_LARGE) {
- for (b_psize = 0; b_psize < MMU_PAGE_COUNT; b_psize++) {
-
- /* valid entries have a shift value */
- if (!mmu_psize_defs[b_psize].shift)
- continue;
-
- a_psize = __hpte_actual_psize(lp, b_psize);
- if (a_psize != -1)
- break;
- }
+ i = hpte_page_sizes[lp];
+ b_psize = i & 0xf;
+ a_psize = i >> 4;
}
+
/*
* Ignore the top 14 bits of va
* v have top two bits covering segment size, hence move
@@ -159,7 +125,6 @@ static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
/* This covers 14..54 bits of va*/
rb = (v & ~0x7fUL) << 16; /* AVA field */
- rb |= (v >> HPTE_V_SSIZE_SHIFT) << 8; /* B field */
/*
* AVA in v had cleared lower 23 bits. We need to derive
* that from pteg index
@@ -211,49 +176,10 @@ static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
break;
}
}
- rb |= (v >> 54) & 0x300; /* B field */
+ rb |= (v >> HPTE_V_SSIZE_SHIFT) << 8; /* B field */
return rb;
}
-static inline unsigned long __hpte_page_size(unsigned long h, unsigned long l,
- bool is_base_size)
-{
-
- int size, a_psize;
- /* Look at the 8 bit LP value */
- unsigned int lp = (l >> LP_SHIFT) & ((1 << LP_BITS) - 1);
-
- /* only handle 4k, 64k and 16M pages for now */
- if (!(h & HPTE_V_LARGE))
- return 1ul << 12;
- else {
- for (size = 0; size < MMU_PAGE_COUNT; size++) {
- /* valid entries have a shift value */
- if (!mmu_psize_defs[size].shift)
- continue;
-
- a_psize = __hpte_actual_psize(lp, size);
- if (a_psize != -1) {
- if (is_base_size)
- return 1ul << mmu_psize_defs[size].shift;
- return 1ul << mmu_psize_defs[a_psize].shift;
- }
- }
-
- }
- return 0;
-}
-
-static inline unsigned long hpte_page_size(unsigned long h, unsigned long l)
-{
- return __hpte_page_size(h, l, 0);
-}
-
-static inline unsigned long hpte_base_page_size(unsigned long h, unsigned long l)
-{
- return __hpte_page_size(h, l, 1);
-}
-
static inline unsigned long hpte_rpn(unsigned long ptel, unsigned long psize)
{
return ((ptel & HPTE_R_RPN) & ~(psize - 1)) >> PAGE_SHIFT;
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index ec35af34a3fb..28350a294b1e 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -43,6 +43,8 @@
#include <asm/cputhreads.h>
#define KVM_MAX_VCPU_ID (threads_per_subcore * KVM_MAX_VCORES)
+#define __KVM_HAVE_ARCH_INTC_INITIALIZED
+
#ifdef CONFIG_KVM_MMIO
#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
#endif
@@ -95,42 +97,49 @@ struct kvmppc_vcpu_book3s;
struct kvmppc_book3s_shadow_vcpu;
struct kvm_vm_stat {
- u32 remote_tlb_flush;
+ ulong remote_tlb_flush;
};
struct kvm_vcpu_stat {
- u32 sum_exits;
- u32 mmio_exits;
- u32 signal_exits;
- u32 light_exits;
+ u64 sum_exits;
+ u64 mmio_exits;
+ u64 signal_exits;
+ u64 light_exits;
/* Account for special types of light exits: */
- u32 itlb_real_miss_exits;
- u32 itlb_virt_miss_exits;
- u32 dtlb_real_miss_exits;
- u32 dtlb_virt_miss_exits;
- u32 syscall_exits;
- u32 isi_exits;
- u32 dsi_exits;
- u32 emulated_inst_exits;
- u32 dec_exits;
- u32 ext_intr_exits;
- u32 halt_successful_poll;
- u32 halt_attempted_poll;
- u32 halt_poll_invalid;
- u32 halt_wakeup;
- u32 dbell_exits;
- u32 gdbell_exits;
- u32 ld;
- u32 st;
+ u64 itlb_real_miss_exits;
+ u64 itlb_virt_miss_exits;
+ u64 dtlb_real_miss_exits;
+ u64 dtlb_virt_miss_exits;
+ u64 syscall_exits;
+ u64 isi_exits;
+ u64 dsi_exits;
+ u64 emulated_inst_exits;
+ u64 dec_exits;
+ u64 ext_intr_exits;
+ u64 halt_poll_success_ns;
+ u64 halt_poll_fail_ns;
+ u64 halt_wait_ns;
+ u64 halt_successful_poll;
+ u64 halt_attempted_poll;
+ u64 halt_successful_wait;
+ u64 halt_poll_invalid;
+ u64 halt_wakeup;
+ u64 dbell_exits;
+ u64 gdbell_exits;
+ u64 ld;
+ u64 st;
#ifdef CONFIG_PPC_BOOK3S
- u32 pf_storage;
- u32 pf_instruc;
- u32 sp_storage;
- u32 sp_instruc;
- u32 queue_intr;
- u32 ld_slow;
- u32 st_slow;
+ u64 pf_storage;
+ u64 pf_instruc;
+ u64 sp_storage;
+ u64 sp_instruc;
+ u64 queue_intr;
+ u64 ld_slow;
+ u64 st_slow;
#endif
+ u64 pthru_all;
+ u64 pthru_host;
+ u64 pthru_bad_aff;
};
enum kvm_exit_types {
@@ -197,6 +206,8 @@ struct kvmppc_spapr_tce_table {
struct kvmppc_xics;
struct kvmppc_icp;
+struct kvmppc_passthru_irqmap;
+
/*
* The reverse mapping array has one entry for each HPTE,
* which stores the guest's view of the second word of the HPTE
@@ -267,6 +278,7 @@ struct kvm_arch {
#endif
#ifdef CONFIG_KVM_XICS
struct kvmppc_xics *xics;
+ struct kvmppc_passthru_irqmap *pimap;
#endif
struct kvmppc_ops *kvm_ops;
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
@@ -275,41 +287,6 @@ struct kvm_arch {
#endif
};
-/*
- * Struct for a virtual core.
- * Note: entry_exit_map combines a bitmap of threads that have entered
- * in the bottom 8 bits and a bitmap of threads that have exited in the
- * next 8 bits. This is so that we can atomically set the entry bit
- * iff the exit map is 0 without taking a lock.
- */
-struct kvmppc_vcore {
- int n_runnable;
- int num_threads;
- int entry_exit_map;
- int napping_threads;
- int first_vcpuid;
- u16 pcpu;
- u16 last_cpu;
- u8 vcore_state;
- u8 in_guest;
- struct kvmppc_vcore *master_vcore;
- struct list_head runnable_threads;
- struct list_head preempt_list;
- spinlock_t lock;
- struct swait_queue_head wq;
- spinlock_t stoltb_lock; /* protects stolen_tb and preempt_tb */
- u64 stolen_tb;
- u64 preempt_tb;
- struct kvm_vcpu *runner;
- struct kvm *kvm;
- u64 tb_offset; /* guest timebase - host timebase */
- ulong lpcr;
- u32 arch_compat;
- ulong pcr;
- ulong dpdes; /* doorbell state (POWER8) */
- ulong conferring_threads;
-};
-
#define VCORE_ENTRY_MAP(vc) ((vc)->entry_exit_map & 0xff)
#define VCORE_EXIT_MAP(vc) ((vc)->entry_exit_map >> 8)
#define VCORE_IS_EXITING(vc) (VCORE_EXIT_MAP(vc) != 0)
@@ -329,6 +306,7 @@ struct kvmppc_vcore {
#define VCORE_SLEEPING 3
#define VCORE_RUNNING 4
#define VCORE_EXITING 5
+#define VCORE_POLLING 6
/*
* Struct used to manage memory for a virtual processor area
@@ -397,6 +375,20 @@ struct kvmhv_tb_accumulator {
u64 tb_max; /* max time */
};
+#ifdef CONFIG_PPC_BOOK3S_64
+struct kvmppc_irq_map {
+ u32 r_hwirq;
+ u32 v_hwirq;
+ struct irq_desc *desc;
+};
+
+#define KVMPPC_PIRQ_MAPPED 1024
+struct kvmppc_passthru_irqmap {
+ int n_mapped;
+ struct kvmppc_irq_map mapped[KVMPPC_PIRQ_MAPPED];
+};
+#endif
+
# ifdef CONFIG_PPC_FSL_BOOK3E
#define KVMPPC_BOOKE_IAC_NUM 2
#define KVMPPC_BOOKE_DAC_NUM 2
@@ -483,7 +475,6 @@ struct kvm_vcpu_arch {
ulong purr;
ulong spurr;
ulong ic;
- ulong vtb;
ulong dscr;
ulong amr;
ulong uamor;
@@ -668,7 +659,6 @@ struct kvm_vcpu_arch {
long pgfault_index;
unsigned long pgfault_hpte[2];
- struct list_head run_list;
struct task_struct *run_task;
struct kvm_run *kvm_run;
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 2544edabe7f3..f6e49640dbe1 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -287,6 +287,10 @@ struct kvmppc_ops {
long (*arch_vm_ioctl)(struct file *filp, unsigned int ioctl,
unsigned long arg);
int (*hcall_implemented)(unsigned long hcall);
+ int (*irq_bypass_add_producer)(struct irq_bypass_consumer *,
+ struct irq_bypass_producer *);
+ void (*irq_bypass_del_producer)(struct irq_bypass_consumer *,
+ struct irq_bypass_producer *);
};
extern struct kvmppc_ops *kvmppc_hv_ops;
@@ -453,8 +457,19 @@ static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu)
{
return vcpu->arch.irq_type == KVMPPC_IRQ_XICS;
}
+
+static inline struct kvmppc_passthru_irqmap *kvmppc_get_passthru_irqmap(
+ struct kvm *kvm)
+{
+ if (kvm && kvm_irq_bypass)
+ return kvm->arch.pimap;
+ return NULL;
+}
+
extern void kvmppc_alloc_host_rm_ops(void);
extern void kvmppc_free_host_rm_ops(void);
+extern void kvmppc_free_pimap(struct kvm *kvm);
+extern int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall);
extern void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu);
extern int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server);
extern int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args);
@@ -464,10 +479,23 @@ extern int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval);
extern int kvmppc_xics_connect_vcpu(struct kvm_device *dev,
struct kvm_vcpu *vcpu, u32 cpu);
extern void kvmppc_xics_ipi_action(void);
+extern void kvmppc_xics_set_mapped(struct kvm *kvm, unsigned long guest_irq,
+ unsigned long host_irq);
+extern void kvmppc_xics_clr_mapped(struct kvm *kvm, unsigned long guest_irq,
+ unsigned long host_irq);
+extern long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu, u32 xirr,
+ struct kvmppc_irq_map *irq_map,
+ struct kvmppc_passthru_irqmap *pimap);
extern int h_ipi_redirect;
#else
+static inline struct kvmppc_passthru_irqmap *kvmppc_get_passthru_irqmap(
+ struct kvm *kvm)
+ { return NULL; }
static inline void kvmppc_alloc_host_rm_ops(void) {};
static inline void kvmppc_free_host_rm_ops(void) {};
+static inline void kvmppc_free_pimap(struct kvm *kvm) {};
+static inline int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
+ { return 0; }
static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu)
{ return 0; }
static inline void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) { }
diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h
index e2fb408f8398..b78e8d3377f6 100644
--- a/arch/powerpc/include/asm/mmu.h
+++ b/arch/powerpc/include/asm/mmu.h
@@ -271,6 +271,7 @@ static inline bool early_radix_enabled(void)
#define MMU_PAGE_16G 13
#define MMU_PAGE_64G 14
+/* N.B. we need to change the type of hpte_page_sizes if this gets to be > 16 */
#define MMU_PAGE_COUNT 15
#ifdef CONFIG_PPC_BOOK3S_64
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index ee05bd203630..e958b7096f19 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -67,6 +67,7 @@ int64_t opal_pci_config_write_half_word(uint64_t phb_id, uint64_t bus_dev_func,
int64_t opal_pci_config_write_word(uint64_t phb_id, uint64_t bus_dev_func,
uint64_t offset, uint32_t data);
int64_t opal_set_xive(uint32_t isn, uint16_t server, uint8_t priority);
+int64_t opal_rm_set_xive(uint32_t isn, uint16_t server, uint8_t priority);
int64_t opal_get_xive(uint32_t isn, __be16 *server, uint8_t *priority);
int64_t opal_register_exception_handler(uint64_t opal_exception,
uint64_t handler_address,
diff --git a/arch/powerpc/include/asm/pnv-pci.h b/arch/powerpc/include/asm/pnv-pci.h
index 0cbd8134ce81..1b46b52d3212 100644
--- a/arch/powerpc/include/asm/pnv-pci.h
+++ b/arch/powerpc/include/asm/pnv-pci.h
@@ -12,6 +12,7 @@
#include <linux/pci.h>
#include <linux/pci_hotplug.h>
+#include <linux/irq.h>
#include <misc/cxl-base.h>
#include <asm/opal-api.h>
@@ -33,6 +34,8 @@ int pnv_cxl_alloc_hwirqs(struct pci_dev *dev, int num);
void pnv_cxl_release_hwirqs(struct pci_dev *dev, int hwirq, int num);
int pnv_cxl_get_irq_count(struct pci_dev *dev);
struct device_node *pnv_pci_get_phb_node(struct pci_dev *dev);
+int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq);
+bool is_pnv_opal_msi(struct irq_chip *chip);
#ifdef CONFIG_CXL_BASE
int pnv_cxl_alloc_hwirq_ranges(struct cxl_irq_ranges *irqs,
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index f69f40f1519a..978dada662ae 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -737,6 +737,7 @@
#define MMCR0_FCHV 0x00000001UL /* freeze conditions in hypervisor mode */
#define SPRN_MMCR1 798
#define SPRN_MMCR2 785
+#define SPRN_UMMCR2 769
#define SPRN_MMCRA 0x312
#define MMCRA_SDSYNC 0x80000000UL /* SDAR synced with SIAR */
#define MMCRA_SDAR_DCACHE_MISS 0x40000000UL
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index b89d14c0352c..a51ae9b165e0 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -506,7 +506,6 @@ int main(void)
DEFINE(VCPU_PURR, offsetof(struct kvm_vcpu, arch.purr));
DEFINE(VCPU_SPURR, offsetof(struct kvm_vcpu, arch.spurr));
DEFINE(VCPU_IC, offsetof(struct kvm_vcpu, arch.ic));
- DEFINE(VCPU_VTB, offsetof(struct kvm_vcpu, arch.vtb));
DEFINE(VCPU_DSCR, offsetof(struct kvm_vcpu, arch.dscr));
DEFINE(VCPU_AMR, offsetof(struct kvm_vcpu, arch.amr));
DEFINE(VCPU_UAMOR, offsetof(struct kvm_vcpu, arch.uamor));
@@ -557,6 +556,7 @@ int main(void)
DEFINE(VCORE_LPCR, offsetof(struct kvmppc_vcore, lpcr));
DEFINE(VCORE_PCR, offsetof(struct kvmppc_vcore, pcr));
DEFINE(VCORE_DPDES, offsetof(struct kvmppc_vcore, dpdes));
+ DEFINE(VCORE_VTB, offsetof(struct kvmppc_vcore, vtb));
DEFINE(VCPU_SLB_E, offsetof(struct kvmppc_slb, orige));
DEFINE(VCPU_SLB_V, offsetof(struct kvmppc_slb, origv));
DEFINE(VCPU_SLB_SIZE, sizeof(struct kvmppc_slb));
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index c2024ac9d4e8..029be26b5a17 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -22,6 +22,9 @@ config KVM
select ANON_INODES
select HAVE_KVM_EVENTFD
select SRCU
+ select KVM_VFIO
+ select IRQ_BYPASS_MANAGER
+ select HAVE_KVM_IRQ_BYPASS
config KVM_BOOK3S_HANDLER
bool
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 855d4b95d752..7dd89b79d038 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -7,16 +7,16 @@ subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm
KVM := ../../../virt/kvm
-common-objs-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
- $(KVM)/eventfd.o
+common-objs-y = $(KVM)/kvm_main.o $(KVM)/eventfd.o
common-objs-$(CONFIG_KVM_VFIO) += $(KVM)/vfio.o
+common-objs-$(CONFIG_KVM_MMIO) += $(KVM)/coalesced_mmio.o
CFLAGS_e500_mmu.o := -I.
CFLAGS_e500_mmu_host.o := -I.
CFLAGS_emulate.o := -I.
CFLAGS_emulate_loadstore.o := -I.
-common-objs-y += powerpc.o emulate.o emulate_loadstore.o
+common-objs-y += powerpc.o emulate_loadstore.o
obj-$(CONFIG_KVM_EXIT_TIMING) += timing.o
obj-$(CONFIG_KVM_BOOK3S_HANDLER) += book3s_exports.o
@@ -24,6 +24,7 @@ AFLAGS_booke_interrupts.o := -I$(objtree)/$(obj)
kvm-e500-objs := \
$(common-objs-y) \
+ emulate.o \
booke.o \
booke_emulate.o \
booke_interrupts.o \
@@ -35,6 +36,7 @@ kvm-objs-$(CONFIG_KVM_E500V2) := $(kvm-e500-objs)
kvm-e500mc-objs := \
$(common-objs-y) \
+ emulate.o \
booke.o \
booke_emulate.o \
bookehv_interrupts.o \
@@ -61,9 +63,6 @@ kvm-pr-y := \
book3s_32_mmu.o
ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
-kvm-book3s_64-module-objs := \
- $(KVM)/coalesced_mmio.o
-
kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \
book3s_rmhandlers.o
endif
@@ -89,11 +88,8 @@ endif
kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \
book3s_xics.o
-kvm-book3s_64-module-objs += \
- $(KVM)/kvm_main.o \
- $(KVM)/eventfd.o \
- powerpc.o \
- emulate_loadstore.o \
+kvm-book3s_64-module-objs := \
+ $(common-objs-y) \
book3s.o \
book3s_64_vio.o \
book3s_rtas.o \
@@ -103,6 +99,7 @@ kvm-objs-$(CONFIG_KVM_BOOK3S_64) := $(kvm-book3s_64-module-objs)
kvm-book3s_32-objs := \
$(common-objs-y) \
+ emulate.o \
fpu.o \
book3s_paired_singles.o \
book3s.o \
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 47018fcbf7d6..b6952dd23152 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -52,8 +52,12 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "dec", VCPU_STAT(dec_exits) },
{ "ext_intr", VCPU_STAT(ext_intr_exits) },
{ "queue_intr", VCPU_STAT(queue_intr) },
+ { "halt_poll_success_ns", VCPU_STAT(halt_poll_success_ns) },
+ { "halt_poll_fail_ns", VCPU_STAT(halt_poll_fail_ns) },
+ { "halt_wait_ns", VCPU_STAT(halt_wait_ns) },
{ "halt_successful_poll", VCPU_STAT(halt_successful_poll), },
{ "halt_attempted_poll", VCPU_STAT(halt_attempted_poll), },
+ { "halt_successful_wait", VCPU_STAT(halt_successful_wait) },
{ "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
{ "pf_storage", VCPU_STAT(pf_storage) },
@@ -64,6 +68,9 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "ld_slow", VCPU_STAT(ld_slow) },
{ "st", VCPU_STAT(st) },
{ "st_slow", VCPU_STAT(st_slow) },
+ { "pthru_all", VCPU_STAT(pthru_all) },
+ { "pthru_host", VCPU_STAT(pthru_host) },
+ { "pthru_bad_aff", VCPU_STAT(pthru_bad_aff) },
{ NULL }
};
@@ -592,9 +599,6 @@ int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id,
case KVM_REG_PPC_BESCR:
*val = get_reg_val(id, vcpu->arch.bescr);
break;
- case KVM_REG_PPC_VTB:
- *val = get_reg_val(id, vcpu->arch.vtb);
- break;
case KVM_REG_PPC_IC:
*val = get_reg_val(id, vcpu->arch.ic);
break;
@@ -666,9 +670,6 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id,
case KVM_REG_PPC_BESCR:
vcpu->arch.bescr = set_reg_val(id, *val);
break;
- case KVM_REG_PPC_VTB:
- vcpu->arch.vtb = set_reg_val(id, *val);
- break;
case KVM_REG_PPC_IC:
vcpu->arch.ic = set_reg_val(id, *val);
break;
diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c
index 2afdb9c0937d..8359752b3efc 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -498,6 +498,7 @@ int kvmppc_core_emulate_mtspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
case SPRN_MMCR0:
case SPRN_MMCR1:
case SPRN_MMCR2:
+ case SPRN_UMMCR2:
#endif
break;
unprivileged:
@@ -579,7 +580,7 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val
*spr_val = vcpu->arch.spurr;
break;
case SPRN_VTB:
- *spr_val = vcpu->arch.vtb;
+ *spr_val = to_book3s(vcpu)->vtb;
break;
case SPRN_IC:
*spr_val = vcpu->arch.ic;
@@ -640,6 +641,7 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val
case SPRN_MMCR0:
case SPRN_MMCR1:
case SPRN_MMCR2:
+ case SPRN_UMMCR2:
case SPRN_TIR:
#endif
*spr_val = 0;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 2fd5580c8f6e..3686471be32b 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -53,11 +53,15 @@
#include <asm/smp.h>
#include <asm/dbell.h>
#include <asm/hmi.h>
+#include <asm/pnv-pci.h>
#include <linux/gfp.h>
#include <linux/vmalloc.h>
#include <linux/highmem.h>
#include <linux/hugetlb.h>
+#include <linux/kvm_irqfd.h>
+#include <linux/irqbypass.h>
#include <linux/module.h>
+#include <linux/compiler.h>
#include "book3s.h"
@@ -70,6 +74,8 @@
/* Used to indicate that a guest page fault needs to be handled */
#define RESUME_PAGE_FAULT (RESUME_GUEST | RESUME_FLAG_ARCH1)
+/* Used to indicate that a guest passthrough interrupt needs to be handled */
+#define RESUME_PASSTHROUGH (RESUME_GUEST | RESUME_FLAG_ARCH2)
/* Used as a "null" value for timebase values */
#define TB_NIL (~(u64)0)
@@ -89,14 +95,55 @@ static struct kernel_param_ops module_param_ops = {
.get = param_get_int,
};
+module_param_cb(kvm_irq_bypass, &module_param_ops, &kvm_irq_bypass,
+ S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(kvm_irq_bypass, "Bypass passthrough interrupt optimization");
+
module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect,
S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");
#endif
+/* Maximum halt poll interval defaults to KVM_HALT_POLL_NS_DEFAULT */
+static unsigned int halt_poll_max_ns = KVM_HALT_POLL_NS_DEFAULT;
+module_param(halt_poll_max_ns, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(halt_poll_max_ns, "Maximum halt poll time in ns");
+
+/* Factor by which the vcore halt poll interval is grown, default is to double
+ */
+static unsigned int halt_poll_ns_grow = 2;
+module_param(halt_poll_ns_grow, int, S_IRUGO);
+MODULE_PARM_DESC(halt_poll_ns_grow, "Factor halt poll time is grown by");
+
+/* Factor by which the vcore halt poll interval is shrunk, default is to reset
+ */
+static unsigned int halt_poll_ns_shrink;
+module_param(halt_poll_ns_shrink, int, S_IRUGO);
+MODULE_PARM_DESC(halt_poll_ns_shrink, "Factor halt poll time is shrunk by");
+
static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
+static inline struct kvm_vcpu *next_runnable_thread(struct kvmppc_vcore *vc,
+ int *ip)
+{
+ int i = *ip;
+ struct kvm_vcpu *vcpu;
+
+ while (++i < MAX_SMT_THREADS) {
+ vcpu = READ_ONCE(vc->runnable_threads[i]);
+ if (vcpu) {
+ *ip = i;
+ return vcpu;
+ }
+ }
+ return NULL;
+}
+
+/* Used to traverse the list of runnable threads for a given vcore */
+#define for_each_runnable_thread(i, vcpu, vc) \
+ for (i = -1; (vcpu = next_runnable_thread(vc, &i)); )
+
static bool kvmppc_ipi_thread(int cpu)
{
/* On POWER8 for IPIs to threads in the same core, use msgsnd */
@@ -991,6 +1038,9 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
r = RESUME_GUEST;
break;
+ case BOOK3S_INTERRUPT_HV_RM_HARD:
+ r = RESUME_PASSTHROUGH;
+ break;
default:
kvmppc_dump_regs(vcpu);
printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n",
@@ -1149,6 +1199,9 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
case KVM_REG_PPC_DPDES:
*val = get_reg_val(id, vcpu->arch.vcore->dpdes);
break;
+ case KVM_REG_PPC_VTB:
+ *val = get_reg_val(id, vcpu->arch.vcore->vtb);
+ break;
case KVM_REG_PPC_DAWR:
*val = get_reg_val(id, vcpu->arch.dawr);
break;
@@ -1341,6 +1394,9 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
case KVM_REG_PPC_DPDES:
vcpu->arch.vcore->dpdes = set_reg_val(id, *val);
break;
+ case KVM_REG_PPC_VTB:
+ vcpu->arch.vcore->vtb = set_reg_val(id, *val);
+ break;
case KVM_REG_PPC_DAWR:
vcpu->arch.dawr = set_reg_val(id, *val);
break;
@@ -1493,7 +1549,6 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core)
if (vcore == NULL)
return NULL;
- INIT_LIST_HEAD(&vcore->runnable_threads);
spin_lock_init(&vcore->lock);
spin_lock_init(&vcore->stoltb_lock);
init_swait_queue_head(&vcore->wq);
@@ -1802,7 +1857,7 @@ static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
spin_unlock_irq(&vcpu->arch.tbacct_lock);
--vc->n_runnable;
- list_del(&vcpu->arch.run_list);
+ WRITE_ONCE(vc->runnable_threads[vcpu->arch.ptid], NULL);
}
static int kvmppc_grab_hwthread(int cpu)
@@ -2048,66 +2103,6 @@ static void init_master_vcore(struct kvmppc_vcore *vc)
vc->conferring_threads = 0;
}
-/*
- * See if the existing subcores can be split into 3 (or fewer) subcores
- * of at most two threads each, so we can fit in another vcore. This
- * assumes there are at most two subcores and at most 6 threads in total.
- */
-static bool can_split_piggybacked_subcores(struct core_info *cip)
-{
- int sub, new_sub;
- int large_sub = -1;
- int thr;
- int n_subcores = cip->n_subcores;
- struct kvmppc_vcore *vc, *vcnext;
- struct kvmppc_vcore *master_vc = NULL;
-
- for (sub = 0; sub < cip->n_subcores; ++sub) {
- if (cip->subcore_threads[sub] <= 2)
- continue;
- if (large_sub >= 0)
- return false;
- large_sub = sub;
- vc = list_first_entry(&cip->vcs[sub], struct kvmppc_vcore,
- preempt_list);
- if (vc->num_threads > 2)
- return false;
- n_subcores += (cip->subcore_threads[sub] - 1) >> 1;
- }
- if (large_sub < 0 || !subcore_config_ok(n_subcores + 1, 2))
- return false;
-
- /*
- * Seems feasible, so go through and move vcores to new subcores.
- * Note that when we have two or more vcores in one subcore,
- * all those vcores must have only one thread each.
- */
- new_sub = cip->n_subcores;
- thr = 0;
- sub = large_sub;
- list_for_each_entry_safe(vc, vcnext, &cip->vcs[sub], preempt_list) {
- if (thr >= 2) {
- list_del(&vc->preempt_list);
- list_add_tail(&vc->preempt_list, &cip->vcs[new_sub]);
- /* vc->num_threads must be 1 */
- if (++cip->subcore_threads[new_sub] == 1) {
- cip->subcore_vm[new_sub] = vc->kvm;
- init_master_vcore(vc);
- master_vc = vc;
- ++cip->n_subcores;
- } else {
- vc->master_vcore = master_vc;
- ++new_sub;
- }
- }
- thr += vc->num_threads;
- }
- cip->subcore_threads[large_sub] = 2;
- cip->max_subcore_threads = 2;
-
- return true;
-}
-
static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip)
{
int n_threads = vc->num_threads;
@@ -2118,23 +2113,9 @@ static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip)
if (n_threads < cip->max_subcore_threads)
n_threads = cip->max_subcore_threads;
- if (subcore_config_ok(cip->n_subcores + 1, n_threads)) {
- cip->max_subcore_threads = n_threads;
- } else if (cip->n_subcores <= 2 && cip->total_threads <= 6 &&
- vc->num_threads <= 2) {
- /*
- * We may be able to fit another subcore in by
- * splitting an existing subcore with 3 or 4
- * threads into two 2-thread subcores, or one
- * with 5 or 6 threads into three subcores.
- * We can only do this if those subcores have
- * piggybacked virtual cores.
- */
- if (!can_split_piggybacked_subcores(cip))
- return false;
- } else {
+ if (!subcore_config_ok(cip->n_subcores + 1, n_threads))
return false;
- }
+ cip->max_subcore_threads = n_threads;
sub = cip->n_subcores;
++cip->n_subcores;
@@ -2148,43 +2129,6 @@ static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip)
return true;
}
-static bool can_piggyback_subcore(struct kvmppc_vcore *pvc,
- struct core_info *cip, int sub)
-{
- struct kvmppc_vcore *vc;
- int n_thr;
-
- vc = list_first_entry(&cip->vcs[sub], struct kvmppc_vcore,
- preempt_list);
-
- /* require same VM and same per-core reg values */
- if (pvc->kvm != vc->kvm ||
- pvc->tb_offset != vc->tb_offset ||
- pvc->pcr != vc->pcr ||
- pvc->lpcr != vc->lpcr)
- return false;
-
- /* P8 guest with > 1 thread per core would see wrong TIR value */
- if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
- (vc->num_threads > 1 || pvc->num_threads > 1))
- return false;
-
- n_thr = cip->subcore_threads[sub] + pvc->num_threads;
- if (n_thr > cip->max_subcore_threads) {
- if (!subcore_config_ok(cip->n_subcores, n_thr))
- return false;
- cip->max_subcore_threads = n_thr;
- }
-
- cip->total_threads += pvc->num_threads;
- cip->subcore_threads[sub] = n_thr;
- pvc->master_vcore = vc;
- list_del(&pvc->preempt_list);
- list_add_tail(&pvc->preempt_list, &cip->vcs[sub]);
-
- return true;
-}
-
/*
* Work out whether it is possible to piggyback the execution of
* vcore *pvc onto the execution of the other vcores described in *cip.
@@ -2192,27 +2136,18 @@ static bool can_piggyback_subcore(struct kvmppc_vcore *pvc,
static bool can_piggyback(struct kvmppc_vcore *pvc, struct core_info *cip,
int target_threads)
{
- int sub;
-
if (cip->total_threads + pvc->num_threads > target_threads)
return false;
- for (sub = 0; sub < cip->n_subcores; ++sub)
- if (cip->subcore_threads[sub] &&
- can_piggyback_subcore(pvc, cip, sub))
- return true;
-
- if (can_dynamic_split(pvc, cip))
- return true;
- return false;
+ return can_dynamic_split(pvc, cip);
}
static void prepare_threads(struct kvmppc_vcore *vc)
{
- struct kvm_vcpu *vcpu, *vnext;
+ int i;
+ struct kvm_vcpu *vcpu;
- list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
- arch.run_list) {
+ for_each_runnable_thread(i, vcpu, vc) {
if (signal_pending(vcpu->arch.run_task))
vcpu->arch.ret = -EINTR;
else if (vcpu->arch.vpa.update_pending ||
@@ -2259,15 +2194,14 @@ static void collect_piggybacks(struct core_info *cip, int target_threads)
static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
{
- int still_running = 0;
+ int still_running = 0, i;
u64 now;
long ret;
- struct kvm_vcpu *vcpu, *vnext;
+ struct kvm_vcpu *vcpu;
spin_lock(&vc->lock);
now = get_tb();
- list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
- arch.run_list) {
+ for_each_runnable_thread(i, vcpu, vc) {
/* cancel pending dec exception if dec is positive */
if (now < vcpu->arch.dec_expires &&
kvmppc_core_pending_dec(vcpu))
@@ -2307,8 +2241,8 @@ static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
}
if (vc->n_runnable > 0 && vc->runner == NULL) {
/* make sure there's a candidate runner awake */
- vcpu = list_first_entry(&vc->runnable_threads,
- struct kvm_vcpu, arch.run_list);
+ i = -1;
+ vcpu = next_runnable_thread(vc, &i);
wake_up(&vcpu->arch.cpu_run);
}
}
@@ -2361,7 +2295,7 @@ static inline void kvmppc_set_host_core(int cpu)
*/
static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
{
- struct kvm_vcpu *vcpu, *vnext;
+ struct kvm_vcpu *vcpu;
int i;
int srcu_idx;
struct core_info core_info;
@@ -2397,8 +2331,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
*/
if ((threads_per_core > 1) &&
((vc->num_threads > threads_per_subcore) || !on_primary_thread())) {
- list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
- arch.run_list) {
+ for_each_runnable_thread(i, vcpu, vc) {
vcpu->arch.ret = -EBUSY;
kvmppc_remove_runnable(vc, vcpu);
wake_up(&vcpu->arch.cpu_run);
@@ -2477,8 +2410,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
active |= 1 << thr;
list_for_each_entry(pvc, &core_info.vcs[sub], preempt_list) {
pvc->pcpu = pcpu + thr;
- list_for_each_entry(vcpu, &pvc->runnable_threads,
- arch.run_list) {
+ for_each_runnable_thread(i, vcpu, pvc) {
kvmppc_start_thread(vcpu, pvc);
kvmppc_create_dtl_entry(vcpu, pvc);
trace_kvm_guest_enter(vcpu);
@@ -2604,34 +2536,92 @@ static void kvmppc_wait_for_exec(struct kvmppc_vcore *vc,
finish_wait(&vcpu->arch.cpu_run, &wait);
}
+static void grow_halt_poll_ns(struct kvmppc_vcore *vc)
+{
+ /* 10us base */
+ if (vc->halt_poll_ns == 0 && halt_poll_ns_grow)
+ vc->halt_poll_ns = 10000;
+ else
+ vc->halt_poll_ns *= halt_poll_ns_grow;
+
+ if (vc->halt_poll_ns > halt_poll_max_ns)
+ vc->halt_poll_ns = halt_poll_max_ns;
+}
+
+static void shrink_halt_poll_ns(struct kvmppc_vcore *vc)
+{
+ if (halt_poll_ns_shrink == 0)
+ vc->halt_poll_ns = 0;
+ else
+ vc->halt_poll_ns /= halt_poll_ns_shrink;
+}
+
+/* Check to see if any of the runnable vcpus on the vcore have pending
+ * exceptions or are no longer ceded
+ */
+static int kvmppc_vcore_check_block(struct kvmppc_vcore *vc)
+{
+ struct kvm_vcpu *vcpu;
+ int i;
+
+ for_each_runnable_thread(i, vcpu, vc) {
+ if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded)
+ return 1;
+ }
+
+ return 0;
+}
+
/*
* All the vcpus in this vcore are idle, so wait for a decrementer
* or external interrupt to one of the vcpus. vc->lock is held.
*/
static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
{
- struct kvm_vcpu *vcpu;
+ ktime_t cur, start_poll, start_wait;
int do_sleep = 1;
+ u64 block_ns;
DECLARE_SWAITQUEUE(wait);
- prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
+ /* Poll for pending exceptions and ceded state */
+ cur = start_poll = ktime_get();
+ if (vc->halt_poll_ns) {
+ ktime_t stop = ktime_add_ns(start_poll, vc->halt_poll_ns);
+ ++vc->runner->stat.halt_attempted_poll;
- /*
- * Check one last time for pending exceptions and ceded state after
- * we put ourselves on the wait queue
- */
- list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
- if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded) {
- do_sleep = 0;
- break;
+ vc->vcore_state = VCORE_POLLING;
+ spin_unlock(&vc->lock);
+
+ do {
+ if (kvmppc_vcore_check_block(vc)) {
+ do_sleep = 0;
+ break;
+ }
+ cur = ktime_get();
+ } while (single_task_running() && ktime_before(cur, stop));
+
+ spin_lock(&vc->lock);
+ vc->vcore_state = VCORE_INACTIVE;
+
+ if (!do_sleep) {
+ ++vc->runner->stat.halt_successful_poll;
+ goto out;
}
}
- if (!do_sleep) {
+ prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
+
+ if (kvmppc_vcore_check_block(vc)) {
finish_swait(&vc->wq, &wait);
- return;
+ do_sleep = 0;
+ /* If we polled, count this as a successful poll */
+ if (vc->halt_poll_ns)
+ ++vc->runner->stat.halt_successful_poll;
+ goto out;
}
+ start_wait = ktime_get();
+
vc->vcore_state = VCORE_SLEEPING;
trace_kvmppc_vcore_blocked(vc, 0);
spin_unlock(&vc->lock);
@@ -2640,13 +2630,52 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
spin_lock(&vc->lock);
vc->vcore_state = VCORE_INACTIVE;
trace_kvmppc_vcore_blocked(vc, 1);
+ ++vc->runner->stat.halt_successful_wait;
+
+ cur = ktime_get();
+
+out:
+ block_ns = ktime_to_ns(cur) - ktime_to_ns(start_poll);
+
+ /* Attribute wait time */
+ if (do_sleep) {
+ vc->runner->stat.halt_wait_ns +=
+ ktime_to_ns(cur) - ktime_to_ns(start_wait);
+ /* Attribute failed poll time */
+ if (vc->halt_poll_ns)
+ vc->runner->stat.halt_poll_fail_ns +=
+ ktime_to_ns(start_wait) -
+ ktime_to_ns(start_poll);
+ } else {
+ /* Attribute successful poll time */
+ if (vc->halt_poll_ns)
+ vc->runner->stat.halt_poll_success_ns +=
+ ktime_to_ns(cur) -
+ ktime_to_ns(start_poll);
+ }
+
+ /* Adjust poll time */
+ if (halt_poll_max_ns) {
+ if (block_ns <= vc->halt_poll_ns)
+ ;
+ /* We slept and blocked for longer than the max halt time */
+ else if (vc->halt_poll_ns && block_ns > halt_poll_max_ns)
+ shrink_halt_poll_ns(vc);
+ /* We slept and our poll time is too small */
+ else if (vc->halt_poll_ns < halt_poll_max_ns &&
+ block_ns < halt_poll_max_ns)
+ grow_halt_poll_ns(vc);
+ } else
+ vc->halt_poll_ns = 0;
+
+ trace_kvmppc_vcore_wakeup(do_sleep, block_ns);
}
static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
{
- int n_ceded;
+ int n_ceded, i;
struct kvmppc_vcore *vc;
- struct kvm_vcpu *v, *vn;
+ struct kvm_vcpu *v;
trace_kvmppc_run_vcpu_enter(vcpu);
@@ -2666,7 +2695,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb());
vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
vcpu->arch.busy_preempt = TB_NIL;
- list_add_tail(&vcpu->arch.run_list, &vc->runnable_threads);
+ WRITE_ONCE(vc->runnable_threads[vcpu->arch.ptid], vcpu);
++vc->n_runnable;
/*
@@ -2706,8 +2735,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
kvmppc_wait_for_exec(vc, vcpu, TASK_INTERRUPTIBLE);
continue;
}
- list_for_each_entry_safe(v, vn, &vc->runnable_threads,
- arch.run_list) {
+ for_each_runnable_thread(i, v, vc) {
kvmppc_core_prepare_to_enter(v);
if (signal_pending(v->arch.run_task)) {
kvmppc_remove_runnable(vc, v);
@@ -2720,7 +2748,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
if (!vc->n_runnable || vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
break;
n_ceded = 0;
- list_for_each_entry(v, &vc->runnable_threads, arch.run_list) {
+ for_each_runnable_thread(i, v, vc) {
if (!v->arch.pending_exceptions)
n_ceded += v->arch.ceded;
else
@@ -2759,8 +2787,8 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
if (vc->n_runnable && vc->vcore_state == VCORE_INACTIVE) {
/* Wake up some vcpu to run the core */
- v = list_first_entry(&vc->runnable_threads,
- struct kvm_vcpu, arch.run_list);
+ i = -1;
+ v = next_runnable_thread(vc, &i);
wake_up(&v->arch.cpu_run);
}
@@ -2818,7 +2846,8 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
r = kvmppc_book3s_hv_page_fault(run, vcpu,
vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
- }
+ } else if (r == RESUME_PASSTHROUGH)
+ r = kvmppc_xics_rm_complete(vcpu, 0);
} while (is_kvmppc_resume_guest(r));
out:
@@ -3247,6 +3276,8 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
kvmppc_free_vcores(kvm);
kvmppc_free_hpt(kvm);
+
+ kvmppc_free_pimap(kvm);
}
/* We don't need to emulate any privileged instructions or dcbz */
@@ -3282,6 +3313,184 @@ static int kvmppc_core_check_processor_compat_hv(void)
return 0;
}
+#ifdef CONFIG_KVM_XICS
+
+void kvmppc_free_pimap(struct kvm *kvm)
+{
+ kfree(kvm->arch.pimap);
+}
+
+static struct kvmppc_passthru_irqmap *kvmppc_alloc_pimap(void)
+{
+ return kzalloc(sizeof(struct kvmppc_passthru_irqmap), GFP_KERNEL);
+}
+
+static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
+{
+ struct irq_desc *desc;
+ struct kvmppc_irq_map *irq_map;
+ struct kvmppc_passthru_irqmap *pimap;
+ struct irq_chip *chip;
+ int i;
+
+ if (!kvm_irq_bypass)
+ return 1;
+
+ desc = irq_to_desc(host_irq);
+ if (!desc)
+ return -EIO;
+
+ mutex_lock(&kvm->lock);
+
+ pimap = kvm->arch.pimap;
+ if (pimap == NULL) {
+ /* First call, allocate structure to hold IRQ map */
+ pimap = kvmppc_alloc_pimap();
+ if (pimap == NULL) {
+ mutex_unlock(&kvm->lock);
+ return -ENOMEM;
+ }
+ kvm->arch.pimap = pimap;
+ }
+
+ /*
+ * For now, we only support interrupts for which the EOI operation
+ * is an OPAL call followed by a write to XIRR, since that's
+ * what our real-mode EOI code does.
+ */
+ chip = irq_data_get_irq_chip(&desc->irq_data);
+ if (!chip || !is_pnv_opal_msi(chip)) {
+ pr_warn("kvmppc_set_passthru_irq_hv: Could not assign IRQ map for (%d,%d)\n",
+ host_irq, guest_gsi);
+ mutex_unlock(&kvm->lock);
+ return -ENOENT;
+ }
+
+ /*
+ * See if we already have an entry for this guest IRQ number.
+ * If it's mapped to a hardware IRQ number, that's an error,
+ * otherwise re-use this entry.
+ */
+ for (i = 0; i < pimap->n_mapped; i++) {
+ if (guest_gsi == pimap->mapped[i].v_hwirq) {
+ if (pimap->mapped[i].r_hwirq) {
+ mutex_unlock(&kvm->lock);
+ return -EINVAL;
+ }
+ break;
+ }
+ }
+
+ if (i == KVMPPC_PIRQ_MAPPED) {
+ mutex_unlock(&kvm->lock);
+ return -EAGAIN; /* table is full */
+ }
+
+ irq_map = &pimap->mapped[i];
+
+ irq_map->v_hwirq = guest_gsi;
+ irq_map->desc = desc;
+
+ /*
+ * Order the above two stores before the next to serialize with
+ * the KVM real mode handler.
+ */
+ smp_wmb();
+ irq_map->r_hwirq = desc->irq_data.hwirq;
+
+ if (i == pimap->n_mapped)
+ pimap->n_mapped++;
+
+ kvmppc_xics_set_mapped(kvm, guest_gsi, desc->irq_data.hwirq);
+
+ mutex_unlock(&kvm->lock);
+
+ return 0;
+}
+
+static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
+{
+ struct irq_desc *desc;
+ struct kvmppc_passthru_irqmap *pimap;
+ int i;
+
+ if (!kvm_irq_bypass)
+ return 0;
+
+ desc = irq_to_desc(host_irq);
+ if (!desc)
+ return -EIO;
+
+ mutex_lock(&kvm->lock);
+
+ if (kvm->arch.pimap == NULL) {
+ mutex_unlock(&kvm->lock);
+ return 0;
+ }
+ pimap = kvm->arch.pimap;
+
+ for (i = 0; i < pimap->n_mapped; i++) {
+ if (guest_gsi == pimap->mapped[i].v_hwirq)
+ break;
+ }
+
+ if (i == pimap->n_mapped) {
+ mutex_unlock(&kvm->lock);
+ return -ENODEV;
+ }
+
+ kvmppc_xics_clr_mapped(kvm, guest_gsi, pimap->mapped[i].r_hwirq);
+
+ /* invalidate the entry */
+ pimap->mapped[i].r_hwirq = 0;
+
+ /*
+ * We don't free this structure even when the count goes to
+ * zero. The structure is freed when we destroy the VM.
+ */
+
+ mutex_unlock(&kvm->lock);
+ return 0;
+}
+
+static int kvmppc_irq_bypass_add_producer_hv(struct irq_bypass_consumer *cons,
+ struct irq_bypass_producer *prod)
+{
+ int ret = 0;
+ struct kvm_kernel_irqfd *irqfd =
+ container_of(cons, struct kvm_kernel_irqfd, consumer);
+
+ irqfd->producer = prod;
+
+ ret = kvmppc_set_passthru_irq(irqfd->kvm, prod->irq, irqfd->gsi);
+ if (ret)
+ pr_info("kvmppc_set_passthru_irq (irq %d, gsi %d) fails: %d\n",
+ prod->irq, irqfd->gsi, ret);
+
+ return ret;
+}
+
+static void kvmppc_irq_bypass_del_producer_hv(struct irq_bypass_consumer *cons,
+ struct irq_bypass_producer *prod)
+{
+ int ret;
+ struct kvm_kernel_irqfd *irqfd =
+ container_of(cons, struct kvm_kernel_irqfd, consumer);
+
+ irqfd->producer = NULL;
+
+ /*
+ * When producer of consumer is unregistered, we change back to
+ * default external interrupt handling mode - KVM real mode
+ * will switch back to host.
+ */
+ ret = kvmppc_clr_passthru_irq(irqfd->kvm, prod->irq, irqfd->gsi);
+ if (ret)
+ pr_warn("kvmppc_clr_passthru_irq (irq %d, gsi %d) fails: %d\n",
+ prod->irq, irqfd->gsi, ret);
+}
+#endif
+
static long kvm_arch_vm_ioctl_hv(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -3400,6 +3609,10 @@ static struct kvmppc_ops kvm_ops_hv = {
.fast_vcpu_kick = kvmppc_fast_vcpu_kick_hv,
.arch_vm_ioctl = kvm_arch_vm_ioctl_hv,
.hcall_implemented = kvmppc_hcall_impl_hv,
+#ifdef CONFIG_KVM_XICS
+ .irq_bypass_add_producer = kvmppc_irq_bypass_add_producer_hv,
+ .irq_bypass_del_producer = kvmppc_irq_bypass_del_producer_hv,
+#endif
};
static int kvm_init_subcore_bitmap(void)
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index 5f0380db3eab..0c84d6bc8356 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -25,6 +25,7 @@
#include <asm/xics.h>
#include <asm/dbell.h>
#include <asm/cputhreads.h>
+#include <asm/io.h>
#define KVM_CMA_CHUNK_ORDER 18
@@ -286,3 +287,158 @@ void kvmhv_commence_exit(int trap)
struct kvmppc_host_rm_ops *kvmppc_host_rm_ops_hv;
EXPORT_SYMBOL_GPL(kvmppc_host_rm_ops_hv);
+
+#ifdef CONFIG_KVM_XICS
+static struct kvmppc_irq_map *get_irqmap(struct kvmppc_passthru_irqmap *pimap,
+ u32 xisr)
+{
+ int i;
+
+ /*
+ * We access the mapped array here without a lock. That
+ * is safe because we never reduce the number of entries
+ * in the array and we never change the v_hwirq field of
+ * an entry once it is set.
+ *
+ * We have also carefully ordered the stores in the writer
+ * and the loads here in the reader, so that if we find a matching
+ * hwirq here, the associated GSI and irq_desc fields are valid.
+ */
+ for (i = 0; i < pimap->n_mapped; i++) {
+ if (xisr == pimap->mapped[i].r_hwirq) {
+ /*
+ * Order subsequent reads in the caller to serialize
+ * with the writer.
+ */
+ smp_rmb();
+ return &pimap->mapped[i];
+ }
+ }
+ return NULL;
+}
+
+/*
+ * If we have an interrupt that's not an IPI, check if we have a
+ * passthrough adapter and if so, check if this external interrupt
+ * is for the adapter.
+ * We will attempt to deliver the IRQ directly to the target VCPU's
+ * ICP, the virtual ICP (based on affinity - the xive value in ICS).
+ *
+ * If the delivery fails or if this is not for a passthrough adapter,
+ * return to the host to handle this interrupt. We earlier
+ * saved a copy of the XIRR in the PACA, it will be picked up by
+ * the host ICP driver.
+ */
+static int kvmppc_check_passthru(u32 xisr, __be32 xirr)
+{
+ struct kvmppc_passthru_irqmap *pimap;
+ struct kvmppc_irq_map *irq_map;
+ struct kvm_vcpu *vcpu;
+
+ vcpu = local_paca->kvm_hstate.kvm_vcpu;
+ if (!vcpu)
+ return 1;
+ pimap = kvmppc_get_passthru_irqmap(vcpu->kvm);
+ if (!pimap)
+ return 1;
+ irq_map = get_irqmap(pimap, xisr);
+ if (!irq_map)
+ return 1;
+
+ /* We're handling this interrupt, generic code doesn't need to */
+ local_paca->kvm_hstate.saved_xirr = 0;
+
+ return kvmppc_deliver_irq_passthru(vcpu, xirr, irq_map, pimap);
+}
+
+#else
+static inline int kvmppc_check_passthru(u32 xisr, __be32 xirr)
+{
+ return 1;
+}
+#endif
+
+/*
+ * Determine what sort of external interrupt is pending (if any).
+ * Returns:
+ * 0 if no interrupt is pending
+ * 1 if an interrupt is pending that needs to be handled by the host
+ * 2 Passthrough that needs completion in the host
+ * -1 if there was a guest wakeup IPI (which has now been cleared)
+ * -2 if there is PCI passthrough external interrupt that was handled
+ */
+
+long kvmppc_read_intr(void)
+{
+ unsigned long xics_phys;
+ u32 h_xirr;
+ __be32 xirr;
+ u32 xisr;
+ u8 host_ipi;
+
+ /* see if a host IPI is pending */
+ host_ipi = local_paca->kvm_hstate.host_ipi;
+ if (host_ipi)
+ return 1;
+
+ /* Now read the interrupt from the ICP */
+ xics_phys = local_paca->kvm_hstate.xics_phys;
+ if (unlikely(!xics_phys))
+ return 1;
+
+ /*
+ * Save XIRR for later. Since we get control in reverse endian
+ * on LE systems, save it byte reversed and fetch it back in
+ * host endian. Note that xirr is the value read from the
+ * XIRR register, while h_xirr is the host endian version.
+ */
+ xirr = _lwzcix(xics_phys + XICS_XIRR);
+ h_xirr = be32_to_cpu(xirr);
+ local_paca->kvm_hstate.saved_xirr = h_xirr;
+ xisr = h_xirr & 0xffffff;
+ /*
+ * Ensure that the store/load complete to guarantee all side
+ * effects of loading from XIRR has completed
+ */
+ smp_mb();
+
+ /* if nothing pending in the ICP */
+ if (!xisr)
+ return 0;
+
+ /* We found something in the ICP...
+ *
+ * If it is an IPI, clear the MFRR and EOI it.
+ */
+ if (xisr == XICS_IPI) {
+ _stbcix(xics_phys + XICS_MFRR, 0xff);
+ _stwcix(xics_phys + XICS_XIRR, xirr);
+ /*
+ * Need to ensure side effects of above stores
+ * complete before proceeding.
+ */
+ smp_mb();
+
+ /*
+ * We need to re-check host IPI now in case it got set in the
+ * meantime. If it's clear, we bounce the interrupt to the
+ * guest
+ */
+ host_ipi = local_paca->kvm_hstate.host_ipi;
+ if (unlikely(host_ipi != 0)) {
+ /* We raced with the host,
+ * we need to resend that IPI, bummer
+ */
+ _stbcix(xics_phys + XICS_MFRR, IPI_PRIORITY);
+ /* Let side effects complete */
+ smp_mb();
+ return 1;
+ }
+
+ /* OK, it's an IPI for us */
+ local_paca->kvm_hstate.saved_xirr = 0;
+ return -1;
+ }
+
+ return kvmppc_check_passthru(xisr, xirr);
+}
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index 980d8a6f7284..82ff5de8b1e7 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -10,6 +10,7 @@
#include <linux/kernel.h>
#include <linux/kvm_host.h>
#include <linux/err.h>
+#include <linux/kernel_stat.h>
#include <asm/kvm_book3s.h>
#include <asm/kvm_ppc.h>
@@ -18,7 +19,10 @@
#include <asm/debug.h>
#include <asm/synch.h>
#include <asm/cputhreads.h>
+#include <asm/pgtable.h>
#include <asm/ppc-opcode.h>
+#include <asm/pnv-pci.h>
+#include <asm/opal.h>
#include "book3s_xics.h"
@@ -26,9 +30,12 @@
int h_ipi_redirect = 1;
EXPORT_SYMBOL(h_ipi_redirect);
+int kvm_irq_bypass = 1;
+EXPORT_SYMBOL(kvm_irq_bypass);
static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
u32 new_irq);
+static int xics_opal_rm_set_server(unsigned int hw_irq, int server_cpu);
/* -- ICS routines -- */
static void ics_rm_check_resend(struct kvmppc_xics *xics,
@@ -708,10 +715,123 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
icp->rm_action |= XICS_RM_NOTIFY_EOI;
icp->rm_eoied_irq = irq;
}
+
+ if (state->host_irq) {
+ ++vcpu->stat.pthru_all;
+ if (state->intr_cpu != -1) {
+ int pcpu = raw_smp_processor_id();
+
+ pcpu = cpu_first_thread_sibling(pcpu);
+ ++vcpu->stat.pthru_host;
+ if (state->intr_cpu != pcpu) {
+ ++vcpu->stat.pthru_bad_aff;
+ xics_opal_rm_set_server(state->host_irq, pcpu);
+ }
+ state->intr_cpu = -1;
+ }
+ }
bail:
return check_too_hard(xics, icp);
}
+unsigned long eoi_rc;
+
+static void icp_eoi(struct irq_chip *c, u32 hwirq, u32 xirr)
+{
+ unsigned long xics_phys;
+ int64_t rc;
+
+ rc = pnv_opal_pci_msi_eoi(c, hwirq);
+
+ if (rc)
+ eoi_rc = rc;
+
+ iosync();
+
+ /* EOI it */
+ xics_phys = local_paca->kvm_hstate.xics_phys;
+ _stwcix(xics_phys + XICS_XIRR, xirr);
+}
+
+static int xics_opal_rm_set_server(unsigned int hw_irq, int server_cpu)
+{
+ unsigned int mangle_cpu = get_hard_smp_processor_id(server_cpu) << 2;
+
+ return opal_rm_set_xive(hw_irq, mangle_cpu, DEFAULT_PRIORITY);
+}
+
+/*
+ * Increment a per-CPU 32-bit unsigned integer variable.
+ * Safe to call in real-mode. Handles vmalloc'ed addresses
+ *
+ * ToDo: Make this work for any integral type
+ */
+
+static inline void this_cpu_inc_rm(unsigned int __percpu *addr)
+{
+ unsigned long l;
+ unsigned int *raddr;
+ int cpu = smp_processor_id();
+
+ raddr = per_cpu_ptr(addr, cpu);
+ l = (unsigned long)raddr;
+
+ if (REGION_ID(l) == VMALLOC_REGION_ID) {
+ l = vmalloc_to_phys(raddr);
+ raddr = (unsigned int *)l;
+ }
+ ++*raddr;
+}
+
+/*
+ * We don't try to update the flags in the irq_desc 'istate' field in
+ * here as would happen in the normal IRQ handling path for several reasons:
+ * - state flags represent internal IRQ state and are not expected to be
+ * updated outside the IRQ subsystem
+ * - more importantly, these are useful for edge triggered interrupts,
+ * IRQ probing, etc., but we are only handling MSI/MSIx interrupts here
+ * and these states shouldn't apply to us.
+ *
+ * However, we do update irq_stats - we somewhat duplicate the code in
+ * kstat_incr_irqs_this_cpu() for this since this function is defined
+ * in irq/internal.h which we don't want to include here.
+ * The only difference is that desc->kstat_irqs is an allocated per CPU
+ * variable and could have been vmalloc'ed, so we can't directly
+ * call __this_cpu_inc() on it. The kstat structure is a static
+ * per CPU variable and it should be accessible by real-mode KVM.
+ *
+ */
+static void kvmppc_rm_handle_irq_desc(struct irq_desc *desc)
+{
+ this_cpu_inc_rm(desc->kstat_irqs);
+ __this_cpu_inc(kstat.irqs_sum);
+}
+
+long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu,
+ u32 xirr,
+ struct kvmppc_irq_map *irq_map,
+ struct kvmppc_passthru_irqmap *pimap)
+{
+ struct kvmppc_xics *xics;
+ struct kvmppc_icp *icp;
+ u32 irq;
+
+ irq = irq_map->v_hwirq;
+ xics = vcpu->kvm->arch.xics;
+ icp = vcpu->arch.icp;
+
+ kvmppc_rm_handle_irq_desc(irq_map->desc);
+ icp_rm_deliver_irq(xics, icp, irq);
+
+ /* EOI the interrupt */
+ icp_eoi(irq_desc_get_chip(irq_map->desc), irq_map->r_hwirq, xirr);
+
+ if (check_too_hard(xics, icp) == H_TOO_HARD)
+ return 2;
+ else
+ return -2;
+}
+
/* --- Non-real mode XICS-related built-in routines --- */
/**
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 975655573844..c3c1d1bcfc67 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -221,6 +221,13 @@ kvmppc_primary_no_guest:
li r3, 0 /* Don't wake on privileged (OS) doorbell */
b kvm_do_nap
+/*
+ * kvm_novcpu_wakeup
+ * Entered from kvm_start_guest if kvm_hstate.napping is set
+ * to NAPPING_NOVCPU
+ * r2 = kernel TOC
+ * r13 = paca
+ */
kvm_novcpu_wakeup:
ld r1, HSTATE_HOST_R1(r13)
ld r5, HSTATE_KVM_VCORE(r13)
@@ -230,6 +237,13 @@ kvm_novcpu_wakeup:
/* check the wake reason */
bl kvmppc_check_wake_reason
+ /*
+ * Restore volatile registers since we could have called
+ * a C routine in kvmppc_check_wake_reason.
+ * r5 = VCORE
+ */
+ ld r5, HSTATE_KVM_VCORE(r13)
+
/* see if any other thread is already exiting */
lwz r0, VCORE_ENTRY_EXIT(r5)
cmpwi r0, 0x100
@@ -322,6 +336,11 @@ kvm_start_guest:
/* Check the wake reason in SRR1 to see why we got here */
bl kvmppc_check_wake_reason
+ /*
+ * kvmppc_check_wake_reason could invoke a C routine, but we
+ * have no volatile registers to restore when we return.
+ */
+
cmpdi r3, 0
bge kvm_no_guest
@@ -625,9 +644,11 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_207S)
38:
BEGIN_FTR_SECTION
- /* DPDES is shared between threads */
+ /* DPDES and VTB are shared between threads */
ld r8, VCORE_DPDES(r5)
+ ld r7, VCORE_VTB(r5)
mtspr SPRN_DPDES, r8
+ mtspr SPRN_VTB, r7
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
/* Mark the subcore state as inside guest */
@@ -787,10 +808,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
mtspr SPRN_CIABR, r7
mtspr SPRN_TAR, r8
ld r5, VCPU_IC(r4)
- ld r6, VCPU_VTB(r4)
- mtspr SPRN_IC, r5
- mtspr SPRN_VTB, r6
ld r8, VCPU_EBBHR(r4)
+ mtspr SPRN_IC, r5
mtspr SPRN_EBBHR, r8
ld r5, VCPU_EBBRR(r4)
ld r6, VCPU_BESCR(r4)
@@ -881,6 +900,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
cmpwi r3, 512 /* 1 microsecond */
blt hdec_soon
+deliver_guest_interrupt:
ld r6, VCPU_CTR(r4)
ld r7, VCPU_XER(r4)
@@ -895,7 +915,6 @@ kvmppc_cede_reentry: /* r4 = vcpu, r13 = paca */
mtspr SPRN_SRR0, r6
mtspr SPRN_SRR1, r7
-deliver_guest_interrupt:
/* r11 = vcpu->arch.msr & ~MSR_HV */
rldicl r11, r11, 63 - MSR_HV_LG, 1
rotldi r11, r11, 1 + MSR_HV_LG
@@ -1155,10 +1174,54 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
* set, we know the host wants us out so let's do it now
*/
bl kvmppc_read_intr
+
+ /*
+ * Restore the active volatile registers after returning from
+ * a C function.
+ */
+ ld r9, HSTATE_KVM_VCPU(r13)
+ li r12, BOOK3S_INTERRUPT_EXTERNAL
+
+ /*
+ * kvmppc_read_intr return codes:
+ *
+ * Exit to host (r3 > 0)
+ * 1 An interrupt is pending that needs to be handled by the host
+ * Exit guest and return to host by branching to guest_exit_cont
+ *
+ * 2 Passthrough that needs completion in the host
+ * Exit guest and return to host by branching to guest_exit_cont
+ * However, we also set r12 to BOOK3S_INTERRUPT_HV_RM_HARD
+ * to indicate to the host to complete handling the interrupt
+ *
+ * Before returning to guest, we check if any CPU is heading out
+ * to the host and if so, we head out also. If no CPUs are heading
+ * check return values <= 0.
+ *
+ * Return to guest (r3 <= 0)
+ * 0 No external interrupt is pending
+ * -1 A guest wakeup IPI (which has now been cleared)
+ * In either case, we return to guest to deliver any pending
+ * guest interrupts.
+ *
+ * -2 A PCI passthrough external interrupt was handled
+ * (interrupt was delivered directly to guest)
+ * Return to guest to deliver any pending guest interrupts.
+ */
+
+ cmpdi r3, 1
+ ble 1f
+
+ /* Return code = 2 */
+ li r12, BOOK3S_INTERRUPT_HV_RM_HARD
+ stw r12, VCPU_TRAP(r9)
+ b guest_exit_cont
+
+1: /* Return code <= 1 */
cmpdi r3, 0
bgt guest_exit_cont
- /* Check if any CPU is heading out to the host, if so head out too */
+ /* Return code <= 0 */
4: ld r5, HSTATE_KVM_VCORE(r13)
lwz r0, VCORE_ENTRY_EXIT(r5)
cmpwi r0, 0x100
@@ -1271,10 +1334,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
stw r6, VCPU_PSPB(r9)
std r7, VCPU_FSCR(r9)
mfspr r5, SPRN_IC
- mfspr r6, SPRN_VTB
mfspr r7, SPRN_TAR
std r5, VCPU_IC(r9)
- std r6, VCPU_VTB(r9)
std r7, VCPU_TAR(r9)
mfspr r8, SPRN_EBBHR
std r8, VCPU_EBBHR(r9)
@@ -1501,9 +1562,11 @@ kvmhv_switch_to_host:
isync
BEGIN_FTR_SECTION
- /* DPDES is shared between threads */
+ /* DPDES and VTB are shared between threads */
mfspr r7, SPRN_DPDES
+ mfspr r8, SPRN_VTB
std r7, VCORE_DPDES(r5)
+ std r8, VCORE_VTB(r5)
/* clear DPDES so we don't get guest doorbells in the host */
li r8, 0
mtspr SPRN_DPDES, r8
@@ -2213,10 +2276,20 @@ END_FTR_SECTION_IFSET(CPU_FTR_TM)
ld r29, VCPU_GPR(R29)(r4)
ld r30, VCPU_GPR(R30)(r4)
ld r31, VCPU_GPR(R31)(r4)
-
+
/* Check the wake reason in SRR1 to see why we got here */
bl kvmppc_check_wake_reason
+ /*
+ * Restore volatile registers since we could have called a
+ * C routine in kvmppc_check_wake_reason
+ * r4 = VCPU
+ * r3 tells us whether we need to return to host or not
+ * WARNING: it gets checked further down:
+ * should not modify r3 until this check is done.
+ */
+ ld r4, HSTATE_KVM_VCPU(r13)
+
/* clear our bit in vcore->napping_threads */
34: ld r5,HSTATE_KVM_VCORE(r13)
lbz r7,HSTATE_PTID(r13)
@@ -2230,7 +2303,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_TM)
li r0,0
stb r0,HSTATE_NAPPING(r13)
- /* See if the wake reason means we need to exit */
+ /* See if the wake reason saved in r3 means we need to exit */
stw r12, VCPU_TRAP(r4)
mr r9, r4
cmpdi r3, 0
@@ -2297,10 +2370,14 @@ machine_check_realmode:
* 0 if nothing needs to be done
* 1 if something happened that needs to be handled by the host
* -1 if there was a guest wakeup (IPI or msgsnd)
+ * -2 if we handled a PCI passthrough interrupt (returned by
+ * kvmppc_read_intr only)
*
* Also sets r12 to the interrupt vector for any interrupt that needs
* to be handled now by the host (0x500 for external interrupt), or zero.
- * Modifies r0, r6, r7, r8.
+ * Modifies all volatile registers (since it may call a C function).
+ * This routine calls kvmppc_read_intr, a C function, if an external
+ * interrupt is pending.
*/
kvmppc_check_wake_reason:
mfspr r6, SPRN_SRR1
@@ -2310,8 +2387,7 @@ FTR_SECTION_ELSE
rlwinm r6, r6, 45-31, 0xe /* P7 wake reason field is 3 bits */
ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_207S)
cmpwi r6, 8 /* was it an external interrupt? */
- li r12, BOOK3S_INTERRUPT_EXTERNAL
- beq kvmppc_read_intr /* if so, see what it was */
+ beq 7f /* if so, see what it was */
li r3, 0
li r12, 0
cmpwi r6, 6 /* was it the decrementer? */
@@ -2350,83 +2426,28 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
li r3, 1
blr
-/*
- * Determine what sort of external interrupt is pending (if any).
- * Returns:
- * 0 if no interrupt is pending
- * 1 if an interrupt is pending that needs to be handled by the host
- * -1 if there was a guest wakeup IPI (which has now been cleared)
- * Modifies r0, r6, r7, r8, returns value in r3.
- */
-kvmppc_read_intr:
- /* see if a host IPI is pending */
- li r3, 1
- lbz r0, HSTATE_HOST_IPI(r13)
- cmpwi r0, 0
- bne 1f
+ /* external interrupt - create a stack frame so we can call C */
+7: mflr r0
+ std r0, PPC_LR_STKOFF(r1)
+ stdu r1, -PPC_MIN_STKFRM(r1)
+ bl kvmppc_read_intr
+ nop
+ li r12, BOOK3S_INTERRUPT_EXTERNAL
+ cmpdi r3, 1
+ ble 1f
- /* Now read the interrupt from the ICP */
- ld r6, HSTATE_XICS_PHYS(r13)
- li r7, XICS_XIRR
- cmpdi r6, 0
- beq- 1f
- lwzcix r0, r6, r7
/*
- * Save XIRR for later. Since we get in in reverse endian on LE
- * systems, save it byte reversed and fetch it back in host endian.
- */
- li r3, HSTATE_SAVED_XIRR
- STWX_BE r0, r3, r13
-#ifdef __LITTLE_ENDIAN__
- lwz r3, HSTATE_SAVED_XIRR(r13)
-#else
- mr r3, r0
-#endif
- rlwinm. r3, r3, 0, 0xffffff
- sync
- beq 1f /* if nothing pending in the ICP */
-
- /* We found something in the ICP...
- *
- * If it's not an IPI, stash it in the PACA and return to
- * the host, we don't (yet) handle directing real external
- * interrupts directly to the guest
+ * Return code of 2 means PCI passthrough interrupt, but
+ * we need to return back to host to complete handling the
+ * interrupt. Trap reason is expected in r12 by guest
+ * exit code.
*/
- cmpwi r3, XICS_IPI /* if there is, is it an IPI? */
- bne 42f
-
- /* It's an IPI, clear the MFRR and EOI it */
- li r3, 0xff
- li r8, XICS_MFRR
- stbcix r3, r6, r8 /* clear the IPI */
- stwcix r0, r6, r7 /* EOI it */
- sync
-
- /* We need to re-check host IPI now in case it got set in the
- * meantime. If it's clear, we bounce the interrupt to the
- * guest
- */
- lbz r0, HSTATE_HOST_IPI(r13)
- cmpwi r0, 0
- bne- 43f
-
- /* OK, it's an IPI for us */
- li r12, 0
- li r3, -1
-1: blr
-
-42: /* It's not an IPI and it's for the host. We saved a copy of XIRR in
- * the PACA earlier, it will be picked up by the host ICP driver
- */
- li r3, 1
- b 1b
-
-43: /* We raced with the host, we need to resend that IPI, bummer */
- li r0, IPI_PRIORITY
- stbcix r0, r6, r8 /* set the IPI */
- sync
- li r3, 1
- b 1b
+ li r12, BOOK3S_INTERRUPT_HV_RM_HARD
+1:
+ ld r0, PPC_MIN_STKFRM+PPC_LR_STKOFF(r1)
+ addi r1, r1, PPC_MIN_STKFRM
+ mtlr r0
+ blr
/*
* Save away FP, VMX and VSX registers.
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index e76f79a45988..826c541a12af 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -226,7 +226,7 @@ void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu,
*/
vcpu->arch.purr += get_tb() - vcpu->arch.entry_tb;
vcpu->arch.spurr += get_tb() - vcpu->arch.entry_tb;
- vcpu->arch.vtb += get_vtb() - vcpu->arch.entry_vtb;
+ to_book3s(vcpu)->vtb += get_vtb() - vcpu->arch.entry_vtb;
if (cpu_has_feature(CPU_FTR_ARCH_207S))
vcpu->arch.ic += mfspr(SPRN_IC) - vcpu->arch.entry_ic;
svcpu->in_use = false;
@@ -448,6 +448,8 @@ void kvmppc_set_pvr_pr(struct kvm_vcpu *vcpu, u32 pvr)
case PVR_POWER7:
case PVR_POWER7p:
case PVR_POWER8:
+ case PVR_POWER8E:
+ case PVR_POWER8NVL:
vcpu->arch.hflags |= BOOK3S_HFLAG_MULTI_PGSIZE |
BOOK3S_HFLAG_NEW_TLBIE;
break;
@@ -1361,6 +1363,9 @@ static int kvmppc_get_one_reg_pr(struct kvm_vcpu *vcpu, u64 id,
case KVM_REG_PPC_HIOR:
*val = get_reg_val(id, to_book3s(vcpu)->hior);
break;
+ case KVM_REG_PPC_VTB:
+ *val = get_reg_val(id, to_book3s(vcpu)->vtb);
+ break;
case KVM_REG_PPC_LPCR:
case KVM_REG_PPC_LPCR_64:
/*
@@ -1397,6 +1402,9 @@ static int kvmppc_set_one_reg_pr(struct kvm_vcpu *vcpu, u64 id,
to_book3s(vcpu)->hior = set_reg_val(id, *val);
to_book3s(vcpu)->hior_explicit = true;
break;
+ case KVM_REG_PPC_VTB:
+ to_book3s(vcpu)->vtb = set_reg_val(id, *val);
+ break;
case KVM_REG_PPC_LPCR:
case KVM_REG_PPC_LPCR_64:
kvmppc_set_lpcr_pr(vcpu, set_reg_val(id, *val));
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index 05aa11399a78..3bdc639157c1 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -99,6 +99,10 @@ static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level)
return 0;
}
+ /* Record which CPU this arrived on for passed-through interrupts */
+ if (state->host_irq)
+ state->intr_cpu = raw_smp_processor_id();
+
/* Attempt delivery */
icp_deliver_irq(xics, NULL, irq);
@@ -812,7 +816,7 @@ static noinline int kvmppc_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
return H_SUCCESS;
}
-static noinline int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
+int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
{
struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
struct kvmppc_icp *icp = vcpu->arch.icp;
@@ -841,6 +845,7 @@ static noinline int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
return H_SUCCESS;
}
+EXPORT_SYMBOL_GPL(kvmppc_xics_rm_complete);
int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 req)
{
@@ -892,6 +897,21 @@ EXPORT_SYMBOL_GPL(kvmppc_xics_hcall);
/* -- Initialisation code etc. -- */
+static void xics_debugfs_irqmap(struct seq_file *m,
+ struct kvmppc_passthru_irqmap *pimap)
+{
+ int i;
+
+ if (!pimap)
+ return;
+ seq_printf(m, "========\nPIRQ mappings: %d maps\n===========\n",
+ pimap->n_mapped);
+ for (i = 0; i < pimap->n_mapped; i++) {
+ seq_printf(m, "r_hwirq=%x, v_hwirq=%x\n",
+ pimap->mapped[i].r_hwirq, pimap->mapped[i].v_hwirq);
+ }
+}
+
static int xics_debug_show(struct seq_file *m, void *private)
{
struct kvmppc_xics *xics = m->private;
@@ -913,6 +933,8 @@ static int xics_debug_show(struct seq_file *m, void *private)
t_check_resend = 0;
t_reject = 0;
+ xics_debugfs_irqmap(m, kvm->arch.pimap);
+
seq_printf(m, "=========\nICP state\n=========\n");
kvm_for_each_vcpu(i, vcpu, kvm) {
@@ -1252,6 +1274,8 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
{
struct kvmppc_xics *xics = kvm->arch.xics;
+ if (!xics)
+ return -ENODEV;
return ics_deliver_irq(xics, irq, level);
}
@@ -1418,3 +1442,34 @@ int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin)
{
return pin;
}
+
+void kvmppc_xics_set_mapped(struct kvm *kvm, unsigned long irq,
+ unsigned long host_irq)
+{
+ struct kvmppc_xics *xics = kvm->arch.xics;
+ struct kvmppc_ics *ics;
+ u16 idx;
+
+ ics = kvmppc_xics_find_ics(xics, irq, &idx);
+ if (!ics)
+ return;
+
+ ics->irq_state[idx].host_irq = host_irq;
+ ics->irq_state[idx].intr_cpu = -1;
+}
+EXPORT_SYMBOL_GPL(kvmppc_xics_set_mapped);
+
+void kvmppc_xics_clr_mapped(struct kvm *kvm, unsigned long irq,
+ unsigned long host_irq)
+{
+ struct kvmppc_xics *xics = kvm->arch.xics;
+ struct kvmppc_ics *ics;
+ u16 idx;
+
+ ics = kvmppc_xics_find_ics(xics, irq, &idx);
+ if (!ics)
+ return;
+
+ ics->irq_state[idx].host_irq = 0;
+}
+EXPORT_SYMBOL_GPL(kvmppc_xics_clr_mapped);
diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h
index a46b954055c4..2a50320b55ca 100644
--- a/arch/powerpc/kvm/book3s_xics.h
+++ b/arch/powerpc/kvm/book3s_xics.h
@@ -42,6 +42,8 @@ struct ics_irq_state {
u8 lsi; /* level-sensitive interrupt */
u8 asserted; /* Only for LSI */
u8 exists;
+ int intr_cpu;
+ u32 host_irq;
};
/* Atomic ICP state, updated with a single compare & swap */
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 02b4672f7347..df3f2706d3e5 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -2038,7 +2038,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
if (type == KVMPPC_DEBUG_NONE)
continue;
- if (type & !(KVMPPC_DEBUG_WATCH_READ |
+ if (type & ~(KVMPPC_DEBUG_WATCH_READ |
KVMPPC_DEBUG_WATCH_WRITE |
KVMPPC_DEBUG_BREAKPOINT))
return -EINVAL;
diff --git a/arch/powerpc/kvm/e500_mmu.c b/arch/powerpc/kvm/e500_mmu.c
index 29911a07bcdb..ddbf8f0284c0 100644
--- a/arch/powerpc/kvm/e500_mmu.c
+++ b/arch/powerpc/kvm/e500_mmu.c
@@ -743,7 +743,7 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
char *virt;
struct page **pages;
struct tlbe_priv *privs[2] = {};
- u64 *g2h_bitmap = NULL;
+ u64 *g2h_bitmap;
size_t array_len;
u32 sets;
int num_pages, ret, i;
@@ -779,41 +779,44 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
num_pages = DIV_ROUND_UP(cfg->array + array_len - 1, PAGE_SIZE) -
cfg->array / PAGE_SIZE;
- pages = kmalloc(sizeof(struct page *) * num_pages, GFP_KERNEL);
+ pages = kmalloc_array(num_pages, sizeof(*pages), GFP_KERNEL);
if (!pages)
return -ENOMEM;
ret = get_user_pages_fast(cfg->array, num_pages, 1, pages);
if (ret < 0)
- goto err_pages;
+ goto free_pages;
if (ret != num_pages) {
num_pages = ret;
ret = -EFAULT;
- goto err_put_page;
+ goto put_pages;
}
virt = vmap(pages, num_pages, VM_MAP, PAGE_KERNEL);
if (!virt) {
ret = -ENOMEM;
- goto err_put_page;
+ goto put_pages;
}
- privs[0] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[0],
- GFP_KERNEL);
- privs[1] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[1],
- GFP_KERNEL);
+ privs[0] = kcalloc(params.tlb_sizes[0], sizeof(*privs[0]), GFP_KERNEL);
+ if (!privs[0]) {
+ ret = -ENOMEM;
+ goto put_pages;
+ }
- if (!privs[0] || !privs[1]) {
+ privs[1] = kcalloc(params.tlb_sizes[1], sizeof(*privs[1]), GFP_KERNEL);
+ if (!privs[1]) {
ret = -ENOMEM;
- goto err_privs;
+ goto free_privs_first;
}
- g2h_bitmap = kzalloc(sizeof(u64) * params.tlb_sizes[1],
- GFP_KERNEL);
+ g2h_bitmap = kcalloc(params.tlb_sizes[1],
+ sizeof(*g2h_bitmap),
+ GFP_KERNEL);
if (!g2h_bitmap) {
ret = -ENOMEM;
- goto err_privs;
+ goto free_privs_second;
}
free_gtlb(vcpu_e500);
@@ -845,16 +848,14 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
kvmppc_recalc_tlb1map_range(vcpu_e500);
return 0;
-
-err_privs:
- kfree(privs[0]);
+ free_privs_second:
kfree(privs[1]);
-
-err_put_page:
+ free_privs_first:
+ kfree(privs[0]);
+ put_pages:
for (i = 0; i < num_pages; i++)
put_page(pages[i]);
-
-err_pages:
+ free_pages:
kfree(pages);
return ret;
}
@@ -904,11 +905,9 @@ static int vcpu_mmu_init(struct kvm_vcpu *vcpu,
int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
{
struct kvm_vcpu *vcpu = &vcpu_e500->vcpu;
- int entry_size = sizeof(struct kvm_book3e_206_tlb_entry);
- int entries = KVM_E500_TLB0_SIZE + KVM_E500_TLB1_SIZE;
if (e500_mmu_host_init(vcpu_e500))
- goto err;
+ goto free_vcpu;
vcpu_e500->gtlb_params[0].entries = KVM_E500_TLB0_SIZE;
vcpu_e500->gtlb_params[1].entries = KVM_E500_TLB1_SIZE;
@@ -920,37 +919,39 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
vcpu_e500->gtlb_params[1].ways = KVM_E500_TLB1_SIZE;
vcpu_e500->gtlb_params[1].sets = 1;
- vcpu_e500->gtlb_arch = kmalloc(entries * entry_size, GFP_KERNEL);
+ vcpu_e500->gtlb_arch = kmalloc_array(KVM_E500_TLB0_SIZE +
+ KVM_E500_TLB1_SIZE,
+ sizeof(*vcpu_e500->gtlb_arch),
+ GFP_KERNEL);
if (!vcpu_e500->gtlb_arch)
return -ENOMEM;
vcpu_e500->gtlb_offset[0] = 0;
vcpu_e500->gtlb_offset[1] = KVM_E500_TLB0_SIZE;
- vcpu_e500->gtlb_priv[0] = kzalloc(sizeof(struct tlbe_ref) *
- vcpu_e500->gtlb_params[0].entries,
+ vcpu_e500->gtlb_priv[0] = kcalloc(vcpu_e500->gtlb_params[0].entries,
+ sizeof(struct tlbe_ref),
GFP_KERNEL);
if (!vcpu_e500->gtlb_priv[0])
- goto err;
+ goto free_vcpu;
- vcpu_e500->gtlb_priv[1] = kzalloc(sizeof(struct tlbe_ref) *
- vcpu_e500->gtlb_params[1].entries,
+ vcpu_e500->gtlb_priv[1] = kcalloc(vcpu_e500->gtlb_params[1].entries,
+ sizeof(struct tlbe_ref),
GFP_KERNEL);
if (!vcpu_e500->gtlb_priv[1])
- goto err;
+ goto free_vcpu;
- vcpu_e500->g2h_tlb1_map = kzalloc(sizeof(u64) *
- vcpu_e500->gtlb_params[1].entries,
+ vcpu_e500->g2h_tlb1_map = kcalloc(vcpu_e500->gtlb_params[1].entries,
+ sizeof(*vcpu_e500->g2h_tlb1_map),
GFP_KERNEL);
if (!vcpu_e500->g2h_tlb1_map)
- goto err;
+ goto free_vcpu;
vcpu_mmu_init(vcpu, vcpu_e500->gtlb_params);
kvmppc_recalc_tlb1map_range(vcpu_e500);
return 0;
-
-err:
+ free_vcpu:
free_gtlb(vcpu_e500);
return -1;
}
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 6ce40dd6fe51..70963c845e96 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -27,6 +27,8 @@
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/module.h>
+#include <linux/irqbypass.h>
+#include <linux/kvm_irqfd.h>
#include <asm/cputable.h>
#include <asm/uaccess.h>
#include <asm/kvm_ppc.h>
@@ -436,6 +438,16 @@ err_out:
return -EINVAL;
}
+bool kvm_arch_has_vcpu_debugfs(void)
+{
+ return false;
+}
+
+int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+
void kvm_arch_destroy_vm(struct kvm *kvm)
{
unsigned int i;
@@ -739,6 +751,42 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
#endif
}
+/*
+ * irq_bypass_add_producer and irq_bypass_del_producer are only
+ * useful if the architecture supports PCI passthrough.
+ * irq_bypass_stop and irq_bypass_start are not needed and so
+ * kvm_ops are not defined for them.
+ */
+bool kvm_arch_has_irq_bypass(void)
+{
+ return ((kvmppc_hv_ops && kvmppc_hv_ops->irq_bypass_add_producer) ||
+ (kvmppc_pr_ops && kvmppc_pr_ops->irq_bypass_add_producer));
+}
+
+int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
+ struct irq_bypass_producer *prod)
+{
+ struct kvm_kernel_irqfd *irqfd =
+ container_of(cons, struct kvm_kernel_irqfd, consumer);
+ struct kvm *kvm = irqfd->kvm;
+
+ if (kvm->arch.kvm_ops->irq_bypass_add_producer)
+ return kvm->arch.kvm_ops->irq_bypass_add_producer(cons, prod);
+
+ return 0;
+}
+
+void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
+ struct irq_bypass_producer *prod)
+{
+ struct kvm_kernel_irqfd *irqfd =
+ container_of(cons, struct kvm_kernel_irqfd, consumer);
+ struct kvm *kvm = irqfd->kvm;
+
+ if (kvm->arch.kvm_ops->irq_bypass_del_producer)
+ kvm->arch.kvm_ops->irq_bypass_del_producer(cons, prod);
+}
+
static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu,
struct kvm_run *run)
{
@@ -1167,6 +1215,19 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
return r;
}
+bool kvm_arch_intc_initialized(struct kvm *kvm)
+{
+#ifdef CONFIG_KVM_MPIC
+ if (kvm->arch.mpic)
+ return true;
+#endif
+#ifdef CONFIG_KVM_XICS
+ if (kvm->arch.xics)
+ return true;
+#endif
+ return false;
+}
+
int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
struct kvm_mp_state *mp_state)
{
diff --git a/arch/powerpc/kvm/trace_hv.h b/arch/powerpc/kvm/trace_hv.h
index 33d9daff5783..fb21990c0fb4 100644
--- a/arch/powerpc/kvm/trace_hv.h
+++ b/arch/powerpc/kvm/trace_hv.h
@@ -432,6 +432,28 @@ TRACE_EVENT(kvmppc_vcore_blocked,
__entry->runner_vcpu, __entry->n_runnable, __entry->tgid)
);
+TRACE_EVENT(kvmppc_vcore_wakeup,
+ TP_PROTO(int do_sleep, __u64 ns),
+
+ TP_ARGS(do_sleep, ns),
+
+ TP_STRUCT__entry(
+ __field(__u64, ns)
+ __field(int, waited)
+ __field(pid_t, tgid)
+ ),
+
+ TP_fast_assign(
+ __entry->ns = ns;
+ __entry->waited = do_sleep;
+ __entry->tgid = current->tgid;
+ ),
+
+ TP_printk("%s time %lld ns, tgid=%d",
+ __entry->waited ? "wait" : "poll",
+ __entry->ns, __entry->tgid)
+);
+
TRACE_EVENT(kvmppc_run_vcpu_enter,
TP_PROTO(struct kvm_vcpu *vcpu),
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index 0e4e9654bd2c..83ddc0e171b0 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -493,36 +493,6 @@ static void native_hugepage_invalidate(unsigned long vsid,
}
#endif
-static inline int __hpte_actual_psize(unsigned int lp, int psize)
-{
- int i, shift;
- unsigned int mask;
-
- /* start from 1 ignoring MMU_PAGE_4K */
- for (i = 1; i < MMU_PAGE_COUNT; i++) {
-
- /* invalid penc */
- if (mmu_psize_defs[psize].penc[i] == -1)
- continue;
- /*
- * encoding bits per actual page size
- * PTE LP actual page size
- * rrrr rrrz >=8KB
- * rrrr rrzz >=16KB
- * rrrr rzzz >=32KB
- * rrrr zzzz >=64KB
- * .......
- */
- shift = mmu_psize_defs[i].shift - LP_SHIFT;
- if (shift > LP_BITS)
- shift = LP_BITS;
- mask = (1 << shift) - 1;
- if ((lp & mask) == mmu_psize_defs[psize].penc[i])
- return i;
- }
- return -1;
-}
-
static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
int *psize, int *apsize, int *ssize, unsigned long *vpn)
{
@@ -538,16 +508,8 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
size = MMU_PAGE_4K;
a_size = MMU_PAGE_4K;
} else {
- for (size = 0; size < MMU_PAGE_COUNT; size++) {
-
- /* valid entries have a shift value */
- if (!mmu_psize_defs[size].shift)
- continue;
-
- a_size = __hpte_actual_psize(lp, size);
- if (a_size != -1)
- break;
- }
+ size = hpte_page_sizes[lp] & 0xf;
+ a_size = hpte_page_sizes[lp] >> 4;
}
/* This works for all page sizes, and for 256M and 1T segments */
if (cpu_has_feature(CPU_FTR_ARCH_300))
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 0821556e16f4..ef3ae891a3db 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -93,6 +93,9 @@ static unsigned long _SDR1;
struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
EXPORT_SYMBOL_GPL(mmu_psize_defs);
+u8 hpte_page_sizes[1 << LP_BITS];
+EXPORT_SYMBOL_GPL(hpte_page_sizes);
+
struct hash_pte *htab_address;
unsigned long htab_size_bytes;
unsigned long htab_hash_mask;
@@ -564,8 +567,60 @@ static void __init htab_scan_page_sizes(void)
#endif /* CONFIG_HUGETLB_PAGE */
}
+/*
+ * Fill in the hpte_page_sizes[] array.
+ * We go through the mmu_psize_defs[] array looking for all the
+ * supported base/actual page size combinations. Each combination
+ * has a unique pagesize encoding (penc) value in the low bits of
+ * the LP field of the HPTE. For actual page sizes less than 1MB,
+ * some of the upper LP bits are used for RPN bits, meaning that
+ * we need to fill in several entries in hpte_page_sizes[].
+ *
+ * In diagrammatic form, with r = RPN bits and z = page size bits:
+ * PTE LP actual page size
+ * rrrr rrrz >=8KB
+ * rrrr rrzz >=16KB
+ * rrrr rzzz >=32KB
+ * rrrr zzzz >=64KB
+ * ...
+ *
+ * The zzzz bits are implementation-specific but are chosen so that
+ * no encoding for a larger page size uses the same value in its
+ * low-order N bits as the encoding for the 2^(12+N) byte page size
+ * (if it exists).
+ */
+static void init_hpte_page_sizes(void)
+{
+ long int ap, bp;
+ long int shift, penc;
+
+ for (bp = 0; bp < MMU_PAGE_COUNT; ++bp) {
+ if (!mmu_psize_defs[bp].shift)
+ continue; /* not a supported page size */
+ for (ap = bp; ap < MMU_PAGE_COUNT; ++ap) {
+ penc = mmu_psize_defs[bp].penc[ap];
+ if (penc == -1)
+ continue;
+ shift = mmu_psize_defs[ap].shift - LP_SHIFT;
+ if (shift <= 0)
+ continue; /* should never happen */
+ /*
+ * For page sizes less than 1MB, this loop
+ * replicates the entry for all possible values
+ * of the rrrr bits.
+ */
+ while (penc < (1 << LP_BITS)) {
+ hpte_page_sizes[penc] = (ap << 4) | bp;
+ penc += 1 << shift;
+ }
+ }
+ }
+}
+
static void __init htab_init_page_sizes(void)
{
+ init_hpte_page_sizes();
+
if (!debug_pagealloc_enabled()) {
/*
* Pick a size for the linear mapping. Currently, we only
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
index 3d29d40eb0e9..44d2d842cee7 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -208,6 +208,7 @@ OPAL_CALL(opal_pci_config_write_byte, OPAL_PCI_CONFIG_WRITE_BYTE);
OPAL_CALL(opal_pci_config_write_half_word, OPAL_PCI_CONFIG_WRITE_HALF_WORD);
OPAL_CALL(opal_pci_config_write_word, OPAL_PCI_CONFIG_WRITE_WORD);
OPAL_CALL(opal_set_xive, OPAL_SET_XIVE);
+OPAL_CALL_REAL(opal_rm_set_xive, OPAL_SET_XIVE);
OPAL_CALL(opal_get_xive, OPAL_GET_XIVE);
OPAL_CALL(opal_register_exception_handler, OPAL_REGISTER_OPAL_EXCEPTION_HANDLER);
OPAL_CALL(opal_pci_eeh_freeze_status, OPAL_PCI_EEH_FREEZE_STATUS);
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 38a5c657ffd3..d314eccd075b 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2718,15 +2718,21 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
}
#ifdef CONFIG_PCI_MSI
-static void pnv_ioda2_msi_eoi(struct irq_data *d)
+int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq)
{
- unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
- struct irq_chip *chip = irq_data_get_irq_chip(d);
struct pnv_phb *phb = container_of(chip, struct pnv_phb,
ioda.irq_chip);
+
+ return opal_pci_msi_eoi(phb->opal_id, hw_irq);
+}
+
+static void pnv_ioda2_msi_eoi(struct irq_data *d)
+{
int64_t rc;
+ unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
+ struct irq_chip *chip = irq_data_get_irq_chip(d);
- rc = opal_pci_msi_eoi(phb->opal_id, hw_irq);
+ rc = pnv_opal_pci_msi_eoi(chip, hw_irq);
WARN_ON_ONCE(rc);
icp_native_eoi(d);
@@ -2756,6 +2762,16 @@ void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq)
irq_set_chip(virq, &phb->ioda.irq_chip);
}
+/*
+ * Returns true iff chip is something that we could call
+ * pnv_opal_pci_msi_eoi for.
+ */
+bool is_pnv_opal_msi(struct irq_chip *chip)
+{
+ return chip->irq_eoi == pnv_ioda2_msi_eoi;
+}
+EXPORT_SYMBOL_GPL(is_pnv_opal_msi);
+
static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
unsigned int hwirq, unsigned int virq,
unsigned int is_64, struct msi_msg *msg)
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 8e5daf7a76ce..a41faf34b034 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -28,7 +28,7 @@
#define KVM_S390_BSCA_CPU_SLOTS 64
#define KVM_S390_ESCA_CPU_SLOTS 248
-#define KVM_MAX_VCPUS KVM_S390_ESCA_CPU_SLOTS
+#define KVM_MAX_VCPUS 255
#define KVM_USER_MEM_SLOTS 32
/*
@@ -245,72 +245,72 @@ struct sie_page {
} __packed;
struct kvm_vcpu_stat {
- u32 exit_userspace;
- u32 exit_null;
- u32 exit_external_request;
- u32 exit_external_interrupt;
- u32 exit_stop_request;
- u32 exit_validity;
- u32 exit_instruction;
- u32 exit_pei;
- u32 halt_successful_poll;
- u32 halt_attempted_poll;
- u32 halt_poll_invalid;
- u32 halt_wakeup;
- u32 instruction_lctl;
- u32 instruction_lctlg;
- u32 instruction_stctl;
- u32 instruction_stctg;
- u32 exit_program_interruption;
- u32 exit_instr_and_program;
- u32 exit_operation_exception;
- u32 deliver_external_call;
- u32 deliver_emergency_signal;
- u32 deliver_service_signal;
- u32 deliver_virtio_interrupt;
- u32 deliver_stop_signal;
- u32 deliver_prefix_signal;
- u32 deliver_restart_signal;
- u32 deliver_program_int;
- u32 deliver_io_int;
- u32 exit_wait_state;
- u32 instruction_pfmf;
- u32 instruction_stidp;
- u32 instruction_spx;
- u32 instruction_stpx;
- u32 instruction_stap;
- u32 instruction_storage_key;
- u32 instruction_ipte_interlock;
- u32 instruction_stsch;
- u32 instruction_chsc;
- u32 instruction_stsi;
- u32 instruction_stfl;
- u32 instruction_tprot;
- u32 instruction_sie;
- u32 instruction_essa;
- u32 instruction_sthyi;
- u32 instruction_sigp_sense;
- u32 instruction_sigp_sense_running;
- u32 instruction_sigp_external_call;
- u32 instruction_sigp_emergency;
- u32 instruction_sigp_cond_emergency;
- u32 instruction_sigp_start;
- u32 instruction_sigp_stop;
- u32 instruction_sigp_stop_store_status;
- u32 instruction_sigp_store_status;
- u32 instruction_sigp_store_adtl_status;
- u32 instruction_sigp_arch;
- u32 instruction_sigp_prefix;
- u32 instruction_sigp_restart;
- u32 instruction_sigp_init_cpu_reset;
- u32 instruction_sigp_cpu_reset;
- u32 instruction_sigp_unknown;
- u32 diagnose_10;
- u32 diagnose_44;
- u32 diagnose_9c;
- u32 diagnose_258;
- u32 diagnose_308;
- u32 diagnose_500;
+ u64 exit_userspace;
+ u64 exit_null;
+ u64 exit_external_request;
+ u64 exit_external_interrupt;
+ u64 exit_stop_request;
+ u64 exit_validity;
+ u64 exit_instruction;
+ u64 exit_pei;
+ u64 halt_successful_poll;
+ u64 halt_attempted_poll;
+ u64 halt_poll_invalid;
+ u64 halt_wakeup;
+ u64 instruction_lctl;
+ u64 instruction_lctlg;
+ u64 instruction_stctl;
+ u64 instruction_stctg;
+ u64 exit_program_interruption;
+ u64 exit_instr_and_program;
+ u64 exit_operation_exception;
+ u64 deliver_external_call;
+ u64 deliver_emergency_signal;
+ u64 deliver_service_signal;
+ u64 deliver_virtio_interrupt;
+ u64 deliver_stop_signal;
+ u64 deliver_prefix_signal;
+ u64 deliver_restart_signal;
+ u64 deliver_program_int;
+ u64 deliver_io_int;
+ u64 exit_wait_state;
+ u64 instruction_pfmf;
+ u64 instruction_stidp;
+ u64 instruction_spx;
+ u64 instruction_stpx;
+ u64 instruction_stap;
+ u64 instruction_storage_key;
+ u64 instruction_ipte_interlock;
+ u64 instruction_stsch;
+ u64 instruction_chsc;
+ u64 instruction_stsi;
+ u64 instruction_stfl;
+ u64 instruction_tprot;
+ u64 instruction_sie;
+ u64 instruction_essa;
+ u64 instruction_sthyi;
+ u64 instruction_sigp_sense;
+ u64 instruction_sigp_sense_running;
+ u64 instruction_sigp_external_call;
+ u64 instruction_sigp_emergency;
+ u64 instruction_sigp_cond_emergency;
+ u64 instruction_sigp_start;
+ u64 instruction_sigp_stop;
+ u64 instruction_sigp_stop_store_status;
+ u64 instruction_sigp_store_status;
+ u64 instruction_sigp_store_adtl_status;
+ u64 instruction_sigp_arch;
+ u64 instruction_sigp_prefix;
+ u64 instruction_sigp_restart;
+ u64 instruction_sigp_init_cpu_reset;
+ u64 instruction_sigp_cpu_reset;
+ u64 instruction_sigp_unknown;
+ u64 diagnose_10;
+ u64 diagnose_44;
+ u64 diagnose_9c;
+ u64 diagnose_258;
+ u64 diagnose_308;
+ u64 diagnose_500;
};
#define PGM_OPERATION 0x01
@@ -577,7 +577,7 @@ struct kvm_vcpu_arch {
};
struct kvm_vm_stat {
- u32 remote_tlb_flush;
+ ulong remote_tlb_flush;
};
struct kvm_arch_memory_slot {
diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c
index 1f95cc1faeb7..f3df9e0a5dec 100644
--- a/arch/s390/kernel/asm-offsets.c
+++ b/arch/s390/kernel/asm-offsets.c
@@ -125,6 +125,7 @@ int main(void)
OFFSET(__LC_STFL_FAC_LIST, lowcore, stfl_fac_list);
OFFSET(__LC_STFLE_FAC_LIST, lowcore, stfle_fac_list);
OFFSET(__LC_MCCK_CODE, lowcore, mcck_interruption_code);
+ OFFSET(__LC_EXT_DAMAGE_CODE, lowcore, external_damage_code);
OFFSET(__LC_MCCK_FAIL_STOR_ADDR, lowcore, failing_storage_address);
OFFSET(__LC_LAST_BREAK, lowcore, breaking_event_addr);
OFFSET(__LC_RST_OLD_PSW, lowcore, restart_old_psw);
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 54200208bf24..4aa8a7e2a1da 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -495,6 +495,18 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
tec = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
switch (code) {
+ case PGM_PROTECTION:
+ switch (prot) {
+ case PROT_TYPE_ALC:
+ tec->b60 = 1;
+ /* FALL THROUGH */
+ case PROT_TYPE_DAT:
+ tec->b61 = 1;
+ break;
+ default: /* LA and KEYC set b61 to 0, other params undefined */
+ return code;
+ }
+ /* FALL THROUGH */
case PGM_ASCE_TYPE:
case PGM_PAGE_TRANSLATION:
case PGM_REGION_FIRST_TRANS:
@@ -504,8 +516,7 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
/*
* op_access_id only applies to MOVE_PAGE -> set bit 61
* exc_access_id has to be set to 0 for some instructions. Both
- * cases have to be handled by the caller. We can always store
- * exc_access_id, as it is undefined for non-ar cases.
+ * cases have to be handled by the caller.
*/
tec->addr = gva >> PAGE_SHIFT;
tec->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH;
@@ -516,25 +527,13 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
case PGM_ASTE_VALIDITY:
case PGM_ASTE_SEQUENCE:
case PGM_EXTENDED_AUTHORITY:
+ /*
+ * We can always store exc_access_id, as it is
+ * undefined for non-ar cases. It is undefined for
+ * most DAT protection exceptions.
+ */
pgm->exc_access_id = ar;
break;
- case PGM_PROTECTION:
- switch (prot) {
- case PROT_TYPE_ALC:
- tec->b60 = 1;
- /* FALL THROUGH */
- case PROT_TYPE_DAT:
- tec->b61 = 1;
- tec->addr = gva >> PAGE_SHIFT;
- tec->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH;
- tec->as = psw_bits(vcpu->arch.sie_block->gpsw).as;
- /* exc_access_id is undefined for most cases */
- pgm->exc_access_id = ar;
- break;
- default: /* LA and KEYC set b61 to 0, other params undefined */
- break;
- }
- break;
}
return code;
}
diff --git a/arch/s390/kvm/guestdbg.c b/arch/s390/kvm/guestdbg.c
index 31a05330d11c..d7c6a7f53ced 100644
--- a/arch/s390/kvm/guestdbg.c
+++ b/arch/s390/kvm/guestdbg.c
@@ -206,7 +206,7 @@ static int __import_wp_info(struct kvm_vcpu *vcpu,
int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu,
struct kvm_guest_debug *dbg)
{
- int ret = 0, nr_wp = 0, nr_bp = 0, i, size;
+ int ret = 0, nr_wp = 0, nr_bp = 0, i;
struct kvm_hw_breakpoint *bp_data = NULL;
struct kvm_hw_wp_info_arch *wp_info = NULL;
struct kvm_hw_bp_info_arch *bp_info = NULL;
@@ -216,17 +216,10 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu,
else if (dbg->arch.nr_hw_bp > MAX_BP_COUNT)
return -EINVAL;
- size = dbg->arch.nr_hw_bp * sizeof(struct kvm_hw_breakpoint);
- bp_data = kmalloc(size, GFP_KERNEL);
- if (!bp_data) {
- ret = -ENOMEM;
- goto error;
- }
-
- if (copy_from_user(bp_data, dbg->arch.hw_bp, size)) {
- ret = -EFAULT;
- goto error;
- }
+ bp_data = memdup_user(dbg->arch.hw_bp,
+ sizeof(*bp_data) * dbg->arch.nr_hw_bp);
+ if (IS_ERR(bp_data))
+ return PTR_ERR(bp_data);
for (i = 0; i < dbg->arch.nr_hw_bp; i++) {
switch (bp_data[i].type) {
@@ -241,17 +234,19 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu,
}
}
- size = nr_wp * sizeof(struct kvm_hw_wp_info_arch);
- if (size > 0) {
- wp_info = kmalloc(size, GFP_KERNEL);
+ if (nr_wp > 0) {
+ wp_info = kmalloc_array(nr_wp,
+ sizeof(*wp_info),
+ GFP_KERNEL);
if (!wp_info) {
ret = -ENOMEM;
goto error;
}
}
- size = nr_bp * sizeof(struct kvm_hw_bp_info_arch);
- if (size > 0) {
- bp_info = kmalloc(size, GFP_KERNEL);
+ if (nr_bp > 0) {
+ bp_info = kmalloc_array(nr_bp,
+ sizeof(*bp_info),
+ GFP_KERNEL);
if (!bp_info) {
ret = -ENOMEM;
goto error;
@@ -382,14 +377,20 @@ void kvm_s390_prepare_debug_exit(struct kvm_vcpu *vcpu)
vcpu->guest_debug &= ~KVM_GUESTDBG_EXIT_PENDING;
}
+#define PER_CODE_MASK (PER_EVENT_MASK >> 24)
+#define PER_CODE_BRANCH (PER_EVENT_BRANCH >> 24)
+#define PER_CODE_IFETCH (PER_EVENT_IFETCH >> 24)
+#define PER_CODE_STORE (PER_EVENT_STORE >> 24)
+#define PER_CODE_STORE_REAL (PER_EVENT_STORE_REAL >> 24)
+
#define per_bp_event(code) \
- (code & (PER_EVENT_IFETCH | PER_EVENT_BRANCH))
+ (code & (PER_CODE_IFETCH | PER_CODE_BRANCH))
#define per_write_wp_event(code) \
- (code & (PER_EVENT_STORE | PER_EVENT_STORE_REAL))
+ (code & (PER_CODE_STORE | PER_CODE_STORE_REAL))
static int debug_exit_required(struct kvm_vcpu *vcpu)
{
- u32 perc = (vcpu->arch.sie_block->perc << 24);
+ u8 perc = vcpu->arch.sie_block->perc;
struct kvm_debug_exit_arch *debug_exit = &vcpu->run->debug.arch;
struct kvm_hw_wp_info_arch *wp_info = NULL;
struct kvm_hw_bp_info_arch *bp_info = NULL;
@@ -444,7 +445,7 @@ int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu)
const u8 ilen = kvm_s390_get_ilen(vcpu);
struct kvm_s390_pgm_info pgm_info = {
.code = PGM_PER,
- .per_code = PER_EVENT_IFETCH >> 24,
+ .per_code = PER_CODE_IFETCH,
.per_address = __rewind_psw(vcpu->arch.sie_block->gpsw, ilen),
};
@@ -458,33 +459,33 @@ int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu)
static void filter_guest_per_event(struct kvm_vcpu *vcpu)
{
- u32 perc = vcpu->arch.sie_block->perc << 24;
+ const u8 perc = vcpu->arch.sie_block->perc;
u64 peraddr = vcpu->arch.sie_block->peraddr;
u64 addr = vcpu->arch.sie_block->gpsw.addr;
u64 cr9 = vcpu->arch.sie_block->gcr[9];
u64 cr10 = vcpu->arch.sie_block->gcr[10];
u64 cr11 = vcpu->arch.sie_block->gcr[11];
/* filter all events, demanded by the guest */
- u32 guest_perc = perc & cr9 & PER_EVENT_MASK;
+ u8 guest_perc = perc & (cr9 >> 24) & PER_CODE_MASK;
if (!guest_per_enabled(vcpu))
guest_perc = 0;
/* filter "successful-branching" events */
- if (guest_perc & PER_EVENT_BRANCH &&
+ if (guest_perc & PER_CODE_BRANCH &&
cr9 & PER_CONTROL_BRANCH_ADDRESS &&
!in_addr_range(addr, cr10, cr11))
- guest_perc &= ~PER_EVENT_BRANCH;
+ guest_perc &= ~PER_CODE_BRANCH;
/* filter "instruction-fetching" events */
- if (guest_perc & PER_EVENT_IFETCH &&
+ if (guest_perc & PER_CODE_IFETCH &&
!in_addr_range(peraddr, cr10, cr11))
- guest_perc &= ~PER_EVENT_IFETCH;
+ guest_perc &= ~PER_CODE_IFETCH;
/* All other PER events will be given to the guest */
/* TODO: Check altered address/address space */
- vcpu->arch.sie_block->perc = guest_perc >> 24;
+ vcpu->arch.sie_block->perc = guest_perc;
if (!guest_perc)
vcpu->arch.sie_block->iprcc &= ~PGM_PER;
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index dfd0ca2638fa..1cab8a177d0e 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -29,6 +29,7 @@ static const intercept_handler_t instruction_handlers[256] = {
[0x01] = kvm_s390_handle_01,
[0x82] = kvm_s390_handle_lpsw,
[0x83] = kvm_s390_handle_diag,
+ [0xaa] = kvm_s390_handle_aa,
[0xae] = kvm_s390_handle_sigp,
[0xb2] = kvm_s390_handle_b2,
[0xb6] = kvm_s390_handle_stctl,
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 24524c0f3ef8..be4db07f70d3 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -24,6 +24,8 @@
#include <asm/sclp.h>
#include <asm/isc.h>
#include <asm/gmap.h>
+#include <asm/switch_to.h>
+#include <asm/nmi.h>
#include "kvm-s390.h"
#include "gaccess.h"
#include "trace-s390.h"
@@ -40,6 +42,7 @@ static int sca_ext_call_pending(struct kvm_vcpu *vcpu, int *src_id)
if (!(atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_ECALL_PEND))
return 0;
+ BUG_ON(!kvm_s390_use_sca_entries());
read_lock(&vcpu->kvm->arch.sca_lock);
if (vcpu->kvm->arch.use_esca) {
struct esca_block *sca = vcpu->kvm->arch.sca;
@@ -68,6 +71,7 @@ static int sca_inject_ext_call(struct kvm_vcpu *vcpu, int src_id)
{
int expect, rc;
+ BUG_ON(!kvm_s390_use_sca_entries());
read_lock(&vcpu->kvm->arch.sca_lock);
if (vcpu->kvm->arch.use_esca) {
struct esca_block *sca = vcpu->kvm->arch.sca;
@@ -109,6 +113,8 @@ static void sca_clear_ext_call(struct kvm_vcpu *vcpu)
struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
int rc, expect;
+ if (!kvm_s390_use_sca_entries())
+ return;
atomic_andnot(CPUSTAT_ECALL_PEND, li->cpuflags);
read_lock(&vcpu->kvm->arch.sca_lock);
if (vcpu->kvm->arch.use_esca) {
@@ -400,12 +406,78 @@ static int __must_check __deliver_pfault_init(struct kvm_vcpu *vcpu)
return rc ? -EFAULT : 0;
}
+static int __write_machine_check(struct kvm_vcpu *vcpu,
+ struct kvm_s390_mchk_info *mchk)
+{
+ unsigned long ext_sa_addr;
+ freg_t fprs[NUM_FPRS];
+ union mci mci;
+ int rc;
+
+ mci.val = mchk->mcic;
+ /* take care of lazy register loading via vcpu load/put */
+ save_fpu_regs();
+ save_access_regs(vcpu->run->s.regs.acrs);
+
+ /* Extended save area */
+ rc = read_guest_lc(vcpu, __LC_VX_SAVE_AREA_ADDR, &ext_sa_addr,
+ sizeof(unsigned long));
+ /* Only bits 0-53 are used for address formation */
+ ext_sa_addr &= ~0x3ffUL;
+ if (!rc && mci.vr && ext_sa_addr && test_kvm_facility(vcpu->kvm, 129)) {
+ if (write_guest_abs(vcpu, ext_sa_addr, vcpu->run->s.regs.vrs,
+ 512))
+ mci.vr = 0;
+ } else {
+ mci.vr = 0;
+ }
+
+ /* General interruption information */
+ rc |= put_guest_lc(vcpu, 1, (u8 __user *) __LC_AR_MODE_ID);
+ rc |= write_guest_lc(vcpu, __LC_MCK_OLD_PSW,
+ &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+ rc |= read_guest_lc(vcpu, __LC_MCK_NEW_PSW,
+ &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+ rc |= put_guest_lc(vcpu, mci.val, (u64 __user *) __LC_MCCK_CODE);
+
+ /* Register-save areas */
+ if (MACHINE_HAS_VX) {
+ convert_vx_to_fp(fprs, (__vector128 *) vcpu->run->s.regs.vrs);
+ rc |= write_guest_lc(vcpu, __LC_FPREGS_SAVE_AREA, fprs, 128);
+ } else {
+ rc |= write_guest_lc(vcpu, __LC_FPREGS_SAVE_AREA,
+ vcpu->run->s.regs.fprs, 128);
+ }
+ rc |= write_guest_lc(vcpu, __LC_GPREGS_SAVE_AREA,
+ vcpu->run->s.regs.gprs, 128);
+ rc |= put_guest_lc(vcpu, current->thread.fpu.fpc,
+ (u32 __user *) __LC_FP_CREG_SAVE_AREA);
+ rc |= put_guest_lc(vcpu, vcpu->arch.sie_block->todpr,
+ (u32 __user *) __LC_TOD_PROGREG_SAVE_AREA);
+ rc |= put_guest_lc(vcpu, kvm_s390_get_cpu_timer(vcpu),
+ (u64 __user *) __LC_CPU_TIMER_SAVE_AREA);
+ rc |= put_guest_lc(vcpu, vcpu->arch.sie_block->ckc >> 8,
+ (u64 __user *) __LC_CLOCK_COMP_SAVE_AREA);
+ rc |= write_guest_lc(vcpu, __LC_AREGS_SAVE_AREA,
+ &vcpu->run->s.regs.acrs, 64);
+ rc |= write_guest_lc(vcpu, __LC_CREGS_SAVE_AREA,
+ &vcpu->arch.sie_block->gcr, 128);
+
+ /* Extended interruption information */
+ rc |= put_guest_lc(vcpu, mchk->ext_damage_code,
+ (u32 __user *) __LC_EXT_DAMAGE_CODE);
+ rc |= put_guest_lc(vcpu, mchk->failing_storage_address,
+ (u64 __user *) __LC_MCCK_FAIL_STOR_ADDR);
+ rc |= write_guest_lc(vcpu, __LC_PSW_SAVE_AREA, &mchk->fixed_logout,
+ sizeof(mchk->fixed_logout));
+ return rc ? -EFAULT : 0;
+}
+
static int __must_check __deliver_machine_check(struct kvm_vcpu *vcpu)
{
struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
struct kvm_s390_mchk_info mchk = {};
- unsigned long adtl_status_addr;
int deliver = 0;
int rc = 0;
@@ -446,29 +518,9 @@ static int __must_check __deliver_machine_check(struct kvm_vcpu *vcpu)
trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id,
KVM_S390_MCHK,
mchk.cr14, mchk.mcic);
-
- rc = kvm_s390_vcpu_store_status(vcpu,
- KVM_S390_STORE_STATUS_PREFIXED);
- rc |= read_guest_lc(vcpu, __LC_VX_SAVE_AREA_ADDR,
- &adtl_status_addr,
- sizeof(unsigned long));
- rc |= kvm_s390_vcpu_store_adtl_status(vcpu,
- adtl_status_addr);
- rc |= put_guest_lc(vcpu, mchk.mcic,
- (u64 __user *) __LC_MCCK_CODE);
- rc |= put_guest_lc(vcpu, mchk.failing_storage_address,
- (u64 __user *) __LC_MCCK_FAIL_STOR_ADDR);
- rc |= write_guest_lc(vcpu, __LC_PSW_SAVE_AREA,
- &mchk.fixed_logout,
- sizeof(mchk.fixed_logout));
- rc |= write_guest_lc(vcpu, __LC_MCK_OLD_PSW,
- &vcpu->arch.sie_block->gpsw,
- sizeof(psw_t));
- rc |= read_guest_lc(vcpu, __LC_MCK_NEW_PSW,
- &vcpu->arch.sie_block->gpsw,
- sizeof(psw_t));
+ rc = __write_machine_check(vcpu, &mchk);
}
- return rc ? -EFAULT : 0;
+ return rc;
}
static int __must_check __deliver_restart(struct kvm_vcpu *vcpu)
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 7e8cb6ab7e0b..9c7a1ecfe6bd 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -384,7 +384,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_NR_VCPUS:
case KVM_CAP_MAX_VCPUS:
r = KVM_S390_BSCA_CPU_SLOTS;
- if (sclp.has_esca && sclp.has_64bscao)
+ if (!kvm_s390_use_sca_entries())
+ r = KVM_MAX_VCPUS;
+ else if (sclp.has_esca && sclp.has_64bscao)
r = KVM_S390_ESCA_CPU_SLOTS;
break;
case KVM_CAP_NR_MEMSLOTS:
@@ -1498,6 +1500,16 @@ out_err:
return rc;
}
+bool kvm_arch_has_vcpu_debugfs(void)
+{
+ return false;
+}
+
+int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
{
VCPU_EVENT(vcpu, 3, "%s", "free cpu");
@@ -1561,6 +1573,8 @@ static int __kvm_ucontrol_vcpu_init(struct kvm_vcpu *vcpu)
static void sca_del_vcpu(struct kvm_vcpu *vcpu)
{
+ if (!kvm_s390_use_sca_entries())
+ return;
read_lock(&vcpu->kvm->arch.sca_lock);
if (vcpu->kvm->arch.use_esca) {
struct esca_block *sca = vcpu->kvm->arch.sca;
@@ -1578,6 +1592,13 @@ static void sca_del_vcpu(struct kvm_vcpu *vcpu)
static void sca_add_vcpu(struct kvm_vcpu *vcpu)
{
+ if (!kvm_s390_use_sca_entries()) {
+ struct bsca_block *sca = vcpu->kvm->arch.sca;
+
+ /* we still need the basic sca for the ipte control */
+ vcpu->arch.sie_block->scaoh = (__u32)(((__u64)sca) >> 32);
+ vcpu->arch.sie_block->scaol = (__u32)(__u64)sca;
+ }
read_lock(&vcpu->kvm->arch.sca_lock);
if (vcpu->kvm->arch.use_esca) {
struct esca_block *sca = vcpu->kvm->arch.sca;
@@ -1658,6 +1679,11 @@ static int sca_can_add_vcpu(struct kvm *kvm, unsigned int id)
{
int rc;
+ if (!kvm_s390_use_sca_entries()) {
+ if (id < KVM_MAX_VCPUS)
+ return true;
+ return false;
+ }
if (id < KVM_S390_BSCA_CPU_SLOTS)
return true;
if (!sclp.has_esca || !sclp.has_64bscao)
@@ -1946,8 +1972,6 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
vcpu->arch.sie_block->eca |= 1;
if (sclp.has_sigpif)
vcpu->arch.sie_block->eca |= 0x10000000U;
- if (test_kvm_facility(vcpu->kvm, 64))
- vcpu->arch.sie_block->ecb3 |= 0x01;
if (test_kvm_facility(vcpu->kvm, 129)) {
vcpu->arch.sie_block->eca |= 0x00020000;
vcpu->arch.sie_block->ecd |= 0x20000000;
@@ -2704,6 +2728,19 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
if (vcpu->arch.pfault_token == KVM_S390_PFAULT_TOKEN_INVALID)
kvm_clear_async_pf_completion_queue(vcpu);
}
+ /*
+ * If userspace sets the riccb (e.g. after migration) to a valid state,
+ * we should enable RI here instead of doing the lazy enablement.
+ */
+ if ((kvm_run->kvm_dirty_regs & KVM_SYNC_RICCB) &&
+ test_kvm_facility(vcpu->kvm, 64)) {
+ struct runtime_instr_cb *riccb =
+ (struct runtime_instr_cb *) &kvm_run->s.regs.riccb;
+
+ if (riccb->valid)
+ vcpu->arch.sie_block->ecb3 |= 0x01;
+ }
+
kvm_run->kvm_dirty_regs = 0;
}
@@ -2847,38 +2884,6 @@ int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr)
return kvm_s390_store_status_unloaded(vcpu, addr);
}
-/*
- * store additional status at address
- */
-int kvm_s390_store_adtl_status_unloaded(struct kvm_vcpu *vcpu,
- unsigned long gpa)
-{
- /* Only bits 0-53 are used for address formation */
- if (!(gpa & ~0x3ff))
- return 0;
-
- return write_guest_abs(vcpu, gpa & ~0x3ff,
- (void *)&vcpu->run->s.regs.vrs, 512);
-}
-
-int kvm_s390_vcpu_store_adtl_status(struct kvm_vcpu *vcpu, unsigned long addr)
-{
- if (!test_kvm_facility(vcpu->kvm, 129))
- return 0;
-
- /*
- * The guest VXRS are in the host VXRs due to the lazy
- * copying in vcpu load/put. We can simply call save_fpu_regs()
- * to save the current register state because we are in the
- * middle of a load/put cycle.
- *
- * Let's update our copies before we save it into the save area.
- */
- save_fpu_regs();
-
- return kvm_s390_store_adtl_status_unloaded(vcpu, addr);
-}
-
static void __disable_ibs_on_vcpu(struct kvm_vcpu *vcpu)
{
kvm_check_request(KVM_REQ_ENABLE_IBS, vcpu);
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index b8432862a817..3a4e97f1a9e6 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -20,6 +20,7 @@
#include <linux/kvm_host.h>
#include <asm/facility.h>
#include <asm/processor.h>
+#include <asm/sclp.h>
typedef int (*intercept_handler_t)(struct kvm_vcpu *vcpu);
@@ -245,6 +246,7 @@ static inline void kvm_s390_retry_instr(struct kvm_vcpu *vcpu)
/* implemented in priv.c */
int is_valid_psw(psw_t *psw);
+int kvm_s390_handle_aa(struct kvm_vcpu *vcpu);
int kvm_s390_handle_b2(struct kvm_vcpu *vcpu);
int kvm_s390_handle_e5(struct kvm_vcpu *vcpu);
int kvm_s390_handle_01(struct kvm_vcpu *vcpu);
@@ -273,10 +275,7 @@ int handle_sthyi(struct kvm_vcpu *vcpu);
void kvm_s390_set_tod_clock(struct kvm *kvm, u64 tod);
long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable);
int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long addr);
-int kvm_s390_store_adtl_status_unloaded(struct kvm_vcpu *vcpu,
- unsigned long addr);
int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr);
-int kvm_s390_vcpu_store_adtl_status(struct kvm_vcpu *vcpu, unsigned long addr);
void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu);
void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu);
void kvm_s390_vcpu_block(struct kvm_vcpu *vcpu);
@@ -389,4 +388,13 @@ static inline union ipte_control *kvm_s390_get_ipte_control(struct kvm *kvm)
return &sca->ipte_control;
}
+static inline int kvm_s390_use_sca_entries(void)
+{
+ /*
+ * Without SIGP interpretation, only SRS interpretation (if available)
+ * might use the entries. By not setting the entries and keeping them
+ * invalid, hardware will not access them but intercept.
+ */
+ return sclp.has_sigpif;
+}
#endif
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 46160388e996..e18435355c16 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -32,6 +32,24 @@
#include "kvm-s390.h"
#include "trace.h"
+static int handle_ri(struct kvm_vcpu *vcpu)
+{
+ if (test_kvm_facility(vcpu->kvm, 64)) {
+ vcpu->arch.sie_block->ecb3 |= 0x01;
+ kvm_s390_retry_instr(vcpu);
+ return 0;
+ } else
+ return kvm_s390_inject_program_int(vcpu, PGM_OPERATION);
+}
+
+int kvm_s390_handle_aa(struct kvm_vcpu *vcpu)
+{
+ if ((vcpu->arch.sie_block->ipa & 0xf) <= 4)
+ return handle_ri(vcpu);
+ else
+ return -EOPNOTSUPP;
+}
+
/* Handle SCK (SET CLOCK) interception */
static int handle_set_clock(struct kvm_vcpu *vcpu)
{
@@ -1093,6 +1111,9 @@ static int handle_stctg(struct kvm_vcpu *vcpu)
static const intercept_handler_t eb_handlers[256] = {
[0x2f] = handle_lctlg,
[0x25] = handle_stctg,
+ [0x60] = handle_ri,
+ [0x61] = handle_ri,
+ [0x62] = handle_ri,
};
int kvm_s390_handle_eb(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index 94d54d0defa7..02223cb4bcfd 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -129,7 +129,7 @@ static notrace cycle_t vread_pvclock(int *mode)
return 0;
}
- ret = __pvclock_read_cycles(pvti);
+ ret = __pvclock_read_cycles(pvti, rdtsc_ordered());
} while (pvclock_read_retry(pvti, version));
/* refer to vread_tsc() comment for rationale */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 33ae3a4d0159..4b20f7304b9c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -568,6 +568,7 @@ struct kvm_vcpu_arch {
struct kvm_steal_time steal;
} st;
+ u64 tsc_offset;
u64 last_guest_tsc;
u64 last_host_tsc;
u64 tsc_offset_adjustment;
@@ -701,6 +702,8 @@ struct kvm_hv {
/* Hyper-v based guest crash (NT kernel bugcheck) parameters */
u64 hv_crash_param[HV_X64_MSR_CRASH_PARAMS];
u64 hv_crash_ctl;
+
+ HV_REFERENCE_TSC_PAGE tsc_ref;
};
struct kvm_arch {
@@ -781,54 +784,56 @@ struct kvm_arch {
bool disabled_lapic_found;
/* Struct members for AVIC */
+ u32 avic_vm_id;
u32 ldr_mode;
struct page *avic_logical_id_table_page;
struct page *avic_physical_id_table_page;
+ struct hlist_node hnode;
bool x2apic_format;
bool x2apic_broadcast_quirk_disabled;
};
struct kvm_vm_stat {
- u32 mmu_shadow_zapped;
- u32 mmu_pte_write;
- u32 mmu_pte_updated;
- u32 mmu_pde_zapped;
- u32 mmu_flooded;
- u32 mmu_recycled;
- u32 mmu_cache_miss;
- u32 mmu_unsync;
- u32 remote_tlb_flush;
- u32 lpages;
+ ulong mmu_shadow_zapped;
+ ulong mmu_pte_write;
+ ulong mmu_pte_updated;
+ ulong mmu_pde_zapped;
+ ulong mmu_flooded;
+ ulong mmu_recycled;
+ ulong mmu_cache_miss;
+ ulong mmu_unsync;
+ ulong remote_tlb_flush;
+ ulong lpages;
};
struct kvm_vcpu_stat {
- u32 pf_fixed;
- u32 pf_guest;
- u32 tlb_flush;
- u32 invlpg;
-
- u32 exits;
- u32 io_exits;
- u32 mmio_exits;
- u32 signal_exits;
- u32 irq_window_exits;
- u32 nmi_window_exits;
- u32 halt_exits;
- u32 halt_successful_poll;
- u32 halt_attempted_poll;
- u32 halt_poll_invalid;
- u32 halt_wakeup;
- u32 request_irq_exits;
- u32 irq_exits;
- u32 host_state_reload;
- u32 efer_reload;
- u32 fpu_reload;
- u32 insn_emulation;
- u32 insn_emulation_fail;
- u32 hypercalls;
- u32 irq_injections;
- u32 nmi_injections;
+ u64 pf_fixed;
+ u64 pf_guest;
+ u64 tlb_flush;
+ u64 invlpg;
+
+ u64 exits;
+ u64 io_exits;
+ u64 mmio_exits;
+ u64 signal_exits;
+ u64 irq_window_exits;
+ u64 nmi_window_exits;
+ u64 halt_exits;
+ u64 halt_successful_poll;
+ u64 halt_attempted_poll;
+ u64 halt_poll_invalid;
+ u64 halt_wakeup;
+ u64 request_irq_exits;
+ u64 irq_exits;
+ u64 host_state_reload;
+ u64 efer_reload;
+ u64 fpu_reload;
+ u64 insn_emulation;
+ u64 insn_emulation_fail;
+ u64 hypercalls;
+ u64 irq_injections;
+ u64 nmi_injections;
};
struct x86_instruction_info;
@@ -951,7 +956,6 @@ struct kvm_x86_ops {
bool (*has_wbinvd_exit)(void);
- u64 (*read_tsc_offset)(struct kvm_vcpu *vcpu);
void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu, u64 host_tsc);
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index d019f0cc80ec..3ad741b84072 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -87,9 +87,10 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
}
static __always_inline
-cycle_t __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src)
+cycle_t __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
+ u64 tsc)
{
- u64 delta = rdtsc_ordered() - src->tsc_timestamp;
+ u64 delta = tsc - src->tsc_timestamp;
cycle_t offset = pvclock_scale_delta(delta, src->tsc_to_system_mul,
src->tsc_shift);
return src->system_time + offset;
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 3599404e3089..5b2cc889ce34 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -80,7 +80,7 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
do {
version = pvclock_read_begin(src);
- ret = __pvclock_read_cycles(src);
+ ret = __pvclock_read_cycles(src, rdtsc_ordered());
flags = src->flags;
} while (pvclock_read_retry(src, version));
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 464fa477afbf..3bff20710471 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -13,7 +13,7 @@ kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
- hyperv.o page_track.o
+ hyperv.o page_track.o debugfs.o
kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += assigned-dev.o iommu.o
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 3235e0fe7792..afa7bbb596cd 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -366,7 +366,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) |
- F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB);
+ F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) |
+ F(AVX512BW) | F(AVX512VL);
/* cpuid 0xD.1.eax */
const u32 kvm_cpuid_D_1_eax_x86_features =
diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c
new file mode 100644
index 000000000000..c19c7ede9bd6
--- /dev/null
+++ b/arch/x86/kvm/debugfs.c
@@ -0,0 +1,69 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * Copyright 2016 Red Hat, Inc. and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+#include <linux/kvm_host.h>
+#include <linux/debugfs.h>
+
+bool kvm_arch_has_vcpu_debugfs(void)
+{
+ return true;
+}
+
+static int vcpu_get_tsc_offset(void *data, u64 *val)
+{
+ struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
+ *val = vcpu->arch.tsc_offset;
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_offset_fops, vcpu_get_tsc_offset, NULL, "%lld\n");
+
+static int vcpu_get_tsc_scaling_ratio(void *data, u64 *val)
+{
+ struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
+ *val = vcpu->arch.tsc_scaling_ratio;
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_fops, vcpu_get_tsc_scaling_ratio, NULL, "%llu\n");
+
+static int vcpu_get_tsc_scaling_frac_bits(void *data, u64 *val)
+{
+ *val = kvm_tsc_scaling_ratio_frac_bits;
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_frac_fops, vcpu_get_tsc_scaling_frac_bits, NULL, "%llu\n");
+
+int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
+{
+ struct dentry *ret;
+
+ ret = debugfs_create_file("tsc-offset", 0444,
+ vcpu->debugfs_dentry,
+ vcpu, &vcpu_tsc_offset_fops);
+ if (!ret)
+ return -ENOMEM;
+
+ if (kvm_has_tsc_control) {
+ ret = debugfs_create_file("tsc-scaling-ratio", 0444,
+ vcpu->debugfs_dentry,
+ vcpu, &vcpu_tsc_scaling_fops);
+ if (!ret)
+ return -ENOMEM;
+ ret = debugfs_create_file("tsc-scaling-ratio-frac-bits", 0444,
+ vcpu->debugfs_dentry,
+ vcpu, &vcpu_tsc_scaling_frac_fops);
+ if (!ret)
+ return -ENOMEM;
+
+ }
+
+ return 0;
+}
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 01bd7b7a6866..42b1c83741c8 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -386,7 +386,21 @@ static void synic_init(struct kvm_vcpu_hv_synic *synic)
static u64 get_time_ref_counter(struct kvm *kvm)
{
- return div_u64(get_kernel_ns() + kvm->arch.kvmclock_offset, 100);
+ struct kvm_hv *hv = &kvm->arch.hyperv;
+ struct kvm_vcpu *vcpu;
+ u64 tsc;
+
+ /*
+ * The guest has not set up the TSC page or the clock isn't
+ * stable, fall back to get_kvmclock_ns.
+ */
+ if (!hv->tsc_ref.tsc_sequence)
+ return div_u64(get_kvmclock_ns(kvm), 100);
+
+ vcpu = kvm_get_vcpu(kvm, 0);
+ tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+ return mul_u64_u64_shr(tsc, hv->tsc_ref.tsc_scale, 64)
+ + hv->tsc_ref.tsc_offset;
}
static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer,
@@ -756,6 +770,129 @@ static int kvm_hv_msr_set_crash_data(struct kvm_vcpu *vcpu,
return 0;
}
+/*
+ * The kvmclock and Hyper-V TSC page use similar formulas, and converting
+ * between them is possible:
+ *
+ * kvmclock formula:
+ * nsec = (ticks - tsc_timestamp) * tsc_to_system_mul * 2^(tsc_shift-32)
+ * + system_time
+ *
+ * Hyper-V formula:
+ * nsec/100 = ticks * scale / 2^64 + offset
+ *
+ * When tsc_timestamp = system_time = 0, offset is zero in the Hyper-V formula.
+ * By dividing the kvmclock formula by 100 and equating what's left we get:
+ * ticks * scale / 2^64 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ * scale / 2^64 = tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ * scale = tsc_to_system_mul * 2^(32+tsc_shift) / 100
+ *
+ * Now expand the kvmclock formula and divide by 100:
+ * nsec = ticks * tsc_to_system_mul * 2^(tsc_shift-32)
+ * - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32)
+ * + system_time
+ * nsec/100 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ * - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ * + system_time / 100
+ *
+ * Replace tsc_to_system_mul * 2^(tsc_shift-32) / 100 by scale / 2^64:
+ * nsec/100 = ticks * scale / 2^64
+ * - tsc_timestamp * scale / 2^64
+ * + system_time / 100
+ *
+ * Equate with the Hyper-V formula so that ticks * scale / 2^64 cancels out:
+ * offset = system_time / 100 - tsc_timestamp * scale / 2^64
+ *
+ * These two equivalencies are implemented in this function.
+ */
+static bool compute_tsc_page_parameters(struct pvclock_vcpu_time_info *hv_clock,
+ HV_REFERENCE_TSC_PAGE *tsc_ref)
+{
+ u64 max_mul;
+
+ if (!(hv_clock->flags & PVCLOCK_TSC_STABLE_BIT))
+ return false;
+
+ /*
+ * check if scale would overflow, if so we use the time ref counter
+ * tsc_to_system_mul * 2^(tsc_shift+32) / 100 >= 2^64
+ * tsc_to_system_mul / 100 >= 2^(32-tsc_shift)
+ * tsc_to_system_mul >= 100 * 2^(32-tsc_shift)
+ */
+ max_mul = 100ull << (32 - hv_clock->tsc_shift);
+ if (hv_clock->tsc_to_system_mul >= max_mul)
+ return false;
+
+ /*
+ * Otherwise compute the scale and offset according to the formulas
+ * derived above.
+ */
+ tsc_ref->tsc_scale =
+ mul_u64_u32_div(1ULL << (32 + hv_clock->tsc_shift),
+ hv_clock->tsc_to_system_mul,
+ 100);
+
+ tsc_ref->tsc_offset = hv_clock->system_time;
+ do_div(tsc_ref->tsc_offset, 100);
+ tsc_ref->tsc_offset -=
+ mul_u64_u64_shr(hv_clock->tsc_timestamp, tsc_ref->tsc_scale, 64);
+ return true;
+}
+
+void kvm_hv_setup_tsc_page(struct kvm *kvm,
+ struct pvclock_vcpu_time_info *hv_clock)
+{
+ struct kvm_hv *hv = &kvm->arch.hyperv;
+ u32 tsc_seq;
+ u64 gfn;
+
+ BUILD_BUG_ON(sizeof(tsc_seq) != sizeof(hv->tsc_ref.tsc_sequence));
+ BUILD_BUG_ON(offsetof(HV_REFERENCE_TSC_PAGE, tsc_sequence) != 0);
+
+ if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE))
+ return;
+
+ gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
+ /*
+ * Because the TSC parameters only vary when there is a
+ * change in the master clock, do not bother with caching.
+ */
+ if (unlikely(kvm_read_guest(kvm, gfn_to_gpa(gfn),
+ &tsc_seq, sizeof(tsc_seq))))
+ return;
+
+ /*
+ * While we're computing and writing the parameters, force the
+ * guest to use the time reference count MSR.
+ */
+ hv->tsc_ref.tsc_sequence = 0;
+ if (kvm_write_guest(kvm, gfn_to_gpa(gfn),
+ &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence)))
+ return;
+
+ if (!compute_tsc_page_parameters(hv_clock, &hv->tsc_ref))
+ return;
+
+ /* Ensure sequence is zero before writing the rest of the struct. */
+ smp_wmb();
+ if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref)))
+ return;
+
+ /*
+ * Now switch to the TSC page mechanism by writing the sequence.
+ */
+ tsc_seq++;
+ if (tsc_seq == 0xFFFFFFFF || tsc_seq == 0)
+ tsc_seq = 1;
+
+ /* Write the struct entirely before the non-zero sequence. */
+ smp_wmb();
+
+ hv->tsc_ref.tsc_sequence = tsc_seq;
+ kvm_write_guest(kvm, gfn_to_gpa(gfn),
+ &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence));
+}
+
static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
bool host)
{
@@ -793,23 +930,11 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
mark_page_dirty(kvm, gfn);
break;
}
- case HV_X64_MSR_REFERENCE_TSC: {
- u64 gfn;
- HV_REFERENCE_TSC_PAGE tsc_ref;
-
- memset(&tsc_ref, 0, sizeof(tsc_ref));
+ case HV_X64_MSR_REFERENCE_TSC:
hv->hv_tsc_page = data;
- if (!(data & HV_X64_MSR_TSC_REFERENCE_ENABLE))
- break;
- gfn = data >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
- if (kvm_write_guest(
- kvm,
- gfn << HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT,
- &tsc_ref, sizeof(tsc_ref)))
- return 1;
- mark_page_dirty(kvm, gfn);
+ if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)
+ kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
break;
- }
case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
return kvm_hv_msr_set_crash_data(vcpu,
msr - HV_X64_MSR_CRASH_P0,
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index 60eccd4bd1d3..cd1119538add 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -84,4 +84,7 @@ static inline bool kvm_hv_has_stimer_pending(struct kvm_vcpu *vcpu)
void kvm_hv_process_stimers(struct kvm_vcpu *vcpu);
+void kvm_hv_setup_tsc_page(struct kvm *kvm,
+ struct pvclock_vcpu_time_info *hv_clock);
+
#endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index b62c85229711..23b99f305382 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1761,9 +1761,10 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
if (value & MSR_IA32_APICBASE_ENABLE) {
kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
static_key_slow_dec_deferred(&apic_hw_disabled);
- } else
+ } else {
static_key_slow_inc(&apic_hw_disabled.key);
- recalculate_apic_map(vcpu->kvm);
+ recalculate_apic_map(vcpu->kvm);
+ }
}
if ((old_value ^ value) & X2APIC_ENABLE) {
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3d4cc8cc56a3..d9c7e986b4e4 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1207,7 +1207,7 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
*
* Return true if tlb need be flushed.
*/
-static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect)
+static bool spte_write_protect(u64 *sptep, bool pt_protect)
{
u64 spte = *sptep;
@@ -1233,12 +1233,12 @@ static bool __rmap_write_protect(struct kvm *kvm,
bool flush = false;
for_each_rmap_spte(rmap_head, &iter, sptep)
- flush |= spte_write_protect(kvm, sptep, pt_protect);
+ flush |= spte_write_protect(sptep, pt_protect);
return flush;
}
-static bool spte_clear_dirty(struct kvm *kvm, u64 *sptep)
+static bool spte_clear_dirty(u64 *sptep)
{
u64 spte = *sptep;
@@ -1256,12 +1256,12 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
bool flush = false;
for_each_rmap_spte(rmap_head, &iter, sptep)
- flush |= spte_clear_dirty(kvm, sptep);
+ flush |= spte_clear_dirty(sptep);
return flush;
}
-static bool spte_set_dirty(struct kvm *kvm, u64 *sptep)
+static bool spte_set_dirty(u64 *sptep)
{
u64 spte = *sptep;
@@ -1279,7 +1279,7 @@ static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
bool flush = false;
for_each_rmap_spte(rmap_head, &iter, sptep)
- flush |= spte_set_dirty(kvm, sptep);
+ flush |= spte_set_dirty(sptep);
return flush;
}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 1e6b84b96ea6..f8157a36ab09 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -34,6 +34,8 @@
#include <linux/sched.h>
#include <linux/trace_events.h>
#include <linux/slab.h>
+#include <linux/amd-iommu.h>
+#include <linux/hashtable.h>
#include <asm/apic.h>
#include <asm/perf_event.h>
@@ -41,6 +43,7 @@
#include <asm/desc.h>
#include <asm/debugreg.h>
#include <asm/kvm_para.h>
+#include <asm/irq_remapping.h>
#include <asm/virtext.h>
#include "trace.h"
@@ -96,6 +99,19 @@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
#define AVIC_UNACCEL_ACCESS_OFFSET_MASK 0xFF0
#define AVIC_UNACCEL_ACCESS_VECTOR_MASK 0xFFFFFFFF
+/* AVIC GATAG is encoded using VM and VCPU IDs */
+#define AVIC_VCPU_ID_BITS 8
+#define AVIC_VCPU_ID_MASK ((1 << AVIC_VCPU_ID_BITS) - 1)
+
+#define AVIC_VM_ID_BITS 24
+#define AVIC_VM_ID_NR (1 << AVIC_VM_ID_BITS)
+#define AVIC_VM_ID_MASK ((1 << AVIC_VM_ID_BITS) - 1)
+
+#define AVIC_GATAG(x, y) (((x & AVIC_VM_ID_MASK) << AVIC_VCPU_ID_BITS) | \
+ (y & AVIC_VCPU_ID_MASK))
+#define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VCPU_ID_BITS) & AVIC_VM_ID_MASK)
+#define AVIC_GATAG_TO_VCPUID(x) (x & AVIC_VCPU_ID_MASK)
+
static bool erratum_383_found __read_mostly;
static const u32 host_save_user_msrs[] = {
@@ -185,6 +201,23 @@ struct vcpu_svm {
struct page *avic_backing_page;
u64 *avic_physical_id_cache;
bool avic_is_running;
+
+ /*
+ * Per-vcpu list of struct amd_svm_iommu_ir:
+ * This is used mainly to store interrupt remapping information used
+ * when update the vcpu affinity. This avoids the need to scan for
+ * IRTE and try to match ga_tag in the IOMMU driver.
+ */
+ struct list_head ir_list;
+ spinlock_t ir_list_lock;
+};
+
+/*
+ * This is a wrapper of struct amd_iommu_ir_data.
+ */
+struct amd_svm_iommu_ir {
+ struct list_head node; /* Used by SVM for per-vcpu ir_list */
+ void *data; /* Storing pointer to struct amd_ir_data */
};
#define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF)
@@ -242,6 +275,10 @@ static int avic;
module_param(avic, int, S_IRUGO);
#endif
+/* AVIC VM ID bit masks and lock */
+static DECLARE_BITMAP(avic_vm_id_bitmap, AVIC_VM_ID_NR);
+static DEFINE_SPINLOCK(avic_vm_id_lock);
+
static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
static void svm_flush_tlb(struct kvm_vcpu *vcpu);
static void svm_complete_interrupts(struct vcpu_svm *svm);
@@ -928,6 +965,55 @@ static void svm_disable_lbrv(struct vcpu_svm *svm)
set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
}
+/* Note:
+ * This hash table is used to map VM_ID to a struct kvm_arch,
+ * when handling AMD IOMMU GALOG notification to schedule in
+ * a particular vCPU.
+ */
+#define SVM_VM_DATA_HASH_BITS 8
+DECLARE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
+static spinlock_t svm_vm_data_hash_lock;
+
+/* Note:
+ * This function is called from IOMMU driver to notify
+ * SVM to schedule in a particular vCPU of a particular VM.
+ */
+static int avic_ga_log_notifier(u32 ga_tag)
+{
+ unsigned long flags;
+ struct kvm_arch *ka = NULL;
+ struct kvm_vcpu *vcpu = NULL;
+ u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag);
+ u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag);
+
+ pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id);
+
+ spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
+ hash_for_each_possible(svm_vm_data_hash, ka, hnode, vm_id) {
+ struct kvm *kvm = container_of(ka, struct kvm, arch);
+ struct kvm_arch *vm_data = &kvm->arch;
+
+ if (vm_data->avic_vm_id != vm_id)
+ continue;
+ vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
+ break;
+ }
+ spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
+
+ if (!vcpu)
+ return 0;
+
+ /* Note:
+ * At this point, the IOMMU should have already set the pending
+ * bit in the vAPIC backing page. So, we just need to schedule
+ * in the vcpu.
+ */
+ if (vcpu->mode == OUTSIDE_GUEST_MODE)
+ kvm_vcpu_wake_up(vcpu);
+
+ return 0;
+}
+
static __init int svm_hardware_setup(void)
{
int cpu;
@@ -986,10 +1072,15 @@ static __init int svm_hardware_setup(void)
if (avic) {
if (!npt_enabled ||
!boot_cpu_has(X86_FEATURE_AVIC) ||
- !IS_ENABLED(CONFIG_X86_LOCAL_APIC))
+ !IS_ENABLED(CONFIG_X86_LOCAL_APIC)) {
avic = false;
- else
+ } else {
pr_info("AVIC enabled\n");
+
+ hash_init(svm_vm_data_hash);
+ spin_lock_init(&svm_vm_data_hash_lock);
+ amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
+ }
}
return 0;
@@ -1028,13 +1119,6 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
seg->base = 0;
}
-static u64 svm_read_tsc_offset(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- return svm->vmcb->control.tsc_offset;
-}
-
static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1280,19 +1364,55 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
return 0;
}
+static inline int avic_get_next_vm_id(void)
+{
+ int id;
+
+ spin_lock(&avic_vm_id_lock);
+
+ /* AVIC VM ID is one-based. */
+ id = find_next_zero_bit(avic_vm_id_bitmap, AVIC_VM_ID_NR, 1);
+ if (id <= AVIC_VM_ID_MASK)
+ __set_bit(id, avic_vm_id_bitmap);
+ else
+ id = -EAGAIN;
+
+ spin_unlock(&avic_vm_id_lock);
+ return id;
+}
+
+static inline int avic_free_vm_id(int id)
+{
+ if (id <= 0 || id > AVIC_VM_ID_MASK)
+ return -EINVAL;
+
+ spin_lock(&avic_vm_id_lock);
+ __clear_bit(id, avic_vm_id_bitmap);
+ spin_unlock(&avic_vm_id_lock);
+ return 0;
+}
+
static void avic_vm_destroy(struct kvm *kvm)
{
+ unsigned long flags;
struct kvm_arch *vm_data = &kvm->arch;
+ avic_free_vm_id(vm_data->avic_vm_id);
+
if (vm_data->avic_logical_id_table_page)
__free_page(vm_data->avic_logical_id_table_page);
if (vm_data->avic_physical_id_table_page)
__free_page(vm_data->avic_physical_id_table_page);
+
+ spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
+ hash_del(&vm_data->hnode);
+ spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
}
static int avic_vm_init(struct kvm *kvm)
{
- int err = -ENOMEM;
+ unsigned long flags;
+ int vm_id, err = -ENOMEM;
struct kvm_arch *vm_data = &kvm->arch;
struct page *p_page;
struct page *l_page;
@@ -1300,6 +1420,11 @@ static int avic_vm_init(struct kvm *kvm)
if (!avic)
return 0;
+ vm_id = avic_get_next_vm_id();
+ if (vm_id < 0)
+ return vm_id;
+ vm_data->avic_vm_id = (u32)vm_id;
+
/* Allocating physical APIC ID table (4KB) */
p_page = alloc_page(GFP_KERNEL);
if (!p_page)
@@ -1316,6 +1441,10 @@ static int avic_vm_init(struct kvm *kvm)
vm_data->avic_logical_id_table_page = l_page;
clear_page(page_address(l_page));
+ spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
+ hash_add(svm_vm_data_hash, &vm_data->hnode, vm_data->avic_vm_id);
+ spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
+
return 0;
free_avic:
@@ -1323,31 +1452,34 @@ free_avic:
return err;
}
-/**
- * This function is called during VCPU halt/unhalt.
- */
-static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
+static inline int
+avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
{
- u64 entry;
- int h_physical_id = kvm_cpu_get_apicid(vcpu->cpu);
+ int ret = 0;
+ unsigned long flags;
+ struct amd_svm_iommu_ir *ir;
struct vcpu_svm *svm = to_svm(vcpu);
- if (!kvm_vcpu_apicv_active(vcpu))
- return;
-
- svm->avic_is_running = is_run;
+ if (!kvm_arch_has_assigned_device(vcpu->kvm))
+ return 0;
- /* ID = 0xff (broadcast), ID > 0xff (reserved) */
- if (WARN_ON(h_physical_id >= AVIC_MAX_PHYSICAL_ID_COUNT))
- return;
+ /*
+ * Here, we go through the per-vcpu ir_list to update all existing
+ * interrupt remapping table entry targeting this vcpu.
+ */
+ spin_lock_irqsave(&svm->ir_list_lock, flags);
- entry = READ_ONCE(*(svm->avic_physical_id_cache));
- WARN_ON(is_run == !!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK));
+ if (list_empty(&svm->ir_list))
+ goto out;
- entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
- if (is_run)
- entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
- WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+ list_for_each_entry(ir, &svm->ir_list, node) {
+ ret = amd_iommu_update_ga(cpu, r, ir->data);
+ if (ret)
+ break;
+ }
+out:
+ spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+ return ret;
}
static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
@@ -1374,6 +1506,8 @@ static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+ avic_update_iommu_vcpu_affinity(vcpu, h_physical_id,
+ svm->avic_is_running);
}
static void avic_vcpu_put(struct kvm_vcpu *vcpu)
@@ -1385,10 +1519,27 @@ static void avic_vcpu_put(struct kvm_vcpu *vcpu)
return;
entry = READ_ONCE(*(svm->avic_physical_id_cache));
+ if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)
+ avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
+
entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
}
+/**
+ * This function is called during VCPU halt/unhalt.
+ */
+static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->avic_is_running = is_run;
+ if (is_run)
+ avic_vcpu_load(vcpu, vcpu->cpu);
+ else
+ avic_vcpu_put(vcpu);
+}
+
static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1450,6 +1601,9 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
err = avic_init_backing_page(&svm->vcpu);
if (err)
goto free_page4;
+
+ INIT_LIST_HEAD(&svm->ir_list);
+ spin_lock_init(&svm->ir_list_lock);
}
/* We initialize this flag to true to make sure that the is_running
@@ -4246,6 +4400,209 @@ static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
kvm_vcpu_wake_up(vcpu);
}
+static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
+{
+ unsigned long flags;
+ struct amd_svm_iommu_ir *cur;
+
+ spin_lock_irqsave(&svm->ir_list_lock, flags);
+ list_for_each_entry(cur, &svm->ir_list, node) {
+ if (cur->data != pi->ir_data)
+ continue;
+ list_del(&cur->node);
+ kfree(cur);
+ break;
+ }
+ spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+}
+
+static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
+{
+ int ret = 0;
+ unsigned long flags;
+ struct amd_svm_iommu_ir *ir;
+
+ /**
+ * In some cases, the existing irte is updaed and re-set,
+ * so we need to check here if it's already been * added
+ * to the ir_list.
+ */
+ if (pi->ir_data && (pi->prev_ga_tag != 0)) {
+ struct kvm *kvm = svm->vcpu.kvm;
+ u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag);
+ struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
+ struct vcpu_svm *prev_svm;
+
+ if (!prev_vcpu) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ prev_svm = to_svm(prev_vcpu);
+ svm_ir_list_del(prev_svm, pi);
+ }
+
+ /**
+ * Allocating new amd_iommu_pi_data, which will get
+ * add to the per-vcpu ir_list.
+ */
+ ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL);
+ if (!ir) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ir->data = pi->ir_data;
+
+ spin_lock_irqsave(&svm->ir_list_lock, flags);
+ list_add(&ir->node, &svm->ir_list);
+ spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+out:
+ return ret;
+}
+
+/**
+ * Note:
+ * The HW cannot support posting multicast/broadcast
+ * interrupts to a vCPU. So, we still use legacy interrupt
+ * remapping for these kind of interrupts.
+ *
+ * For lowest-priority interrupts, we only support
+ * those with single CPU as the destination, e.g. user
+ * configures the interrupts via /proc/irq or uses
+ * irqbalance to make the interrupts single-CPU.
+ */
+static int
+get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
+ struct vcpu_data *vcpu_info, struct vcpu_svm **svm)
+{
+ struct kvm_lapic_irq irq;
+ struct kvm_vcpu *vcpu = NULL;
+
+ kvm_set_msi_irq(kvm, e, &irq);
+
+ if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
+ pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n",
+ __func__, irq.vector);
+ return -1;
+ }
+
+ pr_debug("SVM: %s: use GA mode for irq %u\n", __func__,
+ irq.vector);
+ *svm = to_svm(vcpu);
+ vcpu_info->pi_desc_addr = page_to_phys((*svm)->avic_backing_page);
+ vcpu_info->vector = irq.vector;
+
+ return 0;
+}
+
+/*
+ * svm_update_pi_irte - set IRTE for Posted-Interrupts
+ *
+ * @kvm: kvm
+ * @host_irq: host irq of the interrupt
+ * @guest_irq: gsi of the interrupt
+ * @set: set or unset PI
+ * returns 0 on success, < 0 on failure
+ */
+static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
+ uint32_t guest_irq, bool set)
+{
+ struct kvm_kernel_irq_routing_entry *e;
+ struct kvm_irq_routing_table *irq_rt;
+ int idx, ret = -EINVAL;
+
+ if (!kvm_arch_has_assigned_device(kvm) ||
+ !irq_remapping_cap(IRQ_POSTING_CAP))
+ return 0;
+
+ pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n",
+ __func__, host_irq, guest_irq, set);
+
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
+ WARN_ON(guest_irq >= irq_rt->nr_rt_entries);
+
+ hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
+ struct vcpu_data vcpu_info;
+ struct vcpu_svm *svm = NULL;
+
+ if (e->type != KVM_IRQ_ROUTING_MSI)
+ continue;
+
+ /**
+ * Here, we setup with legacy mode in the following cases:
+ * 1. When cannot target interrupt to a specific vcpu.
+ * 2. Unsetting posted interrupt.
+ * 3. APIC virtialization is disabled for the vcpu.
+ */
+ if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set &&
+ kvm_vcpu_apicv_active(&svm->vcpu)) {
+ struct amd_iommu_pi_data pi;
+
+ /* Try to enable guest_mode in IRTE */
+ pi.base = page_to_phys(svm->avic_backing_page) & AVIC_HPA_MASK;
+ pi.ga_tag = AVIC_GATAG(kvm->arch.avic_vm_id,
+ svm->vcpu.vcpu_id);
+ pi.is_guest_mode = true;
+ pi.vcpu_data = &vcpu_info;
+ ret = irq_set_vcpu_affinity(host_irq, &pi);
+
+ /**
+ * Here, we successfully setting up vcpu affinity in
+ * IOMMU guest mode. Now, we need to store the posted
+ * interrupt information in a per-vcpu ir_list so that
+ * we can reference to them directly when we update vcpu
+ * scheduling information in IOMMU irte.
+ */
+ if (!ret && pi.is_guest_mode)
+ svm_ir_list_add(svm, &pi);
+ } else {
+ /* Use legacy mode in IRTE */
+ struct amd_iommu_pi_data pi;
+
+ /**
+ * Here, pi is used to:
+ * - Tell IOMMU to use legacy mode for this interrupt.
+ * - Retrieve ga_tag of prior interrupt remapping data.
+ */
+ pi.is_guest_mode = false;
+ ret = irq_set_vcpu_affinity(host_irq, &pi);
+
+ /**
+ * Check if the posted interrupt was previously
+ * setup with the guest_mode by checking if the ga_tag
+ * was cached. If so, we need to clean up the per-vcpu
+ * ir_list.
+ */
+ if (!ret && pi.prev_ga_tag) {
+ int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag);
+ struct kvm_vcpu *vcpu;
+
+ vcpu = kvm_get_vcpu_by_id(kvm, id);
+ if (vcpu)
+ svm_ir_list_del(to_svm(vcpu), &pi);
+ }
+ }
+
+ if (!ret && svm) {
+ trace_kvm_pi_irte_update(svm->vcpu.vcpu_id,
+ host_irq, e->gsi,
+ vcpu_info.vector,
+ vcpu_info.pi_desc_addr, set);
+ }
+
+ if (ret < 0) {
+ pr_err("%s: failed to update PI IRTE\n", __func__);
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+ return ret;
+}
+
static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -5064,7 +5421,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.has_wbinvd_exit = svm_has_wbinvd_exit,
- .read_tsc_offset = svm_read_tsc_offset,
.write_tsc_offset = svm_write_tsc_offset,
.adjust_tsc_offset_guest = svm_adjust_tsc_offset_guest,
.read_l1_tsc = svm_read_l1_tsc,
@@ -5078,6 +5434,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.pmu_ops = &amd_pmu_ops,
.deliver_posted_interrupt = svm_deliver_avic_intr,
+ .update_pi_irte = svm_update_pi_irte,
};
static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 121fdf6e9ed0..cf1b16dbc98a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -927,6 +927,8 @@ static unsigned long *vmx_msr_bitmap_legacy;
static unsigned long *vmx_msr_bitmap_longmode;
static unsigned long *vmx_msr_bitmap_legacy_x2apic;
static unsigned long *vmx_msr_bitmap_longmode_x2apic;
+static unsigned long *vmx_msr_bitmap_legacy_x2apic_apicv_inactive;
+static unsigned long *vmx_msr_bitmap_longmode_x2apic_apicv_inactive;
static unsigned long *vmx_vmread_bitmap;
static unsigned long *vmx_vmwrite_bitmap;
@@ -939,6 +941,7 @@ static DEFINE_SPINLOCK(vmx_vpid_lock);
static struct vmcs_config {
int size;
int order;
+ u32 basic_cap;
u32 revision_id;
u32 pin_based_exec_ctrl;
u32 cpu_based_exec_ctrl;
@@ -1215,6 +1218,11 @@ static inline bool cpu_has_vmx_ple(void)
SECONDARY_EXEC_PAUSE_LOOP_EXITING;
}
+static inline bool cpu_has_vmx_basic_inout(void)
+{
+ return (((u64)vmcs_config.basic_cap << 32) & VMX_BASIC_INOUT);
+}
+
static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
{
return flexpriority_enabled && lapic_in_kernel(vcpu);
@@ -2518,10 +2526,17 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
else if (cpu_has_secondary_exec_ctrls() &&
(vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
- if (is_long_mode(vcpu))
- msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
- else
- msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
+ if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) {
+ if (is_long_mode(vcpu))
+ msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
+ else
+ msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
+ } else {
+ if (is_long_mode(vcpu))
+ msr_bitmap = vmx_msr_bitmap_longmode_x2apic_apicv_inactive;
+ else
+ msr_bitmap = vmx_msr_bitmap_legacy_x2apic_apicv_inactive;
+ }
} else {
if (is_long_mode(vcpu))
msr_bitmap = vmx_msr_bitmap_longmode;
@@ -2603,11 +2618,6 @@ static u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
return host_tsc + tsc_offset;
}
-static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu)
-{
- return vmcs_read64(TSC_OFFSET);
-}
-
/*
* writes 'offset' into guest's timestamp counter offset register
*/
@@ -2877,6 +2887,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
*pdata = VMCS12_REVISION | VMX_BASIC_TRUE_CTLS |
((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
(VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
+ if (cpu_has_vmx_basic_inout())
+ *pdata |= VMX_BASIC_INOUT;
break;
case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
case MSR_IA32_VMX_PINBASED_CTLS:
@@ -3457,7 +3469,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
return -EIO;
vmcs_conf->size = vmx_msr_high & 0x1fff;
- vmcs_conf->order = get_order(vmcs_config.size);
+ vmcs_conf->order = get_order(vmcs_conf->size);
+ vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
vmcs_conf->revision_id = vmx_msr_low;
vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
@@ -4678,28 +4691,49 @@ static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
msr, MSR_TYPE_R | MSR_TYPE_W);
}
-static void vmx_enable_intercept_msr_read_x2apic(u32 msr)
+static void vmx_enable_intercept_msr_read_x2apic(u32 msr, bool apicv_active)
{
- __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
- msr, MSR_TYPE_R);
- __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
- msr, MSR_TYPE_R);
+ if (apicv_active) {
+ __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
+ msr, MSR_TYPE_R);
+ __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
+ msr, MSR_TYPE_R);
+ } else {
+ __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
+ msr, MSR_TYPE_R);
+ __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
+ msr, MSR_TYPE_R);
+ }
}
-static void vmx_disable_intercept_msr_read_x2apic(u32 msr)
+static void vmx_disable_intercept_msr_read_x2apic(u32 msr, bool apicv_active)
{
- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
- msr, MSR_TYPE_R);
- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
- msr, MSR_TYPE_R);
+ if (apicv_active) {
+ __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
+ msr, MSR_TYPE_R);
+ __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
+ msr, MSR_TYPE_R);
+ } else {
+ __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
+ msr, MSR_TYPE_R);
+ __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
+ msr, MSR_TYPE_R);
+ }
}
-static void vmx_disable_intercept_msr_write_x2apic(u32 msr)
+static void vmx_disable_intercept_msr_write_x2apic(u32 msr, bool apicv_active)
{
- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
- msr, MSR_TYPE_W);
- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
- msr, MSR_TYPE_W);
+ if (apicv_active) {
+ __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
+ msr, MSR_TYPE_W);
+ __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
+ msr, MSR_TYPE_W);
+ } else {
+ __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
+ msr, MSR_TYPE_W);
+ __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
+ msr, MSR_TYPE_W);
+ }
}
static bool vmx_get_enable_apicv(void)
@@ -5279,29 +5313,30 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (is_guest_mode(vcpu))
- return;
+ if (!is_guest_mode(vcpu)) {
+ if (!cpu_has_virtual_nmis()) {
+ /*
+ * Tracking the NMI-blocked state in software is built upon
+ * finding the next open IRQ window. This, in turn, depends on
+ * well-behaving guests: They have to keep IRQs disabled at
+ * least as long as the NMI handler runs. Otherwise we may
+ * cause NMI nesting, maybe breaking the guest. But as this is
+ * highly unlikely, we can live with the residual risk.
+ */
+ vmx->soft_vnmi_blocked = 1;
+ vmx->vnmi_blocked_time = 0;
+ }
- if (!cpu_has_virtual_nmis()) {
- /*
- * Tracking the NMI-blocked state in software is built upon
- * finding the next open IRQ window. This, in turn, depends on
- * well-behaving guests: They have to keep IRQs disabled at
- * least as long as the NMI handler runs. Otherwise we may
- * cause NMI nesting, maybe breaking the guest. But as this is
- * highly unlikely, we can live with the residual risk.
- */
- vmx->soft_vnmi_blocked = 1;
- vmx->vnmi_blocked_time = 0;
+ ++vcpu->stat.nmi_injections;
+ vmx->nmi_known_unmasked = false;
}
- ++vcpu->stat.nmi_injections;
- vmx->nmi_known_unmasked = false;
if (vmx->rmode.vm86_active) {
if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE)
kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
return;
}
+
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
}
@@ -6109,7 +6144,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
gla_validity = (exit_qualification >> 7) & 0x3;
- if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) {
+ if (gla_validity == 0x2) {
printk(KERN_ERR "EPT: Handling EPT violation failed!\n");
printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
(long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
@@ -6360,22 +6395,32 @@ static __init int hardware_setup(void)
if (!vmx_msr_bitmap_legacy_x2apic)
goto out2;
+ vmx_msr_bitmap_legacy_x2apic_apicv_inactive =
+ (unsigned long *)__get_free_page(GFP_KERNEL);
+ if (!vmx_msr_bitmap_legacy_x2apic_apicv_inactive)
+ goto out3;
+
vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
if (!vmx_msr_bitmap_longmode)
- goto out3;
+ goto out4;
vmx_msr_bitmap_longmode_x2apic =
(unsigned long *)__get_free_page(GFP_KERNEL);
if (!vmx_msr_bitmap_longmode_x2apic)
- goto out4;
+ goto out5;
+
+ vmx_msr_bitmap_longmode_x2apic_apicv_inactive =
+ (unsigned long *)__get_free_page(GFP_KERNEL);
+ if (!vmx_msr_bitmap_longmode_x2apic_apicv_inactive)
+ goto out6;
vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
if (!vmx_vmread_bitmap)
- goto out6;
+ goto out7;
vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
if (!vmx_vmwrite_bitmap)
- goto out7;
+ goto out8;
memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
@@ -6394,7 +6439,7 @@ static __init int hardware_setup(void)
if (setup_vmcs_config(&vmcs_config) < 0) {
r = -EIO;
- goto out8;
+ goto out9;
}
if (boot_cpu_has(X86_FEATURE_NX))
@@ -6461,20 +6506,35 @@ static __init int hardware_setup(void)
vmx_msr_bitmap_legacy, PAGE_SIZE);
memcpy(vmx_msr_bitmap_longmode_x2apic,
vmx_msr_bitmap_longmode, PAGE_SIZE);
+ memcpy(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
+ vmx_msr_bitmap_legacy, PAGE_SIZE);
+ memcpy(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
+ vmx_msr_bitmap_longmode, PAGE_SIZE);
set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
+ /*
+ * enable_apicv && kvm_vcpu_apicv_active()
+ */
for (msr = 0x800; msr <= 0x8ff; msr++)
- vmx_disable_intercept_msr_read_x2apic(msr);
+ vmx_disable_intercept_msr_read_x2apic(msr, true);
/* TMCCT */
- vmx_enable_intercept_msr_read_x2apic(0x839);
+ vmx_enable_intercept_msr_read_x2apic(0x839, true);
/* TPR */
- vmx_disable_intercept_msr_write_x2apic(0x808);
+ vmx_disable_intercept_msr_write_x2apic(0x808, true);
/* EOI */
- vmx_disable_intercept_msr_write_x2apic(0x80b);
+ vmx_disable_intercept_msr_write_x2apic(0x80b, true);
/* SELF-IPI */
- vmx_disable_intercept_msr_write_x2apic(0x83f);
+ vmx_disable_intercept_msr_write_x2apic(0x83f, true);
+
+ /*
+ * (enable_apicv && !kvm_vcpu_apicv_active()) ||
+ * !enable_apicv
+ */
+ /* TPR */
+ vmx_disable_intercept_msr_read_x2apic(0x808, false);
+ vmx_disable_intercept_msr_write_x2apic(0x808, false);
if (enable_ept) {
kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
@@ -6521,14 +6581,18 @@ static __init int hardware_setup(void)
return alloc_kvm_area();
-out8:
+out9:
free_page((unsigned long)vmx_vmwrite_bitmap);
-out7:
+out8:
free_page((unsigned long)vmx_vmread_bitmap);
+out7:
+ free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive);
out6:
free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
-out4:
+out5:
free_page((unsigned long)vmx_msr_bitmap_longmode);
+out4:
+ free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive);
out3:
free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
out2:
@@ -6544,7 +6608,9 @@ out:
static __exit void hardware_unsetup(void)
{
free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
+ free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive);
free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
+ free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive);
free_page((unsigned long)vmx_msr_bitmap_legacy);
free_page((unsigned long)vmx_msr_bitmap_longmode);
free_page((unsigned long)vmx_io_bitmap_b);
@@ -6726,7 +6792,7 @@ static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
{
/* TODO: not to reset guest simply here. */
kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
- pr_warn("kvm: nested vmx abort, indicator %d\n", indicator);
+ pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator);
}
static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
@@ -7013,7 +7079,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
vmx->nested.vmcs02_num = 0;
hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
- HRTIMER_MODE_REL);
+ HRTIMER_MODE_REL_PINNED);
vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
vmx->nested.vmxon = true;
@@ -8435,12 +8501,7 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
return;
}
- /*
- * There is not point to enable virtualize x2apic without enable
- * apicv
- */
- if (!cpu_has_vmx_virtualize_x2apic_mode() ||
- !kvm_vcpu_apicv_active(vcpu))
+ if (!cpu_has_vmx_virtualize_x2apic_mode())
return;
if (!cpu_need_tpr_shadow(vcpu))
@@ -9598,7 +9659,7 @@ static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
maxphyaddr = cpuid_maxphyaddr(vcpu);
if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr ||
(addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) {
- pr_warn_ratelimited(
+ pr_debug_ratelimited(
"nVMX: invalid MSR switch (0x%lx, %d, %llu, 0x%08llx)",
addr_field, maxphyaddr, count, addr);
return -EINVAL;
@@ -9671,13 +9732,13 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
for (i = 0; i < count; i++) {
if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
&e, sizeof(e))) {
- pr_warn_ratelimited(
+ pr_debug_ratelimited(
"%s cannot read MSR entry (%u, 0x%08llx)\n",
__func__, i, gpa + i * sizeof(e));
goto fail;
}
if (nested_vmx_load_msr_check(vcpu, &e)) {
- pr_warn_ratelimited(
+ pr_debug_ratelimited(
"%s check failed (%u, 0x%x, 0x%x)\n",
__func__, i, e.index, e.reserved);
goto fail;
@@ -9685,7 +9746,7 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
msr.index = e.index;
msr.data = e.value;
if (kvm_set_msr(vcpu, &msr)) {
- pr_warn_ratelimited(
+ pr_debug_ratelimited(
"%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
__func__, i, e.index, e.value);
goto fail;
@@ -9706,13 +9767,13 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
if (kvm_vcpu_read_guest(vcpu,
gpa + i * sizeof(e),
&e, 2 * sizeof(u32))) {
- pr_warn_ratelimited(
+ pr_debug_ratelimited(
"%s cannot read MSR entry (%u, 0x%08llx)\n",
__func__, i, gpa + i * sizeof(e));
return -EINVAL;
}
if (nested_vmx_store_msr_check(vcpu, &e)) {
- pr_warn_ratelimited(
+ pr_debug_ratelimited(
"%s check failed (%u, 0x%x, 0x%x)\n",
__func__, i, e.index, e.reserved);
return -EINVAL;
@@ -9720,7 +9781,7 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
msr_info.host_initiated = false;
msr_info.index = e.index;
if (kvm_get_msr(vcpu, &msr_info)) {
- pr_warn_ratelimited(
+ pr_debug_ratelimited(
"%s cannot read MSR (%u, 0x%x)\n",
__func__, i, e.index);
return -EINVAL;
@@ -9729,7 +9790,7 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
gpa + i * sizeof(e) +
offsetof(struct vmx_msr_entry, value),
&msr_info.data, sizeof(msr_info.data))) {
- pr_warn_ratelimited(
+ pr_debug_ratelimited(
"%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
__func__, i, e.index, msr_info.data);
return -EINVAL;
@@ -10500,6 +10561,9 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
}
+ if (nested_cpu_has_ept(vmcs12))
+ vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
+
if (nested_cpu_has_vid(vmcs12))
vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
@@ -10793,7 +10857,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
* We are now running in L2, mmu_notifier will force to reload the
* page's hpa for L2 vmcs. Need to reload it for L1 before entering L1.
*/
- kvm_vcpu_reload_apic_access_page(vcpu);
+ kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
/*
* Exiting from L2 to L1, we're now back to L1 which thinks it just
@@ -11274,7 +11338,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
- .read_tsc_offset = vmx_read_tsc_offset,
.write_tsc_offset = vmx_write_tsc_offset,
.adjust_tsc_offset_guest = vmx_adjust_tsc_offset_guest,
.read_l1_tsc = vmx_read_l1_tsc,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 699f8726539a..6c633de84dd7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1367,7 +1367,7 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
{
- u64 curr_offset = kvm_x86_ops->read_tsc_offset(vcpu);
+ u64 curr_offset = vcpu->arch.tsc_offset;
vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
}
@@ -1413,6 +1413,12 @@ u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
}
EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
+static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+{
+ kvm_x86_ops->write_tsc_offset(vcpu, offset);
+ vcpu->arch.tsc_offset = offset;
+}
+
void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
{
struct kvm *kvm = vcpu->kvm;
@@ -1425,7 +1431,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
offset = kvm_compute_tsc_offset(vcpu, data);
- ns = get_kernel_ns();
+ ns = ktime_get_boot_ns();
elapsed = ns - kvm->arch.last_tsc_nsec;
if (vcpu->arch.virtual_tsc_khz) {
@@ -1522,7 +1528,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
if (guest_cpuid_has_tsc_adjust(vcpu) && !msr->host_initiated)
update_ia32_tsc_adjust_msr(vcpu, offset);
- kvm_x86_ops->write_tsc_offset(vcpu, offset);
+ kvm_vcpu_write_tsc_offset(vcpu, offset);
raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
@@ -1716,6 +1722,88 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
#endif
}
+static u64 __get_kvmclock_ns(struct kvm *kvm)
+{
+ struct kvm_vcpu *vcpu = kvm_get_vcpu(kvm, 0);
+ struct kvm_arch *ka = &kvm->arch;
+ s64 ns;
+
+ if (vcpu->arch.hv_clock.flags & PVCLOCK_TSC_STABLE_BIT) {
+ u64 tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+ ns = __pvclock_read_cycles(&vcpu->arch.hv_clock, tsc);
+ } else {
+ ns = ktime_get_boot_ns() + ka->kvmclock_offset;
+ }
+
+ return ns;
+}
+
+u64 get_kvmclock_ns(struct kvm *kvm)
+{
+ unsigned long flags;
+ s64 ns;
+
+ local_irq_save(flags);
+ ns = __get_kvmclock_ns(kvm);
+ local_irq_restore(flags);
+
+ return ns;
+}
+
+static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
+{
+ struct kvm_vcpu_arch *vcpu = &v->arch;
+ struct pvclock_vcpu_time_info guest_hv_clock;
+
+ if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
+ &guest_hv_clock, sizeof(guest_hv_clock))))
+ return;
+
+ /* This VCPU is paused, but it's legal for a guest to read another
+ * VCPU's kvmclock, so we really have to follow the specification where
+ * it says that version is odd if data is being modified, and even after
+ * it is consistent.
+ *
+ * Version field updates must be kept separate. This is because
+ * kvm_write_guest_cached might use a "rep movs" instruction, and
+ * writes within a string instruction are weakly ordered. So there
+ * are three writes overall.
+ *
+ * As a small optimization, only write the version field in the first
+ * and third write. The vcpu->pv_time cache is still valid, because the
+ * version field is the first in the struct.
+ */
+ BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
+
+ vcpu->hv_clock.version = guest_hv_clock.version + 1;
+ kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+ &vcpu->hv_clock,
+ sizeof(vcpu->hv_clock.version));
+
+ smp_wmb();
+
+ /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
+ vcpu->hv_clock.flags |= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
+
+ if (vcpu->pvclock_set_guest_stopped_request) {
+ vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
+ vcpu->pvclock_set_guest_stopped_request = false;
+ }
+
+ trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
+
+ kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+ &vcpu->hv_clock,
+ sizeof(vcpu->hv_clock));
+
+ smp_wmb();
+
+ vcpu->hv_clock.version++;
+ kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+ &vcpu->hv_clock,
+ sizeof(vcpu->hv_clock.version));
+}
+
static int kvm_guest_time_update(struct kvm_vcpu *v)
{
unsigned long flags, tgt_tsc_khz;
@@ -1723,7 +1811,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
struct kvm_arch *ka = &v->kvm->arch;
s64 kernel_ns;
u64 tsc_timestamp, host_tsc;
- struct pvclock_vcpu_time_info guest_hv_clock;
u8 pvclock_flags;
bool use_master_clock;
@@ -1752,7 +1839,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
}
if (!use_master_clock) {
host_tsc = rdtsc();
- kernel_ns = get_kernel_ns();
+ kernel_ns = ktime_get_boot_ns();
}
tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
@@ -1777,8 +1864,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
local_irq_restore(flags);
- if (!vcpu->pv_time_enabled)
- return 0;
+ /* With all the info we got, fill in the values */
if (kvm_has_tsc_control)
tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
@@ -1790,64 +1876,21 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
vcpu->hw_tsc_khz = tgt_tsc_khz;
}
- /* With all the info we got, fill in the values */
vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
vcpu->last_guest_tsc = tsc_timestamp;
- if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
- &guest_hv_clock, sizeof(guest_hv_clock))))
- return 0;
-
- /* This VCPU is paused, but it's legal for a guest to read another
- * VCPU's kvmclock, so we really have to follow the specification where
- * it says that version is odd if data is being modified, and even after
- * it is consistent.
- *
- * Version field updates must be kept separate. This is because
- * kvm_write_guest_cached might use a "rep movs" instruction, and
- * writes within a string instruction are weakly ordered. So there
- * are three writes overall.
- *
- * As a small optimization, only write the version field in the first
- * and third write. The vcpu->pv_time cache is still valid, because the
- * version field is the first in the struct.
- */
- BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
-
- vcpu->hv_clock.version = guest_hv_clock.version + 1;
- kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
- &vcpu->hv_clock,
- sizeof(vcpu->hv_clock.version));
-
- smp_wmb();
-
- /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
- pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
-
- if (vcpu->pvclock_set_guest_stopped_request) {
- pvclock_flags |= PVCLOCK_GUEST_STOPPED;
- vcpu->pvclock_set_guest_stopped_request = false;
- }
-
/* If the host uses TSC clocksource, then it is stable */
+ pvclock_flags = 0;
if (use_master_clock)
pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
vcpu->hv_clock.flags = pvclock_flags;
- trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
-
- kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
- &vcpu->hv_clock,
- sizeof(vcpu->hv_clock));
-
- smp_wmb();
-
- vcpu->hv_clock.version++;
- kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
- &vcpu->hv_clock,
- sizeof(vcpu->hv_clock.version));
+ if (vcpu->pv_time_enabled)
+ kvm_setup_pvclock_page(v);
+ if (v == kvm_get_vcpu(v->kvm, 0))
+ kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
return 0;
}
@@ -2746,7 +2789,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (check_tsc_unstable()) {
u64 offset = kvm_compute_tsc_offset(vcpu,
vcpu->arch.last_guest_tsc);
- kvm_x86_ops->write_tsc_offset(vcpu, offset);
+ kvm_vcpu_write_tsc_offset(vcpu, offset);
vcpu->arch.tsc_catchup = 1;
}
if (kvm_lapic_hv_timer_in_use(vcpu) &&
@@ -4039,7 +4082,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
case KVM_SET_CLOCK: {
struct kvm_clock_data user_ns;
u64 now_ns;
- s64 delta;
r = -EFAULT;
if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
@@ -4051,10 +4093,9 @@ long kvm_arch_vm_ioctl(struct file *filp,
r = 0;
local_irq_disable();
- now_ns = get_kernel_ns();
- delta = user_ns.clock - now_ns;
+ now_ns = __get_kvmclock_ns(kvm);
+ kvm->arch.kvmclock_offset += user_ns.clock - now_ns;
local_irq_enable();
- kvm->arch.kvmclock_offset = delta;
kvm_gen_update_masterclock(kvm);
break;
}
@@ -4062,10 +4103,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
struct kvm_clock_data user_ns;
u64 now_ns;
- local_irq_disable();
- now_ns = get_kernel_ns();
- user_ns.clock = kvm->arch.kvmclock_offset + now_ns;
- local_irq_enable();
+ now_ns = get_kvmclock_ns(kvm);
+ user_ns.clock = now_ns;
user_ns.flags = 0;
memset(&user_ns.pad, 0, sizeof(user_ns.pad));
@@ -6700,7 +6739,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_put_guest_xcr0(vcpu);
- /* Interrupt is enabled by handle_external_intr() */
kvm_x86_ops->handle_external_intr(vcpu);
++vcpu->stat.exits;
@@ -7530,7 +7568,7 @@ int kvm_arch_hardware_enable(void)
* before any KVM threads can be running. Unfortunately, we can't
* bring the TSCs fully up to date with real time, as we aren't yet far
* enough into CPU bringup that we know how much real time has actually
- * elapsed; our helper function, get_kernel_ns() will be using boot
+ * elapsed; our helper function, ktime_get_boot_ns() will be using boot
* variables that haven't been updated yet.
*
* So we simply find the maximum observed TSC above, then record the
@@ -7765,6 +7803,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
mutex_init(&kvm->arch.apic_map_lock);
spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
+ kvm->arch.kvmclock_offset = -ktime_get_boot_ns();
pvclock_update_vm_gtod_copy(kvm);
INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index a82ca466b62e..e8ff3e4ce38a 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -148,11 +148,6 @@ static inline void kvm_register_writel(struct kvm_vcpu *vcpu,
return kvm_register_write(vcpu, reg, val);
}
-static inline u64 get_kernel_ns(void)
-{
- return ktime_get_boot_ns();
-}
-
static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk)
{
return !(kvm->arch.disabled_quirks & quirk);
@@ -164,6 +159,7 @@ void kvm_set_pending_timer(struct kvm_vcpu *vcpu);
int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr);
+u64 get_kvmclock_ns(struct kvm *kvm);
int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
gva_t addr, void *val, unsigned int bytes,
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 4025291ea0ae..58fa8cc0262b 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -137,6 +137,7 @@ struct iommu_dev_data {
bool pri_tlp; /* PASID TLB required for
PPR completions */
u32 errata; /* Bitmap for errata to apply */
+ bool use_vapic; /* Enable device to use vapic mode */
};
/*
@@ -707,14 +708,74 @@ static void iommu_poll_ppr_log(struct amd_iommu *iommu)
}
}
+#ifdef CONFIG_IRQ_REMAP
+static int (*iommu_ga_log_notifier)(u32);
+
+int amd_iommu_register_ga_log_notifier(int (*notifier)(u32))
+{
+ iommu_ga_log_notifier = notifier;
+
+ return 0;
+}
+EXPORT_SYMBOL(amd_iommu_register_ga_log_notifier);
+
+static void iommu_poll_ga_log(struct amd_iommu *iommu)
+{
+ u32 head, tail, cnt = 0;
+
+ if (iommu->ga_log == NULL)
+ return;
+
+ head = readl(iommu->mmio_base + MMIO_GA_HEAD_OFFSET);
+ tail = readl(iommu->mmio_base + MMIO_GA_TAIL_OFFSET);
+
+ while (head != tail) {
+ volatile u64 *raw;
+ u64 log_entry;
+
+ raw = (u64 *)(iommu->ga_log + head);
+ cnt++;
+
+ /* Avoid memcpy function-call overhead */
+ log_entry = *raw;
+
+ /* Update head pointer of hardware ring-buffer */
+ head = (head + GA_ENTRY_SIZE) % GA_LOG_SIZE;
+ writel(head, iommu->mmio_base + MMIO_GA_HEAD_OFFSET);
+
+ /* Handle GA entry */
+ switch (GA_REQ_TYPE(log_entry)) {
+ case GA_GUEST_NR:
+ if (!iommu_ga_log_notifier)
+ break;
+
+ pr_debug("AMD-Vi: %s: devid=%#x, ga_tag=%#x\n",
+ __func__, GA_DEVID(log_entry),
+ GA_TAG(log_entry));
+
+ if (iommu_ga_log_notifier(GA_TAG(log_entry)) != 0)
+ pr_err("AMD-Vi: GA log notifier failed.\n");
+ break;
+ default:
+ break;
+ }
+ }
+}
+#endif /* CONFIG_IRQ_REMAP */
+
+#define AMD_IOMMU_INT_MASK \
+ (MMIO_STATUS_EVT_INT_MASK | \
+ MMIO_STATUS_PPR_INT_MASK | \
+ MMIO_STATUS_GALOG_INT_MASK)
+
irqreturn_t amd_iommu_int_thread(int irq, void *data)
{
struct amd_iommu *iommu = (struct amd_iommu *) data;
u32 status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
- while (status & (MMIO_STATUS_EVT_INT_MASK | MMIO_STATUS_PPR_INT_MASK)) {
- /* Enable EVT and PPR interrupts again */
- writel((MMIO_STATUS_EVT_INT_MASK | MMIO_STATUS_PPR_INT_MASK),
+ while (status & AMD_IOMMU_INT_MASK) {
+ /* Enable EVT and PPR and GA interrupts again */
+ writel(AMD_IOMMU_INT_MASK,
iommu->mmio_base + MMIO_STATUS_OFFSET);
if (status & MMIO_STATUS_EVT_INT_MASK) {
@@ -727,6 +788,13 @@ irqreturn_t amd_iommu_int_thread(int irq, void *data)
iommu_poll_ppr_log(iommu);
}
+#ifdef CONFIG_IRQ_REMAP
+ if (status & MMIO_STATUS_GALOG_INT_MASK) {
+ pr_devel("AMD-Vi: Processing IOMMU GA Log\n");
+ iommu_poll_ga_log(iommu);
+ }
+#endif
+
/*
* Hardware bug: ERBT1312
* When re-enabling interrupt (by writing 1
@@ -2967,6 +3035,12 @@ static void amd_iommu_detach_device(struct iommu_domain *dom,
if (!iommu)
return;
+#ifdef CONFIG_IRQ_REMAP
+ if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) &&
+ (dom->type == IOMMU_DOMAIN_UNMANAGED))
+ dev_data->use_vapic = 0;
+#endif
+
iommu_completion_wait(iommu);
}
@@ -2992,6 +3066,15 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
ret = attach_device(dev, domain);
+#ifdef CONFIG_IRQ_REMAP
+ if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) {
+ if (dom->type == IOMMU_DOMAIN_UNMANAGED)
+ dev_data->use_vapic = 1;
+ else
+ dev_data->use_vapic = 0;
+ }
+#endif
+
iommu_completion_wait(iommu);
return ret;
@@ -3530,34 +3613,6 @@ EXPORT_SYMBOL(amd_iommu_device_info);
*
*****************************************************************************/
-union irte {
- u32 val;
- struct {
- u32 valid : 1,
- no_fault : 1,
- int_type : 3,
- rq_eoi : 1,
- dm : 1,
- rsvd_1 : 1,
- destination : 8,
- vector : 8,
- rsvd_2 : 8;
- } fields;
-};
-
-struct irq_2_irte {
- u16 devid; /* Device ID for IRTE table */
- u16 index; /* Index into IRTE table*/
-};
-
-struct amd_ir_data {
- struct irq_2_irte irq_2_irte;
- union irte irte_entry;
- union {
- struct msi_msg msi_entry;
- };
-};
-
static struct irq_chip amd_ir_chip;
#define DTE_IRQ_PHYS_ADDR_MASK (((1ULL << 45)-1) << 6)
@@ -3579,8 +3634,6 @@ static void set_dte_irq_entry(u16 devid, struct irq_remap_table *table)
amd_iommu_dev_table[devid].data[2] = dte;
}
-#define IRTE_ALLOCATED (~1U)
-
static struct irq_remap_table *get_irq_table(u16 devid, bool ioapic)
{
struct irq_remap_table *table = NULL;
@@ -3626,13 +3679,18 @@ static struct irq_remap_table *get_irq_table(u16 devid, bool ioapic)
goto out;
}
- memset(table->table, 0, MAX_IRQS_PER_TABLE * sizeof(u32));
+ if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
+ memset(table->table, 0,
+ MAX_IRQS_PER_TABLE * sizeof(u32));
+ else
+ memset(table->table, 0,
+ (MAX_IRQS_PER_TABLE * (sizeof(u64) * 2)));
if (ioapic) {
int i;
for (i = 0; i < 32; ++i)
- table->table[i] = IRTE_ALLOCATED;
+ iommu->irte_ops->set_allocated(table, i);
}
irq_lookup_table[devid] = table;
@@ -3658,6 +3716,10 @@ static int alloc_irq_index(u16 devid, int count)
struct irq_remap_table *table;
unsigned long flags;
int index, c;
+ struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
+
+ if (!iommu)
+ return -ENODEV;
table = get_irq_table(devid, false);
if (!table)
@@ -3669,14 +3731,14 @@ static int alloc_irq_index(u16 devid, int count)
for (c = 0, index = table->min_index;
index < MAX_IRQS_PER_TABLE;
++index) {
- if (table->table[index] == 0)
+ if (!iommu->irte_ops->is_allocated(table, index))
c += 1;
else
c = 0;
if (c == count) {
for (; c != 0; --c)
- table->table[index - c + 1] = IRTE_ALLOCATED;
+ iommu->irte_ops->set_allocated(table, index - c + 1);
index -= count - 1;
goto out;
@@ -3691,7 +3753,42 @@ out:
return index;
}
-static int modify_irte(u16 devid, int index, union irte irte)
+static int modify_irte_ga(u16 devid, int index, struct irte_ga *irte,
+ struct amd_ir_data *data)
+{
+ struct irq_remap_table *table;
+ struct amd_iommu *iommu;
+ unsigned long flags;
+ struct irte_ga *entry;
+
+ iommu = amd_iommu_rlookup_table[devid];
+ if (iommu == NULL)
+ return -EINVAL;
+
+ table = get_irq_table(devid, false);
+ if (!table)
+ return -ENOMEM;
+
+ spin_lock_irqsave(&table->lock, flags);
+
+ entry = (struct irte_ga *)table->table;
+ entry = &entry[index];
+ entry->lo.fields_remap.valid = 0;
+ entry->hi.val = irte->hi.val;
+ entry->lo.val = irte->lo.val;
+ entry->lo.fields_remap.valid = 1;
+ if (data)
+ data->ref = entry;
+
+ spin_unlock_irqrestore(&table->lock, flags);
+
+ iommu_flush_irt(iommu, devid);
+ iommu_completion_wait(iommu);
+
+ return 0;
+}
+
+static int modify_irte(u16 devid, int index, union irte *irte)
{
struct irq_remap_table *table;
struct amd_iommu *iommu;
@@ -3706,7 +3803,7 @@ static int modify_irte(u16 devid, int index, union irte irte)
return -ENOMEM;
spin_lock_irqsave(&table->lock, flags);
- table->table[index] = irte.val;
+ table->table[index] = irte->val;
spin_unlock_irqrestore(&table->lock, flags);
iommu_flush_irt(iommu, devid);
@@ -3730,13 +3827,146 @@ static void free_irte(u16 devid, int index)
return;
spin_lock_irqsave(&table->lock, flags);
- table->table[index] = 0;
+ iommu->irte_ops->clear_allocated(table, index);
spin_unlock_irqrestore(&table->lock, flags);
iommu_flush_irt(iommu, devid);
iommu_completion_wait(iommu);
}
+static void irte_prepare(void *entry,
+ u32 delivery_mode, u32 dest_mode,
+ u8 vector, u32 dest_apicid, int devid)
+{
+ union irte *irte = (union irte *) entry;
+
+ irte->val = 0;
+ irte->fields.vector = vector;
+ irte->fields.int_type = delivery_mode;
+ irte->fields.destination = dest_apicid;
+ irte->fields.dm = dest_mode;
+ irte->fields.valid = 1;
+}
+
+static void irte_ga_prepare(void *entry,
+ u32 delivery_mode, u32 dest_mode,
+ u8 vector, u32 dest_apicid, int devid)
+{
+ struct irte_ga *irte = (struct irte_ga *) entry;
+ struct iommu_dev_data *dev_data = search_dev_data(devid);
+
+ irte->lo.val = 0;
+ irte->hi.val = 0;
+ irte->lo.fields_remap.guest_mode = dev_data ? dev_data->use_vapic : 0;
+ irte->lo.fields_remap.int_type = delivery_mode;
+ irte->lo.fields_remap.dm = dest_mode;
+ irte->hi.fields.vector = vector;
+ irte->lo.fields_remap.destination = dest_apicid;
+ irte->lo.fields_remap.valid = 1;
+}
+
+static void irte_activate(void *entry, u16 devid, u16 index)
+{
+ union irte *irte = (union irte *) entry;
+
+ irte->fields.valid = 1;
+ modify_irte(devid, index, irte);
+}
+
+static void irte_ga_activate(void *entry, u16 devid, u16 index)
+{
+ struct irte_ga *irte = (struct irte_ga *) entry;
+
+ irte->lo.fields_remap.valid = 1;
+ modify_irte_ga(devid, index, irte, NULL);
+}
+
+static void irte_deactivate(void *entry, u16 devid, u16 index)
+{
+ union irte *irte = (union irte *) entry;
+
+ irte->fields.valid = 0;
+ modify_irte(devid, index, irte);
+}
+
+static void irte_ga_deactivate(void *entry, u16 devid, u16 index)
+{
+ struct irte_ga *irte = (struct irte_ga *) entry;
+
+ irte->lo.fields_remap.valid = 0;
+ modify_irte_ga(devid, index, irte, NULL);
+}
+
+static void irte_set_affinity(void *entry, u16 devid, u16 index,
+ u8 vector, u32 dest_apicid)
+{
+ union irte *irte = (union irte *) entry;
+
+ irte->fields.vector = vector;
+ irte->fields.destination = dest_apicid;
+ modify_irte(devid, index, irte);
+}
+
+static void irte_ga_set_affinity(void *entry, u16 devid, u16 index,
+ u8 vector, u32 dest_apicid)
+{
+ struct irte_ga *irte = (struct irte_ga *) entry;
+ struct iommu_dev_data *dev_data = search_dev_data(devid);
+
+ if (!dev_data || !dev_data->use_vapic) {
+ irte->hi.fields.vector = vector;
+ irte->lo.fields_remap.destination = dest_apicid;
+ irte->lo.fields_remap.guest_mode = 0;
+ modify_irte_ga(devid, index, irte, NULL);
+ }
+}
+
+#define IRTE_ALLOCATED (~1U)
+static void irte_set_allocated(struct irq_remap_table *table, int index)
+{
+ table->table[index] = IRTE_ALLOCATED;
+}
+
+static void irte_ga_set_allocated(struct irq_remap_table *table, int index)
+{
+ struct irte_ga *ptr = (struct irte_ga *)table->table;
+ struct irte_ga *irte = &ptr[index];
+
+ memset(&irte->lo.val, 0, sizeof(u64));
+ memset(&irte->hi.val, 0, sizeof(u64));
+ irte->hi.fields.vector = 0xff;
+}
+
+static bool irte_is_allocated(struct irq_remap_table *table, int index)
+{
+ union irte *ptr = (union irte *)table->table;
+ union irte *irte = &ptr[index];
+
+ return irte->val != 0;
+}
+
+static bool irte_ga_is_allocated(struct irq_remap_table *table, int index)
+{
+ struct irte_ga *ptr = (struct irte_ga *)table->table;
+ struct irte_ga *irte = &ptr[index];
+
+ return irte->hi.fields.vector != 0;
+}
+
+static void irte_clear_allocated(struct irq_remap_table *table, int index)
+{
+ table->table[index] = 0;
+}
+
+static void irte_ga_clear_allocated(struct irq_remap_table *table, int index)
+{
+ struct irte_ga *ptr = (struct irte_ga *)table->table;
+ struct irte_ga *irte = &ptr[index];
+
+ memset(&irte->lo.val, 0, sizeof(u64));
+ memset(&irte->hi.val, 0, sizeof(u64));
+}
+
static int get_devid(struct irq_alloc_info *info)
{
int devid = -1;
@@ -3821,19 +4051,17 @@ static void irq_remapping_prepare_irte(struct amd_ir_data *data,
{
struct irq_2_irte *irte_info = &data->irq_2_irte;
struct msi_msg *msg = &data->msi_entry;
- union irte *irte = &data->irte_entry;
struct IO_APIC_route_entry *entry;
+ struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
+
+ if (!iommu)
+ return;
data->irq_2_irte.devid = devid;
data->irq_2_irte.index = index + sub_handle;
-
- /* Setup IRTE for IOMMU */
- irte->val = 0;
- irte->fields.vector = irq_cfg->vector;
- irte->fields.int_type = apic->irq_delivery_mode;
- irte->fields.destination = irq_cfg->dest_apicid;
- irte->fields.dm = apic->irq_dest_mode;
- irte->fields.valid = 1;
+ iommu->irte_ops->prepare(data->entry, apic->irq_delivery_mode,
+ apic->irq_dest_mode, irq_cfg->vector,
+ irq_cfg->dest_apicid, devid);
switch (info->type) {
case X86_IRQ_ALLOC_TYPE_IOAPIC:
@@ -3864,12 +4092,32 @@ static void irq_remapping_prepare_irte(struct amd_ir_data *data,
}
}
+struct amd_irte_ops irte_32_ops = {
+ .prepare = irte_prepare,
+ .activate = irte_activate,
+ .deactivate = irte_deactivate,
+ .set_affinity = irte_set_affinity,
+ .set_allocated = irte_set_allocated,
+ .is_allocated = irte_is_allocated,
+ .clear_allocated = irte_clear_allocated,
+};
+
+struct amd_irte_ops irte_128_ops = {
+ .prepare = irte_ga_prepare,
+ .activate = irte_ga_activate,
+ .deactivate = irte_ga_deactivate,
+ .set_affinity = irte_ga_set_affinity,
+ .set_allocated = irte_ga_set_allocated,
+ .is_allocated = irte_ga_is_allocated,
+ .clear_allocated = irte_ga_clear_allocated,
+};
+
static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq,
unsigned int nr_irqs, void *arg)
{
struct irq_alloc_info *info = arg;
struct irq_data *irq_data;
- struct amd_ir_data *data;
+ struct amd_ir_data *data = NULL;
struct irq_cfg *cfg;
int i, ret, devid;
int index = -1;
@@ -3921,6 +4169,16 @@ static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq,
if (!data)
goto out_free_data;
+ if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
+ data->entry = kzalloc(sizeof(union irte), GFP_KERNEL);
+ else
+ data->entry = kzalloc(sizeof(struct irte_ga),
+ GFP_KERNEL);
+ if (!data->entry) {
+ kfree(data);
+ goto out_free_data;
+ }
+
irq_data->hwirq = (devid << 16) + i;
irq_data->chip_data = data;
irq_data->chip = &amd_ir_chip;
@@ -3957,6 +4215,7 @@ static void irq_remapping_free(struct irq_domain *domain, unsigned int virq,
data = irq_data->chip_data;
irte_info = &data->irq_2_irte;
free_irte(irte_info->devid, irte_info->index);
+ kfree(data->entry);
kfree(data);
}
}
@@ -3968,8 +4227,11 @@ static void irq_remapping_activate(struct irq_domain *domain,
{
struct amd_ir_data *data = irq_data->chip_data;
struct irq_2_irte *irte_info = &data->irq_2_irte;
+ struct amd_iommu *iommu = amd_iommu_rlookup_table[irte_info->devid];
- modify_irte(irte_info->devid, irte_info->index, data->irte_entry);
+ if (iommu)
+ iommu->irte_ops->activate(data->entry, irte_info->devid,
+ irte_info->index);
}
static void irq_remapping_deactivate(struct irq_domain *domain,
@@ -3977,10 +4239,11 @@ static void irq_remapping_deactivate(struct irq_domain *domain,
{
struct amd_ir_data *data = irq_data->chip_data;
struct irq_2_irte *irte_info = &data->irq_2_irte;
- union irte entry;
+ struct amd_iommu *iommu = amd_iommu_rlookup_table[irte_info->devid];
- entry.val = 0;
- modify_irte(irte_info->devid, irte_info->index, data->irte_entry);
+ if (iommu)
+ iommu->irte_ops->deactivate(data->entry, irte_info->devid,
+ irte_info->index);
}
static struct irq_domain_ops amd_ir_domain_ops = {
@@ -3990,6 +4253,70 @@ static struct irq_domain_ops amd_ir_domain_ops = {
.deactivate = irq_remapping_deactivate,
};
+static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *vcpu_info)
+{
+ struct amd_iommu *iommu;
+ struct amd_iommu_pi_data *pi_data = vcpu_info;
+ struct vcpu_data *vcpu_pi_info = pi_data->vcpu_data;
+ struct amd_ir_data *ir_data = data->chip_data;
+ struct irte_ga *irte = (struct irte_ga *) ir_data->entry;
+ struct irq_2_irte *irte_info = &ir_data->irq_2_irte;
+ struct iommu_dev_data *dev_data = search_dev_data(irte_info->devid);
+
+ /* Note:
+ * This device has never been set up for guest mode.
+ * we should not modify the IRTE
+ */
+ if (!dev_data || !dev_data->use_vapic)
+ return 0;
+
+ pi_data->ir_data = ir_data;
+
+ /* Note:
+ * SVM tries to set up for VAPIC mode, but we are in
+ * legacy mode. So, we force legacy mode instead.
+ */
+ if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) {
+ pr_debug("AMD-Vi: %s: Fall back to using intr legacy remap\n",
+ __func__);
+ pi_data->is_guest_mode = false;
+ }
+
+ iommu = amd_iommu_rlookup_table[irte_info->devid];
+ if (iommu == NULL)
+ return -EINVAL;
+
+ pi_data->prev_ga_tag = ir_data->cached_ga_tag;
+ if (pi_data->is_guest_mode) {
+ /* Setting */
+ irte->hi.fields.ga_root_ptr = (pi_data->base >> 12);
+ irte->hi.fields.vector = vcpu_pi_info->vector;
+ irte->lo.fields_vapic.guest_mode = 1;
+ irte->lo.fields_vapic.ga_tag = pi_data->ga_tag;
+
+ ir_data->cached_ga_tag = pi_data->ga_tag;
+ } else {
+ /* Un-Setting */
+ struct irq_cfg *cfg = irqd_cfg(data);
+
+ irte->hi.val = 0;
+ irte->lo.val = 0;
+ irte->hi.fields.vector = cfg->vector;
+ irte->lo.fields_remap.guest_mode = 0;
+ irte->lo.fields_remap.destination = cfg->dest_apicid;
+ irte->lo.fields_remap.int_type = apic->irq_delivery_mode;
+ irte->lo.fields_remap.dm = apic->irq_dest_mode;
+
+ /*
+ * This communicates the ga_tag back to the caller
+ * so that it can do all the necessary clean up.
+ */
+ ir_data->cached_ga_tag = 0;
+ }
+
+ return modify_irte_ga(irte_info->devid, irte_info->index, irte, ir_data);
+}
+
static int amd_ir_set_affinity(struct irq_data *data,
const struct cpumask *mask, bool force)
{
@@ -3997,8 +4324,12 @@ static int amd_ir_set_affinity(struct irq_data *data,
struct irq_2_irte *irte_info = &ir_data->irq_2_irte;
struct irq_cfg *cfg = irqd_cfg(data);
struct irq_data *parent = data->parent_data;
+ struct amd_iommu *iommu = amd_iommu_rlookup_table[irte_info->devid];
int ret;
+ if (!iommu)
+ return -ENODEV;
+
ret = parent->chip->irq_set_affinity(parent, mask, force);
if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
return ret;
@@ -4007,9 +4338,8 @@ static int amd_ir_set_affinity(struct irq_data *data,
* Atomically updates the IRTE with the new destination, vector
* and flushes the interrupt entry cache.
*/
- ir_data->irte_entry.fields.vector = cfg->vector;
- ir_data->irte_entry.fields.destination = cfg->dest_apicid;
- modify_irte(irte_info->devid, irte_info->index, ir_data->irte_entry);
+ iommu->irte_ops->set_affinity(ir_data->entry, irte_info->devid,
+ irte_info->index, cfg->vector, cfg->dest_apicid);
/*
* After this point, all the interrupts will start arriving
@@ -4031,6 +4361,7 @@ static void ir_compose_msi_msg(struct irq_data *irq_data, struct msi_msg *msg)
static struct irq_chip amd_ir_chip = {
.irq_ack = ir_ack_apic_edge,
.irq_set_affinity = amd_ir_set_affinity,
+ .irq_set_vcpu_affinity = amd_ir_set_vcpu_affinity,
.irq_compose_msi_msg = ir_compose_msi_msg,
};
@@ -4045,4 +4376,43 @@ int amd_iommu_create_irq_domain(struct amd_iommu *iommu)
return 0;
}
+
+int amd_iommu_update_ga(int cpu, bool is_run, void *data)
+{
+ unsigned long flags;
+ struct amd_iommu *iommu;
+ struct irq_remap_table *irt;
+ struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
+ int devid = ir_data->irq_2_irte.devid;
+ struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
+ struct irte_ga *ref = (struct irte_ga *) ir_data->ref;
+
+ if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) ||
+ !ref || !entry || !entry->lo.fields_vapic.guest_mode)
+ return 0;
+
+ iommu = amd_iommu_rlookup_table[devid];
+ if (!iommu)
+ return -ENODEV;
+
+ irt = get_irq_table(devid, false);
+ if (!irt)
+ return -ENODEV;
+
+ spin_lock_irqsave(&irt->lock, flags);
+
+ if (ref->lo.fields_vapic.guest_mode) {
+ if (cpu >= 0)
+ ref->lo.fields_vapic.destination = cpu;
+ ref->lo.fields_vapic.is_run = is_run;
+ barrier();
+ }
+
+ spin_unlock_irqrestore(&irt->lock, flags);
+
+ iommu_flush_irt(iommu, devid);
+ iommu_completion_wait(iommu);
+ return 0;
+}
+EXPORT_SYMBOL(amd_iommu_update_ga);
#endif
diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
index 59741ead7e15..cd1713631a4a 100644
--- a/drivers/iommu/amd_iommu_init.c
+++ b/drivers/iommu/amd_iommu_init.c
@@ -84,6 +84,7 @@
#define ACPI_DEVFLAG_LINT1 0x80
#define ACPI_DEVFLAG_ATSDIS 0x10000000
+#define LOOP_TIMEOUT 100000
/*
* ACPI table definitions
*
@@ -145,6 +146,8 @@ struct ivmd_header {
bool amd_iommu_dump;
bool amd_iommu_irq_remap __read_mostly;
+int amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_VAPIC;
+
static bool amd_iommu_detected;
static bool __initdata amd_iommu_disabled;
static int amd_iommu_target_ivhd_type;
@@ -386,6 +389,10 @@ static void iommu_disable(struct amd_iommu *iommu)
iommu_feature_disable(iommu, CONTROL_EVT_INT_EN);
iommu_feature_disable(iommu, CONTROL_EVT_LOG_EN);
+ /* Disable IOMMU GA_LOG */
+ iommu_feature_disable(iommu, CONTROL_GALOG_EN);
+ iommu_feature_disable(iommu, CONTROL_GAINT_EN);
+
/* Disable IOMMU hardware itself */
iommu_feature_disable(iommu, CONTROL_IOMMU_EN);
}
@@ -671,6 +678,99 @@ static void __init free_ppr_log(struct amd_iommu *iommu)
free_pages((unsigned long)iommu->ppr_log, get_order(PPR_LOG_SIZE));
}
+static void free_ga_log(struct amd_iommu *iommu)
+{
+#ifdef CONFIG_IRQ_REMAP
+ if (iommu->ga_log)
+ free_pages((unsigned long)iommu->ga_log,
+ get_order(GA_LOG_SIZE));
+ if (iommu->ga_log_tail)
+ free_pages((unsigned long)iommu->ga_log_tail,
+ get_order(8));
+#endif
+}
+
+static int iommu_ga_log_enable(struct amd_iommu *iommu)
+{
+#ifdef CONFIG_IRQ_REMAP
+ u32 status, i;
+
+ if (!iommu->ga_log)
+ return -EINVAL;
+
+ status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
+
+ /* Check if already running */
+ if (status & (MMIO_STATUS_GALOG_RUN_MASK))
+ return 0;
+
+ iommu_feature_enable(iommu, CONTROL_GAINT_EN);
+ iommu_feature_enable(iommu, CONTROL_GALOG_EN);
+
+ for (i = 0; i < LOOP_TIMEOUT; ++i) {
+ status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
+ if (status & (MMIO_STATUS_GALOG_RUN_MASK))
+ break;
+ }
+
+ if (i >= LOOP_TIMEOUT)
+ return -EINVAL;
+#endif /* CONFIG_IRQ_REMAP */
+ return 0;
+}
+
+#ifdef CONFIG_IRQ_REMAP
+static int iommu_init_ga_log(struct amd_iommu *iommu)
+{
+ u64 entry;
+
+ if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))
+ return 0;
+
+ iommu->ga_log = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ get_order(GA_LOG_SIZE));
+ if (!iommu->ga_log)
+ goto err_out;
+
+ iommu->ga_log_tail = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ get_order(8));
+ if (!iommu->ga_log_tail)
+ goto err_out;
+
+ entry = (u64)virt_to_phys(iommu->ga_log) | GA_LOG_SIZE_512;
+ memcpy_toio(iommu->mmio_base + MMIO_GA_LOG_BASE_OFFSET,
+ &entry, sizeof(entry));
+ entry = ((u64)virt_to_phys(iommu->ga_log) & 0xFFFFFFFFFFFFFULL) & ~7ULL;
+ memcpy_toio(iommu->mmio_base + MMIO_GA_LOG_TAIL_OFFSET,
+ &entry, sizeof(entry));
+ writel(0x00, iommu->mmio_base + MMIO_GA_HEAD_OFFSET);
+ writel(0x00, iommu->mmio_base + MMIO_GA_TAIL_OFFSET);
+
+ return 0;
+err_out:
+ free_ga_log(iommu);
+ return -EINVAL;
+}
+#endif /* CONFIG_IRQ_REMAP */
+
+static int iommu_init_ga(struct amd_iommu *iommu)
+{
+ int ret = 0;
+
+#ifdef CONFIG_IRQ_REMAP
+ /* Note: We have already checked GASup from IVRS table.
+ * Now, we need to make sure that GAMSup is set.
+ */
+ if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) &&
+ !iommu_feature(iommu, FEATURE_GAM_VAPIC))
+ amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_LEGACY_GA;
+
+ ret = iommu_init_ga_log(iommu);
+#endif /* CONFIG_IRQ_REMAP */
+
+ return ret;
+}
+
static void iommu_enable_gt(struct amd_iommu *iommu)
{
if (!iommu_feature(iommu, FEATURE_GT))
@@ -1144,6 +1244,7 @@ static void __init free_iommu_one(struct amd_iommu *iommu)
free_command_buffer(iommu);
free_event_buffer(iommu);
free_ppr_log(iommu);
+ free_ga_log(iommu);
iommu_unmap_mmio_space(iommu);
}
@@ -1258,6 +1359,8 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
iommu->mmio_phys_end = MMIO_REG_END_OFFSET;
else
iommu->mmio_phys_end = MMIO_CNTR_CONF_OFFSET;
+ if (((h->efr_attr & (0x1 << IOMMU_FEAT_GASUP_SHIFT)) == 0))
+ amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_LEGACY;
break;
case 0x11:
case 0x40:
@@ -1265,6 +1368,8 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
iommu->mmio_phys_end = MMIO_REG_END_OFFSET;
else
iommu->mmio_phys_end = MMIO_CNTR_CONF_OFFSET;
+ if (((h->efr_reg & (0x1 << IOMMU_EFR_GASUP_SHIFT)) == 0))
+ amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_LEGACY;
break;
default:
return -EINVAL;
@@ -1432,6 +1537,7 @@ static int iommu_init_pci(struct amd_iommu *iommu)
{
int cap_ptr = iommu->cap_ptr;
u32 range, misc, low, high;
+ int ret;
iommu->dev = pci_get_bus_and_slot(PCI_BUS_NUM(iommu->devid),
iommu->devid & 0xff);
@@ -1488,6 +1594,10 @@ static int iommu_init_pci(struct amd_iommu *iommu)
if (iommu_feature(iommu, FEATURE_PPR) && alloc_ppr_log(iommu))
return -ENOMEM;
+ ret = iommu_init_ga(iommu);
+ if (ret)
+ return ret;
+
if (iommu->cap & (1UL << IOMMU_CAP_NPCACHE))
amd_iommu_np_cache = true;
@@ -1545,16 +1655,24 @@ static void print_iommu_info(void)
dev_name(&iommu->dev->dev), iommu->cap_ptr);
if (iommu->cap & (1 << IOMMU_CAP_EFR)) {
- pr_info("AMD-Vi: Extended features: ");
+ pr_info("AMD-Vi: Extended features (%#llx):\n",
+ iommu->features);
for (i = 0; i < ARRAY_SIZE(feat_str); ++i) {
if (iommu_feature(iommu, (1ULL << i)))
pr_cont(" %s", feat_str[i]);
}
+
+ if (iommu->features & FEATURE_GAM_VAPIC)
+ pr_cont(" GA_vAPIC");
+
pr_cont("\n");
}
}
- if (irq_remapping_enabled)
+ if (irq_remapping_enabled) {
pr_info("AMD-Vi: Interrupt remapping enabled\n");
+ if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))
+ pr_info("AMD-Vi: virtual APIC enabled\n");
+ }
}
static int __init amd_iommu_init_pci(void)
@@ -1645,6 +1763,8 @@ enable_faults:
if (iommu->ppr_log != NULL)
iommu_feature_enable(iommu, CONTROL_PPFINT_EN);
+ iommu_ga_log_enable(iommu);
+
return 0;
}
@@ -1862,6 +1982,24 @@ static void iommu_apply_resume_quirks(struct amd_iommu *iommu)
iommu->stored_addr_lo | 1);
}
+static void iommu_enable_ga(struct amd_iommu *iommu)
+{
+#ifdef CONFIG_IRQ_REMAP
+ switch (amd_iommu_guest_ir) {
+ case AMD_IOMMU_GUEST_IR_VAPIC:
+ iommu_feature_enable(iommu, CONTROL_GAM_EN);
+ /* Fall through */
+ case AMD_IOMMU_GUEST_IR_LEGACY_GA:
+ iommu_feature_enable(iommu, CONTROL_GA_EN);
+ iommu->irte_ops = &irte_128_ops;
+ break;
+ default:
+ iommu->irte_ops = &irte_32_ops;
+ break;
+ }
+#endif
+}
+
/*
* This function finally enables all IOMMUs found in the system after
* they have been initialized
@@ -1877,9 +2015,15 @@ static void early_enable_iommus(void)
iommu_enable_command_buffer(iommu);
iommu_enable_event_buffer(iommu);
iommu_set_exclusion_range(iommu);
+ iommu_enable_ga(iommu);
iommu_enable(iommu);
iommu_flush_all_caches(iommu);
}
+
+#ifdef CONFIG_IRQ_REMAP
+ if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))
+ amd_iommu_irq_ops.capability |= (1 << IRQ_POSTING_CAP);
+#endif
}
static void enable_iommus_v2(void)
@@ -1905,6 +2049,11 @@ static void disable_iommus(void)
for_each_iommu(iommu)
iommu_disable(iommu);
+
+#ifdef CONFIG_IRQ_REMAP
+ if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))
+ amd_iommu_irq_ops.capability &= ~(1 << IRQ_POSTING_CAP);
+#endif
}
/*
@@ -2059,7 +2208,7 @@ static int __init early_amd_iommu_init(void)
struct acpi_table_header *ivrs_base;
acpi_size ivrs_size;
acpi_status status;
- int i, ret = 0;
+ int i, remap_cache_sz, ret = 0;
if (!amd_iommu_detected)
return -ENODEV;
@@ -2157,10 +2306,14 @@ static int __init early_amd_iommu_init(void)
* remapping tables.
*/
ret = -ENOMEM;
+ if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
+ remap_cache_sz = MAX_IRQS_PER_TABLE * sizeof(u32);
+ else
+ remap_cache_sz = MAX_IRQS_PER_TABLE * (sizeof(u64) * 2);
amd_iommu_irq_cache = kmem_cache_create("irq_remap_cache",
- MAX_IRQS_PER_TABLE * sizeof(u32),
- IRQ_TABLE_ALIGNMENT,
- 0, NULL);
+ remap_cache_sz,
+ IRQ_TABLE_ALIGNMENT,
+ 0, NULL);
if (!amd_iommu_irq_cache)
goto out;
@@ -2413,6 +2566,21 @@ static int __init parse_amd_iommu_dump(char *str)
return 1;
}
+static int __init parse_amd_iommu_intr(char *str)
+{
+ for (; *str; ++str) {
+ if (strncmp(str, "legacy", 6) == 0) {
+ amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_LEGACY;
+ break;
+ }
+ if (strncmp(str, "vapic", 5) == 0) {
+ amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_VAPIC;
+ break;
+ }
+ }
+ return 1;
+}
+
static int __init parse_amd_iommu_options(char *str)
{
for (; *str; ++str) {
@@ -2521,6 +2689,7 @@ static int __init parse_ivrs_acpihid(char *str)
__setup("amd_iommu_dump", parse_amd_iommu_dump);
__setup("amd_iommu=", parse_amd_iommu_options);
+__setup("amd_iommu_intr=", parse_amd_iommu_intr);
__setup("ivrs_ioapic", parse_ivrs_ioapic);
__setup("ivrs_hpet", parse_ivrs_hpet);
__setup("ivrs_acpihid", parse_ivrs_acpihid);
diff --git a/drivers/iommu/amd_iommu_proto.h b/drivers/iommu/amd_iommu_proto.h
index 0bd9eb374462..faa3b4895cf0 100644
--- a/drivers/iommu/amd_iommu_proto.h
+++ b/drivers/iommu/amd_iommu_proto.h
@@ -38,6 +38,7 @@ extern int amd_iommu_enable(void);
extern void amd_iommu_disable(void);
extern int amd_iommu_reenable(int);
extern int amd_iommu_enable_faulting(void);
+extern int amd_iommu_guest_ir;
/* IOMMUv2 specific functions */
struct iommu_domain;
diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h
index 9652848e3155..0d91785ebdc3 100644
--- a/drivers/iommu/amd_iommu_types.h
+++ b/drivers/iommu/amd_iommu_types.h
@@ -22,6 +22,7 @@
#include <linux/types.h>
#include <linux/mutex.h>
+#include <linux/msi.h>
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/pci.h>
@@ -69,6 +70,8 @@
#define MMIO_EXCL_LIMIT_OFFSET 0x0028
#define MMIO_EXT_FEATURES 0x0030
#define MMIO_PPR_LOG_OFFSET 0x0038
+#define MMIO_GA_LOG_BASE_OFFSET 0x00e0
+#define MMIO_GA_LOG_TAIL_OFFSET 0x00e8
#define MMIO_CMD_HEAD_OFFSET 0x2000
#define MMIO_CMD_TAIL_OFFSET 0x2008
#define MMIO_EVT_HEAD_OFFSET 0x2010
@@ -76,6 +79,8 @@
#define MMIO_STATUS_OFFSET 0x2020
#define MMIO_PPR_HEAD_OFFSET 0x2030
#define MMIO_PPR_TAIL_OFFSET 0x2038
+#define MMIO_GA_HEAD_OFFSET 0x2040
+#define MMIO_GA_TAIL_OFFSET 0x2048
#define MMIO_CNTR_CONF_OFFSET 0x4000
#define MMIO_CNTR_REG_OFFSET 0x40000
#define MMIO_REG_END_OFFSET 0x80000
@@ -92,6 +97,7 @@
#define FEATURE_GA (1ULL<<7)
#define FEATURE_HE (1ULL<<8)
#define FEATURE_PC (1ULL<<9)
+#define FEATURE_GAM_VAPIC (1ULL<<21)
#define FEATURE_PASID_SHIFT 32
#define FEATURE_PASID_MASK (0x1fULL << FEATURE_PASID_SHIFT)
@@ -110,6 +116,9 @@
#define MMIO_STATUS_EVT_INT_MASK (1 << 1)
#define MMIO_STATUS_COM_WAIT_INT_MASK (1 << 2)
#define MMIO_STATUS_PPR_INT_MASK (1 << 6)
+#define MMIO_STATUS_GALOG_RUN_MASK (1 << 8)
+#define MMIO_STATUS_GALOG_OVERFLOW_MASK (1 << 9)
+#define MMIO_STATUS_GALOG_INT_MASK (1 << 10)
/* event logging constants */
#define EVENT_ENTRY_SIZE 0x10
@@ -146,6 +155,10 @@
#define CONTROL_PPFINT_EN 0x0eULL
#define CONTROL_PPR_EN 0x0fULL
#define CONTROL_GT_EN 0x10ULL
+#define CONTROL_GA_EN 0x11ULL
+#define CONTROL_GAM_EN 0x19ULL
+#define CONTROL_GALOG_EN 0x1CULL
+#define CONTROL_GAINT_EN 0x1DULL
#define CTRL_INV_TO_MASK (7 << CONTROL_INV_TIMEOUT)
#define CTRL_INV_TO_NONE 0
@@ -224,6 +237,19 @@
#define PPR_REQ_FAULT 0x01
+/* Constants for GA Log handling */
+#define GA_LOG_ENTRIES 512
+#define GA_LOG_SIZE_SHIFT 56
+#define GA_LOG_SIZE_512 (0x8ULL << GA_LOG_SIZE_SHIFT)
+#define GA_ENTRY_SIZE 8
+#define GA_LOG_SIZE (GA_ENTRY_SIZE * GA_LOG_ENTRIES)
+
+#define GA_TAG(x) (u32)(x & 0xffffffffULL)
+#define GA_DEVID(x) (u16)(((x) >> 32) & 0xffffULL)
+#define GA_REQ_TYPE(x) (((x) >> 60) & 0xfULL)
+
+#define GA_GUEST_NR 0x1
+
#define PAGE_MODE_NONE 0x00
#define PAGE_MODE_1_LEVEL 0x01
#define PAGE_MODE_2_LEVEL 0x02
@@ -329,6 +355,12 @@
#define IOMMU_CAP_NPCACHE 26
#define IOMMU_CAP_EFR 27
+/* IOMMU Feature Reporting Field (for IVHD type 10h */
+#define IOMMU_FEAT_GASUP_SHIFT 6
+
+/* IOMMU Extended Feature Register (EFR) */
+#define IOMMU_EFR_GASUP_SHIFT 7
+
#define MAX_DOMAIN_ID 65536
/* Protection domain flags */
@@ -400,6 +432,7 @@ struct amd_iommu_fault {
struct iommu_domain;
struct irq_domain;
+struct amd_irte_ops;
/*
* This structure contains generic data for IOMMU protection domains
@@ -490,6 +523,12 @@ struct amd_iommu {
/* Base of the PPR log, if present */
u8 *ppr_log;
+ /* Base of the GA log, if present */
+ u8 *ga_log;
+
+ /* Tail of the GA log, if present */
+ u8 *ga_log_tail;
+
/* true if interrupts for this IOMMU are already enabled */
bool int_enabled;
@@ -523,6 +562,8 @@ struct amd_iommu {
#ifdef CONFIG_IRQ_REMAP
struct irq_domain *ir_domain;
struct irq_domain *msi_domain;
+
+ struct amd_irte_ops *irte_ops;
#endif
volatile u64 __aligned(8) cmd_sem;
@@ -683,4 +724,112 @@ static inline int get_hpet_devid(int id)
return -EINVAL;
}
+enum amd_iommu_intr_mode_type {
+ AMD_IOMMU_GUEST_IR_LEGACY,
+
+ /* This mode is not visible to users. It is used when
+ * we cannot fully enable vAPIC and fallback to only support
+ * legacy interrupt remapping via 128-bit IRTE.
+ */
+ AMD_IOMMU_GUEST_IR_LEGACY_GA,
+ AMD_IOMMU_GUEST_IR_VAPIC,
+};
+
+#define AMD_IOMMU_GUEST_IR_GA(x) (x == AMD_IOMMU_GUEST_IR_VAPIC || \
+ x == AMD_IOMMU_GUEST_IR_LEGACY_GA)
+
+#define AMD_IOMMU_GUEST_IR_VAPIC(x) (x == AMD_IOMMU_GUEST_IR_VAPIC)
+
+union irte {
+ u32 val;
+ struct {
+ u32 valid : 1,
+ no_fault : 1,
+ int_type : 3,
+ rq_eoi : 1,
+ dm : 1,
+ rsvd_1 : 1,
+ destination : 8,
+ vector : 8,
+ rsvd_2 : 8;
+ } fields;
+};
+
+union irte_ga_lo {
+ u64 val;
+
+ /* For int remapping */
+ struct {
+ u64 valid : 1,
+ no_fault : 1,
+ /* ------ */
+ int_type : 3,
+ rq_eoi : 1,
+ dm : 1,
+ /* ------ */
+ guest_mode : 1,
+ destination : 8,
+ rsvd : 48;
+ } fields_remap;
+
+ /* For guest vAPIC */
+ struct {
+ u64 valid : 1,
+ no_fault : 1,
+ /* ------ */
+ ga_log_intr : 1,
+ rsvd1 : 3,
+ is_run : 1,
+ /* ------ */
+ guest_mode : 1,
+ destination : 8,
+ rsvd2 : 16,
+ ga_tag : 32;
+ } fields_vapic;
+};
+
+union irte_ga_hi {
+ u64 val;
+ struct {
+ u64 vector : 8,
+ rsvd_1 : 4,
+ ga_root_ptr : 40,
+ rsvd_2 : 12;
+ } fields;
+};
+
+struct irte_ga {
+ union irte_ga_lo lo;
+ union irte_ga_hi hi;
+};
+
+struct irq_2_irte {
+ u16 devid; /* Device ID for IRTE table */
+ u16 index; /* Index into IRTE table*/
+};
+
+struct amd_ir_data {
+ u32 cached_ga_tag;
+ struct irq_2_irte irq_2_irte;
+ struct msi_msg msi_entry;
+ void *entry; /* Pointer to union irte or struct irte_ga */
+ void *ref; /* Pointer to the actual irte */
+};
+
+struct amd_irte_ops {
+ void (*prepare)(void *, u32, u32, u8, u32, int);
+ void (*activate)(void *, u16, u16);
+ void (*deactivate)(void *, u16, u16);
+ void (*set_affinity)(void *, u16, u16, u8, u32);
+ void *(*get)(struct irq_remap_table *, int);
+ void (*set_allocated)(struct irq_remap_table *, int);
+ bool (*is_allocated)(struct irq_remap_table *, int);
+ void (*clear_allocated)(struct irq_remap_table *, int);
+};
+
+#ifdef CONFIG_IRQ_REMAP
+extern struct amd_irte_ops irte_32_ops;
+extern struct amd_irte_ops irte_128_ops;
+#endif
+
#endif /* _ASM_X86_AMD_IOMMU_TYPES_H */
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 19b698ef3336..002f0922cd92 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -20,9 +20,11 @@
#include <linux/kvm.h>
#include <linux/irqreturn.h>
#include <linux/spinlock.h>
+#include <linux/static_key.h>
#include <linux/types.h>
#include <kvm/iodev.h>
#include <linux/list.h>
+#include <linux/jump_label.h>
#define VGIC_V3_MAX_CPUS 255
#define VGIC_V2_MAX_CPUS 8
@@ -49,6 +51,9 @@ struct vgic_global {
/* Physical address of vgic virtual cpu interface */
phys_addr_t vcpu_base;
+ /* GICV mapping */
+ void __iomem *vcpu_base_va;
+
/* virtual control interface mapping */
void __iomem *vctrl_base;
@@ -63,6 +68,9 @@ struct vgic_global {
/* Only needed for the legacy KVM_CREATE_IRQCHIP */
bool can_emulate_gicv2;
+
+ /* GIC system register CPU interface */
+ struct static_key_false gicv3_cpuif;
};
extern struct vgic_global kvm_vgic_global_state;
@@ -217,7 +225,6 @@ struct vgic_v2_cpu_if {
};
struct vgic_v3_cpu_if {
-#ifdef CONFIG_KVM_ARM_VGIC_V3
u32 vgic_hcr;
u32 vgic_vmcr;
u32 vgic_sre; /* Restored only, change ignored */
@@ -227,7 +234,6 @@ struct vgic_v3_cpu_if {
u32 vgic_ap0r[4];
u32 vgic_ap1r[4];
u64 vgic_lr[VGIC_V3_MAX_LRS];
-#endif
};
struct vgic_cpu {
@@ -265,6 +271,8 @@ struct vgic_cpu {
bool lpis_enabled;
};
+extern struct static_key_false vgic_v2_cpuif_trap;
+
int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write);
void kvm_vgic_early_init(struct kvm *kvm);
int kvm_vgic_create(struct kvm *kvm, u32 type);
@@ -294,13 +302,7 @@ bool kvm_vcpu_has_pending_irqs(struct kvm_vcpu *vcpu);
void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu);
void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu);
-#ifdef CONFIG_KVM_ARM_VGIC_V3
void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg);
-#else
-static inline void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg)
-{
-}
-#endif
/**
* kvm_vgic_get_max_vcpus - Get the maximum number of VCPUs allowed by HW
diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h
index 2b08e79f5100..09751d349963 100644
--- a/include/linux/amd-iommu.h
+++ b/include/linux/amd-iommu.h
@@ -22,6 +22,20 @@
#include <linux/types.h>
+/*
+ * This is mainly used to communicate information back-and-forth
+ * between SVM and IOMMU for setting up and tearing down posted
+ * interrupt
+ */
+struct amd_iommu_pi_data {
+ u32 ga_tag;
+ u32 prev_ga_tag;
+ u64 base;
+ bool is_guest_mode;
+ struct vcpu_data *vcpu_data;
+ void *ir_data;
+};
+
#ifdef CONFIG_AMD_IOMMU
struct task_struct;
@@ -168,11 +182,34 @@ typedef void (*amd_iommu_invalidate_ctx)(struct pci_dev *pdev, int pasid);
extern int amd_iommu_set_invalidate_ctx_cb(struct pci_dev *pdev,
amd_iommu_invalidate_ctx cb);
-
-#else
+#else /* CONFIG_AMD_IOMMU */
static inline int amd_iommu_detect(void) { return -ENODEV; }
-#endif
+#endif /* CONFIG_AMD_IOMMU */
+
+#if defined(CONFIG_AMD_IOMMU) && defined(CONFIG_IRQ_REMAP)
+
+/* IOMMU AVIC Function */
+extern int amd_iommu_register_ga_log_notifier(int (*notifier)(u32));
+
+extern int
+amd_iommu_update_ga(int cpu, bool is_run, void *data);
+
+#else /* defined(CONFIG_AMD_IOMMU) && defined(CONFIG_IRQ_REMAP) */
+
+static inline int
+amd_iommu_register_ga_log_notifier(int (*notifier)(u32))
+{
+ return 0;
+}
+
+static inline int
+amd_iommu_update_ga(int cpu, bool is_run, void *data)
+{
+ return 0;
+}
+
+#endif /* defined(CONFIG_AMD_IOMMU) && defined(CONFIG_IRQ_REMAP) */
#endif /* _ASM_X86_AMD_IOMMU_H */
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 9c28b4d4c90b..01c0b9cc3915 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -265,6 +265,7 @@ struct kvm_vcpu {
#endif
bool preempted;
struct kvm_vcpu_arch arch;
+ struct dentry *debugfs_dentry;
};
static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu)
@@ -749,6 +750,9 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu);
void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu);
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu);
+bool kvm_arch_has_vcpu_debugfs(void);
+int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu);
+
int kvm_arch_hardware_enable(void);
void kvm_arch_hardware_disable(void);
int kvm_arch_hardware_setup(void);
diff --git a/arch/x86/configs/kvm_guest.config b/kernel/configs/kvm_guest.config
index 9906505c998a..8d9643767142 100644
--- a/arch/x86/configs/kvm_guest.config
+++ b/kernel/configs/kvm_guest.config
@@ -29,3 +29,4 @@ CONFIG_NET_9P_VIRTIO=y
CONFIG_SCSI_LOWLEVEL=y
CONFIG_SCSI_VIRTIO=y
CONFIG_VIRTIO_INPUT=y
+CONFIG_DRM_VIRTIO_GPU=y
diff --git a/arch/arm64/kvm/emulate.c b/virt/kvm/arm/aarch32.c
index f87d8fbaa48d..528af4b2d09e 100644
--- a/arch/arm64/kvm/emulate.c
+++ b/virt/kvm/arm/aarch32.c
@@ -22,8 +22,13 @@
*/
#include <linux/kvm_host.h>
-#include <asm/esr.h>
#include <asm/kvm_emulate.h>
+#include <asm/kvm_hyp.h>
+
+#ifndef CONFIG_ARM64
+#define COMPAT_PSR_T_BIT PSR_T_BIT
+#define COMPAT_PSR_IT_MASK PSR_IT_MASK
+#endif
/*
* stolen from arch/arm/kernel/opcodes.c
@@ -52,16 +57,6 @@ static const unsigned short cc_map[16] = {
0 /* NV */
};
-static int kvm_vcpu_get_condition(const struct kvm_vcpu *vcpu)
-{
- u32 esr = kvm_vcpu_get_hsr(vcpu);
-
- if (esr & ESR_ELx_CV)
- return (esr & ESR_ELx_COND_MASK) >> ESR_ELx_COND_SHIFT;
-
- return -1;
-}
-
/*
* Check if a trapped instruction should have been executed or not.
*/
@@ -114,15 +109,13 @@ bool kvm_condition_valid32(const struct kvm_vcpu *vcpu)
*
* IT[7:0] -> CPSR[26:25],CPSR[15:10]
*/
-static void kvm_adjust_itstate(struct kvm_vcpu *vcpu)
+static void __hyp_text kvm_adjust_itstate(struct kvm_vcpu *vcpu)
{
unsigned long itbits, cond;
unsigned long cpsr = *vcpu_cpsr(vcpu);
bool is_arm = !(cpsr & COMPAT_PSR_T_BIT);
- BUG_ON(is_arm && (cpsr & COMPAT_PSR_IT_MASK));
-
- if (!(cpsr & COMPAT_PSR_IT_MASK))
+ if (is_arm || !(cpsr & COMPAT_PSR_IT_MASK))
return;
cond = (cpsr & 0xe000) >> 13;
@@ -146,7 +139,7 @@ static void kvm_adjust_itstate(struct kvm_vcpu *vcpu)
* kvm_skip_instr - skip a trapped instruction and proceed to the next
* @vcpu: The vcpu pointer
*/
-void kvm_skip_instr32(struct kvm_vcpu *vcpu, bool is_wide_instr)
+void __hyp_text kvm_skip_instr32(struct kvm_vcpu *vcpu, bool is_wide_instr)
{
bool is_thumb;
diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
index 77e6ccf14901..27a1f6341d41 100644
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -31,7 +31,6 @@
#include "trace.h"
static struct timecounter *timecounter;
-static struct workqueue_struct *wqueue;
static unsigned int host_vtimer_irq;
static u32 host_vtimer_irq_flags;
@@ -141,7 +140,7 @@ static enum hrtimer_restart kvm_timer_expire(struct hrtimer *hrt)
return HRTIMER_RESTART;
}
- queue_work(wqueue, &timer->expired);
+ schedule_work(&timer->expired);
return HRTIMER_NORESTART;
}
@@ -446,13 +445,7 @@ int kvm_timer_hyp_init(void)
if (err) {
kvm_err("kvm_arch_timer: can't request interrupt %d (%d)\n",
host_vtimer_irq, err);
- goto out;
- }
-
- wqueue = create_singlethread_workqueue("kvm_arch_timer");
- if (!wqueue) {
- err = -ENOMEM;
- goto out_free;
+ return err;
}
kvm_info("virtual timer IRQ%d\n", host_vtimer_irq);
@@ -460,10 +453,6 @@ int kvm_timer_hyp_init(void)
cpuhp_setup_state(CPUHP_AP_KVM_ARM_TIMER_STARTING,
"AP_KVM_ARM_TIMER_STARTING", kvm_timer_starting_cpu,
kvm_timer_dying_cpu);
- goto out;
-out_free:
- free_percpu_irq(host_vtimer_irq, kvm_get_running_vcpus());
-out:
return err;
}
@@ -518,7 +507,7 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu)
* VCPUs have the enabled variable set, before entering the guest, if
* the arch timers are enabled.
*/
- if (timecounter && wqueue)
+ if (timecounter)
timer->enabled = 1;
return 0;
diff --git a/virt/kvm/arm/hyp/vgic-v2-sr.c b/virt/kvm/arm/hyp/vgic-v2-sr.c
index 7cffd9338c49..c8aeb7b91ec8 100644
--- a/virt/kvm/arm/hyp/vgic-v2-sr.c
+++ b/virt/kvm/arm/hyp/vgic-v2-sr.c
@@ -19,6 +19,7 @@
#include <linux/irqchip/arm-gic.h>
#include <linux/kvm_host.h>
+#include <asm/kvm_emulate.h>
#include <asm/kvm_hyp.h>
static void __hyp_text save_maint_int_state(struct kvm_vcpu *vcpu,
@@ -167,3 +168,59 @@ void __hyp_text __vgic_v2_restore_state(struct kvm_vcpu *vcpu)
writel_relaxed(cpu_if->vgic_vmcr, base + GICH_VMCR);
vcpu->arch.vgic_cpu.live_lrs = live_lrs;
}
+
+#ifdef CONFIG_ARM64
+/*
+ * __vgic_v2_perform_cpuif_access -- perform a GICV access on behalf of the
+ * guest.
+ *
+ * @vcpu: the offending vcpu
+ *
+ * Returns:
+ * 1: GICV access successfully performed
+ * 0: Not a GICV access
+ * -1: Illegal GICV access
+ */
+int __hyp_text __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = kern_hyp_va(vcpu->kvm);
+ struct vgic_dist *vgic = &kvm->arch.vgic;
+ phys_addr_t fault_ipa;
+ void __iomem *addr;
+ int rd;
+
+ /* Build the full address */
+ fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
+ fault_ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0);
+
+ /* If not for GICV, move on */
+ if (fault_ipa < vgic->vgic_cpu_base ||
+ fault_ipa >= (vgic->vgic_cpu_base + KVM_VGIC_V2_CPU_SIZE))
+ return 0;
+
+ /* Reject anything but a 32bit access */
+ if (kvm_vcpu_dabt_get_as(vcpu) != sizeof(u32))
+ return -1;
+
+ /* Not aligned? Don't bother */
+ if (fault_ipa & 3)
+ return -1;
+
+ rd = kvm_vcpu_dabt_get_rd(vcpu);
+ addr = kern_hyp_va((kern_hyp_va(&kvm_vgic_global_state))->vcpu_base_va);
+ addr += fault_ipa - vgic->vgic_cpu_base;
+
+ if (kvm_vcpu_dabt_iswrite(vcpu)) {
+ u32 data = vcpu_data_guest_to_host(vcpu,
+ vcpu_get_reg(vcpu, rd),
+ sizeof(u32));
+ writel_relaxed(data, addr);
+ } else {
+ u32 data = readl_relaxed(addr);
+ vcpu_set_reg(vcpu, rd, vcpu_data_host_to_guest(vcpu, data,
+ sizeof(u32)));
+ }
+
+ return 1;
+}
+#endif
diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/virt/kvm/arm/hyp/vgic-v3-sr.c
index 5f8f80b4a224..3947095cc0a1 100644
--- a/arch/arm64/kvm/hyp/vgic-v3-sr.c
+++ b/virt/kvm/arm/hyp/vgic-v3-sr.c
@@ -24,19 +24,6 @@
#define vtr_to_max_lr_idx(v) ((v) & 0xf)
#define vtr_to_nr_pri_bits(v) (((u32)(v) >> 29) + 1)
-#define read_gicreg(r) \
- ({ \
- u64 reg; \
- asm volatile("mrs_s %0, " __stringify(r) : "=r" (reg)); \
- reg; \
- })
-
-#define write_gicreg(v,r) \
- do { \
- u64 __val = (v); \
- asm volatile("msr_s " __stringify(r) ", %0" : : "r" (__val));\
- } while (0)
-
static u64 __hyp_text __gic_v3_get_lr(unsigned int lr)
{
switch (lr & 0xf) {
@@ -335,9 +322,7 @@ void __hyp_text __vgic_v3_init_lrs(void)
__gic_v3_set_lr(0, i);
}
-static u64 __hyp_text __vgic_v3_read_ich_vtr_el2(void)
+u64 __hyp_text __vgic_v3_get_ich_vtr_el2(void)
{
return read_gicreg(ICH_VTR_EL2);
}
-
-__alias(__vgic_v3_read_ich_vtr_el2) u64 __vgic_v3_get_ich_vtr_el2(void);
diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c
index a027569facfa..6e9c40eea208 100644
--- a/virt/kvm/arm/pmu.c
+++ b/virt/kvm/arm/pmu.c
@@ -423,6 +423,14 @@ static int kvm_arm_pmu_v3_init(struct kvm_vcpu *vcpu)
if (!kvm_arm_support_pmu_v3())
return -ENODEV;
+ /*
+ * We currently require an in-kernel VGIC to use the PMU emulation,
+ * because we do not support forwarding PMU overflow interrupts to
+ * userspace yet.
+ */
+ if (!irqchip_in_kernel(vcpu->kvm) || !vgic_initialized(vcpu->kvm))
+ return -ENODEV;
+
if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features) ||
!kvm_arm_pmu_irq_initialized(vcpu))
return -ENXIO;
diff --git a/virt/kvm/arm/vgic/vgic-init.c b/virt/kvm/arm/vgic/vgic-init.c
index 83777c1cbae0..8cebfbc19e90 100644
--- a/virt/kvm/arm/vgic/vgic-init.c
+++ b/virt/kvm/arm/vgic/vgic-init.c
@@ -405,6 +405,10 @@ int kvm_vgic_hyp_init(void)
break;
case GIC_V3:
ret = vgic_v3_probe(gic_kvm_info);
+ if (!ret) {
+ static_branch_enable(&kvm_vgic_global_state.gicv3_cpuif);
+ kvm_info("GIC system register CPU interface enabled\n");
+ }
break;
default:
ret = -ENODEV;
diff --git a/virt/kvm/arm/vgic/vgic-irqfd.c b/virt/kvm/arm/vgic/vgic-irqfd.c
index b31a51a14efb..d918dcf26a5a 100644
--- a/virt/kvm/arm/vgic/vgic-irqfd.c
+++ b/virt/kvm/arm/vgic/vgic-irqfd.c
@@ -46,15 +46,9 @@ static int vgic_irqfd_set_irq(struct kvm_kernel_irq_routing_entry *e,
* @ue: user api routing entry handle
* return 0 on success, -EINVAL on errors.
*/
-#ifdef KVM_CAP_X2APIC_API
int kvm_set_routing_entry(struct kvm *kvm,
struct kvm_kernel_irq_routing_entry *e,
const struct kvm_irq_routing_entry *ue)
-#else
-/* Remove this version and the ifdefery once merged into 4.8 */
-int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e,
- const struct kvm_irq_routing_entry *ue)
-#endif
{
int r = -EINVAL;
diff --git a/virt/kvm/arm/vgic/vgic-kvm-device.c b/virt/kvm/arm/vgic/vgic-kvm-device.c
index 1813f93b5cde..ce1f4ed9daf4 100644
--- a/virt/kvm/arm/vgic/vgic-kvm-device.c
+++ b/virt/kvm/arm/vgic/vgic-kvm-device.c
@@ -71,7 +71,6 @@ int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write)
addr_ptr = &vgic->vgic_cpu_base;
alignment = SZ_4K;
break;
-#ifdef CONFIG_KVM_ARM_VGIC_V3
case KVM_VGIC_V3_ADDR_TYPE_DIST:
type_needed = KVM_DEV_TYPE_ARM_VGIC_V3;
addr_ptr = &vgic->vgic_dist_base;
@@ -82,7 +81,6 @@ int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write)
addr_ptr = &vgic->vgic_redist_base;
alignment = SZ_64K;
break;
-#endif
default:
r = -ENODEV;
goto out;
@@ -219,52 +217,65 @@ int kvm_register_vgic_device(unsigned long type)
ret = kvm_register_device_ops(&kvm_arm_vgic_v2_ops,
KVM_DEV_TYPE_ARM_VGIC_V2);
break;
-#ifdef CONFIG_KVM_ARM_VGIC_V3
case KVM_DEV_TYPE_ARM_VGIC_V3:
ret = kvm_register_device_ops(&kvm_arm_vgic_v3_ops,
KVM_DEV_TYPE_ARM_VGIC_V3);
+
+#ifdef CONFIG_KVM_ARM_VGIC_V3_ITS
if (ret)
break;
ret = kvm_vgic_register_its_device();
- break;
#endif
+ break;
}
return ret;
}
-/** vgic_attr_regs_access: allows user space to read/write VGIC registers
- *
- * @dev: kvm device handle
- * @attr: kvm device attribute
- * @reg: address the value is read or written
- * @is_write: write flag
- *
- */
-static int vgic_attr_regs_access(struct kvm_device *dev,
- struct kvm_device_attr *attr,
- u32 *reg, bool is_write)
-{
+struct vgic_reg_attr {
+ struct kvm_vcpu *vcpu;
gpa_t addr;
- int cpuid, ret, c;
- struct kvm_vcpu *vcpu, *tmp_vcpu;
- int vcpu_lock_idx = -1;
+};
+
+static int parse_vgic_v2_attr(struct kvm_device *dev,
+ struct kvm_device_attr *attr,
+ struct vgic_reg_attr *reg_attr)
+{
+ int cpuid;
cpuid = (attr->attr & KVM_DEV_ARM_VGIC_CPUID_MASK) >>
KVM_DEV_ARM_VGIC_CPUID_SHIFT;
- vcpu = kvm_get_vcpu(dev->kvm, cpuid);
- addr = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
- mutex_lock(&dev->kvm->lock);
+ if (cpuid >= atomic_read(&dev->kvm->online_vcpus))
+ return -EINVAL;
- ret = vgic_init(dev->kvm);
- if (ret)
- goto out;
+ reg_attr->vcpu = kvm_get_vcpu(dev->kvm, cpuid);
+ reg_attr->addr = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
- if (cpuid >= atomic_read(&dev->kvm->online_vcpus)) {
- ret = -EINVAL;
- goto out;
+ return 0;
+}
+
+/* unlocks vcpus from @vcpu_lock_idx and smaller */
+static void unlock_vcpus(struct kvm *kvm, int vcpu_lock_idx)
+{
+ struct kvm_vcpu *tmp_vcpu;
+
+ for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) {
+ tmp_vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx);
+ mutex_unlock(&tmp_vcpu->mutex);
}
+}
+
+static void unlock_all_vcpus(struct kvm *kvm)
+{
+ unlock_vcpus(kvm, atomic_read(&kvm->online_vcpus) - 1);
+}
+
+/* Returns true if all vcpus were locked, false otherwise */
+static bool lock_all_vcpus(struct kvm *kvm)
+{
+ struct kvm_vcpu *tmp_vcpu;
+ int c;
/*
* Any time a vcpu is run, vcpu_load is called which tries to grab the
@@ -272,11 +283,49 @@ static int vgic_attr_regs_access(struct kvm_device *dev,
* that no other VCPUs are run and fiddle with the vgic state while we
* access it.
*/
- ret = -EBUSY;
- kvm_for_each_vcpu(c, tmp_vcpu, dev->kvm) {
- if (!mutex_trylock(&tmp_vcpu->mutex))
- goto out;
- vcpu_lock_idx = c;
+ kvm_for_each_vcpu(c, tmp_vcpu, kvm) {
+ if (!mutex_trylock(&tmp_vcpu->mutex)) {
+ unlock_vcpus(kvm, c - 1);
+ return false;
+ }
+ }
+
+ return true;
+}
+
+/**
+ * vgic_attr_regs_access_v2 - allows user space to access VGIC v2 state
+ *
+ * @dev: kvm device handle
+ * @attr: kvm device attribute
+ * @reg: address the value is read or written
+ * @is_write: true if userspace is writing a register
+ */
+static int vgic_attr_regs_access_v2(struct kvm_device *dev,
+ struct kvm_device_attr *attr,
+ u32 *reg, bool is_write)
+{
+ struct vgic_reg_attr reg_attr;
+ gpa_t addr;
+ struct kvm_vcpu *vcpu;
+ int ret;
+
+ ret = parse_vgic_v2_attr(dev, attr, &reg_attr);
+ if (ret)
+ return ret;
+
+ vcpu = reg_attr.vcpu;
+ addr = reg_attr.addr;
+
+ mutex_lock(&dev->kvm->lock);
+
+ ret = vgic_init(dev->kvm);
+ if (ret)
+ goto out;
+
+ if (!lock_all_vcpus(dev->kvm)) {
+ ret = -EBUSY;
+ goto out;
}
switch (attr->group) {
@@ -291,18 +340,12 @@ static int vgic_attr_regs_access(struct kvm_device *dev,
break;
}
+ unlock_all_vcpus(dev->kvm);
out:
- for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) {
- tmp_vcpu = kvm_get_vcpu(dev->kvm, vcpu_lock_idx);
- mutex_unlock(&tmp_vcpu->mutex);
- }
-
mutex_unlock(&dev->kvm->lock);
return ret;
}
-/* V2 ops */
-
static int vgic_v2_set_attr(struct kvm_device *dev,
struct kvm_device_attr *attr)
{
@@ -321,7 +364,7 @@ static int vgic_v2_set_attr(struct kvm_device *dev,
if (get_user(reg, uaddr))
return -EFAULT;
- return vgic_attr_regs_access(dev, attr, &reg, true);
+ return vgic_attr_regs_access_v2(dev, attr, &reg, true);
}
}
@@ -343,7 +386,7 @@ static int vgic_v2_get_attr(struct kvm_device *dev,
u32 __user *uaddr = (u32 __user *)(long)attr->addr;
u32 reg = 0;
- ret = vgic_attr_regs_access(dev, attr, &reg, false);
+ ret = vgic_attr_regs_access_v2(dev, attr, &reg, false);
if (ret)
return ret;
return put_user(reg, uaddr);
@@ -387,10 +430,6 @@ struct kvm_device_ops kvm_arm_vgic_v2_ops = {
.has_attr = vgic_v2_has_attr,
};
-/* V3 ops */
-
-#ifdef CONFIG_KVM_ARM_VGIC_V3
-
static int vgic_v3_set_attr(struct kvm_device *dev,
struct kvm_device_attr *attr)
{
@@ -433,5 +472,3 @@ struct kvm_device_ops kvm_arm_vgic_v3_ops = {
.get_attr = vgic_v3_get_attr,
.has_attr = vgic_v3_has_attr,
};
-
-#endif /* CONFIG_KVM_ARM_VGIC_V3 */
diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c
index 90d81811fdda..0d3c76a4208b 100644
--- a/virt/kvm/arm/vgic/vgic-mmio-v3.c
+++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c
@@ -23,7 +23,7 @@
#include "vgic-mmio.h"
/* extract @num bytes at @offset bytes offset in data */
-unsigned long extract_bytes(unsigned long data, unsigned int offset,
+unsigned long extract_bytes(u64 data, unsigned int offset,
unsigned int num)
{
return (data >> (offset * 8)) & GENMASK_ULL(num * 8 - 1, 0);
@@ -42,6 +42,7 @@ u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len,
return reg | ((u64)val << lower);
}
+#ifdef CONFIG_KVM_ARM_VGIC_V3_ITS
bool vgic_has_its(struct kvm *kvm)
{
struct vgic_dist *dist = &kvm->arch.vgic;
@@ -51,6 +52,7 @@ bool vgic_has_its(struct kvm *kvm)
return dist->has_its;
}
+#endif
static unsigned long vgic_mmio_read_v3_misc(struct kvm_vcpu *vcpu,
gpa_t addr, unsigned int len)
@@ -179,7 +181,7 @@ static unsigned long vgic_mmio_read_v3r_typer(struct kvm_vcpu *vcpu,
int target_vcpu_id = vcpu->vcpu_id;
u64 value;
- value = (mpidr & GENMASK(23, 0)) << 32;
+ value = (u64)(mpidr & GENMASK(23, 0)) << 32;
value |= ((target_vcpu_id & 0xffff) << 8);
if (target_vcpu_id == atomic_read(&vcpu->kvm->online_vcpus) - 1)
value |= GICR_TYPER_LAST;
@@ -609,7 +611,7 @@ void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg)
bool broadcast;
sgi = (reg & ICC_SGI1R_SGI_ID_MASK) >> ICC_SGI1R_SGI_ID_SHIFT;
- broadcast = reg & BIT(ICC_SGI1R_IRQ_ROUTING_MODE_BIT);
+ broadcast = reg & BIT_ULL(ICC_SGI1R_IRQ_ROUTING_MODE_BIT);
target_cpus = (reg & ICC_SGI1R_TARGET_LIST_MASK) >> ICC_SGI1R_TARGET_LIST_SHIFT;
mpidr = SGI_AFFINITY_LEVEL(reg, 3);
mpidr |= SGI_AFFINITY_LEVEL(reg, 2);
diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c
index 3bad3c5ed431..e18b30ddcdce 100644
--- a/virt/kvm/arm/vgic/vgic-mmio.c
+++ b/virt/kvm/arm/vgic/vgic-mmio.c
@@ -550,11 +550,9 @@ int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address,
case VGIC_V2:
len = vgic_v2_init_dist_iodev(io_device);
break;
-#ifdef CONFIG_KVM_ARM_VGIC_V3
case VGIC_V3:
len = vgic_v3_init_dist_iodev(io_device);
break;
-#endif
default:
BUG_ON(1);
}
diff --git a/virt/kvm/arm/vgic/vgic-mmio.h b/virt/kvm/arm/vgic/vgic-mmio.h
index 0b3ecf9d100e..4c34d39d44a0 100644
--- a/virt/kvm/arm/vgic/vgic-mmio.h
+++ b/virt/kvm/arm/vgic/vgic-mmio.h
@@ -96,7 +96,7 @@ unsigned long vgic_data_mmio_bus_to_host(const void *val, unsigned int len);
void vgic_data_host_to_mmio_bus(void *buf, unsigned int len,
unsigned long data);
-unsigned long extract_bytes(unsigned long data, unsigned int offset,
+unsigned long extract_bytes(u64 data, unsigned int offset,
unsigned int num);
u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len,
@@ -162,12 +162,10 @@ unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev);
unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev);
-#ifdef CONFIG_KVM_ARM_VGIC_V3
u64 vgic_sanitise_outer_cacheability(u64 reg);
u64 vgic_sanitise_inner_cacheability(u64 reg);
u64 vgic_sanitise_shareability(u64 reg);
u64 vgic_sanitise_field(u64 reg, u64 field_mask, int field_shift,
u64 (*sanitise_fn)(u64));
-#endif
#endif
diff --git a/virt/kvm/arm/vgic/vgic-v2.c b/virt/kvm/arm/vgic/vgic-v2.c
index 0bf6709d1006..0a063af40565 100644
--- a/virt/kvm/arm/vgic/vgic-v2.c
+++ b/virt/kvm/arm/vgic/vgic-v2.c
@@ -278,12 +278,14 @@ int vgic_v2_map_resources(struct kvm *kvm)
goto out;
}
- ret = kvm_phys_addr_ioremap(kvm, dist->vgic_cpu_base,
- kvm_vgic_global_state.vcpu_base,
- KVM_VGIC_V2_CPU_SIZE, true);
- if (ret) {
- kvm_err("Unable to remap VGIC CPU to VCPU\n");
- goto out;
+ if (!static_branch_unlikely(&vgic_v2_cpuif_trap)) {
+ ret = kvm_phys_addr_ioremap(kvm, dist->vgic_cpu_base,
+ kvm_vgic_global_state.vcpu_base,
+ KVM_VGIC_V2_CPU_SIZE, true);
+ if (ret) {
+ kvm_err("Unable to remap VGIC CPU to VCPU\n");
+ goto out;
+ }
}
dist->ready = true;
@@ -294,6 +296,8 @@ out:
return ret;
}
+DEFINE_STATIC_KEY_FALSE(vgic_v2_cpuif_trap);
+
/**
* vgic_v2_probe - probe for a GICv2 compatible interrupt controller in DT
* @node: pointer to the DT node
@@ -310,45 +314,51 @@ int vgic_v2_probe(const struct gic_kvm_info *info)
return -ENXIO;
}
- if (!PAGE_ALIGNED(info->vcpu.start)) {
- kvm_err("GICV physical address 0x%llx not page aligned\n",
- (unsigned long long)info->vcpu.start);
- return -ENXIO;
- }
+ if (!PAGE_ALIGNED(info->vcpu.start) ||
+ !PAGE_ALIGNED(resource_size(&info->vcpu))) {
+ kvm_info("GICV region size/alignment is unsafe, using trapping (reduced performance)\n");
+ kvm_vgic_global_state.vcpu_base_va = ioremap(info->vcpu.start,
+ resource_size(&info->vcpu));
+ if (!kvm_vgic_global_state.vcpu_base_va) {
+ kvm_err("Cannot ioremap GICV\n");
+ return -ENOMEM;
+ }
- if (!PAGE_ALIGNED(resource_size(&info->vcpu))) {
- kvm_err("GICV size 0x%llx not a multiple of page size 0x%lx\n",
- (unsigned long long)resource_size(&info->vcpu),
- PAGE_SIZE);
- return -ENXIO;
+ ret = create_hyp_io_mappings(kvm_vgic_global_state.vcpu_base_va,
+ kvm_vgic_global_state.vcpu_base_va + resource_size(&info->vcpu),
+ info->vcpu.start);
+ if (ret) {
+ kvm_err("Cannot map GICV into hyp\n");
+ goto out;
+ }
+
+ static_branch_enable(&vgic_v2_cpuif_trap);
}
kvm_vgic_global_state.vctrl_base = ioremap(info->vctrl.start,
resource_size(&info->vctrl));
if (!kvm_vgic_global_state.vctrl_base) {
kvm_err("Cannot ioremap GICH\n");
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out;
}
vtr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VTR);
kvm_vgic_global_state.nr_lr = (vtr & 0x3f) + 1;
- ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2);
- if (ret) {
- kvm_err("Cannot register GICv2 KVM device\n");
- iounmap(kvm_vgic_global_state.vctrl_base);
- return ret;
- }
-
ret = create_hyp_io_mappings(kvm_vgic_global_state.vctrl_base,
kvm_vgic_global_state.vctrl_base +
resource_size(&info->vctrl),
info->vctrl.start);
if (ret) {
kvm_err("Cannot map VCTRL into hyp\n");
- kvm_unregister_device_ops(KVM_DEV_TYPE_ARM_VGIC_V2);
- iounmap(kvm_vgic_global_state.vctrl_base);
- return ret;
+ goto out;
+ }
+
+ ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2);
+ if (ret) {
+ kvm_err("Cannot register GICv2 KVM device\n");
+ goto out;
}
kvm_vgic_global_state.can_emulate_gicv2 = true;
@@ -359,4 +369,11 @@ int vgic_v2_probe(const struct gic_kvm_info *info)
kvm_info("vgic-v2@%llx\n", info->vctrl.start);
return 0;
+out:
+ if (kvm_vgic_global_state.vctrl_base)
+ iounmap(kvm_vgic_global_state.vctrl_base);
+ if (kvm_vgic_global_state.vcpu_base_va)
+ iounmap(kvm_vgic_global_state.vcpu_base_va);
+
+ return ret;
}
diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c
index e83b7fe4baae..2893d5ba523a 100644
--- a/virt/kvm/arm/vgic/vgic.c
+++ b/virt/kvm/arm/vgic/vgic.c
@@ -29,7 +29,7 @@
#define DEBUG_SPINLOCK_BUG_ON(p)
#endif
-struct vgic_global __section(.hyp.text) kvm_vgic_global_state;
+struct vgic_global __section(.hyp.text) kvm_vgic_global_state = {.gicv3_cpuif = STATIC_KEY_FALSE_INIT,};
/*
* Locking order is always:
@@ -645,6 +645,9 @@ next:
/* Sync back the hardware VGIC state into our emulation after a guest's run. */
void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
{
+ if (unlikely(!vgic_initialized(vcpu->kvm)))
+ return;
+
vgic_process_maintenance_interrupt(vcpu);
vgic_fold_lr_state(vcpu);
vgic_prune_ap_list(vcpu);
@@ -653,6 +656,9 @@ void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
/* Flush our emulation state into the GIC hardware before entering the guest. */
void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
{
+ if (unlikely(!vgic_initialized(vcpu->kvm)))
+ return;
+
spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock);
vgic_flush_lr_state(vcpu);
spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock);
diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h
index 6c4625c46368..9d9e014765a2 100644
--- a/virt/kvm/arm/vgic/vgic.h
+++ b/virt/kvm/arm/vgic/vgic.h
@@ -72,7 +72,6 @@ static inline void vgic_get_irq_kref(struct vgic_irq *irq)
kref_get(&irq->refcount);
}
-#ifdef CONFIG_KVM_ARM_VGIC_V3
void vgic_v3_process_maintenance(struct kvm_vcpu *vcpu);
void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu);
void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr);
@@ -84,63 +83,14 @@ void vgic_v3_enable(struct kvm_vcpu *vcpu);
int vgic_v3_probe(const struct gic_kvm_info *info);
int vgic_v3_map_resources(struct kvm *kvm);
int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t dist_base_address);
+
+#ifdef CONFIG_KVM_ARM_VGIC_V3_ITS
int vgic_register_its_iodevs(struct kvm *kvm);
bool vgic_has_its(struct kvm *kvm);
int kvm_vgic_register_its_device(void);
void vgic_enable_lpis(struct kvm_vcpu *vcpu);
int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi);
#else
-static inline void vgic_v3_process_maintenance(struct kvm_vcpu *vcpu)
-{
-}
-
-static inline void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
-{
-}
-
-static inline void vgic_v3_populate_lr(struct kvm_vcpu *vcpu,
- struct vgic_irq *irq, int lr)
-{
-}
-
-static inline void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr)
-{
-}
-
-static inline void vgic_v3_set_underflow(struct kvm_vcpu *vcpu)
-{
-}
-
-static inline
-void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
-{
-}
-
-static inline
-void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
-{
-}
-
-static inline void vgic_v3_enable(struct kvm_vcpu *vcpu)
-{
-}
-
-static inline int vgic_v3_probe(const struct gic_kvm_info *info)
-{
- return -ENODEV;
-}
-
-static inline int vgic_v3_map_resources(struct kvm *kvm)
-{
- return -ENODEV;
-}
-
-static inline int vgic_register_redist_iodevs(struct kvm *kvm,
- gpa_t dist_base_address)
-{
- return -ENODEV;
-}
-
static inline int vgic_register_its_iodevs(struct kvm *kvm)
{
return -ENODEV;
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index e469b6012471..f397e9b20370 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -42,7 +42,6 @@
#ifdef CONFIG_HAVE_KVM_IRQFD
-static struct workqueue_struct *irqfd_cleanup_wq;
static void
irqfd_inject(struct work_struct *work)
@@ -168,7 +167,7 @@ irqfd_deactivate(struct kvm_kernel_irqfd *irqfd)
list_del_init(&irqfd->list);
- queue_work(irqfd_cleanup_wq, &irqfd->shutdown);
+ schedule_work(&irqfd->shutdown);
}
int __attribute__((weak)) kvm_arch_set_irq_inatomic(
@@ -555,7 +554,7 @@ kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args)
* so that we guarantee there will not be any more interrupts on this
* gsi once this deassign function returns.
*/
- flush_workqueue(irqfd_cleanup_wq);
+ flush_work(&irqfd->shutdown);
return 0;
}
@@ -592,7 +591,7 @@ kvm_irqfd_release(struct kvm *kvm)
* Block until we know all outstanding shutdown jobs have completed
* since we do not take a kvm* reference.
*/
- flush_workqueue(irqfd_cleanup_wq);
+ flush_work(&irqfd->shutdown);
}
@@ -622,23 +621,8 @@ void kvm_irq_routing_update(struct kvm *kvm)
spin_unlock_irq(&kvm->irqfds.lock);
}
-/*
- * create a host-wide workqueue for issuing deferred shutdown requests
- * aggregated from all vm* instances. We need our own isolated single-thread
- * queue to prevent deadlock against flushing the normal work-queue.
- */
-int kvm_irqfd_init(void)
-{
- irqfd_cleanup_wq = create_singlethread_workqueue("kvm-irqfd-cleanup");
- if (!irqfd_cleanup_wq)
- return -ENOMEM;
-
- return 0;
-}
-
void kvm_irqfd_exit(void)
{
- destroy_workqueue(irqfd_cleanup_wq);
}
#endif
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 195078225aa5..81dfc73d3df3 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -559,9 +559,11 @@ static void kvm_destroy_vm_debugfs(struct kvm *kvm)
debugfs_remove_recursive(kvm->debugfs_dentry);
- for (i = 0; i < kvm_debugfs_num_entries; i++)
- kfree(kvm->debugfs_stat_data[i]);
- kfree(kvm->debugfs_stat_data);
+ if (kvm->debugfs_stat_data) {
+ for (i = 0; i < kvm_debugfs_num_entries; i++)
+ kfree(kvm->debugfs_stat_data[i]);
+ kfree(kvm->debugfs_stat_data);
+ }
}
static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
@@ -2369,6 +2371,7 @@ static int kvm_vcpu_release(struct inode *inode, struct file *filp)
{
struct kvm_vcpu *vcpu = filp->private_data;
+ debugfs_remove_recursive(vcpu->debugfs_dentry);
kvm_put_kvm(vcpu->kvm);
return 0;
}
@@ -2391,6 +2394,32 @@ static int create_vcpu_fd(struct kvm_vcpu *vcpu)
return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);
}
+static int kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
+{
+ char dir_name[ITOA_MAX_LEN * 2];
+ int ret;
+
+ if (!kvm_arch_has_vcpu_debugfs())
+ return 0;
+
+ if (!debugfs_initialized())
+ return 0;
+
+ snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
+ vcpu->debugfs_dentry = debugfs_create_dir(dir_name,
+ vcpu->kvm->debugfs_dentry);
+ if (!vcpu->debugfs_dentry)
+ return -ENOMEM;
+
+ ret = kvm_arch_create_vcpu_debugfs(vcpu);
+ if (ret < 0) {
+ debugfs_remove_recursive(vcpu->debugfs_dentry);
+ return ret;
+ }
+
+ return 0;
+}
+
/*
* Creates some virtual cpus. Good luck creating more than one.
*/
@@ -2423,6 +2452,10 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
if (r)
goto vcpu_destroy;
+ r = kvm_create_vcpu_debugfs(vcpu);
+ if (r)
+ goto vcpu_destroy;
+
mutex_lock(&kvm->lock);
if (kvm_get_vcpu_by_id(kvm, id)) {
r = -EEXIST;
@@ -2454,6 +2487,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
unlock_vcpu_destroy:
mutex_unlock(&kvm->lock);
+ debugfs_remove_recursive(vcpu->debugfs_dentry);
vcpu_destroy:
kvm_arch_vcpu_destroy(vcpu);
vcpu_decrement:
@@ -3619,7 +3653,7 @@ static int vm_stat_get_per_vm(void *data, u64 *val)
{
struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
- *val = *(u32 *)((void *)stat_data->kvm + stat_data->offset);
+ *val = *(ulong *)((void *)stat_data->kvm + stat_data->offset);
return 0;
}
@@ -3649,7 +3683,7 @@ static int vcpu_stat_get_per_vm(void *data, u64 *val)
*val = 0;
kvm_for_each_vcpu(i, vcpu, stat_data->kvm)
- *val += *(u32 *)((void *)vcpu + stat_data->offset);
+ *val += *(u64 *)((void *)vcpu + stat_data->offset);
return 0;
}
@@ -3807,12 +3841,7 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
* kvm_arch_init makes sure there's at most one caller
* for architectures that support multiple implementations,
* like intel and amd on x86.
- * kvm_arch_init must be called before kvm_irqfd_init to avoid creating
- * conflicts in case kvm is already setup for another implementation.
*/
- r = kvm_irqfd_init();
- if (r)
- goto out_irqfd;
if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
r = -ENOMEM;
@@ -3894,7 +3923,6 @@ out_free_0a:
free_cpumask_var(cpus_hardware_enabled);
out_free_0:
kvm_irqfd_exit();
-out_irqfd:
kvm_arch_exit();
out_fail:
return r;